1 #include <linux/errno.h> 2 #include <linux/kernel.h> 3 #include <linux/mm.h> 4 #include <linux/smp.h> 5 #include <linux/prctl.h> 6 #include <linux/slab.h> 7 #include <linux/sched.h> 8 #include <linux/module.h> 9 #include <linux/pm.h> 10 #include <linux/clockchips.h> 11 #include <linux/ftrace.h> 12 #include <asm/system.h> 13 #include <asm/apic.h> 14 #include <asm/idle.h> 15 #include <asm/uaccess.h> 16 #include <asm/i387.h> 17 18 unsigned long idle_halt; 19 EXPORT_SYMBOL(idle_halt); 20 unsigned long idle_nomwait; 21 EXPORT_SYMBOL(idle_nomwait); 22 23 struct kmem_cache *task_xstate_cachep; 24 25 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 26 { 27 *dst = *src; 28 if (src->thread.xstate) { 29 dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, 30 GFP_KERNEL); 31 if (!dst->thread.xstate) 32 return -ENOMEM; 33 WARN_ON((unsigned long)dst->thread.xstate & 15); 34 memcpy(dst->thread.xstate, src->thread.xstate, xstate_size); 35 } 36 return 0; 37 } 38 39 void free_thread_xstate(struct task_struct *tsk) 40 { 41 if (tsk->thread.xstate) { 42 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); 43 tsk->thread.xstate = NULL; 44 } 45 } 46 47 void free_thread_info(struct thread_info *ti) 48 { 49 free_thread_xstate(ti->task); 50 free_pages((unsigned long)ti, get_order(THREAD_SIZE)); 51 } 52 53 void arch_task_cache_init(void) 54 { 55 task_xstate_cachep = 56 kmem_cache_create("task_xstate", xstate_size, 57 __alignof__(union thread_xstate), 58 SLAB_PANIC, NULL); 59 } 60 61 /* 62 * Free current thread data structures etc.. 63 */ 64 void exit_thread(void) 65 { 66 struct task_struct *me = current; 67 struct thread_struct *t = &me->thread; 68 unsigned long *bp = t->io_bitmap_ptr; 69 70 if (bp) { 71 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 72 73 t->io_bitmap_ptr = NULL; 74 clear_thread_flag(TIF_IO_BITMAP); 75 /* 76 * Careful, clear this in the TSS too: 77 */ 78 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 79 t->io_bitmap_max = 0; 80 put_cpu(); 81 kfree(bp); 82 } 83 84 ds_exit_thread(current); 85 } 86 87 void flush_thread(void) 88 { 89 struct task_struct *tsk = current; 90 91 #ifdef CONFIG_X86_64 92 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { 93 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); 94 if (test_tsk_thread_flag(tsk, TIF_IA32)) { 95 clear_tsk_thread_flag(tsk, TIF_IA32); 96 } else { 97 set_tsk_thread_flag(tsk, TIF_IA32); 98 current_thread_info()->status |= TS_COMPAT; 99 } 100 } 101 #endif 102 103 clear_tsk_thread_flag(tsk, TIF_DEBUG); 104 105 tsk->thread.debugreg0 = 0; 106 tsk->thread.debugreg1 = 0; 107 tsk->thread.debugreg2 = 0; 108 tsk->thread.debugreg3 = 0; 109 tsk->thread.debugreg6 = 0; 110 tsk->thread.debugreg7 = 0; 111 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 112 /* 113 * Forget coprocessor state.. 114 */ 115 tsk->fpu_counter = 0; 116 clear_fpu(tsk); 117 clear_used_math(); 118 } 119 120 static void hard_disable_TSC(void) 121 { 122 write_cr4(read_cr4() | X86_CR4_TSD); 123 } 124 125 void disable_TSC(void) 126 { 127 preempt_disable(); 128 if (!test_and_set_thread_flag(TIF_NOTSC)) 129 /* 130 * Must flip the CPU state synchronously with 131 * TIF_NOTSC in the current running context. 132 */ 133 hard_disable_TSC(); 134 preempt_enable(); 135 } 136 137 static void hard_enable_TSC(void) 138 { 139 write_cr4(read_cr4() & ~X86_CR4_TSD); 140 } 141 142 static void enable_TSC(void) 143 { 144 preempt_disable(); 145 if (test_and_clear_thread_flag(TIF_NOTSC)) 146 /* 147 * Must flip the CPU state synchronously with 148 * TIF_NOTSC in the current running context. 149 */ 150 hard_enable_TSC(); 151 preempt_enable(); 152 } 153 154 int get_tsc_mode(unsigned long adr) 155 { 156 unsigned int val; 157 158 if (test_thread_flag(TIF_NOTSC)) 159 val = PR_TSC_SIGSEGV; 160 else 161 val = PR_TSC_ENABLE; 162 163 return put_user(val, (unsigned int __user *)adr); 164 } 165 166 int set_tsc_mode(unsigned int val) 167 { 168 if (val == PR_TSC_SIGSEGV) 169 disable_TSC(); 170 else if (val == PR_TSC_ENABLE) 171 enable_TSC(); 172 else 173 return -EINVAL; 174 175 return 0; 176 } 177 178 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 179 struct tss_struct *tss) 180 { 181 struct thread_struct *prev, *next; 182 183 prev = &prev_p->thread; 184 next = &next_p->thread; 185 186 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || 187 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) 188 ds_switch_to(prev_p, next_p); 189 else if (next->debugctlmsr != prev->debugctlmsr) 190 update_debugctlmsr(next->debugctlmsr); 191 192 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 193 set_debugreg(next->debugreg0, 0); 194 set_debugreg(next->debugreg1, 1); 195 set_debugreg(next->debugreg2, 2); 196 set_debugreg(next->debugreg3, 3); 197 /* no 4 and 5 */ 198 set_debugreg(next->debugreg6, 6); 199 set_debugreg(next->debugreg7, 7); 200 } 201 202 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 203 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 204 /* prev and next are different */ 205 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 206 hard_disable_TSC(); 207 else 208 hard_enable_TSC(); 209 } 210 211 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 212 /* 213 * Copy the relevant range of the IO bitmap. 214 * Normally this is 128 bytes or less: 215 */ 216 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 217 max(prev->io_bitmap_max, next->io_bitmap_max)); 218 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 219 /* 220 * Clear any possible leftover bits: 221 */ 222 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 223 } 224 } 225 226 int sys_fork(struct pt_regs *regs) 227 { 228 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); 229 } 230 231 /* 232 * This is trivial, and on the face of it looks like it 233 * could equally well be done in user mode. 234 * 235 * Not so, for quite unobvious reasons - register pressure. 236 * In user mode vfork() cannot have a stack frame, and if 237 * done by calling the "clone()" system call directly, you 238 * do not have enough call-clobbered registers to hold all 239 * the information you need. 240 */ 241 int sys_vfork(struct pt_regs *regs) 242 { 243 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, 244 NULL, NULL); 245 } 246 247 248 /* 249 * Idle related variables and functions 250 */ 251 unsigned long boot_option_idle_override = 0; 252 EXPORT_SYMBOL(boot_option_idle_override); 253 254 /* 255 * Powermanagement idle function, if any.. 256 */ 257 void (*pm_idle)(void); 258 EXPORT_SYMBOL(pm_idle); 259 260 #ifdef CONFIG_X86_32 261 /* 262 * This halt magic was a workaround for ancient floppy DMA 263 * wreckage. It should be safe to remove. 264 */ 265 static int hlt_counter; 266 void disable_hlt(void) 267 { 268 hlt_counter++; 269 } 270 EXPORT_SYMBOL(disable_hlt); 271 272 void enable_hlt(void) 273 { 274 hlt_counter--; 275 } 276 EXPORT_SYMBOL(enable_hlt); 277 278 static inline int hlt_use_halt(void) 279 { 280 return (!hlt_counter && boot_cpu_data.hlt_works_ok); 281 } 282 #else 283 static inline int hlt_use_halt(void) 284 { 285 return 1; 286 } 287 #endif 288 289 /* 290 * We use this if we don't have any better 291 * idle routine.. 292 */ 293 void default_idle(void) 294 { 295 if (hlt_use_halt()) { 296 struct power_trace it; 297 298 trace_power_start(&it, POWER_CSTATE, 1); 299 current_thread_info()->status &= ~TS_POLLING; 300 /* 301 * TS_POLLING-cleared state must be visible before we 302 * test NEED_RESCHED: 303 */ 304 smp_mb(); 305 306 if (!need_resched()) 307 safe_halt(); /* enables interrupts racelessly */ 308 else 309 local_irq_enable(); 310 current_thread_info()->status |= TS_POLLING; 311 trace_power_end(&it); 312 } else { 313 local_irq_enable(); 314 /* loop is done by the caller */ 315 cpu_relax(); 316 } 317 } 318 #ifdef CONFIG_APM_MODULE 319 EXPORT_SYMBOL(default_idle); 320 #endif 321 322 void stop_this_cpu(void *dummy) 323 { 324 local_irq_disable(); 325 /* 326 * Remove this CPU: 327 */ 328 cpu_clear(smp_processor_id(), cpu_online_map); 329 disable_local_APIC(); 330 331 for (;;) { 332 if (hlt_works(smp_processor_id())) 333 halt(); 334 } 335 } 336 337 static void do_nothing(void *unused) 338 { 339 } 340 341 /* 342 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of 343 * pm_idle and update to new pm_idle value. Required while changing pm_idle 344 * handler on SMP systems. 345 * 346 * Caller must have changed pm_idle to the new value before the call. Old 347 * pm_idle value will not be used by any CPU after the return of this function. 348 */ 349 void cpu_idle_wait(void) 350 { 351 smp_mb(); 352 /* kick all the CPUs so that they exit out of pm_idle */ 353 smp_call_function(do_nothing, NULL, 1); 354 } 355 EXPORT_SYMBOL_GPL(cpu_idle_wait); 356 357 /* 358 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 359 * which can obviate IPI to trigger checking of need_resched. 360 * We execute MONITOR against need_resched and enter optimized wait state 361 * through MWAIT. Whenever someone changes need_resched, we would be woken 362 * up from MWAIT (without an IPI). 363 * 364 * New with Core Duo processors, MWAIT can take some hints based on CPU 365 * capability. 366 */ 367 void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 368 { 369 struct power_trace it; 370 371 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); 372 if (!need_resched()) { 373 if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 374 clflush((void *)¤t_thread_info()->flags); 375 376 __monitor((void *)¤t_thread_info()->flags, 0, 0); 377 smp_mb(); 378 if (!need_resched()) 379 __mwait(ax, cx); 380 } 381 trace_power_end(&it); 382 } 383 384 /* Default MONITOR/MWAIT with no hints, used for default C1 state */ 385 static void mwait_idle(void) 386 { 387 struct power_trace it; 388 if (!need_resched()) { 389 trace_power_start(&it, POWER_CSTATE, 1); 390 if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 391 clflush((void *)¤t_thread_info()->flags); 392 393 __monitor((void *)¤t_thread_info()->flags, 0, 0); 394 smp_mb(); 395 if (!need_resched()) 396 __sti_mwait(0, 0); 397 else 398 local_irq_enable(); 399 trace_power_end(&it); 400 } else 401 local_irq_enable(); 402 } 403 404 /* 405 * On SMP it's slightly faster (but much more power-consuming!) 406 * to poll the ->work.need_resched flag instead of waiting for the 407 * cross-CPU IPI to arrive. Use this option with caution. 408 */ 409 static void poll_idle(void) 410 { 411 struct power_trace it; 412 413 trace_power_start(&it, POWER_CSTATE, 0); 414 local_irq_enable(); 415 while (!need_resched()) 416 cpu_relax(); 417 trace_power_end(&it); 418 } 419 420 /* 421 * mwait selection logic: 422 * 423 * It depends on the CPU. For AMD CPUs that support MWAIT this is 424 * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings 425 * then depend on a clock divisor and current Pstate of the core. If 426 * all cores of a processor are in halt state (C1) the processor can 427 * enter the C1E (C1 enhanced) state. If mwait is used this will never 428 * happen. 429 * 430 * idle=mwait overrides this decision and forces the usage of mwait. 431 */ 432 static int __cpuinitdata force_mwait; 433 434 #define MWAIT_INFO 0x05 435 #define MWAIT_ECX_EXTENDED_INFO 0x01 436 #define MWAIT_EDX_C1 0xf0 437 438 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 439 { 440 u32 eax, ebx, ecx, edx; 441 442 if (force_mwait) 443 return 1; 444 445 if (c->cpuid_level < MWAIT_INFO) 446 return 0; 447 448 cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx); 449 /* Check, whether EDX has extended info about MWAIT */ 450 if (!(ecx & MWAIT_ECX_EXTENDED_INFO)) 451 return 1; 452 453 /* 454 * edx enumeratios MONITOR/MWAIT extensions. Check, whether 455 * C1 supports MWAIT 456 */ 457 return (edx & MWAIT_EDX_C1); 458 } 459 460 /* 461 * Check for AMD CPUs, which have potentially C1E support 462 */ 463 static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) 464 { 465 if (c->x86_vendor != X86_VENDOR_AMD) 466 return 0; 467 468 if (c->x86 < 0x0F) 469 return 0; 470 471 /* Family 0x0f models < rev F do not have C1E */ 472 if (c->x86 == 0x0f && c->x86_model < 0x40) 473 return 0; 474 475 return 1; 476 } 477 478 static cpumask_t c1e_mask = CPU_MASK_NONE; 479 static int c1e_detected; 480 481 void c1e_remove_cpu(int cpu) 482 { 483 cpu_clear(cpu, c1e_mask); 484 } 485 486 /* 487 * C1E aware idle routine. We check for C1E active in the interrupt 488 * pending message MSR. If we detect C1E, then we handle it the same 489 * way as C3 power states (local apic timer and TSC stop) 490 */ 491 static void c1e_idle(void) 492 { 493 if (need_resched()) 494 return; 495 496 if (!c1e_detected) { 497 u32 lo, hi; 498 499 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 500 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 501 c1e_detected = 1; 502 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 503 mark_tsc_unstable("TSC halt in AMD C1E"); 504 printk(KERN_INFO "System has AMD C1E enabled\n"); 505 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); 506 } 507 } 508 509 if (c1e_detected) { 510 int cpu = smp_processor_id(); 511 512 if (!cpu_isset(cpu, c1e_mask)) { 513 cpu_set(cpu, c1e_mask); 514 /* 515 * Force broadcast so ACPI can not interfere. Needs 516 * to run with interrupts enabled as it uses 517 * smp_function_call. 518 */ 519 local_irq_enable(); 520 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 521 &cpu); 522 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", 523 cpu); 524 local_irq_disable(); 525 } 526 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 527 528 default_idle(); 529 530 /* 531 * The switch back from broadcast mode needs to be 532 * called with interrupts disabled. 533 */ 534 local_irq_disable(); 535 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 536 local_irq_enable(); 537 } else 538 default_idle(); 539 } 540 541 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 542 { 543 #ifdef CONFIG_SMP 544 if (pm_idle == poll_idle && smp_num_siblings > 1) { 545 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 546 " performance may degrade.\n"); 547 } 548 #endif 549 if (pm_idle) 550 return; 551 552 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { 553 /* 554 * One CPU supports mwait => All CPUs supports mwait 555 */ 556 printk(KERN_INFO "using mwait in idle threads.\n"); 557 pm_idle = mwait_idle; 558 } else if (check_c1e_idle(c)) { 559 printk(KERN_INFO "using C1E aware idle routine\n"); 560 pm_idle = c1e_idle; 561 } else 562 pm_idle = default_idle; 563 } 564 565 static int __init idle_setup(char *str) 566 { 567 if (!str) 568 return -EINVAL; 569 570 if (!strcmp(str, "poll")) { 571 printk("using polling idle threads.\n"); 572 pm_idle = poll_idle; 573 } else if (!strcmp(str, "mwait")) 574 force_mwait = 1; 575 else if (!strcmp(str, "halt")) { 576 /* 577 * When the boot option of idle=halt is added, halt is 578 * forced to be used for CPU idle. In such case CPU C2/C3 579 * won't be used again. 580 * To continue to load the CPU idle driver, don't touch 581 * the boot_option_idle_override. 582 */ 583 pm_idle = default_idle; 584 idle_halt = 1; 585 return 0; 586 } else if (!strcmp(str, "nomwait")) { 587 /* 588 * If the boot option of "idle=nomwait" is added, 589 * it means that mwait will be disabled for CPU C2/C3 590 * states. In such case it won't touch the variable 591 * of boot_option_idle_override. 592 */ 593 idle_nomwait = 1; 594 return 0; 595 } else 596 return -1; 597 598 boot_option_idle_override = 1; 599 return 0; 600 } 601 early_param("idle", idle_setup); 602 603