1 /* 2 * intel_idle.c - native hardware idle loop for modern Intel processors 3 * 4 * Copyright (c) 2010, Intel Corporation. 5 * Len Brown <len.brown@intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 * 16 * You should have received a copy of the GNU General Public License along with 17 * this program; if not, write to the Free Software Foundation, Inc., 18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 19 */ 20 21 /* 22 * intel_idle is a cpuidle driver that loads on specific Intel processors 23 * in lieu of the legacy ACPI processor_idle driver. The intent is to 24 * make Linux more efficient on these processors, as intel_idle knows 25 * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. 26 */ 27 28 /* 29 * Design Assumptions 30 * 31 * All CPUs have same idle states as boot CPU 32 * 33 * Chipset BM_STS (bus master status) bit is a NOP 34 * for preventing entry into deep C-stats 35 */ 36 37 /* 38 * Known limitations 39 * 40 * The driver currently initializes for_each_online_cpu() upon modprobe. 41 * It it unaware of subsequent processors hot-added to the system. 42 * This means that if you boot with maxcpus=n and later online 43 * processors above n, those processors will use C1 only. 44 * 45 * ACPI has a .suspend hack to turn off deep c-statees during suspend 46 * to avoid complications with the lapic timer workaround. 47 * Have not seen issues with suspend, but may need same workaround here. 48 * 49 * There is currently no kernel-based automatic probing/loading mechanism 50 * if the driver is built as a module. 51 */ 52 53 /* un-comment DEBUG to enable pr_debug() statements */ 54 #define DEBUG 55 56 #include <linux/kernel.h> 57 #include <linux/cpuidle.h> 58 #include <linux/clockchips.h> 59 #include <linux/hrtimer.h> /* ktime_get_real() */ 60 #include <trace/events/power.h> 61 #include <linux/sched.h> 62 #include <linux/notifier.h> 63 #include <linux/cpu.h> 64 #include <linux/module.h> 65 #include <asm/cpu_device_id.h> 66 #include <asm/mwait.h> 67 #include <asm/msr.h> 68 69 #define INTEL_IDLE_VERSION "0.4" 70 #define PREFIX "intel_idle: " 71 72 static struct cpuidle_driver intel_idle_driver = { 73 .name = "intel_idle", 74 .owner = THIS_MODULE, 75 }; 76 /* intel_idle.max_cstate=0 disables driver */ 77 static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; 78 79 static unsigned int mwait_substates; 80 81 #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF 82 /* Reliable LAPIC Timer States, bit 1 for C1 etc. */ 83 static unsigned int lapic_timer_reliable_states = (1 << 1); /* Default to only C1 */ 84 85 struct idle_cpu { 86 struct cpuidle_state *state_table; 87 88 /* 89 * Hardware C-state auto-demotion may not always be optimal. 90 * Indicate which enable bits to clear here. 91 */ 92 unsigned long auto_demotion_disable_flags; 93 }; 94 95 static const struct idle_cpu *icpu; 96 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; 97 static int intel_idle(struct cpuidle_device *dev, 98 struct cpuidle_driver *drv, int index); 99 100 static struct cpuidle_state *cpuidle_state_table; 101 102 /* 103 * Set this flag for states where the HW flushes the TLB for us 104 * and so we don't need cross-calls to keep it consistent. 105 * If this flag is set, SW flushes the TLB, so even if the 106 * HW doesn't do the flushing, this flag is safe to use. 107 */ 108 #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 109 110 /* 111 * States are indexed by the cstate number, 112 * which is also the index into the MWAIT hint array. 113 * Thus C0 is a dummy. 114 */ 115 static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { 116 { /* MWAIT C0 */ }, 117 { /* MWAIT C1 */ 118 .name = "C1-NHM", 119 .desc = "MWAIT 0x00", 120 .flags = CPUIDLE_FLAG_TIME_VALID, 121 .exit_latency = 3, 122 .target_residency = 6, 123 .enter = &intel_idle }, 124 { /* MWAIT C2 */ 125 .name = "C3-NHM", 126 .desc = "MWAIT 0x10", 127 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 128 .exit_latency = 20, 129 .target_residency = 80, 130 .enter = &intel_idle }, 131 { /* MWAIT C3 */ 132 .name = "C6-NHM", 133 .desc = "MWAIT 0x20", 134 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 135 .exit_latency = 200, 136 .target_residency = 800, 137 .enter = &intel_idle }, 138 }; 139 140 static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { 141 { /* MWAIT C0 */ }, 142 { /* MWAIT C1 */ 143 .name = "C1-SNB", 144 .desc = "MWAIT 0x00", 145 .flags = CPUIDLE_FLAG_TIME_VALID, 146 .exit_latency = 1, 147 .target_residency = 1, 148 .enter = &intel_idle }, 149 { /* MWAIT C2 */ 150 .name = "C3-SNB", 151 .desc = "MWAIT 0x10", 152 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 153 .exit_latency = 80, 154 .target_residency = 211, 155 .enter = &intel_idle }, 156 { /* MWAIT C3 */ 157 .name = "C6-SNB", 158 .desc = "MWAIT 0x20", 159 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 160 .exit_latency = 104, 161 .target_residency = 345, 162 .enter = &intel_idle }, 163 { /* MWAIT C4 */ 164 .name = "C7-SNB", 165 .desc = "MWAIT 0x30", 166 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 167 .exit_latency = 109, 168 .target_residency = 345, 169 .enter = &intel_idle }, 170 }; 171 172 static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { 173 { /* MWAIT C0 */ }, 174 { /* MWAIT C1 */ 175 .name = "C1-ATM", 176 .desc = "MWAIT 0x00", 177 .flags = CPUIDLE_FLAG_TIME_VALID, 178 .exit_latency = 1, 179 .target_residency = 4, 180 .enter = &intel_idle }, 181 { /* MWAIT C2 */ 182 .name = "C2-ATM", 183 .desc = "MWAIT 0x10", 184 .flags = CPUIDLE_FLAG_TIME_VALID, 185 .exit_latency = 20, 186 .target_residency = 80, 187 .enter = &intel_idle }, 188 { /* MWAIT C3 */ }, 189 { /* MWAIT C4 */ 190 .name = "C4-ATM", 191 .desc = "MWAIT 0x30", 192 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 193 .exit_latency = 100, 194 .target_residency = 400, 195 .enter = &intel_idle }, 196 { /* MWAIT C5 */ }, 197 { /* MWAIT C6 */ 198 .name = "C6-ATM", 199 .desc = "MWAIT 0x52", 200 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 201 .exit_latency = 140, 202 .target_residency = 560, 203 .enter = &intel_idle }, 204 }; 205 206 static long get_driver_data(int cstate) 207 { 208 int driver_data; 209 switch (cstate) { 210 211 case 1: /* MWAIT C1 */ 212 driver_data = 0x00; 213 break; 214 case 2: /* MWAIT C2 */ 215 driver_data = 0x10; 216 break; 217 case 3: /* MWAIT C3 */ 218 driver_data = 0x20; 219 break; 220 case 4: /* MWAIT C4 */ 221 driver_data = 0x30; 222 break; 223 case 5: /* MWAIT C5 */ 224 driver_data = 0x40; 225 break; 226 case 6: /* MWAIT C6 */ 227 driver_data = 0x52; 228 break; 229 default: 230 driver_data = 0x00; 231 } 232 return driver_data; 233 } 234 235 /** 236 * intel_idle 237 * @dev: cpuidle_device 238 * @drv: cpuidle driver 239 * @index: index of cpuidle state 240 * 241 * Must be called under local_irq_disable(). 242 */ 243 static int intel_idle(struct cpuidle_device *dev, 244 struct cpuidle_driver *drv, int index) 245 { 246 unsigned long ecx = 1; /* break on interrupt flag */ 247 struct cpuidle_state *state = &drv->states[index]; 248 struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; 249 unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage); 250 unsigned int cstate; 251 ktime_t kt_before, kt_after; 252 s64 usec_delta; 253 int cpu = smp_processor_id(); 254 255 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 256 257 /* 258 * leave_mm() to avoid costly and often unnecessary wakeups 259 * for flushing the user TLB's associated with the active mm. 260 */ 261 if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) 262 leave_mm(cpu); 263 264 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 265 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 266 267 kt_before = ktime_get_real(); 268 269 stop_critical_timings(); 270 if (!need_resched()) { 271 272 __monitor((void *)¤t_thread_info()->flags, 0, 0); 273 smp_mb(); 274 if (!need_resched()) 275 __mwait(eax, ecx); 276 } 277 278 start_critical_timings(); 279 280 kt_after = ktime_get_real(); 281 usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before)); 282 283 local_irq_enable(); 284 285 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 286 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 287 288 /* Update cpuidle counters */ 289 dev->last_residency = (int)usec_delta; 290 291 return index; 292 } 293 294 static void __setup_broadcast_timer(void *arg) 295 { 296 unsigned long reason = (unsigned long)arg; 297 int cpu = smp_processor_id(); 298 299 reason = reason ? 300 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF; 301 302 clockevents_notify(reason, &cpu); 303 } 304 305 static int setup_broadcast_cpuhp_notify(struct notifier_block *n, 306 unsigned long action, void *hcpu) 307 { 308 int hotcpu = (unsigned long)hcpu; 309 310 switch (action & 0xf) { 311 case CPU_ONLINE: 312 smp_call_function_single(hotcpu, __setup_broadcast_timer, 313 (void *)true, 1); 314 break; 315 } 316 return NOTIFY_OK; 317 } 318 319 static struct notifier_block setup_broadcast_notifier = { 320 .notifier_call = setup_broadcast_cpuhp_notify, 321 }; 322 323 static void auto_demotion_disable(void *dummy) 324 { 325 unsigned long long msr_bits; 326 327 rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); 328 msr_bits &= ~(icpu->auto_demotion_disable_flags); 329 wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); 330 } 331 332 static const struct idle_cpu idle_cpu_nehalem = { 333 .state_table = nehalem_cstates, 334 .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, 335 }; 336 337 static const struct idle_cpu idle_cpu_atom = { 338 .state_table = atom_cstates, 339 }; 340 341 static const struct idle_cpu idle_cpu_lincroft = { 342 .state_table = atom_cstates, 343 .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE, 344 }; 345 346 static const struct idle_cpu idle_cpu_snb = { 347 .state_table = snb_cstates, 348 }; 349 350 #define ICPU(model, cpu) \ 351 { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu } 352 353 static const struct x86_cpu_id intel_idle_ids[] = { 354 ICPU(0x1a, idle_cpu_nehalem), 355 ICPU(0x1e, idle_cpu_nehalem), 356 ICPU(0x1f, idle_cpu_nehalem), 357 ICPU(0x25, idle_cpu_nehalem), 358 ICPU(0x2c, idle_cpu_nehalem), 359 ICPU(0x2e, idle_cpu_nehalem), 360 ICPU(0x1c, idle_cpu_atom), 361 ICPU(0x26, idle_cpu_lincroft), 362 ICPU(0x2f, idle_cpu_nehalem), 363 ICPU(0x2a, idle_cpu_snb), 364 ICPU(0x2d, idle_cpu_snb), 365 {} 366 }; 367 MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); 368 369 /* 370 * intel_idle_probe() 371 */ 372 static int intel_idle_probe(void) 373 { 374 unsigned int eax, ebx, ecx; 375 const struct x86_cpu_id *id; 376 377 if (max_cstate == 0) { 378 pr_debug(PREFIX "disabled\n"); 379 return -EPERM; 380 } 381 382 id = x86_match_cpu(intel_idle_ids); 383 if (!id) { 384 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 385 boot_cpu_data.x86 == 6) 386 pr_debug(PREFIX "does not run on family %d model %d\n", 387 boot_cpu_data.x86, boot_cpu_data.x86_model); 388 return -ENODEV; 389 } 390 391 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) 392 return -ENODEV; 393 394 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); 395 396 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || 397 !(ecx & CPUID5_ECX_INTERRUPT_BREAK) || 398 !mwait_substates) 399 return -ENODEV; 400 401 pr_debug(PREFIX "MWAIT substates: 0x%x\n", mwait_substates); 402 403 icpu = (const struct idle_cpu *)id->driver_data; 404 cpuidle_state_table = icpu->state_table; 405 406 if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ 407 lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; 408 else { 409 on_each_cpu(__setup_broadcast_timer, (void *)true, 1); 410 register_cpu_notifier(&setup_broadcast_notifier); 411 } 412 413 pr_debug(PREFIX "v" INTEL_IDLE_VERSION 414 " model 0x%X\n", boot_cpu_data.x86_model); 415 416 pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", 417 lapic_timer_reliable_states); 418 return 0; 419 } 420 421 /* 422 * intel_idle_cpuidle_devices_uninit() 423 * unregister, free cpuidle_devices 424 */ 425 static void intel_idle_cpuidle_devices_uninit(void) 426 { 427 int i; 428 struct cpuidle_device *dev; 429 430 for_each_online_cpu(i) { 431 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); 432 cpuidle_unregister_device(dev); 433 } 434 435 free_percpu(intel_idle_cpuidle_devices); 436 return; 437 } 438 /* 439 * intel_idle_cpuidle_driver_init() 440 * allocate, initialize cpuidle_states 441 */ 442 static int intel_idle_cpuidle_driver_init(void) 443 { 444 int cstate; 445 struct cpuidle_driver *drv = &intel_idle_driver; 446 447 drv->state_count = 1; 448 449 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { 450 int num_substates; 451 452 if (cstate > max_cstate) { 453 printk(PREFIX "max_cstate %d reached\n", 454 max_cstate); 455 break; 456 } 457 458 /* does the state exist in CPUID.MWAIT? */ 459 num_substates = (mwait_substates >> ((cstate) * 4)) 460 & MWAIT_SUBSTATE_MASK; 461 if (num_substates == 0) 462 continue; 463 /* is the state not enabled? */ 464 if (cpuidle_state_table[cstate].enter == NULL) { 465 /* does the driver not know about the state? */ 466 if (*cpuidle_state_table[cstate].name == '\0') 467 pr_debug(PREFIX "unaware of model 0x%x" 468 " MWAIT %d please" 469 " contact lenb@kernel.org", 470 boot_cpu_data.x86_model, cstate); 471 continue; 472 } 473 474 if ((cstate > 2) && 475 !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 476 mark_tsc_unstable("TSC halts in idle" 477 " states deeper than C2"); 478 479 drv->states[drv->state_count] = /* structure copy */ 480 cpuidle_state_table[cstate]; 481 482 drv->state_count += 1; 483 } 484 485 if (icpu->auto_demotion_disable_flags) 486 on_each_cpu(auto_demotion_disable, NULL, 1); 487 488 return 0; 489 } 490 491 492 /* 493 * intel_idle_cpu_init() 494 * allocate, initialize, register cpuidle_devices 495 * @cpu: cpu/core to initialize 496 */ 497 int intel_idle_cpu_init(int cpu) 498 { 499 int cstate; 500 struct cpuidle_device *dev; 501 502 dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); 503 504 dev->state_count = 1; 505 506 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { 507 int num_substates; 508 509 if (cstate > max_cstate) { 510 printk(PREFIX "max_cstate %d reached\n", max_cstate); 511 break; 512 } 513 514 /* does the state exist in CPUID.MWAIT? */ 515 num_substates = (mwait_substates >> ((cstate) * 4)) 516 & MWAIT_SUBSTATE_MASK; 517 if (num_substates == 0) 518 continue; 519 /* is the state not enabled? */ 520 if (cpuidle_state_table[cstate].enter == NULL) 521 continue; 522 523 dev->states_usage[dev->state_count].driver_data = 524 (void *)get_driver_data(cstate); 525 526 dev->state_count += 1; 527 } 528 529 dev->cpu = cpu; 530 531 if (cpuidle_register_device(dev)) { 532 pr_debug(PREFIX "cpuidle_register_device %d failed!\n", cpu); 533 intel_idle_cpuidle_devices_uninit(); 534 return -EIO; 535 } 536 537 if (icpu->auto_demotion_disable_flags) 538 smp_call_function_single(cpu, auto_demotion_disable, NULL, 1); 539 540 return 0; 541 } 542 EXPORT_SYMBOL_GPL(intel_idle_cpu_init); 543 544 static int __init intel_idle_init(void) 545 { 546 int retval, i; 547 548 /* Do not load intel_idle at all for now if idle= is passed */ 549 if (boot_option_idle_override != IDLE_NO_OVERRIDE) 550 return -ENODEV; 551 552 retval = intel_idle_probe(); 553 if (retval) 554 return retval; 555 556 intel_idle_cpuidle_driver_init(); 557 retval = cpuidle_register_driver(&intel_idle_driver); 558 if (retval) { 559 printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", 560 cpuidle_get_driver()->name); 561 return retval; 562 } 563 564 intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); 565 if (intel_idle_cpuidle_devices == NULL) 566 return -ENOMEM; 567 568 for_each_online_cpu(i) { 569 retval = intel_idle_cpu_init(i); 570 if (retval) { 571 cpuidle_unregister_driver(&intel_idle_driver); 572 return retval; 573 } 574 } 575 576 return 0; 577 } 578 579 static void __exit intel_idle_exit(void) 580 { 581 intel_idle_cpuidle_devices_uninit(); 582 cpuidle_unregister_driver(&intel_idle_driver); 583 584 if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) { 585 on_each_cpu(__setup_broadcast_timer, (void *)false, 1); 586 unregister_cpu_notifier(&setup_broadcast_notifier); 587 } 588 589 return; 590 } 591 592 module_init(intel_idle_init); 593 module_exit(intel_idle_exit); 594 595 module_param(max_cstate, int, 0444); 596 597 MODULE_AUTHOR("Len Brown <len.brown@intel.com>"); 598 MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION); 599 MODULE_LICENSE("GPL"); 600