1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_pstate.c: Native P state management for Intel processors 4 * 5 * (C) Copyright 2012 Intel Corporation 6 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/module.h> 14 #include <linux/ktime.h> 15 #include <linux/hrtimer.h> 16 #include <linux/tick.h> 17 #include <linux/slab.h> 18 #include <linux/sched/cpufreq.h> 19 #include <linux/list.h> 20 #include <linux/cpu.h> 21 #include <linux/cpufreq.h> 22 #include <linux/sysfs.h> 23 #include <linux/types.h> 24 #include <linux/fs.h> 25 #include <linux/acpi.h> 26 #include <linux/vmalloc.h> 27 #include <linux/pm_qos.h> 28 #include <trace/events/power.h> 29 30 #include <asm/div64.h> 31 #include <asm/msr.h> 32 #include <asm/cpu_device_id.h> 33 #include <asm/cpufeature.h> 34 #include <asm/intel-family.h> 35 36 #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) 37 38 #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 39 #define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 40 #define INTEL_CPUFREQ_TRANSITION_DELAY 500 41 42 #ifdef CONFIG_ACPI 43 #include <acpi/processor.h> 44 #include <acpi/cppc_acpi.h> 45 #endif 46 47 #define FRAC_BITS 8 48 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) 49 #define fp_toint(X) ((X) >> FRAC_BITS) 50 51 #define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3)) 52 53 #define EXT_BITS 6 54 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS) 55 #define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS) 56 #define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS) 57 58 static inline int32_t mul_fp(int32_t x, int32_t y) 59 { 60 return ((int64_t)x * (int64_t)y) >> FRAC_BITS; 61 } 62 63 static inline int32_t div_fp(s64 x, s64 y) 64 { 65 return div64_s64((int64_t)x << FRAC_BITS, y); 66 } 67 68 static inline int ceiling_fp(int32_t x) 69 { 70 int mask, ret; 71 72 ret = fp_toint(x); 73 mask = (1 << FRAC_BITS) - 1; 74 if (x & mask) 75 ret += 1; 76 return ret; 77 } 78 79 static inline int32_t percent_fp(int percent) 80 { 81 return div_fp(percent, 100); 82 } 83 84 static inline u64 mul_ext_fp(u64 x, u64 y) 85 { 86 return (x * y) >> EXT_FRAC_BITS; 87 } 88 89 static inline u64 div_ext_fp(u64 x, u64 y) 90 { 91 return div64_u64(x << EXT_FRAC_BITS, y); 92 } 93 94 static inline int32_t percent_ext_fp(int percent) 95 { 96 return div_ext_fp(percent, 100); 97 } 98 99 /** 100 * struct sample - Store performance sample 101 * @core_avg_perf: Ratio of APERF/MPERF which is the actual average 102 * performance during last sample period 103 * @busy_scaled: Scaled busy value which is used to calculate next 104 * P state. This can be different than core_avg_perf 105 * to account for cpu idle period 106 * @aperf: Difference of actual performance frequency clock count 107 * read from APERF MSR between last and current sample 108 * @mperf: Difference of maximum performance frequency clock count 109 * read from MPERF MSR between last and current sample 110 * @tsc: Difference of time stamp counter between last and 111 * current sample 112 * @time: Current time from scheduler 113 * 114 * This structure is used in the cpudata structure to store performance sample 115 * data for choosing next P State. 116 */ 117 struct sample { 118 int32_t core_avg_perf; 119 int32_t busy_scaled; 120 u64 aperf; 121 u64 mperf; 122 u64 tsc; 123 u64 time; 124 }; 125 126 /** 127 * struct pstate_data - Store P state data 128 * @current_pstate: Current requested P state 129 * @min_pstate: Min P state possible for this platform 130 * @max_pstate: Max P state possible for this platform 131 * @max_pstate_physical:This is physical Max P state for a processor 132 * This can be higher than the max_pstate which can 133 * be limited by platform thermal design power limits 134 * @scaling: Scaling factor to convert frequency to cpufreq 135 * frequency units 136 * @turbo_pstate: Max Turbo P state possible for this platform 137 * @max_freq: @max_pstate frequency in cpufreq units 138 * @turbo_freq: @turbo_pstate frequency in cpufreq units 139 * 140 * Stores the per cpu model P state limits and current P state. 141 */ 142 struct pstate_data { 143 int current_pstate; 144 int min_pstate; 145 int max_pstate; 146 int max_pstate_physical; 147 int scaling; 148 int turbo_pstate; 149 unsigned int max_freq; 150 unsigned int turbo_freq; 151 }; 152 153 /** 154 * struct vid_data - Stores voltage information data 155 * @min: VID data for this platform corresponding to 156 * the lowest P state 157 * @max: VID data corresponding to the highest P State. 158 * @turbo: VID data for turbo P state 159 * @ratio: Ratio of (vid max - vid min) / 160 * (max P state - Min P State) 161 * 162 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling) 163 * This data is used in Atom platforms, where in addition to target P state, 164 * the voltage data needs to be specified to select next P State. 165 */ 166 struct vid_data { 167 int min; 168 int max; 169 int turbo; 170 int32_t ratio; 171 }; 172 173 /** 174 * struct global_params - Global parameters, mostly tunable via sysfs. 175 * @no_turbo: Whether or not to use turbo P-states. 176 * @turbo_disabled: Whether or not turbo P-states are available at all, 177 * based on the MSR_IA32_MISC_ENABLE value and whether or 178 * not the maximum reported turbo P-state is different from 179 * the maximum reported non-turbo one. 180 * @turbo_disabled_mf: The @turbo_disabled value reflected by cpuinfo.max_freq. 181 * @min_perf_pct: Minimum capacity limit in percent of the maximum turbo 182 * P-state capacity. 183 * @max_perf_pct: Maximum capacity limit in percent of the maximum turbo 184 * P-state capacity. 185 */ 186 struct global_params { 187 bool no_turbo; 188 bool turbo_disabled; 189 bool turbo_disabled_mf; 190 int max_perf_pct; 191 int min_perf_pct; 192 }; 193 194 /** 195 * struct cpudata - Per CPU instance data storage 196 * @cpu: CPU number for this instance data 197 * @policy: CPUFreq policy value 198 * @update_util: CPUFreq utility callback information 199 * @update_util_set: CPUFreq utility callback is set 200 * @iowait_boost: iowait-related boost fraction 201 * @last_update: Time of the last update. 202 * @pstate: Stores P state limits for this CPU 203 * @vid: Stores VID limits for this CPU 204 * @last_sample_time: Last Sample time 205 * @aperf_mperf_shift: APERF vs MPERF counting frequency difference 206 * @prev_aperf: Last APERF value read from APERF MSR 207 * @prev_mperf: Last MPERF value read from MPERF MSR 208 * @prev_tsc: Last timestamp counter (TSC) value 209 * @prev_cummulative_iowait: IO Wait time difference from last and 210 * current sample 211 * @sample: Storage for storing last Sample data 212 * @min_perf_ratio: Minimum capacity in terms of PERF or HWP ratios 213 * @max_perf_ratio: Maximum capacity in terms of PERF or HWP ratios 214 * @acpi_perf_data: Stores ACPI perf information read from _PSS 215 * @valid_pss_table: Set to true for valid ACPI _PSS entries found 216 * @epp_powersave: Last saved HWP energy performance preference 217 * (EPP) or energy performance bias (EPB), 218 * when policy switched to performance 219 * @epp_policy: Last saved policy used to set EPP/EPB 220 * @epp_default: Power on default HWP energy performance 221 * preference/bias 222 * @epp_saved: Saved EPP/EPB during system suspend or CPU offline 223 * operation 224 * @epp_cached Cached HWP energy-performance preference value 225 * @hwp_req_cached: Cached value of the last HWP Request MSR 226 * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR 227 * @last_io_update: Last time when IO wake flag was set 228 * @sched_flags: Store scheduler flags for possible cross CPU update 229 * @hwp_boost_min: Last HWP boosted min performance 230 * 231 * This structure stores per CPU instance data for all CPUs. 232 */ 233 struct cpudata { 234 int cpu; 235 236 unsigned int policy; 237 struct update_util_data update_util; 238 bool update_util_set; 239 240 struct pstate_data pstate; 241 struct vid_data vid; 242 243 u64 last_update; 244 u64 last_sample_time; 245 u64 aperf_mperf_shift; 246 u64 prev_aperf; 247 u64 prev_mperf; 248 u64 prev_tsc; 249 u64 prev_cummulative_iowait; 250 struct sample sample; 251 int32_t min_perf_ratio; 252 int32_t max_perf_ratio; 253 #ifdef CONFIG_ACPI 254 struct acpi_processor_performance acpi_perf_data; 255 bool valid_pss_table; 256 #endif 257 unsigned int iowait_boost; 258 s16 epp_powersave; 259 s16 epp_policy; 260 s16 epp_default; 261 s16 epp_saved; 262 s16 epp_cached; 263 u64 hwp_req_cached; 264 u64 hwp_cap_cached; 265 u64 last_io_update; 266 unsigned int sched_flags; 267 u32 hwp_boost_min; 268 }; 269 270 static struct cpudata **all_cpu_data; 271 272 /** 273 * struct pstate_funcs - Per CPU model specific callbacks 274 * @get_max: Callback to get maximum non turbo effective P state 275 * @get_max_physical: Callback to get maximum non turbo physical P state 276 * @get_min: Callback to get minimum P state 277 * @get_turbo: Callback to get turbo P state 278 * @get_scaling: Callback to get frequency scaling factor 279 * @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference 280 * @get_val: Callback to convert P state to actual MSR write value 281 * @get_vid: Callback to get VID data for Atom platforms 282 * 283 * Core and Atom CPU models have different way to get P State limits. This 284 * structure is used to store those callbacks. 285 */ 286 struct pstate_funcs { 287 int (*get_max)(void); 288 int (*get_max_physical)(void); 289 int (*get_min)(void); 290 int (*get_turbo)(void); 291 int (*get_scaling)(void); 292 int (*get_aperf_mperf_shift)(void); 293 u64 (*get_val)(struct cpudata*, int pstate); 294 void (*get_vid)(struct cpudata *); 295 }; 296 297 static struct pstate_funcs pstate_funcs __read_mostly; 298 299 static int hwp_active __read_mostly; 300 static int hwp_mode_bdw __read_mostly; 301 static bool per_cpu_limits __read_mostly; 302 static bool hwp_boost __read_mostly; 303 304 static struct cpufreq_driver *intel_pstate_driver __read_mostly; 305 306 #ifdef CONFIG_ACPI 307 static bool acpi_ppc; 308 #endif 309 310 static struct global_params global; 311 312 static DEFINE_MUTEX(intel_pstate_driver_lock); 313 static DEFINE_MUTEX(intel_pstate_limits_lock); 314 315 #ifdef CONFIG_ACPI 316 317 static bool intel_pstate_acpi_pm_profile_server(void) 318 { 319 if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER || 320 acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER) 321 return true; 322 323 return false; 324 } 325 326 static bool intel_pstate_get_ppc_enable_status(void) 327 { 328 if (intel_pstate_acpi_pm_profile_server()) 329 return true; 330 331 return acpi_ppc; 332 } 333 334 #ifdef CONFIG_ACPI_CPPC_LIB 335 336 /* The work item is needed to avoid CPU hotplug locking issues */ 337 static void intel_pstste_sched_itmt_work_fn(struct work_struct *work) 338 { 339 sched_set_itmt_support(); 340 } 341 342 static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn); 343 344 static void intel_pstate_set_itmt_prio(int cpu) 345 { 346 struct cppc_perf_caps cppc_perf; 347 static u32 max_highest_perf = 0, min_highest_perf = U32_MAX; 348 int ret; 349 350 ret = cppc_get_perf_caps(cpu, &cppc_perf); 351 if (ret) 352 return; 353 354 /* 355 * The priorities can be set regardless of whether or not 356 * sched_set_itmt_support(true) has been called and it is valid to 357 * update them at any time after it has been called. 358 */ 359 sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); 360 361 if (max_highest_perf <= min_highest_perf) { 362 if (cppc_perf.highest_perf > max_highest_perf) 363 max_highest_perf = cppc_perf.highest_perf; 364 365 if (cppc_perf.highest_perf < min_highest_perf) 366 min_highest_perf = cppc_perf.highest_perf; 367 368 if (max_highest_perf > min_highest_perf) { 369 /* 370 * This code can be run during CPU online under the 371 * CPU hotplug locks, so sched_set_itmt_support() 372 * cannot be called from here. Queue up a work item 373 * to invoke it. 374 */ 375 schedule_work(&sched_itmt_work); 376 } 377 } 378 } 379 380 static int intel_pstate_get_cppc_guranteed(int cpu) 381 { 382 struct cppc_perf_caps cppc_perf; 383 int ret; 384 385 ret = cppc_get_perf_caps(cpu, &cppc_perf); 386 if (ret) 387 return ret; 388 389 if (cppc_perf.guaranteed_perf) 390 return cppc_perf.guaranteed_perf; 391 392 return cppc_perf.nominal_perf; 393 } 394 395 #else /* CONFIG_ACPI_CPPC_LIB */ 396 static void intel_pstate_set_itmt_prio(int cpu) 397 { 398 } 399 #endif /* CONFIG_ACPI_CPPC_LIB */ 400 401 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 402 { 403 struct cpudata *cpu; 404 int ret; 405 int i; 406 407 if (hwp_active) { 408 intel_pstate_set_itmt_prio(policy->cpu); 409 return; 410 } 411 412 if (!intel_pstate_get_ppc_enable_status()) 413 return; 414 415 cpu = all_cpu_data[policy->cpu]; 416 417 ret = acpi_processor_register_performance(&cpu->acpi_perf_data, 418 policy->cpu); 419 if (ret) 420 return; 421 422 /* 423 * Check if the control value in _PSS is for PERF_CTL MSR, which should 424 * guarantee that the states returned by it map to the states in our 425 * list directly. 426 */ 427 if (cpu->acpi_perf_data.control_register.space_id != 428 ACPI_ADR_SPACE_FIXED_HARDWARE) 429 goto err; 430 431 /* 432 * If there is only one entry _PSS, simply ignore _PSS and continue as 433 * usual without taking _PSS into account 434 */ 435 if (cpu->acpi_perf_data.state_count < 2) 436 goto err; 437 438 pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu); 439 for (i = 0; i < cpu->acpi_perf_data.state_count; i++) { 440 pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n", 441 (i == cpu->acpi_perf_data.state ? '*' : ' '), i, 442 (u32) cpu->acpi_perf_data.states[i].core_frequency, 443 (u32) cpu->acpi_perf_data.states[i].power, 444 (u32) cpu->acpi_perf_data.states[i].control); 445 } 446 447 /* 448 * The _PSS table doesn't contain whole turbo frequency range. 449 * This just contains +1 MHZ above the max non turbo frequency, 450 * with control value corresponding to max turbo ratio. But 451 * when cpufreq set policy is called, it will call with this 452 * max frequency, which will cause a reduced performance as 453 * this driver uses real max turbo frequency as the max 454 * frequency. So correct this frequency in _PSS table to 455 * correct max turbo frequency based on the turbo state. 456 * Also need to convert to MHz as _PSS freq is in MHz. 457 */ 458 if (!global.turbo_disabled) 459 cpu->acpi_perf_data.states[0].core_frequency = 460 policy->cpuinfo.max_freq / 1000; 461 cpu->valid_pss_table = true; 462 pr_debug("_PPC limits will be enforced\n"); 463 464 return; 465 466 err: 467 cpu->valid_pss_table = false; 468 acpi_processor_unregister_performance(policy->cpu); 469 } 470 471 static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 472 { 473 struct cpudata *cpu; 474 475 cpu = all_cpu_data[policy->cpu]; 476 if (!cpu->valid_pss_table) 477 return; 478 479 acpi_processor_unregister_performance(policy->cpu); 480 } 481 #else /* CONFIG_ACPI */ 482 static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 483 { 484 } 485 486 static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 487 { 488 } 489 490 static inline bool intel_pstate_acpi_pm_profile_server(void) 491 { 492 return false; 493 } 494 #endif /* CONFIG_ACPI */ 495 496 #ifndef CONFIG_ACPI_CPPC_LIB 497 static int intel_pstate_get_cppc_guranteed(int cpu) 498 { 499 return -ENOTSUPP; 500 } 501 #endif /* CONFIG_ACPI_CPPC_LIB */ 502 503 static inline void update_turbo_state(void) 504 { 505 u64 misc_en; 506 struct cpudata *cpu; 507 508 cpu = all_cpu_data[0]; 509 rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); 510 global.turbo_disabled = 511 (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || 512 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); 513 } 514 515 static int min_perf_pct_min(void) 516 { 517 struct cpudata *cpu = all_cpu_data[0]; 518 int turbo_pstate = cpu->pstate.turbo_pstate; 519 520 return turbo_pstate ? 521 (cpu->pstate.min_pstate * 100 / turbo_pstate) : 0; 522 } 523 524 static s16 intel_pstate_get_epb(struct cpudata *cpu_data) 525 { 526 u64 epb; 527 int ret; 528 529 if (!boot_cpu_has(X86_FEATURE_EPB)) 530 return -ENXIO; 531 532 ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); 533 if (ret) 534 return (s16)ret; 535 536 return (s16)(epb & 0x0f); 537 } 538 539 static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data) 540 { 541 s16 epp; 542 543 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 544 /* 545 * When hwp_req_data is 0, means that caller didn't read 546 * MSR_HWP_REQUEST, so need to read and get EPP. 547 */ 548 if (!hwp_req_data) { 549 epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, 550 &hwp_req_data); 551 if (epp) 552 return epp; 553 } 554 epp = (hwp_req_data >> 24) & 0xff; 555 } else { 556 /* When there is no EPP present, HWP uses EPB settings */ 557 epp = intel_pstate_get_epb(cpu_data); 558 } 559 560 return epp; 561 } 562 563 static int intel_pstate_set_epb(int cpu, s16 pref) 564 { 565 u64 epb; 566 int ret; 567 568 if (!boot_cpu_has(X86_FEATURE_EPB)) 569 return -ENXIO; 570 571 ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); 572 if (ret) 573 return ret; 574 575 epb = (epb & ~0x0f) | pref; 576 wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb); 577 578 return 0; 579 } 580 581 /* 582 * EPP/EPB display strings corresponding to EPP index in the 583 * energy_perf_strings[] 584 * index String 585 *------------------------------------- 586 * 0 default 587 * 1 performance 588 * 2 balance_performance 589 * 3 balance_power 590 * 4 power 591 */ 592 static const char * const energy_perf_strings[] = { 593 "default", 594 "performance", 595 "balance_performance", 596 "balance_power", 597 "power", 598 NULL 599 }; 600 static const unsigned int epp_values[] = { 601 HWP_EPP_PERFORMANCE, 602 HWP_EPP_BALANCE_PERFORMANCE, 603 HWP_EPP_BALANCE_POWERSAVE, 604 HWP_EPP_POWERSAVE 605 }; 606 607 static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp) 608 { 609 s16 epp; 610 int index = -EINVAL; 611 612 *raw_epp = 0; 613 epp = intel_pstate_get_epp(cpu_data, 0); 614 if (epp < 0) 615 return epp; 616 617 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 618 if (epp == HWP_EPP_PERFORMANCE) 619 return 1; 620 if (epp == HWP_EPP_BALANCE_PERFORMANCE) 621 return 2; 622 if (epp == HWP_EPP_BALANCE_POWERSAVE) 623 return 3; 624 if (epp == HWP_EPP_POWERSAVE) 625 return 4; 626 *raw_epp = epp; 627 return 0; 628 } else if (boot_cpu_has(X86_FEATURE_EPB)) { 629 /* 630 * Range: 631 * 0x00-0x03 : Performance 632 * 0x04-0x07 : Balance performance 633 * 0x08-0x0B : Balance power 634 * 0x0C-0x0F : Power 635 * The EPB is a 4 bit value, but our ranges restrict the 636 * value which can be set. Here only using top two bits 637 * effectively. 638 */ 639 index = (epp >> 2) + 1; 640 } 641 642 return index; 643 } 644 645 static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp) 646 { 647 /* 648 * Use the cached HWP Request MSR value, because in the active mode the 649 * register itself may be updated by intel_pstate_hwp_boost_up() or 650 * intel_pstate_hwp_boost_down() at any time. 651 */ 652 u64 value = READ_ONCE(cpu->hwp_req_cached); 653 654 value &= ~GENMASK_ULL(31, 24); 655 value |= (u64)epp << 24; 656 /* 657 * The only other updater of hwp_req_cached in the active mode, 658 * intel_pstate_hwp_set(), is called under the same lock as this 659 * function, so it cannot run in parallel with the update below. 660 */ 661 WRITE_ONCE(cpu->hwp_req_cached, value); 662 return wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 663 } 664 665 static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, 666 int pref_index, bool use_raw, 667 u32 raw_epp) 668 { 669 int epp = -EINVAL; 670 int ret; 671 672 if (!pref_index) 673 epp = cpu_data->epp_default; 674 675 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 676 if (use_raw) 677 epp = raw_epp; 678 else if (epp == -EINVAL) 679 epp = epp_values[pref_index - 1]; 680 681 ret = intel_pstate_set_epp(cpu_data, epp); 682 } else { 683 if (epp == -EINVAL) 684 epp = (pref_index - 1) << 2; 685 ret = intel_pstate_set_epb(cpu_data->cpu, epp); 686 } 687 688 return ret; 689 } 690 691 static ssize_t show_energy_performance_available_preferences( 692 struct cpufreq_policy *policy, char *buf) 693 { 694 int i = 0; 695 int ret = 0; 696 697 while (energy_perf_strings[i] != NULL) 698 ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]); 699 700 ret += sprintf(&buf[ret], "\n"); 701 702 return ret; 703 } 704 705 cpufreq_freq_attr_ro(energy_performance_available_preferences); 706 707 static struct cpufreq_driver intel_pstate; 708 709 static ssize_t store_energy_performance_preference( 710 struct cpufreq_policy *policy, const char *buf, size_t count) 711 { 712 struct cpudata *cpu = all_cpu_data[policy->cpu]; 713 char str_preference[21]; 714 bool raw = false; 715 ssize_t ret; 716 u32 epp = 0; 717 718 ret = sscanf(buf, "%20s", str_preference); 719 if (ret != 1) 720 return -EINVAL; 721 722 ret = match_string(energy_perf_strings, -1, str_preference); 723 if (ret < 0) { 724 if (!boot_cpu_has(X86_FEATURE_HWP_EPP)) 725 return ret; 726 727 ret = kstrtouint(buf, 10, &epp); 728 if (ret) 729 return ret; 730 731 if (epp > 255) 732 return -EINVAL; 733 734 raw = true; 735 } 736 737 /* 738 * This function runs with the policy R/W semaphore held, which 739 * guarantees that the driver pointer will not change while it is 740 * running. 741 */ 742 if (!intel_pstate_driver) 743 return -EAGAIN; 744 745 mutex_lock(&intel_pstate_limits_lock); 746 747 if (intel_pstate_driver == &intel_pstate) { 748 ret = intel_pstate_set_energy_pref_index(cpu, ret, raw, epp); 749 } else { 750 /* 751 * In the passive mode the governor needs to be stopped on the 752 * target CPU before the EPP update and restarted after it, 753 * which is super-heavy-weight, so make sure it is worth doing 754 * upfront. 755 */ 756 if (!raw) 757 epp = ret ? epp_values[ret - 1] : cpu->epp_default; 758 759 if (cpu->epp_cached != epp) { 760 int err; 761 762 cpufreq_stop_governor(policy); 763 ret = intel_pstate_set_epp(cpu, epp); 764 err = cpufreq_start_governor(policy); 765 if (!ret) { 766 cpu->epp_cached = epp; 767 ret = err; 768 } 769 } 770 } 771 772 mutex_unlock(&intel_pstate_limits_lock); 773 774 return ret ?: count; 775 } 776 777 static ssize_t show_energy_performance_preference( 778 struct cpufreq_policy *policy, char *buf) 779 { 780 struct cpudata *cpu_data = all_cpu_data[policy->cpu]; 781 int preference, raw_epp; 782 783 preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp); 784 if (preference < 0) 785 return preference; 786 787 if (raw_epp) 788 return sprintf(buf, "%d\n", raw_epp); 789 else 790 return sprintf(buf, "%s\n", energy_perf_strings[preference]); 791 } 792 793 cpufreq_freq_attr_rw(energy_performance_preference); 794 795 static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf) 796 { 797 struct cpudata *cpu; 798 u64 cap; 799 int ratio; 800 801 ratio = intel_pstate_get_cppc_guranteed(policy->cpu); 802 if (ratio <= 0) { 803 rdmsrl_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap); 804 ratio = HWP_GUARANTEED_PERF(cap); 805 } 806 807 cpu = all_cpu_data[policy->cpu]; 808 809 return sprintf(buf, "%d\n", ratio * cpu->pstate.scaling); 810 } 811 812 cpufreq_freq_attr_ro(base_frequency); 813 814 static struct freq_attr *hwp_cpufreq_attrs[] = { 815 &energy_performance_preference, 816 &energy_performance_available_preferences, 817 &base_frequency, 818 NULL, 819 }; 820 821 static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max, 822 int *current_max) 823 { 824 u64 cap; 825 826 rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); 827 WRITE_ONCE(all_cpu_data[cpu]->hwp_cap_cached, cap); 828 if (global.no_turbo) 829 *current_max = HWP_GUARANTEED_PERF(cap); 830 else 831 *current_max = HWP_HIGHEST_PERF(cap); 832 833 *phy_max = HWP_HIGHEST_PERF(cap); 834 } 835 836 static void intel_pstate_hwp_set(unsigned int cpu) 837 { 838 struct cpudata *cpu_data = all_cpu_data[cpu]; 839 int max, min; 840 u64 value; 841 s16 epp; 842 843 max = cpu_data->max_perf_ratio; 844 min = cpu_data->min_perf_ratio; 845 846 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) 847 min = max; 848 849 rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); 850 851 value &= ~HWP_MIN_PERF(~0L); 852 value |= HWP_MIN_PERF(min); 853 854 value &= ~HWP_MAX_PERF(~0L); 855 value |= HWP_MAX_PERF(max); 856 857 if (cpu_data->epp_policy == cpu_data->policy) 858 goto skip_epp; 859 860 cpu_data->epp_policy = cpu_data->policy; 861 862 if (cpu_data->epp_saved >= 0) { 863 epp = cpu_data->epp_saved; 864 cpu_data->epp_saved = -EINVAL; 865 goto update_epp; 866 } 867 868 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) { 869 epp = intel_pstate_get_epp(cpu_data, value); 870 cpu_data->epp_powersave = epp; 871 /* If EPP read was failed, then don't try to write */ 872 if (epp < 0) 873 goto skip_epp; 874 875 epp = 0; 876 } else { 877 /* skip setting EPP, when saved value is invalid */ 878 if (cpu_data->epp_powersave < 0) 879 goto skip_epp; 880 881 /* 882 * No need to restore EPP when it is not zero. This 883 * means: 884 * - Policy is not changed 885 * - user has manually changed 886 * - Error reading EPB 887 */ 888 epp = intel_pstate_get_epp(cpu_data, value); 889 if (epp) 890 goto skip_epp; 891 892 epp = cpu_data->epp_powersave; 893 } 894 update_epp: 895 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 896 value &= ~GENMASK_ULL(31, 24); 897 value |= (u64)epp << 24; 898 } else { 899 intel_pstate_set_epb(cpu, epp); 900 } 901 skip_epp: 902 WRITE_ONCE(cpu_data->hwp_req_cached, value); 903 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); 904 } 905 906 static void intel_pstate_hwp_force_min_perf(int cpu) 907 { 908 u64 value; 909 int min_perf; 910 911 value = all_cpu_data[cpu]->hwp_req_cached; 912 value &= ~GENMASK_ULL(31, 0); 913 min_perf = HWP_LOWEST_PERF(all_cpu_data[cpu]->hwp_cap_cached); 914 915 /* Set hwp_max = hwp_min */ 916 value |= HWP_MAX_PERF(min_perf); 917 value |= HWP_MIN_PERF(min_perf); 918 919 /* Set EPP to min */ 920 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) 921 value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE); 922 923 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); 924 } 925 926 static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy) 927 { 928 struct cpudata *cpu_data = all_cpu_data[policy->cpu]; 929 930 if (!hwp_active) 931 return 0; 932 933 cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0); 934 935 return 0; 936 } 937 938 #define POWER_CTL_EE_ENABLE 1 939 #define POWER_CTL_EE_DISABLE 2 940 941 static int power_ctl_ee_state; 942 943 static void set_power_ctl_ee_state(bool input) 944 { 945 u64 power_ctl; 946 947 mutex_lock(&intel_pstate_driver_lock); 948 rdmsrl(MSR_IA32_POWER_CTL, power_ctl); 949 if (input) { 950 power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE); 951 power_ctl_ee_state = POWER_CTL_EE_ENABLE; 952 } else { 953 power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE); 954 power_ctl_ee_state = POWER_CTL_EE_DISABLE; 955 } 956 wrmsrl(MSR_IA32_POWER_CTL, power_ctl); 957 mutex_unlock(&intel_pstate_driver_lock); 958 } 959 960 static void intel_pstate_hwp_enable(struct cpudata *cpudata); 961 962 static int intel_pstate_resume(struct cpufreq_policy *policy) 963 { 964 965 /* Only restore if the system default is changed */ 966 if (power_ctl_ee_state == POWER_CTL_EE_ENABLE) 967 set_power_ctl_ee_state(true); 968 else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE) 969 set_power_ctl_ee_state(false); 970 971 if (!hwp_active) 972 return 0; 973 974 mutex_lock(&intel_pstate_limits_lock); 975 976 if (policy->cpu == 0) 977 intel_pstate_hwp_enable(all_cpu_data[policy->cpu]); 978 979 all_cpu_data[policy->cpu]->epp_policy = 0; 980 intel_pstate_hwp_set(policy->cpu); 981 982 mutex_unlock(&intel_pstate_limits_lock); 983 984 return 0; 985 } 986 987 static void intel_pstate_update_policies(void) 988 { 989 int cpu; 990 991 for_each_possible_cpu(cpu) 992 cpufreq_update_policy(cpu); 993 } 994 995 static void intel_pstate_update_max_freq(unsigned int cpu) 996 { 997 struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); 998 struct cpudata *cpudata; 999 1000 if (!policy) 1001 return; 1002 1003 cpudata = all_cpu_data[cpu]; 1004 policy->cpuinfo.max_freq = global.turbo_disabled_mf ? 1005 cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; 1006 1007 refresh_frequency_limits(policy); 1008 1009 cpufreq_cpu_release(policy); 1010 } 1011 1012 static void intel_pstate_update_limits(unsigned int cpu) 1013 { 1014 mutex_lock(&intel_pstate_driver_lock); 1015 1016 update_turbo_state(); 1017 /* 1018 * If turbo has been turned on or off globally, policy limits for 1019 * all CPUs need to be updated to reflect that. 1020 */ 1021 if (global.turbo_disabled_mf != global.turbo_disabled) { 1022 global.turbo_disabled_mf = global.turbo_disabled; 1023 arch_set_max_freq_ratio(global.turbo_disabled); 1024 for_each_possible_cpu(cpu) 1025 intel_pstate_update_max_freq(cpu); 1026 } else { 1027 cpufreq_update_policy(cpu); 1028 } 1029 1030 mutex_unlock(&intel_pstate_driver_lock); 1031 } 1032 1033 /************************** sysfs begin ************************/ 1034 #define show_one(file_name, object) \ 1035 static ssize_t show_##file_name \ 1036 (struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ 1037 { \ 1038 return sprintf(buf, "%u\n", global.object); \ 1039 } 1040 1041 static ssize_t intel_pstate_show_status(char *buf); 1042 static int intel_pstate_update_status(const char *buf, size_t size); 1043 1044 static ssize_t show_status(struct kobject *kobj, 1045 struct kobj_attribute *attr, char *buf) 1046 { 1047 ssize_t ret; 1048 1049 mutex_lock(&intel_pstate_driver_lock); 1050 ret = intel_pstate_show_status(buf); 1051 mutex_unlock(&intel_pstate_driver_lock); 1052 1053 return ret; 1054 } 1055 1056 static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, 1057 const char *buf, size_t count) 1058 { 1059 char *p = memchr(buf, '\n', count); 1060 int ret; 1061 1062 mutex_lock(&intel_pstate_driver_lock); 1063 ret = intel_pstate_update_status(buf, p ? p - buf : count); 1064 mutex_unlock(&intel_pstate_driver_lock); 1065 1066 return ret < 0 ? ret : count; 1067 } 1068 1069 static ssize_t show_turbo_pct(struct kobject *kobj, 1070 struct kobj_attribute *attr, char *buf) 1071 { 1072 struct cpudata *cpu; 1073 int total, no_turbo, turbo_pct; 1074 uint32_t turbo_fp; 1075 1076 mutex_lock(&intel_pstate_driver_lock); 1077 1078 if (!intel_pstate_driver) { 1079 mutex_unlock(&intel_pstate_driver_lock); 1080 return -EAGAIN; 1081 } 1082 1083 cpu = all_cpu_data[0]; 1084 1085 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 1086 no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; 1087 turbo_fp = div_fp(no_turbo, total); 1088 turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); 1089 1090 mutex_unlock(&intel_pstate_driver_lock); 1091 1092 return sprintf(buf, "%u\n", turbo_pct); 1093 } 1094 1095 static ssize_t show_num_pstates(struct kobject *kobj, 1096 struct kobj_attribute *attr, char *buf) 1097 { 1098 struct cpudata *cpu; 1099 int total; 1100 1101 mutex_lock(&intel_pstate_driver_lock); 1102 1103 if (!intel_pstate_driver) { 1104 mutex_unlock(&intel_pstate_driver_lock); 1105 return -EAGAIN; 1106 } 1107 1108 cpu = all_cpu_data[0]; 1109 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 1110 1111 mutex_unlock(&intel_pstate_driver_lock); 1112 1113 return sprintf(buf, "%u\n", total); 1114 } 1115 1116 static ssize_t show_no_turbo(struct kobject *kobj, 1117 struct kobj_attribute *attr, char *buf) 1118 { 1119 ssize_t ret; 1120 1121 mutex_lock(&intel_pstate_driver_lock); 1122 1123 if (!intel_pstate_driver) { 1124 mutex_unlock(&intel_pstate_driver_lock); 1125 return -EAGAIN; 1126 } 1127 1128 update_turbo_state(); 1129 if (global.turbo_disabled) 1130 ret = sprintf(buf, "%u\n", global.turbo_disabled); 1131 else 1132 ret = sprintf(buf, "%u\n", global.no_turbo); 1133 1134 mutex_unlock(&intel_pstate_driver_lock); 1135 1136 return ret; 1137 } 1138 1139 static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, 1140 const char *buf, size_t count) 1141 { 1142 unsigned int input; 1143 int ret; 1144 1145 ret = sscanf(buf, "%u", &input); 1146 if (ret != 1) 1147 return -EINVAL; 1148 1149 mutex_lock(&intel_pstate_driver_lock); 1150 1151 if (!intel_pstate_driver) { 1152 mutex_unlock(&intel_pstate_driver_lock); 1153 return -EAGAIN; 1154 } 1155 1156 mutex_lock(&intel_pstate_limits_lock); 1157 1158 update_turbo_state(); 1159 if (global.turbo_disabled) { 1160 pr_notice_once("Turbo disabled by BIOS or unavailable on processor\n"); 1161 mutex_unlock(&intel_pstate_limits_lock); 1162 mutex_unlock(&intel_pstate_driver_lock); 1163 return -EPERM; 1164 } 1165 1166 global.no_turbo = clamp_t(int, input, 0, 1); 1167 1168 if (global.no_turbo) { 1169 struct cpudata *cpu = all_cpu_data[0]; 1170 int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate; 1171 1172 /* Squash the global minimum into the permitted range. */ 1173 if (global.min_perf_pct > pct) 1174 global.min_perf_pct = pct; 1175 } 1176 1177 mutex_unlock(&intel_pstate_limits_lock); 1178 1179 intel_pstate_update_policies(); 1180 1181 mutex_unlock(&intel_pstate_driver_lock); 1182 1183 return count; 1184 } 1185 1186 static void update_qos_request(enum freq_qos_req_type type) 1187 { 1188 int max_state, turbo_max, freq, i, perf_pct; 1189 struct freq_qos_request *req; 1190 struct cpufreq_policy *policy; 1191 1192 for_each_possible_cpu(i) { 1193 struct cpudata *cpu = all_cpu_data[i]; 1194 1195 policy = cpufreq_cpu_get(i); 1196 if (!policy) 1197 continue; 1198 1199 req = policy->driver_data; 1200 cpufreq_cpu_put(policy); 1201 1202 if (!req) 1203 continue; 1204 1205 if (hwp_active) 1206 intel_pstate_get_hwp_max(i, &turbo_max, &max_state); 1207 else 1208 turbo_max = cpu->pstate.turbo_pstate; 1209 1210 if (type == FREQ_QOS_MIN) { 1211 perf_pct = global.min_perf_pct; 1212 } else { 1213 req++; 1214 perf_pct = global.max_perf_pct; 1215 } 1216 1217 freq = DIV_ROUND_UP(turbo_max * perf_pct, 100); 1218 freq *= cpu->pstate.scaling; 1219 1220 if (freq_qos_update_request(req, freq) < 0) 1221 pr_warn("Failed to update freq constraint: CPU%d\n", i); 1222 } 1223 } 1224 1225 static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b, 1226 const char *buf, size_t count) 1227 { 1228 unsigned int input; 1229 int ret; 1230 1231 ret = sscanf(buf, "%u", &input); 1232 if (ret != 1) 1233 return -EINVAL; 1234 1235 mutex_lock(&intel_pstate_driver_lock); 1236 1237 if (!intel_pstate_driver) { 1238 mutex_unlock(&intel_pstate_driver_lock); 1239 return -EAGAIN; 1240 } 1241 1242 mutex_lock(&intel_pstate_limits_lock); 1243 1244 global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100); 1245 1246 mutex_unlock(&intel_pstate_limits_lock); 1247 1248 if (intel_pstate_driver == &intel_pstate) 1249 intel_pstate_update_policies(); 1250 else 1251 update_qos_request(FREQ_QOS_MAX); 1252 1253 mutex_unlock(&intel_pstate_driver_lock); 1254 1255 return count; 1256 } 1257 1258 static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b, 1259 const char *buf, size_t count) 1260 { 1261 unsigned int input; 1262 int ret; 1263 1264 ret = sscanf(buf, "%u", &input); 1265 if (ret != 1) 1266 return -EINVAL; 1267 1268 mutex_lock(&intel_pstate_driver_lock); 1269 1270 if (!intel_pstate_driver) { 1271 mutex_unlock(&intel_pstate_driver_lock); 1272 return -EAGAIN; 1273 } 1274 1275 mutex_lock(&intel_pstate_limits_lock); 1276 1277 global.min_perf_pct = clamp_t(int, input, 1278 min_perf_pct_min(), global.max_perf_pct); 1279 1280 mutex_unlock(&intel_pstate_limits_lock); 1281 1282 if (intel_pstate_driver == &intel_pstate) 1283 intel_pstate_update_policies(); 1284 else 1285 update_qos_request(FREQ_QOS_MIN); 1286 1287 mutex_unlock(&intel_pstate_driver_lock); 1288 1289 return count; 1290 } 1291 1292 static ssize_t show_hwp_dynamic_boost(struct kobject *kobj, 1293 struct kobj_attribute *attr, char *buf) 1294 { 1295 return sprintf(buf, "%u\n", hwp_boost); 1296 } 1297 1298 static ssize_t store_hwp_dynamic_boost(struct kobject *a, 1299 struct kobj_attribute *b, 1300 const char *buf, size_t count) 1301 { 1302 unsigned int input; 1303 int ret; 1304 1305 ret = kstrtouint(buf, 10, &input); 1306 if (ret) 1307 return ret; 1308 1309 mutex_lock(&intel_pstate_driver_lock); 1310 hwp_boost = !!input; 1311 intel_pstate_update_policies(); 1312 mutex_unlock(&intel_pstate_driver_lock); 1313 1314 return count; 1315 } 1316 1317 static ssize_t show_energy_efficiency(struct kobject *kobj, struct kobj_attribute *attr, 1318 char *buf) 1319 { 1320 u64 power_ctl; 1321 int enable; 1322 1323 rdmsrl(MSR_IA32_POWER_CTL, power_ctl); 1324 enable = !!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE)); 1325 return sprintf(buf, "%d\n", !enable); 1326 } 1327 1328 static ssize_t store_energy_efficiency(struct kobject *a, struct kobj_attribute *b, 1329 const char *buf, size_t count) 1330 { 1331 bool input; 1332 int ret; 1333 1334 ret = kstrtobool(buf, &input); 1335 if (ret) 1336 return ret; 1337 1338 set_power_ctl_ee_state(input); 1339 1340 return count; 1341 } 1342 1343 show_one(max_perf_pct, max_perf_pct); 1344 show_one(min_perf_pct, min_perf_pct); 1345 1346 define_one_global_rw(status); 1347 define_one_global_rw(no_turbo); 1348 define_one_global_rw(max_perf_pct); 1349 define_one_global_rw(min_perf_pct); 1350 define_one_global_ro(turbo_pct); 1351 define_one_global_ro(num_pstates); 1352 define_one_global_rw(hwp_dynamic_boost); 1353 define_one_global_rw(energy_efficiency); 1354 1355 static struct attribute *intel_pstate_attributes[] = { 1356 &status.attr, 1357 &no_turbo.attr, 1358 &turbo_pct.attr, 1359 &num_pstates.attr, 1360 NULL 1361 }; 1362 1363 static const struct attribute_group intel_pstate_attr_group = { 1364 .attrs = intel_pstate_attributes, 1365 }; 1366 1367 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[]; 1368 1369 static struct kobject *intel_pstate_kobject; 1370 1371 static void __init intel_pstate_sysfs_expose_params(void) 1372 { 1373 int rc; 1374 1375 intel_pstate_kobject = kobject_create_and_add("intel_pstate", 1376 &cpu_subsys.dev_root->kobj); 1377 if (WARN_ON(!intel_pstate_kobject)) 1378 return; 1379 1380 rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group); 1381 if (WARN_ON(rc)) 1382 return; 1383 1384 /* 1385 * If per cpu limits are enforced there are no global limits, so 1386 * return without creating max/min_perf_pct attributes 1387 */ 1388 if (per_cpu_limits) 1389 return; 1390 1391 rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr); 1392 WARN_ON(rc); 1393 1394 rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr); 1395 WARN_ON(rc); 1396 1397 if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) { 1398 rc = sysfs_create_file(intel_pstate_kobject, &energy_efficiency.attr); 1399 WARN_ON(rc); 1400 } 1401 } 1402 1403 static void intel_pstate_sysfs_expose_hwp_dynamic_boost(void) 1404 { 1405 int rc; 1406 1407 if (!hwp_active) 1408 return; 1409 1410 rc = sysfs_create_file(intel_pstate_kobject, &hwp_dynamic_boost.attr); 1411 WARN_ON_ONCE(rc); 1412 } 1413 1414 static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void) 1415 { 1416 if (!hwp_active) 1417 return; 1418 1419 sysfs_remove_file(intel_pstate_kobject, &hwp_dynamic_boost.attr); 1420 } 1421 1422 /************************** sysfs end ************************/ 1423 1424 static void intel_pstate_hwp_enable(struct cpudata *cpudata) 1425 { 1426 /* First disable HWP notification interrupt as we don't process them */ 1427 if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) 1428 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); 1429 1430 wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); 1431 cpudata->epp_policy = 0; 1432 if (cpudata->epp_default == -EINVAL) 1433 cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); 1434 } 1435 1436 static int atom_get_min_pstate(void) 1437 { 1438 u64 value; 1439 1440 rdmsrl(MSR_ATOM_CORE_RATIOS, value); 1441 return (value >> 8) & 0x7F; 1442 } 1443 1444 static int atom_get_max_pstate(void) 1445 { 1446 u64 value; 1447 1448 rdmsrl(MSR_ATOM_CORE_RATIOS, value); 1449 return (value >> 16) & 0x7F; 1450 } 1451 1452 static int atom_get_turbo_pstate(void) 1453 { 1454 u64 value; 1455 1456 rdmsrl(MSR_ATOM_CORE_TURBO_RATIOS, value); 1457 return value & 0x7F; 1458 } 1459 1460 static u64 atom_get_val(struct cpudata *cpudata, int pstate) 1461 { 1462 u64 val; 1463 int32_t vid_fp; 1464 u32 vid; 1465 1466 val = (u64)pstate << 8; 1467 if (global.no_turbo && !global.turbo_disabled) 1468 val |= (u64)1 << 32; 1469 1470 vid_fp = cpudata->vid.min + mul_fp( 1471 int_tofp(pstate - cpudata->pstate.min_pstate), 1472 cpudata->vid.ratio); 1473 1474 vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); 1475 vid = ceiling_fp(vid_fp); 1476 1477 if (pstate > cpudata->pstate.max_pstate) 1478 vid = cpudata->vid.turbo; 1479 1480 return val | vid; 1481 } 1482 1483 static int silvermont_get_scaling(void) 1484 { 1485 u64 value; 1486 int i; 1487 /* Defined in Table 35-6 from SDM (Sept 2015) */ 1488 static int silvermont_freq_table[] = { 1489 83300, 100000, 133300, 116700, 80000}; 1490 1491 rdmsrl(MSR_FSB_FREQ, value); 1492 i = value & 0x7; 1493 WARN_ON(i > 4); 1494 1495 return silvermont_freq_table[i]; 1496 } 1497 1498 static int airmont_get_scaling(void) 1499 { 1500 u64 value; 1501 int i; 1502 /* Defined in Table 35-10 from SDM (Sept 2015) */ 1503 static int airmont_freq_table[] = { 1504 83300, 100000, 133300, 116700, 80000, 1505 93300, 90000, 88900, 87500}; 1506 1507 rdmsrl(MSR_FSB_FREQ, value); 1508 i = value & 0xF; 1509 WARN_ON(i > 8); 1510 1511 return airmont_freq_table[i]; 1512 } 1513 1514 static void atom_get_vid(struct cpudata *cpudata) 1515 { 1516 u64 value; 1517 1518 rdmsrl(MSR_ATOM_CORE_VIDS, value); 1519 cpudata->vid.min = int_tofp((value >> 8) & 0x7f); 1520 cpudata->vid.max = int_tofp((value >> 16) & 0x7f); 1521 cpudata->vid.ratio = div_fp( 1522 cpudata->vid.max - cpudata->vid.min, 1523 int_tofp(cpudata->pstate.max_pstate - 1524 cpudata->pstate.min_pstate)); 1525 1526 rdmsrl(MSR_ATOM_CORE_TURBO_VIDS, value); 1527 cpudata->vid.turbo = value & 0x7f; 1528 } 1529 1530 static int core_get_min_pstate(void) 1531 { 1532 u64 value; 1533 1534 rdmsrl(MSR_PLATFORM_INFO, value); 1535 return (value >> 40) & 0xFF; 1536 } 1537 1538 static int core_get_max_pstate_physical(void) 1539 { 1540 u64 value; 1541 1542 rdmsrl(MSR_PLATFORM_INFO, value); 1543 return (value >> 8) & 0xFF; 1544 } 1545 1546 static int core_get_tdp_ratio(u64 plat_info) 1547 { 1548 /* Check how many TDP levels present */ 1549 if (plat_info & 0x600000000) { 1550 u64 tdp_ctrl; 1551 u64 tdp_ratio; 1552 int tdp_msr; 1553 int err; 1554 1555 /* Get the TDP level (0, 1, 2) to get ratios */ 1556 err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl); 1557 if (err) 1558 return err; 1559 1560 /* TDP MSR are continuous starting at 0x648 */ 1561 tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x03); 1562 err = rdmsrl_safe(tdp_msr, &tdp_ratio); 1563 if (err) 1564 return err; 1565 1566 /* For level 1 and 2, bits[23:16] contain the ratio */ 1567 if (tdp_ctrl & 0x03) 1568 tdp_ratio >>= 16; 1569 1570 tdp_ratio &= 0xff; /* ratios are only 8 bits long */ 1571 pr_debug("tdp_ratio %x\n", (int)tdp_ratio); 1572 1573 return (int)tdp_ratio; 1574 } 1575 1576 return -ENXIO; 1577 } 1578 1579 static int core_get_max_pstate(void) 1580 { 1581 u64 tar; 1582 u64 plat_info; 1583 int max_pstate; 1584 int tdp_ratio; 1585 int err; 1586 1587 rdmsrl(MSR_PLATFORM_INFO, plat_info); 1588 max_pstate = (plat_info >> 8) & 0xFF; 1589 1590 tdp_ratio = core_get_tdp_ratio(plat_info); 1591 if (tdp_ratio <= 0) 1592 return max_pstate; 1593 1594 if (hwp_active) { 1595 /* Turbo activation ratio is not used on HWP platforms */ 1596 return tdp_ratio; 1597 } 1598 1599 err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar); 1600 if (!err) { 1601 int tar_levels; 1602 1603 /* Do some sanity checking for safety */ 1604 tar_levels = tar & 0xff; 1605 if (tdp_ratio - 1 == tar_levels) { 1606 max_pstate = tar_levels; 1607 pr_debug("max_pstate=TAC %x\n", max_pstate); 1608 } 1609 } 1610 1611 return max_pstate; 1612 } 1613 1614 static int core_get_turbo_pstate(void) 1615 { 1616 u64 value; 1617 int nont, ret; 1618 1619 rdmsrl(MSR_TURBO_RATIO_LIMIT, value); 1620 nont = core_get_max_pstate(); 1621 ret = (value) & 255; 1622 if (ret <= nont) 1623 ret = nont; 1624 return ret; 1625 } 1626 1627 static inline int core_get_scaling(void) 1628 { 1629 return 100000; 1630 } 1631 1632 static u64 core_get_val(struct cpudata *cpudata, int pstate) 1633 { 1634 u64 val; 1635 1636 val = (u64)pstate << 8; 1637 if (global.no_turbo && !global.turbo_disabled) 1638 val |= (u64)1 << 32; 1639 1640 return val; 1641 } 1642 1643 static int knl_get_aperf_mperf_shift(void) 1644 { 1645 return 10; 1646 } 1647 1648 static int knl_get_turbo_pstate(void) 1649 { 1650 u64 value; 1651 int nont, ret; 1652 1653 rdmsrl(MSR_TURBO_RATIO_LIMIT, value); 1654 nont = core_get_max_pstate(); 1655 ret = (((value) >> 8) & 0xFF); 1656 if (ret <= nont) 1657 ret = nont; 1658 return ret; 1659 } 1660 1661 static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) 1662 { 1663 trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); 1664 cpu->pstate.current_pstate = pstate; 1665 /* 1666 * Generally, there is no guarantee that this code will always run on 1667 * the CPU being updated, so force the register update to run on the 1668 * right CPU. 1669 */ 1670 wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, 1671 pstate_funcs.get_val(cpu, pstate)); 1672 } 1673 1674 static void intel_pstate_set_min_pstate(struct cpudata *cpu) 1675 { 1676 intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); 1677 } 1678 1679 static void intel_pstate_max_within_limits(struct cpudata *cpu) 1680 { 1681 int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); 1682 1683 update_turbo_state(); 1684 intel_pstate_set_pstate(cpu, pstate); 1685 } 1686 1687 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) 1688 { 1689 cpu->pstate.min_pstate = pstate_funcs.get_min(); 1690 cpu->pstate.max_pstate = pstate_funcs.get_max(); 1691 cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(); 1692 cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); 1693 cpu->pstate.scaling = pstate_funcs.get_scaling(); 1694 cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling; 1695 1696 if (hwp_active && !hwp_mode_bdw) { 1697 unsigned int phy_max, current_max; 1698 1699 intel_pstate_get_hwp_max(cpu->cpu, &phy_max, ¤t_max); 1700 cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling; 1701 cpu->pstate.turbo_pstate = phy_max; 1702 } else { 1703 cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; 1704 } 1705 1706 if (pstate_funcs.get_aperf_mperf_shift) 1707 cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift(); 1708 1709 if (pstate_funcs.get_vid) 1710 pstate_funcs.get_vid(cpu); 1711 1712 intel_pstate_set_min_pstate(cpu); 1713 } 1714 1715 /* 1716 * Long hold time will keep high perf limits for long time, 1717 * which negatively impacts perf/watt for some workloads, 1718 * like specpower. 3ms is based on experiements on some 1719 * workoads. 1720 */ 1721 static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC; 1722 1723 static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu) 1724 { 1725 u64 hwp_req = READ_ONCE(cpu->hwp_req_cached); 1726 u32 max_limit = (hwp_req & 0xff00) >> 8; 1727 u32 min_limit = (hwp_req & 0xff); 1728 u32 boost_level1; 1729 1730 /* 1731 * Cases to consider (User changes via sysfs or boot time): 1732 * If, P0 (Turbo max) = P1 (Guaranteed max) = min: 1733 * No boost, return. 1734 * If, P0 (Turbo max) > P1 (Guaranteed max) = min: 1735 * Should result in one level boost only for P0. 1736 * If, P0 (Turbo max) = P1 (Guaranteed max) > min: 1737 * Should result in two level boost: 1738 * (min + p1)/2 and P1. 1739 * If, P0 (Turbo max) > P1 (Guaranteed max) > min: 1740 * Should result in three level boost: 1741 * (min + p1)/2, P1 and P0. 1742 */ 1743 1744 /* If max and min are equal or already at max, nothing to boost */ 1745 if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit) 1746 return; 1747 1748 if (!cpu->hwp_boost_min) 1749 cpu->hwp_boost_min = min_limit; 1750 1751 /* level at half way mark between min and guranteed */ 1752 boost_level1 = (HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) + min_limit) >> 1; 1753 1754 if (cpu->hwp_boost_min < boost_level1) 1755 cpu->hwp_boost_min = boost_level1; 1756 else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(cpu->hwp_cap_cached)) 1757 cpu->hwp_boost_min = HWP_GUARANTEED_PERF(cpu->hwp_cap_cached); 1758 else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) && 1759 max_limit != HWP_GUARANTEED_PERF(cpu->hwp_cap_cached)) 1760 cpu->hwp_boost_min = max_limit; 1761 else 1762 return; 1763 1764 hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min; 1765 wrmsrl(MSR_HWP_REQUEST, hwp_req); 1766 cpu->last_update = cpu->sample.time; 1767 } 1768 1769 static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu) 1770 { 1771 if (cpu->hwp_boost_min) { 1772 bool expired; 1773 1774 /* Check if we are idle for hold time to boost down */ 1775 expired = time_after64(cpu->sample.time, cpu->last_update + 1776 hwp_boost_hold_time_ns); 1777 if (expired) { 1778 wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached); 1779 cpu->hwp_boost_min = 0; 1780 } 1781 } 1782 cpu->last_update = cpu->sample.time; 1783 } 1784 1785 static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu, 1786 u64 time) 1787 { 1788 cpu->sample.time = time; 1789 1790 if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) { 1791 bool do_io = false; 1792 1793 cpu->sched_flags = 0; 1794 /* 1795 * Set iowait_boost flag and update time. Since IO WAIT flag 1796 * is set all the time, we can't just conclude that there is 1797 * some IO bound activity is scheduled on this CPU with just 1798 * one occurrence. If we receive at least two in two 1799 * consecutive ticks, then we treat as boost candidate. 1800 */ 1801 if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC)) 1802 do_io = true; 1803 1804 cpu->last_io_update = time; 1805 1806 if (do_io) 1807 intel_pstate_hwp_boost_up(cpu); 1808 1809 } else { 1810 intel_pstate_hwp_boost_down(cpu); 1811 } 1812 } 1813 1814 static inline void intel_pstate_update_util_hwp(struct update_util_data *data, 1815 u64 time, unsigned int flags) 1816 { 1817 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 1818 1819 cpu->sched_flags |= flags; 1820 1821 if (smp_processor_id() == cpu->cpu) 1822 intel_pstate_update_util_hwp_local(cpu, time); 1823 } 1824 1825 static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu) 1826 { 1827 struct sample *sample = &cpu->sample; 1828 1829 sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf); 1830 } 1831 1832 static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) 1833 { 1834 u64 aperf, mperf; 1835 unsigned long flags; 1836 u64 tsc; 1837 1838 local_irq_save(flags); 1839 rdmsrl(MSR_IA32_APERF, aperf); 1840 rdmsrl(MSR_IA32_MPERF, mperf); 1841 tsc = rdtsc(); 1842 if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) { 1843 local_irq_restore(flags); 1844 return false; 1845 } 1846 local_irq_restore(flags); 1847 1848 cpu->last_sample_time = cpu->sample.time; 1849 cpu->sample.time = time; 1850 cpu->sample.aperf = aperf; 1851 cpu->sample.mperf = mperf; 1852 cpu->sample.tsc = tsc; 1853 cpu->sample.aperf -= cpu->prev_aperf; 1854 cpu->sample.mperf -= cpu->prev_mperf; 1855 cpu->sample.tsc -= cpu->prev_tsc; 1856 1857 cpu->prev_aperf = aperf; 1858 cpu->prev_mperf = mperf; 1859 cpu->prev_tsc = tsc; 1860 /* 1861 * First time this function is invoked in a given cycle, all of the 1862 * previous sample data fields are equal to zero or stale and they must 1863 * be populated with meaningful numbers for things to work, so assume 1864 * that sample.time will always be reset before setting the utilization 1865 * update hook and make the caller skip the sample then. 1866 */ 1867 if (cpu->last_sample_time) { 1868 intel_pstate_calc_avg_perf(cpu); 1869 return true; 1870 } 1871 return false; 1872 } 1873 1874 static inline int32_t get_avg_frequency(struct cpudata *cpu) 1875 { 1876 return mul_ext_fp(cpu->sample.core_avg_perf, cpu_khz); 1877 } 1878 1879 static inline int32_t get_avg_pstate(struct cpudata *cpu) 1880 { 1881 return mul_ext_fp(cpu->pstate.max_pstate_physical, 1882 cpu->sample.core_avg_perf); 1883 } 1884 1885 static inline int32_t get_target_pstate(struct cpudata *cpu) 1886 { 1887 struct sample *sample = &cpu->sample; 1888 int32_t busy_frac; 1889 int target, avg_pstate; 1890 1891 busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift, 1892 sample->tsc); 1893 1894 if (busy_frac < cpu->iowait_boost) 1895 busy_frac = cpu->iowait_boost; 1896 1897 sample->busy_scaled = busy_frac * 100; 1898 1899 target = global.no_turbo || global.turbo_disabled ? 1900 cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; 1901 target += target >> 2; 1902 target = mul_fp(target, busy_frac); 1903 if (target < cpu->pstate.min_pstate) 1904 target = cpu->pstate.min_pstate; 1905 1906 /* 1907 * If the average P-state during the previous cycle was higher than the 1908 * current target, add 50% of the difference to the target to reduce 1909 * possible performance oscillations and offset possible performance 1910 * loss related to moving the workload from one CPU to another within 1911 * a package/module. 1912 */ 1913 avg_pstate = get_avg_pstate(cpu); 1914 if (avg_pstate > target) 1915 target += (avg_pstate - target) >> 1; 1916 1917 return target; 1918 } 1919 1920 static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) 1921 { 1922 int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio); 1923 int max_pstate = max(min_pstate, cpu->max_perf_ratio); 1924 1925 return clamp_t(int, pstate, min_pstate, max_pstate); 1926 } 1927 1928 static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) 1929 { 1930 if (pstate == cpu->pstate.current_pstate) 1931 return; 1932 1933 cpu->pstate.current_pstate = pstate; 1934 wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); 1935 } 1936 1937 static void intel_pstate_adjust_pstate(struct cpudata *cpu) 1938 { 1939 int from = cpu->pstate.current_pstate; 1940 struct sample *sample; 1941 int target_pstate; 1942 1943 update_turbo_state(); 1944 1945 target_pstate = get_target_pstate(cpu); 1946 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 1947 trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu); 1948 intel_pstate_update_pstate(cpu, target_pstate); 1949 1950 sample = &cpu->sample; 1951 trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf), 1952 fp_toint(sample->busy_scaled), 1953 from, 1954 cpu->pstate.current_pstate, 1955 sample->mperf, 1956 sample->aperf, 1957 sample->tsc, 1958 get_avg_frequency(cpu), 1959 fp_toint(cpu->iowait_boost * 100)); 1960 } 1961 1962 static void intel_pstate_update_util(struct update_util_data *data, u64 time, 1963 unsigned int flags) 1964 { 1965 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 1966 u64 delta_ns; 1967 1968 /* Don't allow remote callbacks */ 1969 if (smp_processor_id() != cpu->cpu) 1970 return; 1971 1972 delta_ns = time - cpu->last_update; 1973 if (flags & SCHED_CPUFREQ_IOWAIT) { 1974 /* Start over if the CPU may have been idle. */ 1975 if (delta_ns > TICK_NSEC) { 1976 cpu->iowait_boost = ONE_EIGHTH_FP; 1977 } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) { 1978 cpu->iowait_boost <<= 1; 1979 if (cpu->iowait_boost > int_tofp(1)) 1980 cpu->iowait_boost = int_tofp(1); 1981 } else { 1982 cpu->iowait_boost = ONE_EIGHTH_FP; 1983 } 1984 } else if (cpu->iowait_boost) { 1985 /* Clear iowait_boost if the CPU may have been idle. */ 1986 if (delta_ns > TICK_NSEC) 1987 cpu->iowait_boost = 0; 1988 else 1989 cpu->iowait_boost >>= 1; 1990 } 1991 cpu->last_update = time; 1992 delta_ns = time - cpu->sample.time; 1993 if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL) 1994 return; 1995 1996 if (intel_pstate_sample(cpu, time)) 1997 intel_pstate_adjust_pstate(cpu); 1998 } 1999 2000 static struct pstate_funcs core_funcs = { 2001 .get_max = core_get_max_pstate, 2002 .get_max_physical = core_get_max_pstate_physical, 2003 .get_min = core_get_min_pstate, 2004 .get_turbo = core_get_turbo_pstate, 2005 .get_scaling = core_get_scaling, 2006 .get_val = core_get_val, 2007 }; 2008 2009 static const struct pstate_funcs silvermont_funcs = { 2010 .get_max = atom_get_max_pstate, 2011 .get_max_physical = atom_get_max_pstate, 2012 .get_min = atom_get_min_pstate, 2013 .get_turbo = atom_get_turbo_pstate, 2014 .get_val = atom_get_val, 2015 .get_scaling = silvermont_get_scaling, 2016 .get_vid = atom_get_vid, 2017 }; 2018 2019 static const struct pstate_funcs airmont_funcs = { 2020 .get_max = atom_get_max_pstate, 2021 .get_max_physical = atom_get_max_pstate, 2022 .get_min = atom_get_min_pstate, 2023 .get_turbo = atom_get_turbo_pstate, 2024 .get_val = atom_get_val, 2025 .get_scaling = airmont_get_scaling, 2026 .get_vid = atom_get_vid, 2027 }; 2028 2029 static const struct pstate_funcs knl_funcs = { 2030 .get_max = core_get_max_pstate, 2031 .get_max_physical = core_get_max_pstate_physical, 2032 .get_min = core_get_min_pstate, 2033 .get_turbo = knl_get_turbo_pstate, 2034 .get_aperf_mperf_shift = knl_get_aperf_mperf_shift, 2035 .get_scaling = core_get_scaling, 2036 .get_val = core_get_val, 2037 }; 2038 2039 #define X86_MATCH(model, policy) \ 2040 X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ 2041 X86_FEATURE_APERFMPERF, &policy) 2042 2043 static const struct x86_cpu_id intel_pstate_cpu_ids[] = { 2044 X86_MATCH(SANDYBRIDGE, core_funcs), 2045 X86_MATCH(SANDYBRIDGE_X, core_funcs), 2046 X86_MATCH(ATOM_SILVERMONT, silvermont_funcs), 2047 X86_MATCH(IVYBRIDGE, core_funcs), 2048 X86_MATCH(HASWELL, core_funcs), 2049 X86_MATCH(BROADWELL, core_funcs), 2050 X86_MATCH(IVYBRIDGE_X, core_funcs), 2051 X86_MATCH(HASWELL_X, core_funcs), 2052 X86_MATCH(HASWELL_L, core_funcs), 2053 X86_MATCH(HASWELL_G, core_funcs), 2054 X86_MATCH(BROADWELL_G, core_funcs), 2055 X86_MATCH(ATOM_AIRMONT, airmont_funcs), 2056 X86_MATCH(SKYLAKE_L, core_funcs), 2057 X86_MATCH(BROADWELL_X, core_funcs), 2058 X86_MATCH(SKYLAKE, core_funcs), 2059 X86_MATCH(BROADWELL_D, core_funcs), 2060 X86_MATCH(XEON_PHI_KNL, knl_funcs), 2061 X86_MATCH(XEON_PHI_KNM, knl_funcs), 2062 X86_MATCH(ATOM_GOLDMONT, core_funcs), 2063 X86_MATCH(ATOM_GOLDMONT_PLUS, core_funcs), 2064 X86_MATCH(SKYLAKE_X, core_funcs), 2065 {} 2066 }; 2067 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); 2068 2069 static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { 2070 X86_MATCH(BROADWELL_D, core_funcs), 2071 X86_MATCH(BROADWELL_X, core_funcs), 2072 X86_MATCH(SKYLAKE_X, core_funcs), 2073 {} 2074 }; 2075 2076 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = { 2077 X86_MATCH(KABYLAKE, core_funcs), 2078 {} 2079 }; 2080 2081 static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = { 2082 X86_MATCH(SKYLAKE_X, core_funcs), 2083 X86_MATCH(SKYLAKE, core_funcs), 2084 {} 2085 }; 2086 2087 static int intel_pstate_init_cpu(unsigned int cpunum) 2088 { 2089 struct cpudata *cpu; 2090 2091 cpu = all_cpu_data[cpunum]; 2092 2093 if (!cpu) { 2094 cpu = kzalloc(sizeof(*cpu), GFP_KERNEL); 2095 if (!cpu) 2096 return -ENOMEM; 2097 2098 all_cpu_data[cpunum] = cpu; 2099 2100 cpu->epp_default = -EINVAL; 2101 cpu->epp_powersave = -EINVAL; 2102 cpu->epp_saved = -EINVAL; 2103 } 2104 2105 cpu = all_cpu_data[cpunum]; 2106 2107 cpu->cpu = cpunum; 2108 2109 if (hwp_active) { 2110 const struct x86_cpu_id *id; 2111 2112 intel_pstate_hwp_enable(cpu); 2113 2114 id = x86_match_cpu(intel_pstate_hwp_boost_ids); 2115 if (id && intel_pstate_acpi_pm_profile_server()) 2116 hwp_boost = true; 2117 } 2118 2119 intel_pstate_get_cpu_pstates(cpu); 2120 2121 pr_debug("controlling: cpu %d\n", cpunum); 2122 2123 return 0; 2124 } 2125 2126 static void intel_pstate_set_update_util_hook(unsigned int cpu_num) 2127 { 2128 struct cpudata *cpu = all_cpu_data[cpu_num]; 2129 2130 if (hwp_active && !hwp_boost) 2131 return; 2132 2133 if (cpu->update_util_set) 2134 return; 2135 2136 /* Prevent intel_pstate_update_util() from using stale data. */ 2137 cpu->sample.time = 0; 2138 cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, 2139 (hwp_active ? 2140 intel_pstate_update_util_hwp : 2141 intel_pstate_update_util)); 2142 cpu->update_util_set = true; 2143 } 2144 2145 static void intel_pstate_clear_update_util_hook(unsigned int cpu) 2146 { 2147 struct cpudata *cpu_data = all_cpu_data[cpu]; 2148 2149 if (!cpu_data->update_util_set) 2150 return; 2151 2152 cpufreq_remove_update_util_hook(cpu); 2153 cpu_data->update_util_set = false; 2154 synchronize_rcu(); 2155 } 2156 2157 static int intel_pstate_get_max_freq(struct cpudata *cpu) 2158 { 2159 return global.turbo_disabled || global.no_turbo ? 2160 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 2161 } 2162 2163 static void intel_pstate_update_perf_limits(struct cpudata *cpu, 2164 unsigned int policy_min, 2165 unsigned int policy_max) 2166 { 2167 int max_freq = intel_pstate_get_max_freq(cpu); 2168 int32_t max_policy_perf, min_policy_perf; 2169 int max_state, turbo_max; 2170 2171 /* 2172 * HWP needs some special consideration, because on BDX the 2173 * HWP_REQUEST uses abstract value to represent performance 2174 * rather than pure ratios. 2175 */ 2176 if (hwp_active) { 2177 intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state); 2178 } else { 2179 max_state = global.no_turbo || global.turbo_disabled ? 2180 cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; 2181 turbo_max = cpu->pstate.turbo_pstate; 2182 } 2183 2184 max_policy_perf = max_state * policy_max / max_freq; 2185 if (policy_max == policy_min) { 2186 min_policy_perf = max_policy_perf; 2187 } else { 2188 min_policy_perf = max_state * policy_min / max_freq; 2189 min_policy_perf = clamp_t(int32_t, min_policy_perf, 2190 0, max_policy_perf); 2191 } 2192 2193 pr_debug("cpu:%d max_state %d min_policy_perf:%d max_policy_perf:%d\n", 2194 cpu->cpu, max_state, min_policy_perf, max_policy_perf); 2195 2196 /* Normalize user input to [min_perf, max_perf] */ 2197 if (per_cpu_limits) { 2198 cpu->min_perf_ratio = min_policy_perf; 2199 cpu->max_perf_ratio = max_policy_perf; 2200 } else { 2201 int32_t global_min, global_max; 2202 2203 /* Global limits are in percent of the maximum turbo P-state. */ 2204 global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100); 2205 global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); 2206 global_min = clamp_t(int32_t, global_min, 0, global_max); 2207 2208 pr_debug("cpu:%d global_min:%d global_max:%d\n", cpu->cpu, 2209 global_min, global_max); 2210 2211 cpu->min_perf_ratio = max(min_policy_perf, global_min); 2212 cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf); 2213 cpu->max_perf_ratio = min(max_policy_perf, global_max); 2214 cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio); 2215 2216 /* Make sure min_perf <= max_perf */ 2217 cpu->min_perf_ratio = min(cpu->min_perf_ratio, 2218 cpu->max_perf_ratio); 2219 2220 } 2221 pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", cpu->cpu, 2222 cpu->max_perf_ratio, 2223 cpu->min_perf_ratio); 2224 } 2225 2226 static int intel_pstate_set_policy(struct cpufreq_policy *policy) 2227 { 2228 struct cpudata *cpu; 2229 2230 if (!policy->cpuinfo.max_freq) 2231 return -ENODEV; 2232 2233 pr_debug("set_policy cpuinfo.max %u policy->max %u\n", 2234 policy->cpuinfo.max_freq, policy->max); 2235 2236 cpu = all_cpu_data[policy->cpu]; 2237 cpu->policy = policy->policy; 2238 2239 mutex_lock(&intel_pstate_limits_lock); 2240 2241 intel_pstate_update_perf_limits(cpu, policy->min, policy->max); 2242 2243 if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { 2244 /* 2245 * NOHZ_FULL CPUs need this as the governor callback may not 2246 * be invoked on them. 2247 */ 2248 intel_pstate_clear_update_util_hook(policy->cpu); 2249 intel_pstate_max_within_limits(cpu); 2250 } else { 2251 intel_pstate_set_update_util_hook(policy->cpu); 2252 } 2253 2254 if (hwp_active) { 2255 /* 2256 * When hwp_boost was active before and dynamically it 2257 * was turned off, in that case we need to clear the 2258 * update util hook. 2259 */ 2260 if (!hwp_boost) 2261 intel_pstate_clear_update_util_hook(policy->cpu); 2262 intel_pstate_hwp_set(policy->cpu); 2263 } 2264 2265 mutex_unlock(&intel_pstate_limits_lock); 2266 2267 return 0; 2268 } 2269 2270 static void intel_pstate_adjust_policy_max(struct cpudata *cpu, 2271 struct cpufreq_policy_data *policy) 2272 { 2273 if (!hwp_active && 2274 cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate && 2275 policy->max < policy->cpuinfo.max_freq && 2276 policy->max > cpu->pstate.max_freq) { 2277 pr_debug("policy->max > max non turbo frequency\n"); 2278 policy->max = policy->cpuinfo.max_freq; 2279 } 2280 } 2281 2282 static void intel_pstate_verify_cpu_policy(struct cpudata *cpu, 2283 struct cpufreq_policy_data *policy) 2284 { 2285 update_turbo_state(); 2286 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, 2287 intel_pstate_get_max_freq(cpu)); 2288 2289 intel_pstate_adjust_policy_max(cpu, policy); 2290 } 2291 2292 static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy) 2293 { 2294 intel_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy); 2295 2296 return 0; 2297 } 2298 2299 static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) 2300 { 2301 if (hwp_active) 2302 intel_pstate_hwp_force_min_perf(policy->cpu); 2303 else 2304 intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); 2305 } 2306 2307 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) 2308 { 2309 pr_debug("CPU %d exiting\n", policy->cpu); 2310 2311 intel_pstate_clear_update_util_hook(policy->cpu); 2312 if (hwp_active) 2313 intel_pstate_hwp_save_state(policy); 2314 2315 intel_cpufreq_stop_cpu(policy); 2316 } 2317 2318 static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) 2319 { 2320 intel_pstate_exit_perf_limits(policy); 2321 2322 policy->fast_switch_possible = false; 2323 2324 return 0; 2325 } 2326 2327 static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) 2328 { 2329 struct cpudata *cpu; 2330 int rc; 2331 2332 rc = intel_pstate_init_cpu(policy->cpu); 2333 if (rc) 2334 return rc; 2335 2336 cpu = all_cpu_data[policy->cpu]; 2337 2338 cpu->max_perf_ratio = 0xFF; 2339 cpu->min_perf_ratio = 0; 2340 2341 policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; 2342 policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; 2343 2344 /* cpuinfo and default policy values */ 2345 policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; 2346 update_turbo_state(); 2347 global.turbo_disabled_mf = global.turbo_disabled; 2348 policy->cpuinfo.max_freq = global.turbo_disabled ? 2349 cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; 2350 policy->cpuinfo.max_freq *= cpu->pstate.scaling; 2351 2352 if (hwp_active) { 2353 unsigned int max_freq; 2354 2355 max_freq = global.turbo_disabled ? 2356 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 2357 if (max_freq < policy->cpuinfo.max_freq) 2358 policy->cpuinfo.max_freq = max_freq; 2359 } 2360 2361 intel_pstate_init_acpi_perf_limits(policy); 2362 2363 policy->fast_switch_possible = true; 2364 2365 return 0; 2366 } 2367 2368 static int intel_pstate_cpu_init(struct cpufreq_policy *policy) 2369 { 2370 int ret = __intel_pstate_cpu_init(policy); 2371 2372 if (ret) 2373 return ret; 2374 2375 /* 2376 * Set the policy to powersave to provide a valid fallback value in case 2377 * the default cpufreq governor is neither powersave nor performance. 2378 */ 2379 policy->policy = CPUFREQ_POLICY_POWERSAVE; 2380 2381 return 0; 2382 } 2383 2384 static struct cpufreq_driver intel_pstate = { 2385 .flags = CPUFREQ_CONST_LOOPS, 2386 .verify = intel_pstate_verify_policy, 2387 .setpolicy = intel_pstate_set_policy, 2388 .suspend = intel_pstate_hwp_save_state, 2389 .resume = intel_pstate_resume, 2390 .init = intel_pstate_cpu_init, 2391 .exit = intel_pstate_cpu_exit, 2392 .stop_cpu = intel_pstate_stop_cpu, 2393 .update_limits = intel_pstate_update_limits, 2394 .name = "intel_pstate", 2395 }; 2396 2397 static int intel_cpufreq_verify_policy(struct cpufreq_policy_data *policy) 2398 { 2399 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2400 2401 intel_pstate_verify_cpu_policy(cpu, policy); 2402 intel_pstate_update_perf_limits(cpu, policy->min, policy->max); 2403 2404 return 0; 2405 } 2406 2407 /* Use of trace in passive mode: 2408 * 2409 * In passive mode the trace core_busy field (also known as the 2410 * performance field, and lablelled as such on the graphs; also known as 2411 * core_avg_perf) is not needed and so is re-assigned to indicate if the 2412 * driver call was via the normal or fast switch path. Various graphs 2413 * output from the intel_pstate_tracer.py utility that include core_busy 2414 * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%, 2415 * so we use 10 to indicate the the normal path through the driver, and 2416 * 90 to indicate the fast switch path through the driver. 2417 * The scaled_busy field is not used, and is set to 0. 2418 */ 2419 2420 #define INTEL_PSTATE_TRACE_TARGET 10 2421 #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 2422 2423 static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) 2424 { 2425 struct sample *sample; 2426 2427 if (!trace_pstate_sample_enabled()) 2428 return; 2429 2430 if (!intel_pstate_sample(cpu, ktime_get())) 2431 return; 2432 2433 sample = &cpu->sample; 2434 trace_pstate_sample(trace_type, 2435 0, 2436 old_pstate, 2437 cpu->pstate.current_pstate, 2438 sample->mperf, 2439 sample->aperf, 2440 sample->tsc, 2441 get_avg_frequency(cpu), 2442 fp_toint(cpu->iowait_boost * 100)); 2443 } 2444 2445 static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate, 2446 bool fast_switch) 2447 { 2448 u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev; 2449 2450 value &= ~HWP_MIN_PERF(~0L); 2451 value |= HWP_MIN_PERF(target_pstate); 2452 2453 /* 2454 * The entire MSR needs to be updated in order to update the HWP min 2455 * field in it, so opportunistically update the max too if needed. 2456 */ 2457 value &= ~HWP_MAX_PERF(~0L); 2458 value |= HWP_MAX_PERF(cpu->max_perf_ratio); 2459 2460 if (value == prev) 2461 return; 2462 2463 WRITE_ONCE(cpu->hwp_req_cached, value); 2464 if (fast_switch) 2465 wrmsrl(MSR_HWP_REQUEST, value); 2466 else 2467 wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 2468 } 2469 2470 static void intel_cpufreq_adjust_perf_ctl(struct cpudata *cpu, 2471 u32 target_pstate, bool fast_switch) 2472 { 2473 if (fast_switch) 2474 wrmsrl(MSR_IA32_PERF_CTL, 2475 pstate_funcs.get_val(cpu, target_pstate)); 2476 else 2477 wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, 2478 pstate_funcs.get_val(cpu, target_pstate)); 2479 } 2480 2481 static int intel_cpufreq_update_pstate(struct cpudata *cpu, int target_pstate, 2482 bool fast_switch) 2483 { 2484 int old_pstate = cpu->pstate.current_pstate; 2485 2486 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 2487 if (target_pstate != old_pstate) { 2488 cpu->pstate.current_pstate = target_pstate; 2489 if (hwp_active) 2490 intel_cpufreq_adjust_hwp(cpu, target_pstate, 2491 fast_switch); 2492 else 2493 intel_cpufreq_adjust_perf_ctl(cpu, target_pstate, 2494 fast_switch); 2495 } 2496 2497 intel_cpufreq_trace(cpu, fast_switch ? INTEL_PSTATE_TRACE_FAST_SWITCH : 2498 INTEL_PSTATE_TRACE_TARGET, old_pstate); 2499 2500 return target_pstate; 2501 } 2502 2503 static int intel_cpufreq_target(struct cpufreq_policy *policy, 2504 unsigned int target_freq, 2505 unsigned int relation) 2506 { 2507 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2508 struct cpufreq_freqs freqs; 2509 int target_pstate; 2510 2511 update_turbo_state(); 2512 2513 freqs.old = policy->cur; 2514 freqs.new = target_freq; 2515 2516 cpufreq_freq_transition_begin(policy, &freqs); 2517 2518 switch (relation) { 2519 case CPUFREQ_RELATION_L: 2520 target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling); 2521 break; 2522 case CPUFREQ_RELATION_H: 2523 target_pstate = freqs.new / cpu->pstate.scaling; 2524 break; 2525 default: 2526 target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling); 2527 break; 2528 } 2529 2530 target_pstate = intel_cpufreq_update_pstate(cpu, target_pstate, false); 2531 2532 freqs.new = target_pstate * cpu->pstate.scaling; 2533 2534 cpufreq_freq_transition_end(policy, &freqs, false); 2535 2536 return 0; 2537 } 2538 2539 static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, 2540 unsigned int target_freq) 2541 { 2542 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2543 int target_pstate; 2544 2545 update_turbo_state(); 2546 2547 target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); 2548 2549 target_pstate = intel_cpufreq_update_pstate(cpu, target_pstate, true); 2550 2551 return target_pstate * cpu->pstate.scaling; 2552 } 2553 2554 static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) 2555 { 2556 int max_state, turbo_max, min_freq, max_freq, ret; 2557 struct freq_qos_request *req; 2558 struct cpudata *cpu; 2559 struct device *dev; 2560 2561 dev = get_cpu_device(policy->cpu); 2562 if (!dev) 2563 return -ENODEV; 2564 2565 ret = __intel_pstate_cpu_init(policy); 2566 if (ret) 2567 return ret; 2568 2569 policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; 2570 /* This reflects the intel_pstate_get_cpu_pstates() setting. */ 2571 policy->cur = policy->cpuinfo.min_freq; 2572 2573 req = kcalloc(2, sizeof(*req), GFP_KERNEL); 2574 if (!req) { 2575 ret = -ENOMEM; 2576 goto pstate_exit; 2577 } 2578 2579 cpu = all_cpu_data[policy->cpu]; 2580 2581 if (hwp_active) { 2582 u64 value; 2583 2584 intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state); 2585 policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP; 2586 rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value); 2587 WRITE_ONCE(cpu->hwp_req_cached, value); 2588 cpu->epp_cached = (value & GENMASK_ULL(31, 24)) >> 24; 2589 } else { 2590 turbo_max = cpu->pstate.turbo_pstate; 2591 policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; 2592 } 2593 2594 min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); 2595 min_freq *= cpu->pstate.scaling; 2596 max_freq = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100); 2597 max_freq *= cpu->pstate.scaling; 2598 2599 ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN, 2600 min_freq); 2601 if (ret < 0) { 2602 dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); 2603 goto free_req; 2604 } 2605 2606 ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX, 2607 max_freq); 2608 if (ret < 0) { 2609 dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret); 2610 goto remove_min_req; 2611 } 2612 2613 policy->driver_data = req; 2614 2615 return 0; 2616 2617 remove_min_req: 2618 freq_qos_remove_request(req); 2619 free_req: 2620 kfree(req); 2621 pstate_exit: 2622 intel_pstate_exit_perf_limits(policy); 2623 2624 return ret; 2625 } 2626 2627 static int intel_cpufreq_cpu_exit(struct cpufreq_policy *policy) 2628 { 2629 struct freq_qos_request *req; 2630 2631 req = policy->driver_data; 2632 2633 freq_qos_remove_request(req + 1); 2634 freq_qos_remove_request(req); 2635 kfree(req); 2636 2637 return intel_pstate_cpu_exit(policy); 2638 } 2639 2640 static struct cpufreq_driver intel_cpufreq = { 2641 .flags = CPUFREQ_CONST_LOOPS, 2642 .verify = intel_cpufreq_verify_policy, 2643 .target = intel_cpufreq_target, 2644 .fast_switch = intel_cpufreq_fast_switch, 2645 .init = intel_cpufreq_cpu_init, 2646 .exit = intel_cpufreq_cpu_exit, 2647 .stop_cpu = intel_cpufreq_stop_cpu, 2648 .update_limits = intel_pstate_update_limits, 2649 .name = "intel_cpufreq", 2650 }; 2651 2652 static struct cpufreq_driver *default_driver; 2653 2654 static void intel_pstate_driver_cleanup(void) 2655 { 2656 unsigned int cpu; 2657 2658 get_online_cpus(); 2659 for_each_online_cpu(cpu) { 2660 if (all_cpu_data[cpu]) { 2661 if (intel_pstate_driver == &intel_pstate) 2662 intel_pstate_clear_update_util_hook(cpu); 2663 2664 kfree(all_cpu_data[cpu]); 2665 all_cpu_data[cpu] = NULL; 2666 } 2667 } 2668 put_online_cpus(); 2669 2670 if (intel_pstate_driver == &intel_pstate) 2671 intel_pstate_sysfs_hide_hwp_dynamic_boost(); 2672 2673 intel_pstate_driver = NULL; 2674 } 2675 2676 static int intel_pstate_register_driver(struct cpufreq_driver *driver) 2677 { 2678 int ret; 2679 2680 if (driver == &intel_pstate) 2681 intel_pstate_sysfs_expose_hwp_dynamic_boost(); 2682 2683 memset(&global, 0, sizeof(global)); 2684 global.max_perf_pct = 100; 2685 2686 intel_pstate_driver = driver; 2687 ret = cpufreq_register_driver(intel_pstate_driver); 2688 if (ret) { 2689 intel_pstate_driver_cleanup(); 2690 return ret; 2691 } 2692 2693 global.min_perf_pct = min_perf_pct_min(); 2694 2695 return 0; 2696 } 2697 2698 static int intel_pstate_unregister_driver(void) 2699 { 2700 cpufreq_unregister_driver(intel_pstate_driver); 2701 intel_pstate_driver_cleanup(); 2702 2703 return 0; 2704 } 2705 2706 static ssize_t intel_pstate_show_status(char *buf) 2707 { 2708 if (!intel_pstate_driver) 2709 return sprintf(buf, "off\n"); 2710 2711 return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ? 2712 "active" : "passive"); 2713 } 2714 2715 static int intel_pstate_update_status(const char *buf, size_t size) 2716 { 2717 int ret; 2718 2719 if (size == 3 && !strncmp(buf, "off", size)) 2720 return intel_pstate_driver ? 2721 intel_pstate_unregister_driver() : -EINVAL; 2722 2723 if (size == 6 && !strncmp(buf, "active", size)) { 2724 if (intel_pstate_driver) { 2725 if (intel_pstate_driver == &intel_pstate) 2726 return 0; 2727 2728 ret = intel_pstate_unregister_driver(); 2729 if (ret) 2730 return ret; 2731 } 2732 2733 return intel_pstate_register_driver(&intel_pstate); 2734 } 2735 2736 if (size == 7 && !strncmp(buf, "passive", size)) { 2737 if (intel_pstate_driver) { 2738 if (intel_pstate_driver == &intel_cpufreq) 2739 return 0; 2740 2741 ret = intel_pstate_unregister_driver(); 2742 if (ret) 2743 return ret; 2744 } 2745 2746 return intel_pstate_register_driver(&intel_cpufreq); 2747 } 2748 2749 return -EINVAL; 2750 } 2751 2752 static int no_load __initdata; 2753 static int no_hwp __initdata; 2754 static int hwp_only __initdata; 2755 static unsigned int force_load __initdata; 2756 2757 static int __init intel_pstate_msrs_not_valid(void) 2758 { 2759 if (!pstate_funcs.get_max() || 2760 !pstate_funcs.get_min() || 2761 !pstate_funcs.get_turbo()) 2762 return -ENODEV; 2763 2764 return 0; 2765 } 2766 2767 static void __init copy_cpu_funcs(struct pstate_funcs *funcs) 2768 { 2769 pstate_funcs.get_max = funcs->get_max; 2770 pstate_funcs.get_max_physical = funcs->get_max_physical; 2771 pstate_funcs.get_min = funcs->get_min; 2772 pstate_funcs.get_turbo = funcs->get_turbo; 2773 pstate_funcs.get_scaling = funcs->get_scaling; 2774 pstate_funcs.get_val = funcs->get_val; 2775 pstate_funcs.get_vid = funcs->get_vid; 2776 pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift; 2777 } 2778 2779 #ifdef CONFIG_ACPI 2780 2781 static bool __init intel_pstate_no_acpi_pss(void) 2782 { 2783 int i; 2784 2785 for_each_possible_cpu(i) { 2786 acpi_status status; 2787 union acpi_object *pss; 2788 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 2789 struct acpi_processor *pr = per_cpu(processors, i); 2790 2791 if (!pr) 2792 continue; 2793 2794 status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer); 2795 if (ACPI_FAILURE(status)) 2796 continue; 2797 2798 pss = buffer.pointer; 2799 if (pss && pss->type == ACPI_TYPE_PACKAGE) { 2800 kfree(pss); 2801 return false; 2802 } 2803 2804 kfree(pss); 2805 } 2806 2807 pr_debug("ACPI _PSS not found\n"); 2808 return true; 2809 } 2810 2811 static bool __init intel_pstate_no_acpi_pcch(void) 2812 { 2813 acpi_status status; 2814 acpi_handle handle; 2815 2816 status = acpi_get_handle(NULL, "\\_SB", &handle); 2817 if (ACPI_FAILURE(status)) 2818 goto not_found; 2819 2820 if (acpi_has_method(handle, "PCCH")) 2821 return false; 2822 2823 not_found: 2824 pr_debug("ACPI PCCH not found\n"); 2825 return true; 2826 } 2827 2828 static bool __init intel_pstate_has_acpi_ppc(void) 2829 { 2830 int i; 2831 2832 for_each_possible_cpu(i) { 2833 struct acpi_processor *pr = per_cpu(processors, i); 2834 2835 if (!pr) 2836 continue; 2837 if (acpi_has_method(pr->handle, "_PPC")) 2838 return true; 2839 } 2840 pr_debug("ACPI _PPC not found\n"); 2841 return false; 2842 } 2843 2844 enum { 2845 PSS, 2846 PPC, 2847 }; 2848 2849 /* Hardware vendor-specific info that has its own power management modes */ 2850 static struct acpi_platform_list plat_info[] __initdata = { 2851 {"HP ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, NULL, PSS}, 2852 {"ORACLE", "X4-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2853 {"ORACLE", "X4-2L ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2854 {"ORACLE", "X4-2B ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2855 {"ORACLE", "X3-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2856 {"ORACLE", "X3-2L ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2857 {"ORACLE", "X3-2B ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2858 {"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2859 {"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2860 {"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2861 {"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2862 {"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2863 {"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2864 {"ORACLE", "X6-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2865 {"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 2866 { } /* End */ 2867 }; 2868 2869 #define BITMASK_OOB (BIT(8) | BIT(18)) 2870 2871 static bool __init intel_pstate_platform_pwr_mgmt_exists(void) 2872 { 2873 const struct x86_cpu_id *id; 2874 u64 misc_pwr; 2875 int idx; 2876 2877 id = x86_match_cpu(intel_pstate_cpu_oob_ids); 2878 if (id) { 2879 rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr); 2880 if (misc_pwr & BITMASK_OOB) { 2881 pr_debug("Bit 8 or 18 in the MISC_PWR_MGMT MSR set\n"); 2882 pr_debug("P states are controlled in Out of Band mode by the firmware/hardware\n"); 2883 return true; 2884 } 2885 } 2886 2887 idx = acpi_match_platform_list(plat_info); 2888 if (idx < 0) 2889 return false; 2890 2891 switch (plat_info[idx].data) { 2892 case PSS: 2893 if (!intel_pstate_no_acpi_pss()) 2894 return false; 2895 2896 return intel_pstate_no_acpi_pcch(); 2897 case PPC: 2898 return intel_pstate_has_acpi_ppc() && !force_load; 2899 } 2900 2901 return false; 2902 } 2903 2904 static void intel_pstate_request_control_from_smm(void) 2905 { 2906 /* 2907 * It may be unsafe to request P-states control from SMM if _PPC support 2908 * has not been enabled. 2909 */ 2910 if (acpi_ppc) 2911 acpi_processor_pstate_control(); 2912 } 2913 #else /* CONFIG_ACPI not enabled */ 2914 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; } 2915 static inline bool intel_pstate_has_acpi_ppc(void) { return false; } 2916 static inline void intel_pstate_request_control_from_smm(void) {} 2917 #endif /* CONFIG_ACPI */ 2918 2919 #define INTEL_PSTATE_HWP_BROADWELL 0x01 2920 2921 #define X86_MATCH_HWP(model, hwp_mode) \ 2922 X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ 2923 X86_FEATURE_HWP, hwp_mode) 2924 2925 static const struct x86_cpu_id hwp_support_ids[] __initconst = { 2926 X86_MATCH_HWP(BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL), 2927 X86_MATCH_HWP(BROADWELL_D, INTEL_PSTATE_HWP_BROADWELL), 2928 X86_MATCH_HWP(ANY, 0), 2929 {} 2930 }; 2931 2932 static int __init intel_pstate_init(void) 2933 { 2934 const struct x86_cpu_id *id; 2935 int rc; 2936 2937 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 2938 return -ENODEV; 2939 2940 if (no_load) 2941 return -ENODEV; 2942 2943 id = x86_match_cpu(hwp_support_ids); 2944 if (id) { 2945 copy_cpu_funcs(&core_funcs); 2946 /* 2947 * Avoid enabling HWP for processors without EPP support, 2948 * because that means incomplete HWP implementation which is a 2949 * corner case and supporting it is generally problematic. 2950 */ 2951 if (!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) { 2952 hwp_active++; 2953 hwp_mode_bdw = id->driver_data; 2954 intel_pstate.attr = hwp_cpufreq_attrs; 2955 intel_cpufreq.attr = hwp_cpufreq_attrs; 2956 if (!default_driver) 2957 default_driver = &intel_pstate; 2958 2959 goto hwp_cpu_matched; 2960 } 2961 } else { 2962 id = x86_match_cpu(intel_pstate_cpu_ids); 2963 if (!id) { 2964 pr_info("CPU model not supported\n"); 2965 return -ENODEV; 2966 } 2967 2968 copy_cpu_funcs((struct pstate_funcs *)id->driver_data); 2969 } 2970 2971 if (intel_pstate_msrs_not_valid()) { 2972 pr_info("Invalid MSRs\n"); 2973 return -ENODEV; 2974 } 2975 /* Without HWP start in the passive mode. */ 2976 if (!default_driver) 2977 default_driver = &intel_cpufreq; 2978 2979 hwp_cpu_matched: 2980 /* 2981 * The Intel pstate driver will be ignored if the platform 2982 * firmware has its own power management modes. 2983 */ 2984 if (intel_pstate_platform_pwr_mgmt_exists()) { 2985 pr_info("P-states controlled by the platform\n"); 2986 return -ENODEV; 2987 } 2988 2989 if (!hwp_active && hwp_only) 2990 return -ENOTSUPP; 2991 2992 pr_info("Intel P-state driver initializing\n"); 2993 2994 all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus())); 2995 if (!all_cpu_data) 2996 return -ENOMEM; 2997 2998 intel_pstate_request_control_from_smm(); 2999 3000 intel_pstate_sysfs_expose_params(); 3001 3002 mutex_lock(&intel_pstate_driver_lock); 3003 rc = intel_pstate_register_driver(default_driver); 3004 mutex_unlock(&intel_pstate_driver_lock); 3005 if (rc) 3006 return rc; 3007 3008 if (hwp_active) { 3009 const struct x86_cpu_id *id; 3010 3011 id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids); 3012 if (id) { 3013 set_power_ctl_ee_state(false); 3014 pr_info("Disabling energy efficiency optimization\n"); 3015 } 3016 3017 pr_info("HWP enabled\n"); 3018 } 3019 3020 return 0; 3021 } 3022 device_initcall(intel_pstate_init); 3023 3024 static int __init intel_pstate_setup(char *str) 3025 { 3026 if (!str) 3027 return -EINVAL; 3028 3029 if (!strcmp(str, "disable")) 3030 no_load = 1; 3031 else if (!strcmp(str, "active")) 3032 default_driver = &intel_pstate; 3033 else if (!strcmp(str, "passive")) 3034 default_driver = &intel_cpufreq; 3035 3036 if (!strcmp(str, "no_hwp")) { 3037 pr_info("HWP disabled\n"); 3038 no_hwp = 1; 3039 } 3040 if (!strcmp(str, "force")) 3041 force_load = 1; 3042 if (!strcmp(str, "hwp_only")) 3043 hwp_only = 1; 3044 if (!strcmp(str, "per_cpu_perf_limits")) 3045 per_cpu_limits = true; 3046 3047 #ifdef CONFIG_ACPI 3048 if (!strcmp(str, "support_acpi_ppc")) 3049 acpi_ppc = true; 3050 #endif 3051 3052 return 0; 3053 } 3054 early_param("intel_pstate", intel_pstate_setup); 3055 3056 MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>"); 3057 MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors"); 3058 MODULE_LICENSE("GPL"); 3059