1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_pstate.c: Native P state management for Intel processors 4 * 5 * (C) Copyright 2012 Intel Corporation 6 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/module.h> 14 #include <linux/ktime.h> 15 #include <linux/hrtimer.h> 16 #include <linux/tick.h> 17 #include <linux/slab.h> 18 #include <linux/sched/cpufreq.h> 19 #include <linux/list.h> 20 #include <linux/cpu.h> 21 #include <linux/cpufreq.h> 22 #include <linux/sysfs.h> 23 #include <linux/types.h> 24 #include <linux/fs.h> 25 #include <linux/acpi.h> 26 #include <linux/vmalloc.h> 27 #include <linux/pm_qos.h> 28 #include <trace/events/power.h> 29 30 #include <asm/div64.h> 31 #include <asm/msr.h> 32 #include <asm/cpu_device_id.h> 33 #include <asm/cpufeature.h> 34 #include <asm/intel-family.h> 35 #include "../drivers/thermal/intel/thermal_interrupt.h" 36 37 #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) 38 39 #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 40 #define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 41 #define INTEL_CPUFREQ_TRANSITION_DELAY 500 42 43 #ifdef CONFIG_ACPI 44 #include <acpi/processor.h> 45 #include <acpi/cppc_acpi.h> 46 #endif 47 48 #define FRAC_BITS 8 49 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) 50 #define fp_toint(X) ((X) >> FRAC_BITS) 51 52 #define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3)) 53 54 #define EXT_BITS 6 55 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS) 56 #define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS) 57 #define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS) 58 59 static inline int32_t mul_fp(int32_t x, int32_t y) 60 { 61 return ((int64_t)x * (int64_t)y) >> FRAC_BITS; 62 } 63 64 static inline int32_t div_fp(s64 x, s64 y) 65 { 66 return div64_s64((int64_t)x << FRAC_BITS, y); 67 } 68 69 static inline int ceiling_fp(int32_t x) 70 { 71 int mask, ret; 72 73 ret = fp_toint(x); 74 mask = (1 << FRAC_BITS) - 1; 75 if (x & mask) 76 ret += 1; 77 return ret; 78 } 79 80 static inline u64 mul_ext_fp(u64 x, u64 y) 81 { 82 return (x * y) >> EXT_FRAC_BITS; 83 } 84 85 static inline u64 div_ext_fp(u64 x, u64 y) 86 { 87 return div64_u64(x << EXT_FRAC_BITS, y); 88 } 89 90 /** 91 * struct sample - Store performance sample 92 * @core_avg_perf: Ratio of APERF/MPERF which is the actual average 93 * performance during last sample period 94 * @busy_scaled: Scaled busy value which is used to calculate next 95 * P state. This can be different than core_avg_perf 96 * to account for cpu idle period 97 * @aperf: Difference of actual performance frequency clock count 98 * read from APERF MSR between last and current sample 99 * @mperf: Difference of maximum performance frequency clock count 100 * read from MPERF MSR between last and current sample 101 * @tsc: Difference of time stamp counter between last and 102 * current sample 103 * @time: Current time from scheduler 104 * 105 * This structure is used in the cpudata structure to store performance sample 106 * data for choosing next P State. 107 */ 108 struct sample { 109 int32_t core_avg_perf; 110 int32_t busy_scaled; 111 u64 aperf; 112 u64 mperf; 113 u64 tsc; 114 u64 time; 115 }; 116 117 /** 118 * struct pstate_data - Store P state data 119 * @current_pstate: Current requested P state 120 * @min_pstate: Min P state possible for this platform 121 * @max_pstate: Max P state possible for this platform 122 * @max_pstate_physical:This is physical Max P state for a processor 123 * This can be higher than the max_pstate which can 124 * be limited by platform thermal design power limits 125 * @perf_ctl_scaling: PERF_CTL P-state to frequency scaling factor 126 * @scaling: Scaling factor between performance and frequency 127 * @turbo_pstate: Max Turbo P state possible for this platform 128 * @min_freq: @min_pstate frequency in cpufreq units 129 * @max_freq: @max_pstate frequency in cpufreq units 130 * @turbo_freq: @turbo_pstate frequency in cpufreq units 131 * 132 * Stores the per cpu model P state limits and current P state. 133 */ 134 struct pstate_data { 135 int current_pstate; 136 int min_pstate; 137 int max_pstate; 138 int max_pstate_physical; 139 int perf_ctl_scaling; 140 int scaling; 141 int turbo_pstate; 142 unsigned int min_freq; 143 unsigned int max_freq; 144 unsigned int turbo_freq; 145 }; 146 147 /** 148 * struct vid_data - Stores voltage information data 149 * @min: VID data for this platform corresponding to 150 * the lowest P state 151 * @max: VID data corresponding to the highest P State. 152 * @turbo: VID data for turbo P state 153 * @ratio: Ratio of (vid max - vid min) / 154 * (max P state - Min P State) 155 * 156 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling) 157 * This data is used in Atom platforms, where in addition to target P state, 158 * the voltage data needs to be specified to select next P State. 159 */ 160 struct vid_data { 161 int min; 162 int max; 163 int turbo; 164 int32_t ratio; 165 }; 166 167 /** 168 * struct global_params - Global parameters, mostly tunable via sysfs. 169 * @no_turbo: Whether or not to use turbo P-states. 170 * @turbo_disabled: Whether or not turbo P-states are available at all, 171 * based on the MSR_IA32_MISC_ENABLE value and whether or 172 * not the maximum reported turbo P-state is different from 173 * the maximum reported non-turbo one. 174 * @turbo_disabled_mf: The @turbo_disabled value reflected by cpuinfo.max_freq. 175 * @min_perf_pct: Minimum capacity limit in percent of the maximum turbo 176 * P-state capacity. 177 * @max_perf_pct: Maximum capacity limit in percent of the maximum turbo 178 * P-state capacity. 179 */ 180 struct global_params { 181 bool no_turbo; 182 bool turbo_disabled; 183 bool turbo_disabled_mf; 184 int max_perf_pct; 185 int min_perf_pct; 186 }; 187 188 /** 189 * struct cpudata - Per CPU instance data storage 190 * @cpu: CPU number for this instance data 191 * @policy: CPUFreq policy value 192 * @update_util: CPUFreq utility callback information 193 * @update_util_set: CPUFreq utility callback is set 194 * @iowait_boost: iowait-related boost fraction 195 * @last_update: Time of the last update. 196 * @pstate: Stores P state limits for this CPU 197 * @vid: Stores VID limits for this CPU 198 * @last_sample_time: Last Sample time 199 * @aperf_mperf_shift: APERF vs MPERF counting frequency difference 200 * @prev_aperf: Last APERF value read from APERF MSR 201 * @prev_mperf: Last MPERF value read from MPERF MSR 202 * @prev_tsc: Last timestamp counter (TSC) value 203 * @prev_cummulative_iowait: IO Wait time difference from last and 204 * current sample 205 * @sample: Storage for storing last Sample data 206 * @min_perf_ratio: Minimum capacity in terms of PERF or HWP ratios 207 * @max_perf_ratio: Maximum capacity in terms of PERF or HWP ratios 208 * @acpi_perf_data: Stores ACPI perf information read from _PSS 209 * @valid_pss_table: Set to true for valid ACPI _PSS entries found 210 * @epp_powersave: Last saved HWP energy performance preference 211 * (EPP) or energy performance bias (EPB), 212 * when policy switched to performance 213 * @epp_policy: Last saved policy used to set EPP/EPB 214 * @epp_default: Power on default HWP energy performance 215 * preference/bias 216 * @epp_cached Cached HWP energy-performance preference value 217 * @hwp_req_cached: Cached value of the last HWP Request MSR 218 * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR 219 * @last_io_update: Last time when IO wake flag was set 220 * @sched_flags: Store scheduler flags for possible cross CPU update 221 * @hwp_boost_min: Last HWP boosted min performance 222 * @suspended: Whether or not the driver has been suspended. 223 * @hwp_notify_work: workqueue for HWP notifications. 224 * 225 * This structure stores per CPU instance data for all CPUs. 226 */ 227 struct cpudata { 228 int cpu; 229 230 unsigned int policy; 231 struct update_util_data update_util; 232 bool update_util_set; 233 234 struct pstate_data pstate; 235 struct vid_data vid; 236 237 u64 last_update; 238 u64 last_sample_time; 239 u64 aperf_mperf_shift; 240 u64 prev_aperf; 241 u64 prev_mperf; 242 u64 prev_tsc; 243 u64 prev_cummulative_iowait; 244 struct sample sample; 245 int32_t min_perf_ratio; 246 int32_t max_perf_ratio; 247 #ifdef CONFIG_ACPI 248 struct acpi_processor_performance acpi_perf_data; 249 bool valid_pss_table; 250 #endif 251 unsigned int iowait_boost; 252 s16 epp_powersave; 253 s16 epp_policy; 254 s16 epp_default; 255 s16 epp_cached; 256 u64 hwp_req_cached; 257 u64 hwp_cap_cached; 258 u64 last_io_update; 259 unsigned int sched_flags; 260 u32 hwp_boost_min; 261 bool suspended; 262 struct delayed_work hwp_notify_work; 263 }; 264 265 static struct cpudata **all_cpu_data; 266 267 /** 268 * struct pstate_funcs - Per CPU model specific callbacks 269 * @get_max: Callback to get maximum non turbo effective P state 270 * @get_max_physical: Callback to get maximum non turbo physical P state 271 * @get_min: Callback to get minimum P state 272 * @get_turbo: Callback to get turbo P state 273 * @get_scaling: Callback to get frequency scaling factor 274 * @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference 275 * @get_val: Callback to convert P state to actual MSR write value 276 * @get_vid: Callback to get VID data for Atom platforms 277 * 278 * Core and Atom CPU models have different way to get P State limits. This 279 * structure is used to store those callbacks. 280 */ 281 struct pstate_funcs { 282 int (*get_max)(void); 283 int (*get_max_physical)(void); 284 int (*get_min)(void); 285 int (*get_turbo)(void); 286 int (*get_scaling)(void); 287 int (*get_aperf_mperf_shift)(void); 288 u64 (*get_val)(struct cpudata*, int pstate); 289 void (*get_vid)(struct cpudata *); 290 }; 291 292 static struct pstate_funcs pstate_funcs __read_mostly; 293 294 static int hwp_active __read_mostly; 295 static int hwp_mode_bdw __read_mostly; 296 static bool per_cpu_limits __read_mostly; 297 static bool hwp_boost __read_mostly; 298 299 static struct cpufreq_driver *intel_pstate_driver __read_mostly; 300 301 #ifdef CONFIG_ACPI 302 static bool acpi_ppc; 303 #endif 304 305 static struct global_params global; 306 307 static DEFINE_MUTEX(intel_pstate_driver_lock); 308 static DEFINE_MUTEX(intel_pstate_limits_lock); 309 310 #ifdef CONFIG_ACPI 311 312 static bool intel_pstate_acpi_pm_profile_server(void) 313 { 314 if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER || 315 acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER) 316 return true; 317 318 return false; 319 } 320 321 static bool intel_pstate_get_ppc_enable_status(void) 322 { 323 if (intel_pstate_acpi_pm_profile_server()) 324 return true; 325 326 return acpi_ppc; 327 } 328 329 #ifdef CONFIG_ACPI_CPPC_LIB 330 331 /* The work item is needed to avoid CPU hotplug locking issues */ 332 static void intel_pstste_sched_itmt_work_fn(struct work_struct *work) 333 { 334 sched_set_itmt_support(); 335 } 336 337 static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn); 338 339 static void intel_pstate_set_itmt_prio(int cpu) 340 { 341 struct cppc_perf_caps cppc_perf; 342 static u32 max_highest_perf = 0, min_highest_perf = U32_MAX; 343 int ret; 344 345 ret = cppc_get_perf_caps(cpu, &cppc_perf); 346 if (ret) 347 return; 348 349 /* 350 * The priorities can be set regardless of whether or not 351 * sched_set_itmt_support(true) has been called and it is valid to 352 * update them at any time after it has been called. 353 */ 354 sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); 355 356 if (max_highest_perf <= min_highest_perf) { 357 if (cppc_perf.highest_perf > max_highest_perf) 358 max_highest_perf = cppc_perf.highest_perf; 359 360 if (cppc_perf.highest_perf < min_highest_perf) 361 min_highest_perf = cppc_perf.highest_perf; 362 363 if (max_highest_perf > min_highest_perf) { 364 /* 365 * This code can be run during CPU online under the 366 * CPU hotplug locks, so sched_set_itmt_support() 367 * cannot be called from here. Queue up a work item 368 * to invoke it. 369 */ 370 schedule_work(&sched_itmt_work); 371 } 372 } 373 } 374 375 static int intel_pstate_get_cppc_guaranteed(int cpu) 376 { 377 struct cppc_perf_caps cppc_perf; 378 int ret; 379 380 ret = cppc_get_perf_caps(cpu, &cppc_perf); 381 if (ret) 382 return ret; 383 384 if (cppc_perf.guaranteed_perf) 385 return cppc_perf.guaranteed_perf; 386 387 return cppc_perf.nominal_perf; 388 } 389 390 #else /* CONFIG_ACPI_CPPC_LIB */ 391 static inline void intel_pstate_set_itmt_prio(int cpu) 392 { 393 } 394 #endif /* CONFIG_ACPI_CPPC_LIB */ 395 396 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 397 { 398 struct cpudata *cpu; 399 int ret; 400 int i; 401 402 if (hwp_active) { 403 intel_pstate_set_itmt_prio(policy->cpu); 404 return; 405 } 406 407 if (!intel_pstate_get_ppc_enable_status()) 408 return; 409 410 cpu = all_cpu_data[policy->cpu]; 411 412 ret = acpi_processor_register_performance(&cpu->acpi_perf_data, 413 policy->cpu); 414 if (ret) 415 return; 416 417 /* 418 * Check if the control value in _PSS is for PERF_CTL MSR, which should 419 * guarantee that the states returned by it map to the states in our 420 * list directly. 421 */ 422 if (cpu->acpi_perf_data.control_register.space_id != 423 ACPI_ADR_SPACE_FIXED_HARDWARE) 424 goto err; 425 426 /* 427 * If there is only one entry _PSS, simply ignore _PSS and continue as 428 * usual without taking _PSS into account 429 */ 430 if (cpu->acpi_perf_data.state_count < 2) 431 goto err; 432 433 pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu); 434 for (i = 0; i < cpu->acpi_perf_data.state_count; i++) { 435 pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n", 436 (i == cpu->acpi_perf_data.state ? '*' : ' '), i, 437 (u32) cpu->acpi_perf_data.states[i].core_frequency, 438 (u32) cpu->acpi_perf_data.states[i].power, 439 (u32) cpu->acpi_perf_data.states[i].control); 440 } 441 442 /* 443 * The _PSS table doesn't contain whole turbo frequency range. 444 * This just contains +1 MHZ above the max non turbo frequency, 445 * with control value corresponding to max turbo ratio. But 446 * when cpufreq set policy is called, it will call with this 447 * max frequency, which will cause a reduced performance as 448 * this driver uses real max turbo frequency as the max 449 * frequency. So correct this frequency in _PSS table to 450 * correct max turbo frequency based on the turbo state. 451 * Also need to convert to MHz as _PSS freq is in MHz. 452 */ 453 if (!global.turbo_disabled) 454 cpu->acpi_perf_data.states[0].core_frequency = 455 policy->cpuinfo.max_freq / 1000; 456 cpu->valid_pss_table = true; 457 pr_debug("_PPC limits will be enforced\n"); 458 459 return; 460 461 err: 462 cpu->valid_pss_table = false; 463 acpi_processor_unregister_performance(policy->cpu); 464 } 465 466 static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 467 { 468 struct cpudata *cpu; 469 470 cpu = all_cpu_data[policy->cpu]; 471 if (!cpu->valid_pss_table) 472 return; 473 474 acpi_processor_unregister_performance(policy->cpu); 475 } 476 477 static bool intel_pstate_cppc_perf_valid(u32 perf, struct cppc_perf_caps *caps) 478 { 479 return perf && perf <= caps->highest_perf && perf >= caps->lowest_perf; 480 } 481 482 static bool intel_pstate_cppc_perf_caps(struct cpudata *cpu, 483 struct cppc_perf_caps *caps) 484 { 485 if (cppc_get_perf_caps(cpu->cpu, caps)) 486 return false; 487 488 return caps->highest_perf && caps->lowest_perf <= caps->highest_perf; 489 } 490 #else /* CONFIG_ACPI */ 491 static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 492 { 493 } 494 495 static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 496 { 497 } 498 499 static inline bool intel_pstate_acpi_pm_profile_server(void) 500 { 501 return false; 502 } 503 #endif /* CONFIG_ACPI */ 504 505 #ifndef CONFIG_ACPI_CPPC_LIB 506 static inline int intel_pstate_get_cppc_guaranteed(int cpu) 507 { 508 return -ENOTSUPP; 509 } 510 #endif /* CONFIG_ACPI_CPPC_LIB */ 511 512 static void intel_pstate_hybrid_hwp_perf_ctl_parity(struct cpudata *cpu) 513 { 514 pr_debug("CPU%d: Using PERF_CTL scaling for HWP\n", cpu->cpu); 515 516 cpu->pstate.scaling = cpu->pstate.perf_ctl_scaling; 517 } 518 519 /** 520 * intel_pstate_hybrid_hwp_calibrate - Calibrate HWP performance levels. 521 * @cpu: Target CPU. 522 * 523 * On hybrid processors, HWP may expose more performance levels than there are 524 * P-states accessible through the PERF_CTL interface. If that happens, the 525 * scaling factor between HWP performance levels and CPU frequency will be less 526 * than the scaling factor between P-state values and CPU frequency. 527 * 528 * In that case, the scaling factor between HWP performance levels and CPU 529 * frequency needs to be determined which can be done with the help of the 530 * observation that certain HWP performance levels should correspond to certain 531 * P-states, like for example the HWP highest performance should correspond 532 * to the maximum turbo P-state of the CPU. 533 */ 534 static void intel_pstate_hybrid_hwp_calibrate(struct cpudata *cpu) 535 { 536 int perf_ctl_max_phys = cpu->pstate.max_pstate_physical; 537 int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling; 538 int perf_ctl_turbo = pstate_funcs.get_turbo(); 539 int turbo_freq = perf_ctl_turbo * perf_ctl_scaling; 540 int perf_ctl_max = pstate_funcs.get_max(); 541 int max_freq = perf_ctl_max * perf_ctl_scaling; 542 int scaling = INT_MAX; 543 int freq; 544 545 pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys); 546 pr_debug("CPU%d: perf_ctl_max = %d\n", cpu->cpu, perf_ctl_max); 547 pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo); 548 pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling); 549 550 pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate); 551 pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate); 552 553 #ifdef CONFIG_ACPI 554 if (IS_ENABLED(CONFIG_ACPI_CPPC_LIB)) { 555 struct cppc_perf_caps caps; 556 557 if (intel_pstate_cppc_perf_caps(cpu, &caps)) { 558 if (intel_pstate_cppc_perf_valid(caps.nominal_perf, &caps)) { 559 pr_debug("CPU%d: Using CPPC nominal\n", cpu->cpu); 560 561 /* 562 * If the CPPC nominal performance is valid, it 563 * can be assumed to correspond to cpu_khz. 564 */ 565 if (caps.nominal_perf == perf_ctl_max_phys) { 566 intel_pstate_hybrid_hwp_perf_ctl_parity(cpu); 567 return; 568 } 569 scaling = DIV_ROUND_UP(cpu_khz, caps.nominal_perf); 570 } else if (intel_pstate_cppc_perf_valid(caps.guaranteed_perf, &caps)) { 571 pr_debug("CPU%d: Using CPPC guaranteed\n", cpu->cpu); 572 573 /* 574 * If the CPPC guaranteed performance is valid, 575 * it can be assumed to correspond to max_freq. 576 */ 577 if (caps.guaranteed_perf == perf_ctl_max) { 578 intel_pstate_hybrid_hwp_perf_ctl_parity(cpu); 579 return; 580 } 581 scaling = DIV_ROUND_UP(max_freq, caps.guaranteed_perf); 582 } 583 } 584 } 585 #endif 586 /* 587 * If using the CPPC data to compute the HWP-to-frequency scaling factor 588 * doesn't work, use the HWP_CAP gauranteed perf for this purpose with 589 * the assumption that it corresponds to max_freq. 590 */ 591 if (scaling > perf_ctl_scaling) { 592 pr_debug("CPU%d: Using HWP_CAP guaranteed\n", cpu->cpu); 593 594 if (cpu->pstate.max_pstate == perf_ctl_max) { 595 intel_pstate_hybrid_hwp_perf_ctl_parity(cpu); 596 return; 597 } 598 scaling = DIV_ROUND_UP(max_freq, cpu->pstate.max_pstate); 599 if (scaling > perf_ctl_scaling) { 600 /* 601 * This should not happen, because it would mean that 602 * the number of HWP perf levels was less than the 603 * number of P-states, so use the PERF_CTL scaling in 604 * that case. 605 */ 606 pr_debug("CPU%d: scaling (%d) out of range\n", cpu->cpu, 607 scaling); 608 609 intel_pstate_hybrid_hwp_perf_ctl_parity(cpu); 610 return; 611 } 612 } 613 614 /* 615 * If the product of the HWP performance scaling factor obtained above 616 * and the HWP_CAP highest performance is greater than the maximum turbo 617 * frequency corresponding to the pstate_funcs.get_turbo() return value, 618 * the scaling factor is too high, so recompute it so that the HWP_CAP 619 * highest performance corresponds to the maximum turbo frequency. 620 */ 621 if (turbo_freq < cpu->pstate.turbo_pstate * scaling) { 622 pr_debug("CPU%d: scaling too high (%d)\n", cpu->cpu, scaling); 623 624 cpu->pstate.turbo_freq = turbo_freq; 625 scaling = DIV_ROUND_UP(turbo_freq, cpu->pstate.turbo_pstate); 626 } 627 628 cpu->pstate.scaling = scaling; 629 630 pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling); 631 632 cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling, 633 perf_ctl_scaling); 634 635 freq = perf_ctl_max_phys * perf_ctl_scaling; 636 cpu->pstate.max_pstate_physical = DIV_ROUND_UP(freq, scaling); 637 638 cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling; 639 /* 640 * Cast the min P-state value retrieved via pstate_funcs.get_min() to 641 * the effective range of HWP performance levels. 642 */ 643 cpu->pstate.min_pstate = DIV_ROUND_UP(cpu->pstate.min_freq, scaling); 644 } 645 646 static inline void update_turbo_state(void) 647 { 648 u64 misc_en; 649 struct cpudata *cpu; 650 651 cpu = all_cpu_data[0]; 652 rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); 653 global.turbo_disabled = 654 (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || 655 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); 656 } 657 658 static int min_perf_pct_min(void) 659 { 660 struct cpudata *cpu = all_cpu_data[0]; 661 int turbo_pstate = cpu->pstate.turbo_pstate; 662 663 return turbo_pstate ? 664 (cpu->pstate.min_pstate * 100 / turbo_pstate) : 0; 665 } 666 667 static s16 intel_pstate_get_epb(struct cpudata *cpu_data) 668 { 669 u64 epb; 670 int ret; 671 672 if (!boot_cpu_has(X86_FEATURE_EPB)) 673 return -ENXIO; 674 675 ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); 676 if (ret) 677 return (s16)ret; 678 679 return (s16)(epb & 0x0f); 680 } 681 682 static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data) 683 { 684 s16 epp; 685 686 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 687 /* 688 * When hwp_req_data is 0, means that caller didn't read 689 * MSR_HWP_REQUEST, so need to read and get EPP. 690 */ 691 if (!hwp_req_data) { 692 epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, 693 &hwp_req_data); 694 if (epp) 695 return epp; 696 } 697 epp = (hwp_req_data >> 24) & 0xff; 698 } else { 699 /* When there is no EPP present, HWP uses EPB settings */ 700 epp = intel_pstate_get_epb(cpu_data); 701 } 702 703 return epp; 704 } 705 706 static int intel_pstate_set_epb(int cpu, s16 pref) 707 { 708 u64 epb; 709 int ret; 710 711 if (!boot_cpu_has(X86_FEATURE_EPB)) 712 return -ENXIO; 713 714 ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); 715 if (ret) 716 return ret; 717 718 epb = (epb & ~0x0f) | pref; 719 wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb); 720 721 return 0; 722 } 723 724 /* 725 * EPP/EPB display strings corresponding to EPP index in the 726 * energy_perf_strings[] 727 * index String 728 *------------------------------------- 729 * 0 default 730 * 1 performance 731 * 2 balance_performance 732 * 3 balance_power 733 * 4 power 734 */ 735 static const char * const energy_perf_strings[] = { 736 "default", 737 "performance", 738 "balance_performance", 739 "balance_power", 740 "power", 741 NULL 742 }; 743 static const unsigned int epp_values[] = { 744 HWP_EPP_PERFORMANCE, 745 HWP_EPP_BALANCE_PERFORMANCE, 746 HWP_EPP_BALANCE_POWERSAVE, 747 HWP_EPP_POWERSAVE 748 }; 749 750 static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp) 751 { 752 s16 epp; 753 int index = -EINVAL; 754 755 *raw_epp = 0; 756 epp = intel_pstate_get_epp(cpu_data, 0); 757 if (epp < 0) 758 return epp; 759 760 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 761 if (epp == HWP_EPP_PERFORMANCE) 762 return 1; 763 if (epp == HWP_EPP_BALANCE_PERFORMANCE) 764 return 2; 765 if (epp == HWP_EPP_BALANCE_POWERSAVE) 766 return 3; 767 if (epp == HWP_EPP_POWERSAVE) 768 return 4; 769 *raw_epp = epp; 770 return 0; 771 } else if (boot_cpu_has(X86_FEATURE_EPB)) { 772 /* 773 * Range: 774 * 0x00-0x03 : Performance 775 * 0x04-0x07 : Balance performance 776 * 0x08-0x0B : Balance power 777 * 0x0C-0x0F : Power 778 * The EPB is a 4 bit value, but our ranges restrict the 779 * value which can be set. Here only using top two bits 780 * effectively. 781 */ 782 index = (epp >> 2) + 1; 783 } 784 785 return index; 786 } 787 788 static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp) 789 { 790 int ret; 791 792 /* 793 * Use the cached HWP Request MSR value, because in the active mode the 794 * register itself may be updated by intel_pstate_hwp_boost_up() or 795 * intel_pstate_hwp_boost_down() at any time. 796 */ 797 u64 value = READ_ONCE(cpu->hwp_req_cached); 798 799 value &= ~GENMASK_ULL(31, 24); 800 value |= (u64)epp << 24; 801 /* 802 * The only other updater of hwp_req_cached in the active mode, 803 * intel_pstate_hwp_set(), is called under the same lock as this 804 * function, so it cannot run in parallel with the update below. 805 */ 806 WRITE_ONCE(cpu->hwp_req_cached, value); 807 ret = wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 808 if (!ret) 809 cpu->epp_cached = epp; 810 811 return ret; 812 } 813 814 static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, 815 int pref_index, bool use_raw, 816 u32 raw_epp) 817 { 818 int epp = -EINVAL; 819 int ret; 820 821 if (!pref_index) 822 epp = cpu_data->epp_default; 823 824 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 825 if (use_raw) 826 epp = raw_epp; 827 else if (epp == -EINVAL) 828 epp = epp_values[pref_index - 1]; 829 830 /* 831 * To avoid confusion, refuse to set EPP to any values different 832 * from 0 (performance) if the current policy is "performance", 833 * because those values would be overridden. 834 */ 835 if (epp > 0 && cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) 836 return -EBUSY; 837 838 ret = intel_pstate_set_epp(cpu_data, epp); 839 } else { 840 if (epp == -EINVAL) 841 epp = (pref_index - 1) << 2; 842 ret = intel_pstate_set_epb(cpu_data->cpu, epp); 843 } 844 845 return ret; 846 } 847 848 static ssize_t show_energy_performance_available_preferences( 849 struct cpufreq_policy *policy, char *buf) 850 { 851 int i = 0; 852 int ret = 0; 853 854 while (energy_perf_strings[i] != NULL) 855 ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]); 856 857 ret += sprintf(&buf[ret], "\n"); 858 859 return ret; 860 } 861 862 cpufreq_freq_attr_ro(energy_performance_available_preferences); 863 864 static struct cpufreq_driver intel_pstate; 865 866 static ssize_t store_energy_performance_preference( 867 struct cpufreq_policy *policy, const char *buf, size_t count) 868 { 869 struct cpudata *cpu = all_cpu_data[policy->cpu]; 870 char str_preference[21]; 871 bool raw = false; 872 ssize_t ret; 873 u32 epp = 0; 874 875 ret = sscanf(buf, "%20s", str_preference); 876 if (ret != 1) 877 return -EINVAL; 878 879 ret = match_string(energy_perf_strings, -1, str_preference); 880 if (ret < 0) { 881 if (!boot_cpu_has(X86_FEATURE_HWP_EPP)) 882 return ret; 883 884 ret = kstrtouint(buf, 10, &epp); 885 if (ret) 886 return ret; 887 888 if (epp > 255) 889 return -EINVAL; 890 891 raw = true; 892 } 893 894 /* 895 * This function runs with the policy R/W semaphore held, which 896 * guarantees that the driver pointer will not change while it is 897 * running. 898 */ 899 if (!intel_pstate_driver) 900 return -EAGAIN; 901 902 mutex_lock(&intel_pstate_limits_lock); 903 904 if (intel_pstate_driver == &intel_pstate) { 905 ret = intel_pstate_set_energy_pref_index(cpu, ret, raw, epp); 906 } else { 907 /* 908 * In the passive mode the governor needs to be stopped on the 909 * target CPU before the EPP update and restarted after it, 910 * which is super-heavy-weight, so make sure it is worth doing 911 * upfront. 912 */ 913 if (!raw) 914 epp = ret ? epp_values[ret - 1] : cpu->epp_default; 915 916 if (cpu->epp_cached != epp) { 917 int err; 918 919 cpufreq_stop_governor(policy); 920 ret = intel_pstate_set_epp(cpu, epp); 921 err = cpufreq_start_governor(policy); 922 if (!ret) 923 ret = err; 924 } 925 } 926 927 mutex_unlock(&intel_pstate_limits_lock); 928 929 return ret ?: count; 930 } 931 932 static ssize_t show_energy_performance_preference( 933 struct cpufreq_policy *policy, char *buf) 934 { 935 struct cpudata *cpu_data = all_cpu_data[policy->cpu]; 936 int preference, raw_epp; 937 938 preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp); 939 if (preference < 0) 940 return preference; 941 942 if (raw_epp) 943 return sprintf(buf, "%d\n", raw_epp); 944 else 945 return sprintf(buf, "%s\n", energy_perf_strings[preference]); 946 } 947 948 cpufreq_freq_attr_rw(energy_performance_preference); 949 950 static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf) 951 { 952 struct cpudata *cpu = all_cpu_data[policy->cpu]; 953 int ratio, freq; 954 955 ratio = intel_pstate_get_cppc_guaranteed(policy->cpu); 956 if (ratio <= 0) { 957 u64 cap; 958 959 rdmsrl_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap); 960 ratio = HWP_GUARANTEED_PERF(cap); 961 } 962 963 freq = ratio * cpu->pstate.scaling; 964 if (cpu->pstate.scaling != cpu->pstate.perf_ctl_scaling) 965 freq = rounddown(freq, cpu->pstate.perf_ctl_scaling); 966 967 return sprintf(buf, "%d\n", freq); 968 } 969 970 cpufreq_freq_attr_ro(base_frequency); 971 972 static struct freq_attr *hwp_cpufreq_attrs[] = { 973 &energy_performance_preference, 974 &energy_performance_available_preferences, 975 &base_frequency, 976 NULL, 977 }; 978 979 static void __intel_pstate_get_hwp_cap(struct cpudata *cpu) 980 { 981 u64 cap; 982 983 rdmsrl_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap); 984 WRITE_ONCE(cpu->hwp_cap_cached, cap); 985 cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(cap); 986 cpu->pstate.turbo_pstate = HWP_HIGHEST_PERF(cap); 987 } 988 989 static void intel_pstate_get_hwp_cap(struct cpudata *cpu) 990 { 991 int scaling = cpu->pstate.scaling; 992 993 __intel_pstate_get_hwp_cap(cpu); 994 995 cpu->pstate.max_freq = cpu->pstate.max_pstate * scaling; 996 cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * scaling; 997 if (scaling != cpu->pstate.perf_ctl_scaling) { 998 int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling; 999 1000 cpu->pstate.max_freq = rounddown(cpu->pstate.max_freq, 1001 perf_ctl_scaling); 1002 cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_freq, 1003 perf_ctl_scaling); 1004 } 1005 } 1006 1007 static void intel_pstate_hwp_set(unsigned int cpu) 1008 { 1009 struct cpudata *cpu_data = all_cpu_data[cpu]; 1010 int max, min; 1011 u64 value; 1012 s16 epp; 1013 1014 max = cpu_data->max_perf_ratio; 1015 min = cpu_data->min_perf_ratio; 1016 1017 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) 1018 min = max; 1019 1020 rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); 1021 1022 value &= ~HWP_MIN_PERF(~0L); 1023 value |= HWP_MIN_PERF(min); 1024 1025 value &= ~HWP_MAX_PERF(~0L); 1026 value |= HWP_MAX_PERF(max); 1027 1028 if (cpu_data->epp_policy == cpu_data->policy) 1029 goto skip_epp; 1030 1031 cpu_data->epp_policy = cpu_data->policy; 1032 1033 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) { 1034 epp = intel_pstate_get_epp(cpu_data, value); 1035 cpu_data->epp_powersave = epp; 1036 /* If EPP read was failed, then don't try to write */ 1037 if (epp < 0) 1038 goto skip_epp; 1039 1040 epp = 0; 1041 } else { 1042 /* skip setting EPP, when saved value is invalid */ 1043 if (cpu_data->epp_powersave < 0) 1044 goto skip_epp; 1045 1046 /* 1047 * No need to restore EPP when it is not zero. This 1048 * means: 1049 * - Policy is not changed 1050 * - user has manually changed 1051 * - Error reading EPB 1052 */ 1053 epp = intel_pstate_get_epp(cpu_data, value); 1054 if (epp) 1055 goto skip_epp; 1056 1057 epp = cpu_data->epp_powersave; 1058 } 1059 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 1060 value &= ~GENMASK_ULL(31, 24); 1061 value |= (u64)epp << 24; 1062 } else { 1063 intel_pstate_set_epb(cpu, epp); 1064 } 1065 skip_epp: 1066 WRITE_ONCE(cpu_data->hwp_req_cached, value); 1067 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); 1068 } 1069 1070 static void intel_pstate_hwp_offline(struct cpudata *cpu) 1071 { 1072 u64 value = READ_ONCE(cpu->hwp_req_cached); 1073 int min_perf; 1074 1075 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 1076 /* 1077 * In case the EPP has been set to "performance" by the 1078 * active mode "performance" scaling algorithm, replace that 1079 * temporary value with the cached EPP one. 1080 */ 1081 value &= ~GENMASK_ULL(31, 24); 1082 value |= HWP_ENERGY_PERF_PREFERENCE(cpu->epp_cached); 1083 WRITE_ONCE(cpu->hwp_req_cached, value); 1084 } 1085 1086 value &= ~GENMASK_ULL(31, 0); 1087 min_perf = HWP_LOWEST_PERF(READ_ONCE(cpu->hwp_cap_cached)); 1088 1089 /* Set hwp_max = hwp_min */ 1090 value |= HWP_MAX_PERF(min_perf); 1091 value |= HWP_MIN_PERF(min_perf); 1092 1093 /* Set EPP to min */ 1094 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) 1095 value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE); 1096 1097 wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 1098 } 1099 1100 #define POWER_CTL_EE_ENABLE 1 1101 #define POWER_CTL_EE_DISABLE 2 1102 1103 static int power_ctl_ee_state; 1104 1105 static void set_power_ctl_ee_state(bool input) 1106 { 1107 u64 power_ctl; 1108 1109 mutex_lock(&intel_pstate_driver_lock); 1110 rdmsrl(MSR_IA32_POWER_CTL, power_ctl); 1111 if (input) { 1112 power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE); 1113 power_ctl_ee_state = POWER_CTL_EE_ENABLE; 1114 } else { 1115 power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE); 1116 power_ctl_ee_state = POWER_CTL_EE_DISABLE; 1117 } 1118 wrmsrl(MSR_IA32_POWER_CTL, power_ctl); 1119 mutex_unlock(&intel_pstate_driver_lock); 1120 } 1121 1122 static void intel_pstate_hwp_enable(struct cpudata *cpudata); 1123 1124 static void intel_pstate_hwp_reenable(struct cpudata *cpu) 1125 { 1126 intel_pstate_hwp_enable(cpu); 1127 wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, READ_ONCE(cpu->hwp_req_cached)); 1128 } 1129 1130 static int intel_pstate_suspend(struct cpufreq_policy *policy) 1131 { 1132 struct cpudata *cpu = all_cpu_data[policy->cpu]; 1133 1134 pr_debug("CPU %d suspending\n", cpu->cpu); 1135 1136 cpu->suspended = true; 1137 1138 return 0; 1139 } 1140 1141 static int intel_pstate_resume(struct cpufreq_policy *policy) 1142 { 1143 struct cpudata *cpu = all_cpu_data[policy->cpu]; 1144 1145 pr_debug("CPU %d resuming\n", cpu->cpu); 1146 1147 /* Only restore if the system default is changed */ 1148 if (power_ctl_ee_state == POWER_CTL_EE_ENABLE) 1149 set_power_ctl_ee_state(true); 1150 else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE) 1151 set_power_ctl_ee_state(false); 1152 1153 if (cpu->suspended && hwp_active) { 1154 mutex_lock(&intel_pstate_limits_lock); 1155 1156 /* Re-enable HWP, because "online" has not done that. */ 1157 intel_pstate_hwp_reenable(cpu); 1158 1159 mutex_unlock(&intel_pstate_limits_lock); 1160 } 1161 1162 cpu->suspended = false; 1163 1164 return 0; 1165 } 1166 1167 static void intel_pstate_update_policies(void) 1168 { 1169 int cpu; 1170 1171 for_each_possible_cpu(cpu) 1172 cpufreq_update_policy(cpu); 1173 } 1174 1175 static void intel_pstate_update_max_freq(unsigned int cpu) 1176 { 1177 struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); 1178 struct cpudata *cpudata; 1179 1180 if (!policy) 1181 return; 1182 1183 cpudata = all_cpu_data[cpu]; 1184 policy->cpuinfo.max_freq = global.turbo_disabled_mf ? 1185 cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; 1186 1187 refresh_frequency_limits(policy); 1188 1189 cpufreq_cpu_release(policy); 1190 } 1191 1192 static void intel_pstate_update_limits(unsigned int cpu) 1193 { 1194 mutex_lock(&intel_pstate_driver_lock); 1195 1196 update_turbo_state(); 1197 /* 1198 * If turbo has been turned on or off globally, policy limits for 1199 * all CPUs need to be updated to reflect that. 1200 */ 1201 if (global.turbo_disabled_mf != global.turbo_disabled) { 1202 global.turbo_disabled_mf = global.turbo_disabled; 1203 arch_set_max_freq_ratio(global.turbo_disabled); 1204 for_each_possible_cpu(cpu) 1205 intel_pstate_update_max_freq(cpu); 1206 } else { 1207 cpufreq_update_policy(cpu); 1208 } 1209 1210 mutex_unlock(&intel_pstate_driver_lock); 1211 } 1212 1213 /************************** sysfs begin ************************/ 1214 #define show_one(file_name, object) \ 1215 static ssize_t show_##file_name \ 1216 (struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ 1217 { \ 1218 return sprintf(buf, "%u\n", global.object); \ 1219 } 1220 1221 static ssize_t intel_pstate_show_status(char *buf); 1222 static int intel_pstate_update_status(const char *buf, size_t size); 1223 1224 static ssize_t show_status(struct kobject *kobj, 1225 struct kobj_attribute *attr, char *buf) 1226 { 1227 ssize_t ret; 1228 1229 mutex_lock(&intel_pstate_driver_lock); 1230 ret = intel_pstate_show_status(buf); 1231 mutex_unlock(&intel_pstate_driver_lock); 1232 1233 return ret; 1234 } 1235 1236 static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, 1237 const char *buf, size_t count) 1238 { 1239 char *p = memchr(buf, '\n', count); 1240 int ret; 1241 1242 mutex_lock(&intel_pstate_driver_lock); 1243 ret = intel_pstate_update_status(buf, p ? p - buf : count); 1244 mutex_unlock(&intel_pstate_driver_lock); 1245 1246 return ret < 0 ? ret : count; 1247 } 1248 1249 static ssize_t show_turbo_pct(struct kobject *kobj, 1250 struct kobj_attribute *attr, char *buf) 1251 { 1252 struct cpudata *cpu; 1253 int total, no_turbo, turbo_pct; 1254 uint32_t turbo_fp; 1255 1256 mutex_lock(&intel_pstate_driver_lock); 1257 1258 if (!intel_pstate_driver) { 1259 mutex_unlock(&intel_pstate_driver_lock); 1260 return -EAGAIN; 1261 } 1262 1263 cpu = all_cpu_data[0]; 1264 1265 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 1266 no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; 1267 turbo_fp = div_fp(no_turbo, total); 1268 turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); 1269 1270 mutex_unlock(&intel_pstate_driver_lock); 1271 1272 return sprintf(buf, "%u\n", turbo_pct); 1273 } 1274 1275 static ssize_t show_num_pstates(struct kobject *kobj, 1276 struct kobj_attribute *attr, char *buf) 1277 { 1278 struct cpudata *cpu; 1279 int total; 1280 1281 mutex_lock(&intel_pstate_driver_lock); 1282 1283 if (!intel_pstate_driver) { 1284 mutex_unlock(&intel_pstate_driver_lock); 1285 return -EAGAIN; 1286 } 1287 1288 cpu = all_cpu_data[0]; 1289 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 1290 1291 mutex_unlock(&intel_pstate_driver_lock); 1292 1293 return sprintf(buf, "%u\n", total); 1294 } 1295 1296 static ssize_t show_no_turbo(struct kobject *kobj, 1297 struct kobj_attribute *attr, char *buf) 1298 { 1299 ssize_t ret; 1300 1301 mutex_lock(&intel_pstate_driver_lock); 1302 1303 if (!intel_pstate_driver) { 1304 mutex_unlock(&intel_pstate_driver_lock); 1305 return -EAGAIN; 1306 } 1307 1308 update_turbo_state(); 1309 if (global.turbo_disabled) 1310 ret = sprintf(buf, "%u\n", global.turbo_disabled); 1311 else 1312 ret = sprintf(buf, "%u\n", global.no_turbo); 1313 1314 mutex_unlock(&intel_pstate_driver_lock); 1315 1316 return ret; 1317 } 1318 1319 static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, 1320 const char *buf, size_t count) 1321 { 1322 unsigned int input; 1323 int ret; 1324 1325 ret = sscanf(buf, "%u", &input); 1326 if (ret != 1) 1327 return -EINVAL; 1328 1329 mutex_lock(&intel_pstate_driver_lock); 1330 1331 if (!intel_pstate_driver) { 1332 mutex_unlock(&intel_pstate_driver_lock); 1333 return -EAGAIN; 1334 } 1335 1336 mutex_lock(&intel_pstate_limits_lock); 1337 1338 update_turbo_state(); 1339 if (global.turbo_disabled) { 1340 pr_notice_once("Turbo disabled by BIOS or unavailable on processor\n"); 1341 mutex_unlock(&intel_pstate_limits_lock); 1342 mutex_unlock(&intel_pstate_driver_lock); 1343 return -EPERM; 1344 } 1345 1346 global.no_turbo = clamp_t(int, input, 0, 1); 1347 1348 if (global.no_turbo) { 1349 struct cpudata *cpu = all_cpu_data[0]; 1350 int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate; 1351 1352 /* Squash the global minimum into the permitted range. */ 1353 if (global.min_perf_pct > pct) 1354 global.min_perf_pct = pct; 1355 } 1356 1357 mutex_unlock(&intel_pstate_limits_lock); 1358 1359 intel_pstate_update_policies(); 1360 1361 mutex_unlock(&intel_pstate_driver_lock); 1362 1363 return count; 1364 } 1365 1366 static void update_qos_request(enum freq_qos_req_type type) 1367 { 1368 struct freq_qos_request *req; 1369 struct cpufreq_policy *policy; 1370 int i; 1371 1372 for_each_possible_cpu(i) { 1373 struct cpudata *cpu = all_cpu_data[i]; 1374 unsigned int freq, perf_pct; 1375 1376 policy = cpufreq_cpu_get(i); 1377 if (!policy) 1378 continue; 1379 1380 req = policy->driver_data; 1381 cpufreq_cpu_put(policy); 1382 1383 if (!req) 1384 continue; 1385 1386 if (hwp_active) 1387 intel_pstate_get_hwp_cap(cpu); 1388 1389 if (type == FREQ_QOS_MIN) { 1390 perf_pct = global.min_perf_pct; 1391 } else { 1392 req++; 1393 perf_pct = global.max_perf_pct; 1394 } 1395 1396 freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * perf_pct, 100); 1397 1398 if (freq_qos_update_request(req, freq) < 0) 1399 pr_warn("Failed to update freq constraint: CPU%d\n", i); 1400 } 1401 } 1402 1403 static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b, 1404 const char *buf, size_t count) 1405 { 1406 unsigned int input; 1407 int ret; 1408 1409 ret = sscanf(buf, "%u", &input); 1410 if (ret != 1) 1411 return -EINVAL; 1412 1413 mutex_lock(&intel_pstate_driver_lock); 1414 1415 if (!intel_pstate_driver) { 1416 mutex_unlock(&intel_pstate_driver_lock); 1417 return -EAGAIN; 1418 } 1419 1420 mutex_lock(&intel_pstate_limits_lock); 1421 1422 global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100); 1423 1424 mutex_unlock(&intel_pstate_limits_lock); 1425 1426 if (intel_pstate_driver == &intel_pstate) 1427 intel_pstate_update_policies(); 1428 else 1429 update_qos_request(FREQ_QOS_MAX); 1430 1431 mutex_unlock(&intel_pstate_driver_lock); 1432 1433 return count; 1434 } 1435 1436 static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b, 1437 const char *buf, size_t count) 1438 { 1439 unsigned int input; 1440 int ret; 1441 1442 ret = sscanf(buf, "%u", &input); 1443 if (ret != 1) 1444 return -EINVAL; 1445 1446 mutex_lock(&intel_pstate_driver_lock); 1447 1448 if (!intel_pstate_driver) { 1449 mutex_unlock(&intel_pstate_driver_lock); 1450 return -EAGAIN; 1451 } 1452 1453 mutex_lock(&intel_pstate_limits_lock); 1454 1455 global.min_perf_pct = clamp_t(int, input, 1456 min_perf_pct_min(), global.max_perf_pct); 1457 1458 mutex_unlock(&intel_pstate_limits_lock); 1459 1460 if (intel_pstate_driver == &intel_pstate) 1461 intel_pstate_update_policies(); 1462 else 1463 update_qos_request(FREQ_QOS_MIN); 1464 1465 mutex_unlock(&intel_pstate_driver_lock); 1466 1467 return count; 1468 } 1469 1470 static ssize_t show_hwp_dynamic_boost(struct kobject *kobj, 1471 struct kobj_attribute *attr, char *buf) 1472 { 1473 return sprintf(buf, "%u\n", hwp_boost); 1474 } 1475 1476 static ssize_t store_hwp_dynamic_boost(struct kobject *a, 1477 struct kobj_attribute *b, 1478 const char *buf, size_t count) 1479 { 1480 unsigned int input; 1481 int ret; 1482 1483 ret = kstrtouint(buf, 10, &input); 1484 if (ret) 1485 return ret; 1486 1487 mutex_lock(&intel_pstate_driver_lock); 1488 hwp_boost = !!input; 1489 intel_pstate_update_policies(); 1490 mutex_unlock(&intel_pstate_driver_lock); 1491 1492 return count; 1493 } 1494 1495 static ssize_t show_energy_efficiency(struct kobject *kobj, struct kobj_attribute *attr, 1496 char *buf) 1497 { 1498 u64 power_ctl; 1499 int enable; 1500 1501 rdmsrl(MSR_IA32_POWER_CTL, power_ctl); 1502 enable = !!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE)); 1503 return sprintf(buf, "%d\n", !enable); 1504 } 1505 1506 static ssize_t store_energy_efficiency(struct kobject *a, struct kobj_attribute *b, 1507 const char *buf, size_t count) 1508 { 1509 bool input; 1510 int ret; 1511 1512 ret = kstrtobool(buf, &input); 1513 if (ret) 1514 return ret; 1515 1516 set_power_ctl_ee_state(input); 1517 1518 return count; 1519 } 1520 1521 show_one(max_perf_pct, max_perf_pct); 1522 show_one(min_perf_pct, min_perf_pct); 1523 1524 define_one_global_rw(status); 1525 define_one_global_rw(no_turbo); 1526 define_one_global_rw(max_perf_pct); 1527 define_one_global_rw(min_perf_pct); 1528 define_one_global_ro(turbo_pct); 1529 define_one_global_ro(num_pstates); 1530 define_one_global_rw(hwp_dynamic_boost); 1531 define_one_global_rw(energy_efficiency); 1532 1533 static struct attribute *intel_pstate_attributes[] = { 1534 &status.attr, 1535 &no_turbo.attr, 1536 NULL 1537 }; 1538 1539 static const struct attribute_group intel_pstate_attr_group = { 1540 .attrs = intel_pstate_attributes, 1541 }; 1542 1543 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[]; 1544 1545 static struct kobject *intel_pstate_kobject; 1546 1547 static void __init intel_pstate_sysfs_expose_params(void) 1548 { 1549 int rc; 1550 1551 intel_pstate_kobject = kobject_create_and_add("intel_pstate", 1552 &cpu_subsys.dev_root->kobj); 1553 if (WARN_ON(!intel_pstate_kobject)) 1554 return; 1555 1556 rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group); 1557 if (WARN_ON(rc)) 1558 return; 1559 1560 if (!boot_cpu_has(X86_FEATURE_HYBRID_CPU)) { 1561 rc = sysfs_create_file(intel_pstate_kobject, &turbo_pct.attr); 1562 WARN_ON(rc); 1563 1564 rc = sysfs_create_file(intel_pstate_kobject, &num_pstates.attr); 1565 WARN_ON(rc); 1566 } 1567 1568 /* 1569 * If per cpu limits are enforced there are no global limits, so 1570 * return without creating max/min_perf_pct attributes 1571 */ 1572 if (per_cpu_limits) 1573 return; 1574 1575 rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr); 1576 WARN_ON(rc); 1577 1578 rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr); 1579 WARN_ON(rc); 1580 1581 if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) { 1582 rc = sysfs_create_file(intel_pstate_kobject, &energy_efficiency.attr); 1583 WARN_ON(rc); 1584 } 1585 } 1586 1587 static void __init intel_pstate_sysfs_remove(void) 1588 { 1589 if (!intel_pstate_kobject) 1590 return; 1591 1592 sysfs_remove_group(intel_pstate_kobject, &intel_pstate_attr_group); 1593 1594 if (!boot_cpu_has(X86_FEATURE_HYBRID_CPU)) { 1595 sysfs_remove_file(intel_pstate_kobject, &num_pstates.attr); 1596 sysfs_remove_file(intel_pstate_kobject, &turbo_pct.attr); 1597 } 1598 1599 if (!per_cpu_limits) { 1600 sysfs_remove_file(intel_pstate_kobject, &max_perf_pct.attr); 1601 sysfs_remove_file(intel_pstate_kobject, &min_perf_pct.attr); 1602 1603 if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) 1604 sysfs_remove_file(intel_pstate_kobject, &energy_efficiency.attr); 1605 } 1606 1607 kobject_put(intel_pstate_kobject); 1608 } 1609 1610 static void intel_pstate_sysfs_expose_hwp_dynamic_boost(void) 1611 { 1612 int rc; 1613 1614 if (!hwp_active) 1615 return; 1616 1617 rc = sysfs_create_file(intel_pstate_kobject, &hwp_dynamic_boost.attr); 1618 WARN_ON_ONCE(rc); 1619 } 1620 1621 static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void) 1622 { 1623 if (!hwp_active) 1624 return; 1625 1626 sysfs_remove_file(intel_pstate_kobject, &hwp_dynamic_boost.attr); 1627 } 1628 1629 /************************** sysfs end ************************/ 1630 1631 static void intel_pstate_notify_work(struct work_struct *work) 1632 { 1633 mutex_lock(&intel_pstate_driver_lock); 1634 cpufreq_update_policy(smp_processor_id()); 1635 wrmsrl(MSR_HWP_STATUS, 0); 1636 mutex_unlock(&intel_pstate_driver_lock); 1637 } 1638 1639 void notify_hwp_interrupt(void) 1640 { 1641 unsigned int this_cpu = smp_processor_id(); 1642 struct cpudata *cpudata; 1643 u64 value; 1644 1645 if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) 1646 return; 1647 1648 rdmsrl(MSR_HWP_STATUS, value); 1649 if (!(value & 0x01)) 1650 return; 1651 1652 cpudata = all_cpu_data[this_cpu]; 1653 schedule_delayed_work_on(this_cpu, &cpudata->hwp_notify_work, msecs_to_jiffies(10)); 1654 } 1655 1656 static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata) 1657 { 1658 /* Enable HWP notification interrupt for guaranteed performance change */ 1659 if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) { 1660 INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work); 1661 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01); 1662 } 1663 } 1664 1665 static void intel_pstate_hwp_enable(struct cpudata *cpudata) 1666 { 1667 /* First disable HWP notification interrupt as we don't process them */ 1668 if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) 1669 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); 1670 1671 wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); 1672 if (cpudata->epp_default == -EINVAL) 1673 cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); 1674 1675 intel_pstate_enable_hwp_interrupt(cpudata); 1676 } 1677 1678 static int atom_get_min_pstate(void) 1679 { 1680 u64 value; 1681 1682 rdmsrl(MSR_ATOM_CORE_RATIOS, value); 1683 return (value >> 8) & 0x7F; 1684 } 1685 1686 static int atom_get_max_pstate(void) 1687 { 1688 u64 value; 1689 1690 rdmsrl(MSR_ATOM_CORE_RATIOS, value); 1691 return (value >> 16) & 0x7F; 1692 } 1693 1694 static int atom_get_turbo_pstate(void) 1695 { 1696 u64 value; 1697 1698 rdmsrl(MSR_ATOM_CORE_TURBO_RATIOS, value); 1699 return value & 0x7F; 1700 } 1701 1702 static u64 atom_get_val(struct cpudata *cpudata, int pstate) 1703 { 1704 u64 val; 1705 int32_t vid_fp; 1706 u32 vid; 1707 1708 val = (u64)pstate << 8; 1709 if (global.no_turbo && !global.turbo_disabled) 1710 val |= (u64)1 << 32; 1711 1712 vid_fp = cpudata->vid.min + mul_fp( 1713 int_tofp(pstate - cpudata->pstate.min_pstate), 1714 cpudata->vid.ratio); 1715 1716 vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); 1717 vid = ceiling_fp(vid_fp); 1718 1719 if (pstate > cpudata->pstate.max_pstate) 1720 vid = cpudata->vid.turbo; 1721 1722 return val | vid; 1723 } 1724 1725 static int silvermont_get_scaling(void) 1726 { 1727 u64 value; 1728 int i; 1729 /* Defined in Table 35-6 from SDM (Sept 2015) */ 1730 static int silvermont_freq_table[] = { 1731 83300, 100000, 133300, 116700, 80000}; 1732 1733 rdmsrl(MSR_FSB_FREQ, value); 1734 i = value & 0x7; 1735 WARN_ON(i > 4); 1736 1737 return silvermont_freq_table[i]; 1738 } 1739 1740 static int airmont_get_scaling(void) 1741 { 1742 u64 value; 1743 int i; 1744 /* Defined in Table 35-10 from SDM (Sept 2015) */ 1745 static int airmont_freq_table[] = { 1746 83300, 100000, 133300, 116700, 80000, 1747 93300, 90000, 88900, 87500}; 1748 1749 rdmsrl(MSR_FSB_FREQ, value); 1750 i = value & 0xF; 1751 WARN_ON(i > 8); 1752 1753 return airmont_freq_table[i]; 1754 } 1755 1756 static void atom_get_vid(struct cpudata *cpudata) 1757 { 1758 u64 value; 1759 1760 rdmsrl(MSR_ATOM_CORE_VIDS, value); 1761 cpudata->vid.min = int_tofp((value >> 8) & 0x7f); 1762 cpudata->vid.max = int_tofp((value >> 16) & 0x7f); 1763 cpudata->vid.ratio = div_fp( 1764 cpudata->vid.max - cpudata->vid.min, 1765 int_tofp(cpudata->pstate.max_pstate - 1766 cpudata->pstate.min_pstate)); 1767 1768 rdmsrl(MSR_ATOM_CORE_TURBO_VIDS, value); 1769 cpudata->vid.turbo = value & 0x7f; 1770 } 1771 1772 static int core_get_min_pstate(void) 1773 { 1774 u64 value; 1775 1776 rdmsrl(MSR_PLATFORM_INFO, value); 1777 return (value >> 40) & 0xFF; 1778 } 1779 1780 static int core_get_max_pstate_physical(void) 1781 { 1782 u64 value; 1783 1784 rdmsrl(MSR_PLATFORM_INFO, value); 1785 return (value >> 8) & 0xFF; 1786 } 1787 1788 static int core_get_tdp_ratio(u64 plat_info) 1789 { 1790 /* Check how many TDP levels present */ 1791 if (plat_info & 0x600000000) { 1792 u64 tdp_ctrl; 1793 u64 tdp_ratio; 1794 int tdp_msr; 1795 int err; 1796 1797 /* Get the TDP level (0, 1, 2) to get ratios */ 1798 err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl); 1799 if (err) 1800 return err; 1801 1802 /* TDP MSR are continuous starting at 0x648 */ 1803 tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x03); 1804 err = rdmsrl_safe(tdp_msr, &tdp_ratio); 1805 if (err) 1806 return err; 1807 1808 /* For level 1 and 2, bits[23:16] contain the ratio */ 1809 if (tdp_ctrl & 0x03) 1810 tdp_ratio >>= 16; 1811 1812 tdp_ratio &= 0xff; /* ratios are only 8 bits long */ 1813 pr_debug("tdp_ratio %x\n", (int)tdp_ratio); 1814 1815 return (int)tdp_ratio; 1816 } 1817 1818 return -ENXIO; 1819 } 1820 1821 static int core_get_max_pstate(void) 1822 { 1823 u64 tar; 1824 u64 plat_info; 1825 int max_pstate; 1826 int tdp_ratio; 1827 int err; 1828 1829 rdmsrl(MSR_PLATFORM_INFO, plat_info); 1830 max_pstate = (plat_info >> 8) & 0xFF; 1831 1832 tdp_ratio = core_get_tdp_ratio(plat_info); 1833 if (tdp_ratio <= 0) 1834 return max_pstate; 1835 1836 if (hwp_active) { 1837 /* Turbo activation ratio is not used on HWP platforms */ 1838 return tdp_ratio; 1839 } 1840 1841 err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar); 1842 if (!err) { 1843 int tar_levels; 1844 1845 /* Do some sanity checking for safety */ 1846 tar_levels = tar & 0xff; 1847 if (tdp_ratio - 1 == tar_levels) { 1848 max_pstate = tar_levels; 1849 pr_debug("max_pstate=TAC %x\n", max_pstate); 1850 } 1851 } 1852 1853 return max_pstate; 1854 } 1855 1856 static int core_get_turbo_pstate(void) 1857 { 1858 u64 value; 1859 int nont, ret; 1860 1861 rdmsrl(MSR_TURBO_RATIO_LIMIT, value); 1862 nont = core_get_max_pstate(); 1863 ret = (value) & 255; 1864 if (ret <= nont) 1865 ret = nont; 1866 return ret; 1867 } 1868 1869 static inline int core_get_scaling(void) 1870 { 1871 return 100000; 1872 } 1873 1874 static u64 core_get_val(struct cpudata *cpudata, int pstate) 1875 { 1876 u64 val; 1877 1878 val = (u64)pstate << 8; 1879 if (global.no_turbo && !global.turbo_disabled) 1880 val |= (u64)1 << 32; 1881 1882 return val; 1883 } 1884 1885 static int knl_get_aperf_mperf_shift(void) 1886 { 1887 return 10; 1888 } 1889 1890 static int knl_get_turbo_pstate(void) 1891 { 1892 u64 value; 1893 int nont, ret; 1894 1895 rdmsrl(MSR_TURBO_RATIO_LIMIT, value); 1896 nont = core_get_max_pstate(); 1897 ret = (((value) >> 8) & 0xFF); 1898 if (ret <= nont) 1899 ret = nont; 1900 return ret; 1901 } 1902 1903 static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) 1904 { 1905 trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); 1906 cpu->pstate.current_pstate = pstate; 1907 /* 1908 * Generally, there is no guarantee that this code will always run on 1909 * the CPU being updated, so force the register update to run on the 1910 * right CPU. 1911 */ 1912 wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, 1913 pstate_funcs.get_val(cpu, pstate)); 1914 } 1915 1916 static void intel_pstate_set_min_pstate(struct cpudata *cpu) 1917 { 1918 intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); 1919 } 1920 1921 static void intel_pstate_max_within_limits(struct cpudata *cpu) 1922 { 1923 int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); 1924 1925 update_turbo_state(); 1926 intel_pstate_set_pstate(cpu, pstate); 1927 } 1928 1929 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) 1930 { 1931 bool hybrid_cpu = boot_cpu_has(X86_FEATURE_HYBRID_CPU); 1932 int perf_ctl_max_phys = pstate_funcs.get_max_physical(); 1933 int perf_ctl_scaling = hybrid_cpu ? cpu_khz / perf_ctl_max_phys : 1934 pstate_funcs.get_scaling(); 1935 1936 cpu->pstate.min_pstate = pstate_funcs.get_min(); 1937 cpu->pstate.max_pstate_physical = perf_ctl_max_phys; 1938 cpu->pstate.perf_ctl_scaling = perf_ctl_scaling; 1939 1940 if (hwp_active && !hwp_mode_bdw) { 1941 __intel_pstate_get_hwp_cap(cpu); 1942 1943 if (hybrid_cpu) 1944 intel_pstate_hybrid_hwp_calibrate(cpu); 1945 else 1946 cpu->pstate.scaling = perf_ctl_scaling; 1947 } else { 1948 cpu->pstate.scaling = perf_ctl_scaling; 1949 cpu->pstate.max_pstate = pstate_funcs.get_max(); 1950 cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); 1951 } 1952 1953 if (cpu->pstate.scaling == perf_ctl_scaling) { 1954 cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling; 1955 cpu->pstate.max_freq = cpu->pstate.max_pstate * perf_ctl_scaling; 1956 cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * perf_ctl_scaling; 1957 } 1958 1959 if (pstate_funcs.get_aperf_mperf_shift) 1960 cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift(); 1961 1962 if (pstate_funcs.get_vid) 1963 pstate_funcs.get_vid(cpu); 1964 1965 intel_pstate_set_min_pstate(cpu); 1966 } 1967 1968 /* 1969 * Long hold time will keep high perf limits for long time, 1970 * which negatively impacts perf/watt for some workloads, 1971 * like specpower. 3ms is based on experiements on some 1972 * workoads. 1973 */ 1974 static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC; 1975 1976 static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu) 1977 { 1978 u64 hwp_req = READ_ONCE(cpu->hwp_req_cached); 1979 u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); 1980 u32 max_limit = (hwp_req & 0xff00) >> 8; 1981 u32 min_limit = (hwp_req & 0xff); 1982 u32 boost_level1; 1983 1984 /* 1985 * Cases to consider (User changes via sysfs or boot time): 1986 * If, P0 (Turbo max) = P1 (Guaranteed max) = min: 1987 * No boost, return. 1988 * If, P0 (Turbo max) > P1 (Guaranteed max) = min: 1989 * Should result in one level boost only for P0. 1990 * If, P0 (Turbo max) = P1 (Guaranteed max) > min: 1991 * Should result in two level boost: 1992 * (min + p1)/2 and P1. 1993 * If, P0 (Turbo max) > P1 (Guaranteed max) > min: 1994 * Should result in three level boost: 1995 * (min + p1)/2, P1 and P0. 1996 */ 1997 1998 /* If max and min are equal or already at max, nothing to boost */ 1999 if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit) 2000 return; 2001 2002 if (!cpu->hwp_boost_min) 2003 cpu->hwp_boost_min = min_limit; 2004 2005 /* level at half way mark between min and guranteed */ 2006 boost_level1 = (HWP_GUARANTEED_PERF(hwp_cap) + min_limit) >> 1; 2007 2008 if (cpu->hwp_boost_min < boost_level1) 2009 cpu->hwp_boost_min = boost_level1; 2010 else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(hwp_cap)) 2011 cpu->hwp_boost_min = HWP_GUARANTEED_PERF(hwp_cap); 2012 else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(hwp_cap) && 2013 max_limit != HWP_GUARANTEED_PERF(hwp_cap)) 2014 cpu->hwp_boost_min = max_limit; 2015 else 2016 return; 2017 2018 hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min; 2019 wrmsrl(MSR_HWP_REQUEST, hwp_req); 2020 cpu->last_update = cpu->sample.time; 2021 } 2022 2023 static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu) 2024 { 2025 if (cpu->hwp_boost_min) { 2026 bool expired; 2027 2028 /* Check if we are idle for hold time to boost down */ 2029 expired = time_after64(cpu->sample.time, cpu->last_update + 2030 hwp_boost_hold_time_ns); 2031 if (expired) { 2032 wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached); 2033 cpu->hwp_boost_min = 0; 2034 } 2035 } 2036 cpu->last_update = cpu->sample.time; 2037 } 2038 2039 static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu, 2040 u64 time) 2041 { 2042 cpu->sample.time = time; 2043 2044 if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) { 2045 bool do_io = false; 2046 2047 cpu->sched_flags = 0; 2048 /* 2049 * Set iowait_boost flag and update time. Since IO WAIT flag 2050 * is set all the time, we can't just conclude that there is 2051 * some IO bound activity is scheduled on this CPU with just 2052 * one occurrence. If we receive at least two in two 2053 * consecutive ticks, then we treat as boost candidate. 2054 */ 2055 if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC)) 2056 do_io = true; 2057 2058 cpu->last_io_update = time; 2059 2060 if (do_io) 2061 intel_pstate_hwp_boost_up(cpu); 2062 2063 } else { 2064 intel_pstate_hwp_boost_down(cpu); 2065 } 2066 } 2067 2068 static inline void intel_pstate_update_util_hwp(struct update_util_data *data, 2069 u64 time, unsigned int flags) 2070 { 2071 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 2072 2073 cpu->sched_flags |= flags; 2074 2075 if (smp_processor_id() == cpu->cpu) 2076 intel_pstate_update_util_hwp_local(cpu, time); 2077 } 2078 2079 static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu) 2080 { 2081 struct sample *sample = &cpu->sample; 2082 2083 sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf); 2084 } 2085 2086 static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) 2087 { 2088 u64 aperf, mperf; 2089 unsigned long flags; 2090 u64 tsc; 2091 2092 local_irq_save(flags); 2093 rdmsrl(MSR_IA32_APERF, aperf); 2094 rdmsrl(MSR_IA32_MPERF, mperf); 2095 tsc = rdtsc(); 2096 if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) { 2097 local_irq_restore(flags); 2098 return false; 2099 } 2100 local_irq_restore(flags); 2101 2102 cpu->last_sample_time = cpu->sample.time; 2103 cpu->sample.time = time; 2104 cpu->sample.aperf = aperf; 2105 cpu->sample.mperf = mperf; 2106 cpu->sample.tsc = tsc; 2107 cpu->sample.aperf -= cpu->prev_aperf; 2108 cpu->sample.mperf -= cpu->prev_mperf; 2109 cpu->sample.tsc -= cpu->prev_tsc; 2110 2111 cpu->prev_aperf = aperf; 2112 cpu->prev_mperf = mperf; 2113 cpu->prev_tsc = tsc; 2114 /* 2115 * First time this function is invoked in a given cycle, all of the 2116 * previous sample data fields are equal to zero or stale and they must 2117 * be populated with meaningful numbers for things to work, so assume 2118 * that sample.time will always be reset before setting the utilization 2119 * update hook and make the caller skip the sample then. 2120 */ 2121 if (cpu->last_sample_time) { 2122 intel_pstate_calc_avg_perf(cpu); 2123 return true; 2124 } 2125 return false; 2126 } 2127 2128 static inline int32_t get_avg_frequency(struct cpudata *cpu) 2129 { 2130 return mul_ext_fp(cpu->sample.core_avg_perf, cpu_khz); 2131 } 2132 2133 static inline int32_t get_avg_pstate(struct cpudata *cpu) 2134 { 2135 return mul_ext_fp(cpu->pstate.max_pstate_physical, 2136 cpu->sample.core_avg_perf); 2137 } 2138 2139 static inline int32_t get_target_pstate(struct cpudata *cpu) 2140 { 2141 struct sample *sample = &cpu->sample; 2142 int32_t busy_frac; 2143 int target, avg_pstate; 2144 2145 busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift, 2146 sample->tsc); 2147 2148 if (busy_frac < cpu->iowait_boost) 2149 busy_frac = cpu->iowait_boost; 2150 2151 sample->busy_scaled = busy_frac * 100; 2152 2153 target = global.no_turbo || global.turbo_disabled ? 2154 cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; 2155 target += target >> 2; 2156 target = mul_fp(target, busy_frac); 2157 if (target < cpu->pstate.min_pstate) 2158 target = cpu->pstate.min_pstate; 2159 2160 /* 2161 * If the average P-state during the previous cycle was higher than the 2162 * current target, add 50% of the difference to the target to reduce 2163 * possible performance oscillations and offset possible performance 2164 * loss related to moving the workload from one CPU to another within 2165 * a package/module. 2166 */ 2167 avg_pstate = get_avg_pstate(cpu); 2168 if (avg_pstate > target) 2169 target += (avg_pstate - target) >> 1; 2170 2171 return target; 2172 } 2173 2174 static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) 2175 { 2176 int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio); 2177 int max_pstate = max(min_pstate, cpu->max_perf_ratio); 2178 2179 return clamp_t(int, pstate, min_pstate, max_pstate); 2180 } 2181 2182 static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) 2183 { 2184 if (pstate == cpu->pstate.current_pstate) 2185 return; 2186 2187 cpu->pstate.current_pstate = pstate; 2188 wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); 2189 } 2190 2191 static void intel_pstate_adjust_pstate(struct cpudata *cpu) 2192 { 2193 int from = cpu->pstate.current_pstate; 2194 struct sample *sample; 2195 int target_pstate; 2196 2197 update_turbo_state(); 2198 2199 target_pstate = get_target_pstate(cpu); 2200 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 2201 trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu); 2202 intel_pstate_update_pstate(cpu, target_pstate); 2203 2204 sample = &cpu->sample; 2205 trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf), 2206 fp_toint(sample->busy_scaled), 2207 from, 2208 cpu->pstate.current_pstate, 2209 sample->mperf, 2210 sample->aperf, 2211 sample->tsc, 2212 get_avg_frequency(cpu), 2213 fp_toint(cpu->iowait_boost * 100)); 2214 } 2215 2216 static void intel_pstate_update_util(struct update_util_data *data, u64 time, 2217 unsigned int flags) 2218 { 2219 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 2220 u64 delta_ns; 2221 2222 /* Don't allow remote callbacks */ 2223 if (smp_processor_id() != cpu->cpu) 2224 return; 2225 2226 delta_ns = time - cpu->last_update; 2227 if (flags & SCHED_CPUFREQ_IOWAIT) { 2228 /* Start over if the CPU may have been idle. */ 2229 if (delta_ns > TICK_NSEC) { 2230 cpu->iowait_boost = ONE_EIGHTH_FP; 2231 } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) { 2232 cpu->iowait_boost <<= 1; 2233 if (cpu->iowait_boost > int_tofp(1)) 2234 cpu->iowait_boost = int_tofp(1); 2235 } else { 2236 cpu->iowait_boost = ONE_EIGHTH_FP; 2237 } 2238 } else if (cpu->iowait_boost) { 2239 /* Clear iowait_boost if the CPU may have been idle. */ 2240 if (delta_ns > TICK_NSEC) 2241 cpu->iowait_boost = 0; 2242 else 2243 cpu->iowait_boost >>= 1; 2244 } 2245 cpu->last_update = time; 2246 delta_ns = time - cpu->sample.time; 2247 if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL) 2248 return; 2249 2250 if (intel_pstate_sample(cpu, time)) 2251 intel_pstate_adjust_pstate(cpu); 2252 } 2253 2254 static struct pstate_funcs core_funcs = { 2255 .get_max = core_get_max_pstate, 2256 .get_max_physical = core_get_max_pstate_physical, 2257 .get_min = core_get_min_pstate, 2258 .get_turbo = core_get_turbo_pstate, 2259 .get_scaling = core_get_scaling, 2260 .get_val = core_get_val, 2261 }; 2262 2263 static const struct pstate_funcs silvermont_funcs = { 2264 .get_max = atom_get_max_pstate, 2265 .get_max_physical = atom_get_max_pstate, 2266 .get_min = atom_get_min_pstate, 2267 .get_turbo = atom_get_turbo_pstate, 2268 .get_val = atom_get_val, 2269 .get_scaling = silvermont_get_scaling, 2270 .get_vid = atom_get_vid, 2271 }; 2272 2273 static const struct pstate_funcs airmont_funcs = { 2274 .get_max = atom_get_max_pstate, 2275 .get_max_physical = atom_get_max_pstate, 2276 .get_min = atom_get_min_pstate, 2277 .get_turbo = atom_get_turbo_pstate, 2278 .get_val = atom_get_val, 2279 .get_scaling = airmont_get_scaling, 2280 .get_vid = atom_get_vid, 2281 }; 2282 2283 static const struct pstate_funcs knl_funcs = { 2284 .get_max = core_get_max_pstate, 2285 .get_max_physical = core_get_max_pstate_physical, 2286 .get_min = core_get_min_pstate, 2287 .get_turbo = knl_get_turbo_pstate, 2288 .get_aperf_mperf_shift = knl_get_aperf_mperf_shift, 2289 .get_scaling = core_get_scaling, 2290 .get_val = core_get_val, 2291 }; 2292 2293 #define X86_MATCH(model, policy) \ 2294 X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ 2295 X86_FEATURE_APERFMPERF, &policy) 2296 2297 static const struct x86_cpu_id intel_pstate_cpu_ids[] = { 2298 X86_MATCH(SANDYBRIDGE, core_funcs), 2299 X86_MATCH(SANDYBRIDGE_X, core_funcs), 2300 X86_MATCH(ATOM_SILVERMONT, silvermont_funcs), 2301 X86_MATCH(IVYBRIDGE, core_funcs), 2302 X86_MATCH(HASWELL, core_funcs), 2303 X86_MATCH(BROADWELL, core_funcs), 2304 X86_MATCH(IVYBRIDGE_X, core_funcs), 2305 X86_MATCH(HASWELL_X, core_funcs), 2306 X86_MATCH(HASWELL_L, core_funcs), 2307 X86_MATCH(HASWELL_G, core_funcs), 2308 X86_MATCH(BROADWELL_G, core_funcs), 2309 X86_MATCH(ATOM_AIRMONT, airmont_funcs), 2310 X86_MATCH(SKYLAKE_L, core_funcs), 2311 X86_MATCH(BROADWELL_X, core_funcs), 2312 X86_MATCH(SKYLAKE, core_funcs), 2313 X86_MATCH(BROADWELL_D, core_funcs), 2314 X86_MATCH(XEON_PHI_KNL, knl_funcs), 2315 X86_MATCH(XEON_PHI_KNM, knl_funcs), 2316 X86_MATCH(ATOM_GOLDMONT, core_funcs), 2317 X86_MATCH(ATOM_GOLDMONT_PLUS, core_funcs), 2318 X86_MATCH(SKYLAKE_X, core_funcs), 2319 X86_MATCH(COMETLAKE, core_funcs), 2320 X86_MATCH(ICELAKE_X, core_funcs), 2321 {} 2322 }; 2323 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); 2324 2325 static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { 2326 X86_MATCH(BROADWELL_D, core_funcs), 2327 X86_MATCH(BROADWELL_X, core_funcs), 2328 X86_MATCH(SKYLAKE_X, core_funcs), 2329 {} 2330 }; 2331 2332 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = { 2333 X86_MATCH(KABYLAKE, core_funcs), 2334 {} 2335 }; 2336 2337 static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = { 2338 X86_MATCH(SKYLAKE_X, core_funcs), 2339 X86_MATCH(SKYLAKE, core_funcs), 2340 {} 2341 }; 2342 2343 static int intel_pstate_init_cpu(unsigned int cpunum) 2344 { 2345 struct cpudata *cpu; 2346 2347 cpu = all_cpu_data[cpunum]; 2348 2349 if (!cpu) { 2350 cpu = kzalloc(sizeof(*cpu), GFP_KERNEL); 2351 if (!cpu) 2352 return -ENOMEM; 2353 2354 all_cpu_data[cpunum] = cpu; 2355 2356 cpu->cpu = cpunum; 2357 2358 cpu->epp_default = -EINVAL; 2359 2360 if (hwp_active) { 2361 const struct x86_cpu_id *id; 2362 2363 intel_pstate_hwp_enable(cpu); 2364 2365 id = x86_match_cpu(intel_pstate_hwp_boost_ids); 2366 if (id && intel_pstate_acpi_pm_profile_server()) 2367 hwp_boost = true; 2368 } 2369 } else if (hwp_active) { 2370 /* 2371 * Re-enable HWP in case this happens after a resume from ACPI 2372 * S3 if the CPU was offline during the whole system/resume 2373 * cycle. 2374 */ 2375 intel_pstate_hwp_reenable(cpu); 2376 } 2377 2378 cpu->epp_powersave = -EINVAL; 2379 cpu->epp_policy = 0; 2380 2381 intel_pstate_get_cpu_pstates(cpu); 2382 2383 pr_debug("controlling: cpu %d\n", cpunum); 2384 2385 return 0; 2386 } 2387 2388 static void intel_pstate_set_update_util_hook(unsigned int cpu_num) 2389 { 2390 struct cpudata *cpu = all_cpu_data[cpu_num]; 2391 2392 if (hwp_active && !hwp_boost) 2393 return; 2394 2395 if (cpu->update_util_set) 2396 return; 2397 2398 /* Prevent intel_pstate_update_util() from using stale data. */ 2399 cpu->sample.time = 0; 2400 cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, 2401 (hwp_active ? 2402 intel_pstate_update_util_hwp : 2403 intel_pstate_update_util)); 2404 cpu->update_util_set = true; 2405 } 2406 2407 static void intel_pstate_clear_update_util_hook(unsigned int cpu) 2408 { 2409 struct cpudata *cpu_data = all_cpu_data[cpu]; 2410 2411 if (!cpu_data->update_util_set) 2412 return; 2413 2414 cpufreq_remove_update_util_hook(cpu); 2415 cpu_data->update_util_set = false; 2416 synchronize_rcu(); 2417 } 2418 2419 static int intel_pstate_get_max_freq(struct cpudata *cpu) 2420 { 2421 return global.turbo_disabled || global.no_turbo ? 2422 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 2423 } 2424 2425 static void intel_pstate_update_perf_limits(struct cpudata *cpu, 2426 unsigned int policy_min, 2427 unsigned int policy_max) 2428 { 2429 int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling; 2430 int32_t max_policy_perf, min_policy_perf; 2431 2432 max_policy_perf = policy_max / perf_ctl_scaling; 2433 if (policy_max == policy_min) { 2434 min_policy_perf = max_policy_perf; 2435 } else { 2436 min_policy_perf = policy_min / perf_ctl_scaling; 2437 min_policy_perf = clamp_t(int32_t, min_policy_perf, 2438 0, max_policy_perf); 2439 } 2440 2441 /* 2442 * HWP needs some special consideration, because HWP_REQUEST uses 2443 * abstract values to represent performance rather than pure ratios. 2444 */ 2445 if (hwp_active) { 2446 intel_pstate_get_hwp_cap(cpu); 2447 2448 if (cpu->pstate.scaling != perf_ctl_scaling) { 2449 int scaling = cpu->pstate.scaling; 2450 int freq; 2451 2452 freq = max_policy_perf * perf_ctl_scaling; 2453 max_policy_perf = DIV_ROUND_UP(freq, scaling); 2454 freq = min_policy_perf * perf_ctl_scaling; 2455 min_policy_perf = DIV_ROUND_UP(freq, scaling); 2456 } 2457 } 2458 2459 pr_debug("cpu:%d min_policy_perf:%d max_policy_perf:%d\n", 2460 cpu->cpu, min_policy_perf, max_policy_perf); 2461 2462 /* Normalize user input to [min_perf, max_perf] */ 2463 if (per_cpu_limits) { 2464 cpu->min_perf_ratio = min_policy_perf; 2465 cpu->max_perf_ratio = max_policy_perf; 2466 } else { 2467 int turbo_max = cpu->pstate.turbo_pstate; 2468 int32_t global_min, global_max; 2469 2470 /* Global limits are in percent of the maximum turbo P-state. */ 2471 global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100); 2472 global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); 2473 global_min = clamp_t(int32_t, global_min, 0, global_max); 2474 2475 pr_debug("cpu:%d global_min:%d global_max:%d\n", cpu->cpu, 2476 global_min, global_max); 2477 2478 cpu->min_perf_ratio = max(min_policy_perf, global_min); 2479 cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf); 2480 cpu->max_perf_ratio = min(max_policy_perf, global_max); 2481 cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio); 2482 2483 /* Make sure min_perf <= max_perf */ 2484 cpu->min_perf_ratio = min(cpu->min_perf_ratio, 2485 cpu->max_perf_ratio); 2486 2487 } 2488 pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", cpu->cpu, 2489 cpu->max_perf_ratio, 2490 cpu->min_perf_ratio); 2491 } 2492 2493 static int intel_pstate_set_policy(struct cpufreq_policy *policy) 2494 { 2495 struct cpudata *cpu; 2496 2497 if (!policy->cpuinfo.max_freq) 2498 return -ENODEV; 2499 2500 pr_debug("set_policy cpuinfo.max %u policy->max %u\n", 2501 policy->cpuinfo.max_freq, policy->max); 2502 2503 cpu = all_cpu_data[policy->cpu]; 2504 cpu->policy = policy->policy; 2505 2506 mutex_lock(&intel_pstate_limits_lock); 2507 2508 intel_pstate_update_perf_limits(cpu, policy->min, policy->max); 2509 2510 if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { 2511 /* 2512 * NOHZ_FULL CPUs need this as the governor callback may not 2513 * be invoked on them. 2514 */ 2515 intel_pstate_clear_update_util_hook(policy->cpu); 2516 intel_pstate_max_within_limits(cpu); 2517 } else { 2518 intel_pstate_set_update_util_hook(policy->cpu); 2519 } 2520 2521 if (hwp_active) { 2522 /* 2523 * When hwp_boost was active before and dynamically it 2524 * was turned off, in that case we need to clear the 2525 * update util hook. 2526 */ 2527 if (!hwp_boost) 2528 intel_pstate_clear_update_util_hook(policy->cpu); 2529 intel_pstate_hwp_set(policy->cpu); 2530 } 2531 2532 mutex_unlock(&intel_pstate_limits_lock); 2533 2534 return 0; 2535 } 2536 2537 static void intel_pstate_adjust_policy_max(struct cpudata *cpu, 2538 struct cpufreq_policy_data *policy) 2539 { 2540 if (!hwp_active && 2541 cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate && 2542 policy->max < policy->cpuinfo.max_freq && 2543 policy->max > cpu->pstate.max_freq) { 2544 pr_debug("policy->max > max non turbo frequency\n"); 2545 policy->max = policy->cpuinfo.max_freq; 2546 } 2547 } 2548 2549 static void intel_pstate_verify_cpu_policy(struct cpudata *cpu, 2550 struct cpufreq_policy_data *policy) 2551 { 2552 int max_freq; 2553 2554 update_turbo_state(); 2555 if (hwp_active) { 2556 intel_pstate_get_hwp_cap(cpu); 2557 max_freq = global.no_turbo || global.turbo_disabled ? 2558 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 2559 } else { 2560 max_freq = intel_pstate_get_max_freq(cpu); 2561 } 2562 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, max_freq); 2563 2564 intel_pstate_adjust_policy_max(cpu, policy); 2565 } 2566 2567 static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy) 2568 { 2569 intel_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy); 2570 2571 return 0; 2572 } 2573 2574 static int intel_cpufreq_cpu_offline(struct cpufreq_policy *policy) 2575 { 2576 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2577 2578 pr_debug("CPU %d going offline\n", cpu->cpu); 2579 2580 if (cpu->suspended) 2581 return 0; 2582 2583 /* 2584 * If the CPU is an SMT thread and it goes offline with the performance 2585 * settings different from the minimum, it will prevent its sibling 2586 * from getting to lower performance levels, so force the minimum 2587 * performance on CPU offline to prevent that from happening. 2588 */ 2589 if (hwp_active) 2590 intel_pstate_hwp_offline(cpu); 2591 else 2592 intel_pstate_set_min_pstate(cpu); 2593 2594 intel_pstate_exit_perf_limits(policy); 2595 2596 return 0; 2597 } 2598 2599 static int intel_pstate_cpu_online(struct cpufreq_policy *policy) 2600 { 2601 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2602 2603 pr_debug("CPU %d going online\n", cpu->cpu); 2604 2605 intel_pstate_init_acpi_perf_limits(policy); 2606 2607 if (hwp_active) { 2608 /* 2609 * Re-enable HWP and clear the "suspended" flag to let "resume" 2610 * know that it need not do that. 2611 */ 2612 intel_pstate_hwp_reenable(cpu); 2613 cpu->suspended = false; 2614 } 2615 2616 return 0; 2617 } 2618 2619 static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) 2620 { 2621 intel_pstate_clear_update_util_hook(policy->cpu); 2622 2623 return intel_cpufreq_cpu_offline(policy); 2624 } 2625 2626 static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) 2627 { 2628 pr_debug("CPU %d exiting\n", policy->cpu); 2629 2630 policy->fast_switch_possible = false; 2631 2632 return 0; 2633 } 2634 2635 static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) 2636 { 2637 struct cpudata *cpu; 2638 int rc; 2639 2640 rc = intel_pstate_init_cpu(policy->cpu); 2641 if (rc) 2642 return rc; 2643 2644 cpu = all_cpu_data[policy->cpu]; 2645 2646 cpu->max_perf_ratio = 0xFF; 2647 cpu->min_perf_ratio = 0; 2648 2649 /* cpuinfo and default policy values */ 2650 policy->cpuinfo.min_freq = cpu->pstate.min_freq; 2651 update_turbo_state(); 2652 global.turbo_disabled_mf = global.turbo_disabled; 2653 policy->cpuinfo.max_freq = global.turbo_disabled ? 2654 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 2655 2656 policy->min = policy->cpuinfo.min_freq; 2657 policy->max = policy->cpuinfo.max_freq; 2658 2659 intel_pstate_init_acpi_perf_limits(policy); 2660 2661 policy->fast_switch_possible = true; 2662 2663 return 0; 2664 } 2665 2666 static int intel_pstate_cpu_init(struct cpufreq_policy *policy) 2667 { 2668 int ret = __intel_pstate_cpu_init(policy); 2669 2670 if (ret) 2671 return ret; 2672 2673 /* 2674 * Set the policy to powersave to provide a valid fallback value in case 2675 * the default cpufreq governor is neither powersave nor performance. 2676 */ 2677 policy->policy = CPUFREQ_POLICY_POWERSAVE; 2678 2679 if (hwp_active) { 2680 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2681 2682 cpu->epp_cached = intel_pstate_get_epp(cpu, 0); 2683 } 2684 2685 return 0; 2686 } 2687 2688 static struct cpufreq_driver intel_pstate = { 2689 .flags = CPUFREQ_CONST_LOOPS, 2690 .verify = intel_pstate_verify_policy, 2691 .setpolicy = intel_pstate_set_policy, 2692 .suspend = intel_pstate_suspend, 2693 .resume = intel_pstate_resume, 2694 .init = intel_pstate_cpu_init, 2695 .exit = intel_pstate_cpu_exit, 2696 .offline = intel_pstate_cpu_offline, 2697 .online = intel_pstate_cpu_online, 2698 .update_limits = intel_pstate_update_limits, 2699 .name = "intel_pstate", 2700 }; 2701 2702 static int intel_cpufreq_verify_policy(struct cpufreq_policy_data *policy) 2703 { 2704 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2705 2706 intel_pstate_verify_cpu_policy(cpu, policy); 2707 intel_pstate_update_perf_limits(cpu, policy->min, policy->max); 2708 2709 return 0; 2710 } 2711 2712 /* Use of trace in passive mode: 2713 * 2714 * In passive mode the trace core_busy field (also known as the 2715 * performance field, and lablelled as such on the graphs; also known as 2716 * core_avg_perf) is not needed and so is re-assigned to indicate if the 2717 * driver call was via the normal or fast switch path. Various graphs 2718 * output from the intel_pstate_tracer.py utility that include core_busy 2719 * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%, 2720 * so we use 10 to indicate the normal path through the driver, and 2721 * 90 to indicate the fast switch path through the driver. 2722 * The scaled_busy field is not used, and is set to 0. 2723 */ 2724 2725 #define INTEL_PSTATE_TRACE_TARGET 10 2726 #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 2727 2728 static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) 2729 { 2730 struct sample *sample; 2731 2732 if (!trace_pstate_sample_enabled()) 2733 return; 2734 2735 if (!intel_pstate_sample(cpu, ktime_get())) 2736 return; 2737 2738 sample = &cpu->sample; 2739 trace_pstate_sample(trace_type, 2740 0, 2741 old_pstate, 2742 cpu->pstate.current_pstate, 2743 sample->mperf, 2744 sample->aperf, 2745 sample->tsc, 2746 get_avg_frequency(cpu), 2747 fp_toint(cpu->iowait_boost * 100)); 2748 } 2749 2750 static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max, 2751 u32 desired, bool fast_switch) 2752 { 2753 u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev; 2754 2755 value &= ~HWP_MIN_PERF(~0L); 2756 value |= HWP_MIN_PERF(min); 2757 2758 value &= ~HWP_MAX_PERF(~0L); 2759 value |= HWP_MAX_PERF(max); 2760 2761 value &= ~HWP_DESIRED_PERF(~0L); 2762 value |= HWP_DESIRED_PERF(desired); 2763 2764 if (value == prev) 2765 return; 2766 2767 WRITE_ONCE(cpu->hwp_req_cached, value); 2768 if (fast_switch) 2769 wrmsrl(MSR_HWP_REQUEST, value); 2770 else 2771 wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 2772 } 2773 2774 static void intel_cpufreq_perf_ctl_update(struct cpudata *cpu, 2775 u32 target_pstate, bool fast_switch) 2776 { 2777 if (fast_switch) 2778 wrmsrl(MSR_IA32_PERF_CTL, 2779 pstate_funcs.get_val(cpu, target_pstate)); 2780 else 2781 wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, 2782 pstate_funcs.get_val(cpu, target_pstate)); 2783 } 2784 2785 static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy, 2786 int target_pstate, bool fast_switch) 2787 { 2788 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2789 int old_pstate = cpu->pstate.current_pstate; 2790 2791 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 2792 if (hwp_active) { 2793 int max_pstate = policy->strict_target ? 2794 target_pstate : cpu->max_perf_ratio; 2795 2796 intel_cpufreq_hwp_update(cpu, target_pstate, max_pstate, 0, 2797 fast_switch); 2798 } else if (target_pstate != old_pstate) { 2799 intel_cpufreq_perf_ctl_update(cpu, target_pstate, fast_switch); 2800 } 2801 2802 cpu->pstate.current_pstate = target_pstate; 2803 2804 intel_cpufreq_trace(cpu, fast_switch ? INTEL_PSTATE_TRACE_FAST_SWITCH : 2805 INTEL_PSTATE_TRACE_TARGET, old_pstate); 2806 2807 return target_pstate; 2808 } 2809 2810 static int intel_cpufreq_target(struct cpufreq_policy *policy, 2811 unsigned int target_freq, 2812 unsigned int relation) 2813 { 2814 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2815 struct cpufreq_freqs freqs; 2816 int target_pstate; 2817 2818 update_turbo_state(); 2819 2820 freqs.old = policy->cur; 2821 freqs.new = target_freq; 2822 2823 cpufreq_freq_transition_begin(policy, &freqs); 2824 2825 switch (relation) { 2826 case CPUFREQ_RELATION_L: 2827 target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling); 2828 break; 2829 case CPUFREQ_RELATION_H: 2830 target_pstate = freqs.new / cpu->pstate.scaling; 2831 break; 2832 default: 2833 target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling); 2834 break; 2835 } 2836 2837 target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, false); 2838 2839 freqs.new = target_pstate * cpu->pstate.scaling; 2840 2841 cpufreq_freq_transition_end(policy, &freqs, false); 2842 2843 return 0; 2844 } 2845 2846 static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, 2847 unsigned int target_freq) 2848 { 2849 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2850 int target_pstate; 2851 2852 update_turbo_state(); 2853 2854 target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); 2855 2856 target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, true); 2857 2858 return target_pstate * cpu->pstate.scaling; 2859 } 2860 2861 static void intel_cpufreq_adjust_perf(unsigned int cpunum, 2862 unsigned long min_perf, 2863 unsigned long target_perf, 2864 unsigned long capacity) 2865 { 2866 struct cpudata *cpu = all_cpu_data[cpunum]; 2867 u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); 2868 int old_pstate = cpu->pstate.current_pstate; 2869 int cap_pstate, min_pstate, max_pstate, target_pstate; 2870 2871 update_turbo_state(); 2872 cap_pstate = global.turbo_disabled ? HWP_GUARANTEED_PERF(hwp_cap) : 2873 HWP_HIGHEST_PERF(hwp_cap); 2874 2875 /* Optimization: Avoid unnecessary divisions. */ 2876 2877 target_pstate = cap_pstate; 2878 if (target_perf < capacity) 2879 target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity); 2880 2881 min_pstate = cap_pstate; 2882 if (min_perf < capacity) 2883 min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity); 2884 2885 if (min_pstate < cpu->pstate.min_pstate) 2886 min_pstate = cpu->pstate.min_pstate; 2887 2888 if (min_pstate < cpu->min_perf_ratio) 2889 min_pstate = cpu->min_perf_ratio; 2890 2891 max_pstate = min(cap_pstate, cpu->max_perf_ratio); 2892 if (max_pstate < min_pstate) 2893 max_pstate = min_pstate; 2894 2895 target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate); 2896 2897 intel_cpufreq_hwp_update(cpu, min_pstate, max_pstate, target_pstate, true); 2898 2899 cpu->pstate.current_pstate = target_pstate; 2900 intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate); 2901 } 2902 2903 static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) 2904 { 2905 struct freq_qos_request *req; 2906 struct cpudata *cpu; 2907 struct device *dev; 2908 int ret, freq; 2909 2910 dev = get_cpu_device(policy->cpu); 2911 if (!dev) 2912 return -ENODEV; 2913 2914 ret = __intel_pstate_cpu_init(policy); 2915 if (ret) 2916 return ret; 2917 2918 policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; 2919 /* This reflects the intel_pstate_get_cpu_pstates() setting. */ 2920 policy->cur = policy->cpuinfo.min_freq; 2921 2922 req = kcalloc(2, sizeof(*req), GFP_KERNEL); 2923 if (!req) { 2924 ret = -ENOMEM; 2925 goto pstate_exit; 2926 } 2927 2928 cpu = all_cpu_data[policy->cpu]; 2929 2930 if (hwp_active) { 2931 u64 value; 2932 2933 policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP; 2934 2935 intel_pstate_get_hwp_cap(cpu); 2936 2937 rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value); 2938 WRITE_ONCE(cpu->hwp_req_cached, value); 2939 2940 cpu->epp_cached = intel_pstate_get_epp(cpu, value); 2941 } else { 2942 policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; 2943 } 2944 2945 freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.min_perf_pct, 100); 2946 2947 ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN, 2948 freq); 2949 if (ret < 0) { 2950 dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); 2951 goto free_req; 2952 } 2953 2954 freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.max_perf_pct, 100); 2955 2956 ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX, 2957 freq); 2958 if (ret < 0) { 2959 dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret); 2960 goto remove_min_req; 2961 } 2962 2963 policy->driver_data = req; 2964 2965 return 0; 2966 2967 remove_min_req: 2968 freq_qos_remove_request(req); 2969 free_req: 2970 kfree(req); 2971 pstate_exit: 2972 intel_pstate_exit_perf_limits(policy); 2973 2974 return ret; 2975 } 2976 2977 static int intel_cpufreq_cpu_exit(struct cpufreq_policy *policy) 2978 { 2979 struct freq_qos_request *req; 2980 2981 req = policy->driver_data; 2982 2983 freq_qos_remove_request(req + 1); 2984 freq_qos_remove_request(req); 2985 kfree(req); 2986 2987 return intel_pstate_cpu_exit(policy); 2988 } 2989 2990 static struct cpufreq_driver intel_cpufreq = { 2991 .flags = CPUFREQ_CONST_LOOPS, 2992 .verify = intel_cpufreq_verify_policy, 2993 .target = intel_cpufreq_target, 2994 .fast_switch = intel_cpufreq_fast_switch, 2995 .init = intel_cpufreq_cpu_init, 2996 .exit = intel_cpufreq_cpu_exit, 2997 .offline = intel_cpufreq_cpu_offline, 2998 .online = intel_pstate_cpu_online, 2999 .suspend = intel_pstate_suspend, 3000 .resume = intel_pstate_resume, 3001 .update_limits = intel_pstate_update_limits, 3002 .name = "intel_cpufreq", 3003 }; 3004 3005 static struct cpufreq_driver *default_driver; 3006 3007 static void intel_pstate_driver_cleanup(void) 3008 { 3009 unsigned int cpu; 3010 3011 cpus_read_lock(); 3012 for_each_online_cpu(cpu) { 3013 if (all_cpu_data[cpu]) { 3014 if (intel_pstate_driver == &intel_pstate) 3015 intel_pstate_clear_update_util_hook(cpu); 3016 3017 kfree(all_cpu_data[cpu]); 3018 all_cpu_data[cpu] = NULL; 3019 } 3020 } 3021 cpus_read_unlock(); 3022 3023 intel_pstate_driver = NULL; 3024 } 3025 3026 static int intel_pstate_register_driver(struct cpufreq_driver *driver) 3027 { 3028 int ret; 3029 3030 if (driver == &intel_pstate) 3031 intel_pstate_sysfs_expose_hwp_dynamic_boost(); 3032 3033 memset(&global, 0, sizeof(global)); 3034 global.max_perf_pct = 100; 3035 3036 intel_pstate_driver = driver; 3037 ret = cpufreq_register_driver(intel_pstate_driver); 3038 if (ret) { 3039 intel_pstate_driver_cleanup(); 3040 return ret; 3041 } 3042 3043 global.min_perf_pct = min_perf_pct_min(); 3044 3045 return 0; 3046 } 3047 3048 static ssize_t intel_pstate_show_status(char *buf) 3049 { 3050 if (!intel_pstate_driver) 3051 return sprintf(buf, "off\n"); 3052 3053 return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ? 3054 "active" : "passive"); 3055 } 3056 3057 static int intel_pstate_update_status(const char *buf, size_t size) 3058 { 3059 if (size == 3 && !strncmp(buf, "off", size)) { 3060 if (!intel_pstate_driver) 3061 return -EINVAL; 3062 3063 if (hwp_active) 3064 return -EBUSY; 3065 3066 cpufreq_unregister_driver(intel_pstate_driver); 3067 intel_pstate_driver_cleanup(); 3068 return 0; 3069 } 3070 3071 if (size == 6 && !strncmp(buf, "active", size)) { 3072 if (intel_pstate_driver) { 3073 if (intel_pstate_driver == &intel_pstate) 3074 return 0; 3075 3076 cpufreq_unregister_driver(intel_pstate_driver); 3077 } 3078 3079 return intel_pstate_register_driver(&intel_pstate); 3080 } 3081 3082 if (size == 7 && !strncmp(buf, "passive", size)) { 3083 if (intel_pstate_driver) { 3084 if (intel_pstate_driver == &intel_cpufreq) 3085 return 0; 3086 3087 cpufreq_unregister_driver(intel_pstate_driver); 3088 intel_pstate_sysfs_hide_hwp_dynamic_boost(); 3089 } 3090 3091 return intel_pstate_register_driver(&intel_cpufreq); 3092 } 3093 3094 return -EINVAL; 3095 } 3096 3097 static int no_load __initdata; 3098 static int no_hwp __initdata; 3099 static int hwp_only __initdata; 3100 static unsigned int force_load __initdata; 3101 3102 static int __init intel_pstate_msrs_not_valid(void) 3103 { 3104 if (!pstate_funcs.get_max() || 3105 !pstate_funcs.get_min() || 3106 !pstate_funcs.get_turbo()) 3107 return -ENODEV; 3108 3109 return 0; 3110 } 3111 3112 static void __init copy_cpu_funcs(struct pstate_funcs *funcs) 3113 { 3114 pstate_funcs.get_max = funcs->get_max; 3115 pstate_funcs.get_max_physical = funcs->get_max_physical; 3116 pstate_funcs.get_min = funcs->get_min; 3117 pstate_funcs.get_turbo = funcs->get_turbo; 3118 pstate_funcs.get_scaling = funcs->get_scaling; 3119 pstate_funcs.get_val = funcs->get_val; 3120 pstate_funcs.get_vid = funcs->get_vid; 3121 pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift; 3122 } 3123 3124 #ifdef CONFIG_ACPI 3125 3126 static bool __init intel_pstate_no_acpi_pss(void) 3127 { 3128 int i; 3129 3130 for_each_possible_cpu(i) { 3131 acpi_status status; 3132 union acpi_object *pss; 3133 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 3134 struct acpi_processor *pr = per_cpu(processors, i); 3135 3136 if (!pr) 3137 continue; 3138 3139 status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer); 3140 if (ACPI_FAILURE(status)) 3141 continue; 3142 3143 pss = buffer.pointer; 3144 if (pss && pss->type == ACPI_TYPE_PACKAGE) { 3145 kfree(pss); 3146 return false; 3147 } 3148 3149 kfree(pss); 3150 } 3151 3152 pr_debug("ACPI _PSS not found\n"); 3153 return true; 3154 } 3155 3156 static bool __init intel_pstate_no_acpi_pcch(void) 3157 { 3158 acpi_status status; 3159 acpi_handle handle; 3160 3161 status = acpi_get_handle(NULL, "\\_SB", &handle); 3162 if (ACPI_FAILURE(status)) 3163 goto not_found; 3164 3165 if (acpi_has_method(handle, "PCCH")) 3166 return false; 3167 3168 not_found: 3169 pr_debug("ACPI PCCH not found\n"); 3170 return true; 3171 } 3172 3173 static bool __init intel_pstate_has_acpi_ppc(void) 3174 { 3175 int i; 3176 3177 for_each_possible_cpu(i) { 3178 struct acpi_processor *pr = per_cpu(processors, i); 3179 3180 if (!pr) 3181 continue; 3182 if (acpi_has_method(pr->handle, "_PPC")) 3183 return true; 3184 } 3185 pr_debug("ACPI _PPC not found\n"); 3186 return false; 3187 } 3188 3189 enum { 3190 PSS, 3191 PPC, 3192 }; 3193 3194 /* Hardware vendor-specific info that has its own power management modes */ 3195 static struct acpi_platform_list plat_info[] __initdata = { 3196 {"HP ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, NULL, PSS}, 3197 {"ORACLE", "X4-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3198 {"ORACLE", "X4-2L ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3199 {"ORACLE", "X4-2B ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3200 {"ORACLE", "X3-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3201 {"ORACLE", "X3-2L ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3202 {"ORACLE", "X3-2B ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3203 {"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3204 {"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3205 {"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3206 {"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3207 {"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3208 {"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3209 {"ORACLE", "X6-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3210 {"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3211 { } /* End */ 3212 }; 3213 3214 #define BITMASK_OOB (BIT(8) | BIT(18)) 3215 3216 static bool __init intel_pstate_platform_pwr_mgmt_exists(void) 3217 { 3218 const struct x86_cpu_id *id; 3219 u64 misc_pwr; 3220 int idx; 3221 3222 id = x86_match_cpu(intel_pstate_cpu_oob_ids); 3223 if (id) { 3224 rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr); 3225 if (misc_pwr & BITMASK_OOB) { 3226 pr_debug("Bit 8 or 18 in the MISC_PWR_MGMT MSR set\n"); 3227 pr_debug("P states are controlled in Out of Band mode by the firmware/hardware\n"); 3228 return true; 3229 } 3230 } 3231 3232 idx = acpi_match_platform_list(plat_info); 3233 if (idx < 0) 3234 return false; 3235 3236 switch (plat_info[idx].data) { 3237 case PSS: 3238 if (!intel_pstate_no_acpi_pss()) 3239 return false; 3240 3241 return intel_pstate_no_acpi_pcch(); 3242 case PPC: 3243 return intel_pstate_has_acpi_ppc() && !force_load; 3244 } 3245 3246 return false; 3247 } 3248 3249 static void intel_pstate_request_control_from_smm(void) 3250 { 3251 /* 3252 * It may be unsafe to request P-states control from SMM if _PPC support 3253 * has not been enabled. 3254 */ 3255 if (acpi_ppc) 3256 acpi_processor_pstate_control(); 3257 } 3258 #else /* CONFIG_ACPI not enabled */ 3259 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; } 3260 static inline bool intel_pstate_has_acpi_ppc(void) { return false; } 3261 static inline void intel_pstate_request_control_from_smm(void) {} 3262 #endif /* CONFIG_ACPI */ 3263 3264 #define INTEL_PSTATE_HWP_BROADWELL 0x01 3265 3266 #define X86_MATCH_HWP(model, hwp_mode) \ 3267 X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ 3268 X86_FEATURE_HWP, hwp_mode) 3269 3270 static const struct x86_cpu_id hwp_support_ids[] __initconst = { 3271 X86_MATCH_HWP(BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL), 3272 X86_MATCH_HWP(BROADWELL_D, INTEL_PSTATE_HWP_BROADWELL), 3273 X86_MATCH_HWP(ANY, 0), 3274 {} 3275 }; 3276 3277 static bool intel_pstate_hwp_is_enabled(void) 3278 { 3279 u64 value; 3280 3281 rdmsrl(MSR_PM_ENABLE, value); 3282 return !!(value & 0x1); 3283 } 3284 3285 static int __init intel_pstate_init(void) 3286 { 3287 const struct x86_cpu_id *id; 3288 int rc; 3289 3290 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 3291 return -ENODEV; 3292 3293 if (no_load) 3294 return -ENODEV; 3295 3296 id = x86_match_cpu(hwp_support_ids); 3297 if (id) { 3298 copy_cpu_funcs(&core_funcs); 3299 /* 3300 * Avoid enabling HWP for processors without EPP support, 3301 * because that means incomplete HWP implementation which is a 3302 * corner case and supporting it is generally problematic. 3303 * 3304 * If HWP is enabled already, though, there is no choice but to 3305 * deal with it. 3306 */ 3307 if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || 3308 intel_pstate_hwp_is_enabled()) { 3309 hwp_active++; 3310 hwp_mode_bdw = id->driver_data; 3311 intel_pstate.attr = hwp_cpufreq_attrs; 3312 intel_cpufreq.attr = hwp_cpufreq_attrs; 3313 intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS; 3314 intel_cpufreq.adjust_perf = intel_cpufreq_adjust_perf; 3315 if (!default_driver) 3316 default_driver = &intel_pstate; 3317 3318 goto hwp_cpu_matched; 3319 } 3320 } else { 3321 id = x86_match_cpu(intel_pstate_cpu_ids); 3322 if (!id) { 3323 pr_info("CPU model not supported\n"); 3324 return -ENODEV; 3325 } 3326 3327 copy_cpu_funcs((struct pstate_funcs *)id->driver_data); 3328 } 3329 3330 if (intel_pstate_msrs_not_valid()) { 3331 pr_info("Invalid MSRs\n"); 3332 return -ENODEV; 3333 } 3334 /* Without HWP start in the passive mode. */ 3335 if (!default_driver) 3336 default_driver = &intel_cpufreq; 3337 3338 hwp_cpu_matched: 3339 /* 3340 * The Intel pstate driver will be ignored if the platform 3341 * firmware has its own power management modes. 3342 */ 3343 if (intel_pstate_platform_pwr_mgmt_exists()) { 3344 pr_info("P-states controlled by the platform\n"); 3345 return -ENODEV; 3346 } 3347 3348 if (!hwp_active && hwp_only) 3349 return -ENOTSUPP; 3350 3351 pr_info("Intel P-state driver initializing\n"); 3352 3353 all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus())); 3354 if (!all_cpu_data) 3355 return -ENOMEM; 3356 3357 intel_pstate_request_control_from_smm(); 3358 3359 intel_pstate_sysfs_expose_params(); 3360 3361 mutex_lock(&intel_pstate_driver_lock); 3362 rc = intel_pstate_register_driver(default_driver); 3363 mutex_unlock(&intel_pstate_driver_lock); 3364 if (rc) { 3365 intel_pstate_sysfs_remove(); 3366 return rc; 3367 } 3368 3369 if (hwp_active) { 3370 const struct x86_cpu_id *id; 3371 3372 id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids); 3373 if (id) { 3374 set_power_ctl_ee_state(false); 3375 pr_info("Disabling energy efficiency optimization\n"); 3376 } 3377 3378 pr_info("HWP enabled\n"); 3379 } else if (boot_cpu_has(X86_FEATURE_HYBRID_CPU)) { 3380 pr_warn("Problematic setup: Hybrid processor with disabled HWP\n"); 3381 } 3382 3383 return 0; 3384 } 3385 device_initcall(intel_pstate_init); 3386 3387 static int __init intel_pstate_setup(char *str) 3388 { 3389 if (!str) 3390 return -EINVAL; 3391 3392 if (!strcmp(str, "disable")) 3393 no_load = 1; 3394 else if (!strcmp(str, "active")) 3395 default_driver = &intel_pstate; 3396 else if (!strcmp(str, "passive")) 3397 default_driver = &intel_cpufreq; 3398 3399 if (!strcmp(str, "no_hwp")) { 3400 pr_info("HWP disabled\n"); 3401 no_hwp = 1; 3402 } 3403 if (!strcmp(str, "force")) 3404 force_load = 1; 3405 if (!strcmp(str, "hwp_only")) 3406 hwp_only = 1; 3407 if (!strcmp(str, "per_cpu_perf_limits")) 3408 per_cpu_limits = true; 3409 3410 #ifdef CONFIG_ACPI 3411 if (!strcmp(str, "support_acpi_ppc")) 3412 acpi_ppc = true; 3413 #endif 3414 3415 return 0; 3416 } 3417 early_param("intel_pstate", intel_pstate_setup); 3418 3419 MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>"); 3420 MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors"); 3421 MODULE_LICENSE("GPL"); 3422