1 /* 2 * intel_pstate.c: Native P state management for Intel processors 3 * 4 * (C) Copyright 2012 Intel Corporation 5 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com> 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; version 2 10 * of the License. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/kernel.h> 16 #include <linux/kernel_stat.h> 17 #include <linux/module.h> 18 #include <linux/ktime.h> 19 #include <linux/hrtimer.h> 20 #include <linux/tick.h> 21 #include <linux/slab.h> 22 #include <linux/sched/cpufreq.h> 23 #include <linux/list.h> 24 #include <linux/cpu.h> 25 #include <linux/cpufreq.h> 26 #include <linux/sysfs.h> 27 #include <linux/types.h> 28 #include <linux/fs.h> 29 #include <linux/debugfs.h> 30 #include <linux/acpi.h> 31 #include <linux/vmalloc.h> 32 #include <trace/events/power.h> 33 34 #include <asm/div64.h> 35 #include <asm/msr.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/cpufeature.h> 38 #include <asm/intel-family.h> 39 40 #define INTEL_PSTATE_DEFAULT_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) 41 #define INTEL_PSTATE_HWP_SAMPLING_INTERVAL (50 * NSEC_PER_MSEC) 42 43 #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 44 45 #ifdef CONFIG_ACPI 46 #include <acpi/processor.h> 47 #include <acpi/cppc_acpi.h> 48 #endif 49 50 #define FRAC_BITS 8 51 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) 52 #define fp_toint(X) ((X) >> FRAC_BITS) 53 54 #define EXT_BITS 6 55 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS) 56 #define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS) 57 #define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS) 58 59 static inline int32_t mul_fp(int32_t x, int32_t y) 60 { 61 return ((int64_t)x * (int64_t)y) >> FRAC_BITS; 62 } 63 64 static inline int32_t div_fp(s64 x, s64 y) 65 { 66 return div64_s64((int64_t)x << FRAC_BITS, y); 67 } 68 69 static inline int ceiling_fp(int32_t x) 70 { 71 int mask, ret; 72 73 ret = fp_toint(x); 74 mask = (1 << FRAC_BITS) - 1; 75 if (x & mask) 76 ret += 1; 77 return ret; 78 } 79 80 static inline int32_t percent_fp(int percent) 81 { 82 return div_fp(percent, 100); 83 } 84 85 static inline u64 mul_ext_fp(u64 x, u64 y) 86 { 87 return (x * y) >> EXT_FRAC_BITS; 88 } 89 90 static inline u64 div_ext_fp(u64 x, u64 y) 91 { 92 return div64_u64(x << EXT_FRAC_BITS, y); 93 } 94 95 static inline int32_t percent_ext_fp(int percent) 96 { 97 return div_ext_fp(percent, 100); 98 } 99 100 /** 101 * struct sample - Store performance sample 102 * @core_avg_perf: Ratio of APERF/MPERF which is the actual average 103 * performance during last sample period 104 * @busy_scaled: Scaled busy value which is used to calculate next 105 * P state. This can be different than core_avg_perf 106 * to account for cpu idle period 107 * @aperf: Difference of actual performance frequency clock count 108 * read from APERF MSR between last and current sample 109 * @mperf: Difference of maximum performance frequency clock count 110 * read from MPERF MSR between last and current sample 111 * @tsc: Difference of time stamp counter between last and 112 * current sample 113 * @time: Current time from scheduler 114 * 115 * This structure is used in the cpudata structure to store performance sample 116 * data for choosing next P State. 117 */ 118 struct sample { 119 int32_t core_avg_perf; 120 int32_t busy_scaled; 121 u64 aperf; 122 u64 mperf; 123 u64 tsc; 124 u64 time; 125 }; 126 127 /** 128 * struct pstate_data - Store P state data 129 * @current_pstate: Current requested P state 130 * @min_pstate: Min P state possible for this platform 131 * @max_pstate: Max P state possible for this platform 132 * @max_pstate_physical:This is physical Max P state for a processor 133 * This can be higher than the max_pstate which can 134 * be limited by platform thermal design power limits 135 * @scaling: Scaling factor to convert frequency to cpufreq 136 * frequency units 137 * @turbo_pstate: Max Turbo P state possible for this platform 138 * @max_freq: @max_pstate frequency in cpufreq units 139 * @turbo_freq: @turbo_pstate frequency in cpufreq units 140 * 141 * Stores the per cpu model P state limits and current P state. 142 */ 143 struct pstate_data { 144 int current_pstate; 145 int min_pstate; 146 int max_pstate; 147 int max_pstate_physical; 148 int scaling; 149 int turbo_pstate; 150 unsigned int max_freq; 151 unsigned int turbo_freq; 152 }; 153 154 /** 155 * struct vid_data - Stores voltage information data 156 * @min: VID data for this platform corresponding to 157 * the lowest P state 158 * @max: VID data corresponding to the highest P State. 159 * @turbo: VID data for turbo P state 160 * @ratio: Ratio of (vid max - vid min) / 161 * (max P state - Min P State) 162 * 163 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling) 164 * This data is used in Atom platforms, where in addition to target P state, 165 * the voltage data needs to be specified to select next P State. 166 */ 167 struct vid_data { 168 int min; 169 int max; 170 int turbo; 171 int32_t ratio; 172 }; 173 174 /** 175 * struct _pid - Stores PID data 176 * @setpoint: Target set point for busyness or performance 177 * @integral: Storage for accumulated error values 178 * @p_gain: PID proportional gain 179 * @i_gain: PID integral gain 180 * @d_gain: PID derivative gain 181 * @deadband: PID deadband 182 * @last_err: Last error storage for integral part of PID calculation 183 * 184 * Stores PID coefficients and last error for PID controller. 185 */ 186 struct _pid { 187 int setpoint; 188 int32_t integral; 189 int32_t p_gain; 190 int32_t i_gain; 191 int32_t d_gain; 192 int deadband; 193 int32_t last_err; 194 }; 195 196 /** 197 * struct global_params - Global parameters, mostly tunable via sysfs. 198 * @no_turbo: Whether or not to use turbo P-states. 199 * @turbo_disabled: Whethet or not turbo P-states are available at all, 200 * based on the MSR_IA32_MISC_ENABLE value and whether or 201 * not the maximum reported turbo P-state is different from 202 * the maximum reported non-turbo one. 203 * @min_perf_pct: Minimum capacity limit in percent of the maximum turbo 204 * P-state capacity. 205 * @max_perf_pct: Maximum capacity limit in percent of the maximum turbo 206 * P-state capacity. 207 */ 208 struct global_params { 209 bool no_turbo; 210 bool turbo_disabled; 211 int max_perf_pct; 212 int min_perf_pct; 213 }; 214 215 /** 216 * struct cpudata - Per CPU instance data storage 217 * @cpu: CPU number for this instance data 218 * @policy: CPUFreq policy value 219 * @update_util: CPUFreq utility callback information 220 * @update_util_set: CPUFreq utility callback is set 221 * @iowait_boost: iowait-related boost fraction 222 * @last_update: Time of the last update. 223 * @pstate: Stores P state limits for this CPU 224 * @vid: Stores VID limits for this CPU 225 * @pid: Stores PID parameters for this CPU 226 * @last_sample_time: Last Sample time 227 * @prev_aperf: Last APERF value read from APERF MSR 228 * @prev_mperf: Last MPERF value read from MPERF MSR 229 * @prev_tsc: Last timestamp counter (TSC) value 230 * @prev_cummulative_iowait: IO Wait time difference from last and 231 * current sample 232 * @sample: Storage for storing last Sample data 233 * @min_perf: Minimum capacity limit as a fraction of the maximum 234 * turbo P-state capacity. 235 * @max_perf: Maximum capacity limit as a fraction of the maximum 236 * turbo P-state capacity. 237 * @acpi_perf_data: Stores ACPI perf information read from _PSS 238 * @valid_pss_table: Set to true for valid ACPI _PSS entries found 239 * @epp_powersave: Last saved HWP energy performance preference 240 * (EPP) or energy performance bias (EPB), 241 * when policy switched to performance 242 * @epp_policy: Last saved policy used to set EPP/EPB 243 * @epp_default: Power on default HWP energy performance 244 * preference/bias 245 * @epp_saved: Saved EPP/EPB during system suspend or CPU offline 246 * operation 247 * 248 * This structure stores per CPU instance data for all CPUs. 249 */ 250 struct cpudata { 251 int cpu; 252 253 unsigned int policy; 254 struct update_util_data update_util; 255 bool update_util_set; 256 257 struct pstate_data pstate; 258 struct vid_data vid; 259 struct _pid pid; 260 261 u64 last_update; 262 u64 last_sample_time; 263 u64 prev_aperf; 264 u64 prev_mperf; 265 u64 prev_tsc; 266 u64 prev_cummulative_iowait; 267 struct sample sample; 268 int32_t min_perf; 269 int32_t max_perf; 270 #ifdef CONFIG_ACPI 271 struct acpi_processor_performance acpi_perf_data; 272 bool valid_pss_table; 273 #endif 274 unsigned int iowait_boost; 275 s16 epp_powersave; 276 s16 epp_policy; 277 s16 epp_default; 278 s16 epp_saved; 279 }; 280 281 static struct cpudata **all_cpu_data; 282 283 /** 284 * struct pstate_adjust_policy - Stores static PID configuration data 285 * @sample_rate_ms: PID calculation sample rate in ms 286 * @sample_rate_ns: Sample rate calculation in ns 287 * @deadband: PID deadband 288 * @setpoint: PID Setpoint 289 * @p_gain_pct: PID proportional gain 290 * @i_gain_pct: PID integral gain 291 * @d_gain_pct: PID derivative gain 292 * 293 * Stores per CPU model static PID configuration data. 294 */ 295 struct pstate_adjust_policy { 296 int sample_rate_ms; 297 s64 sample_rate_ns; 298 int deadband; 299 int setpoint; 300 int p_gain_pct; 301 int d_gain_pct; 302 int i_gain_pct; 303 }; 304 305 /** 306 * struct pstate_funcs - Per CPU model specific callbacks 307 * @get_max: Callback to get maximum non turbo effective P state 308 * @get_max_physical: Callback to get maximum non turbo physical P state 309 * @get_min: Callback to get minimum P state 310 * @get_turbo: Callback to get turbo P state 311 * @get_scaling: Callback to get frequency scaling factor 312 * @get_val: Callback to convert P state to actual MSR write value 313 * @get_vid: Callback to get VID data for Atom platforms 314 * @get_target_pstate: Callback to a function to calculate next P state to use 315 * 316 * Core and Atom CPU models have different way to get P State limits. This 317 * structure is used to store those callbacks. 318 */ 319 struct pstate_funcs { 320 int (*get_max)(void); 321 int (*get_max_physical)(void); 322 int (*get_min)(void); 323 int (*get_turbo)(void); 324 int (*get_scaling)(void); 325 u64 (*get_val)(struct cpudata*, int pstate); 326 void (*get_vid)(struct cpudata *); 327 int32_t (*get_target_pstate)(struct cpudata *); 328 }; 329 330 /** 331 * struct cpu_defaults- Per CPU model default config data 332 * @funcs: Callback function data 333 */ 334 struct cpu_defaults { 335 struct pstate_funcs funcs; 336 }; 337 338 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu); 339 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu); 340 341 static struct pstate_funcs pstate_funcs __read_mostly; 342 static struct pstate_adjust_policy pid_params __read_mostly = { 343 .sample_rate_ms = 10, 344 .sample_rate_ns = 10 * NSEC_PER_MSEC, 345 .deadband = 0, 346 .setpoint = 97, 347 .p_gain_pct = 20, 348 .d_gain_pct = 0, 349 .i_gain_pct = 0, 350 }; 351 352 static int hwp_active __read_mostly; 353 static bool per_cpu_limits __read_mostly; 354 355 static struct cpufreq_driver *intel_pstate_driver __read_mostly; 356 357 #ifdef CONFIG_ACPI 358 static bool acpi_ppc; 359 #endif 360 361 static struct global_params global; 362 363 static DEFINE_MUTEX(intel_pstate_driver_lock); 364 static DEFINE_MUTEX(intel_pstate_limits_lock); 365 366 #ifdef CONFIG_ACPI 367 368 static bool intel_pstate_get_ppc_enable_status(void) 369 { 370 if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER || 371 acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER) 372 return true; 373 374 return acpi_ppc; 375 } 376 377 #ifdef CONFIG_ACPI_CPPC_LIB 378 379 /* The work item is needed to avoid CPU hotplug locking issues */ 380 static void intel_pstste_sched_itmt_work_fn(struct work_struct *work) 381 { 382 sched_set_itmt_support(); 383 } 384 385 static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn); 386 387 static void intel_pstate_set_itmt_prio(int cpu) 388 { 389 struct cppc_perf_caps cppc_perf; 390 static u32 max_highest_perf = 0, min_highest_perf = U32_MAX; 391 int ret; 392 393 ret = cppc_get_perf_caps(cpu, &cppc_perf); 394 if (ret) 395 return; 396 397 /* 398 * The priorities can be set regardless of whether or not 399 * sched_set_itmt_support(true) has been called and it is valid to 400 * update them at any time after it has been called. 401 */ 402 sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); 403 404 if (max_highest_perf <= min_highest_perf) { 405 if (cppc_perf.highest_perf > max_highest_perf) 406 max_highest_perf = cppc_perf.highest_perf; 407 408 if (cppc_perf.highest_perf < min_highest_perf) 409 min_highest_perf = cppc_perf.highest_perf; 410 411 if (max_highest_perf > min_highest_perf) { 412 /* 413 * This code can be run during CPU online under the 414 * CPU hotplug locks, so sched_set_itmt_support() 415 * cannot be called from here. Queue up a work item 416 * to invoke it. 417 */ 418 schedule_work(&sched_itmt_work); 419 } 420 } 421 } 422 #else 423 static void intel_pstate_set_itmt_prio(int cpu) 424 { 425 } 426 #endif 427 428 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 429 { 430 struct cpudata *cpu; 431 int ret; 432 int i; 433 434 if (hwp_active) { 435 intel_pstate_set_itmt_prio(policy->cpu); 436 return; 437 } 438 439 if (!intel_pstate_get_ppc_enable_status()) 440 return; 441 442 cpu = all_cpu_data[policy->cpu]; 443 444 ret = acpi_processor_register_performance(&cpu->acpi_perf_data, 445 policy->cpu); 446 if (ret) 447 return; 448 449 /* 450 * Check if the control value in _PSS is for PERF_CTL MSR, which should 451 * guarantee that the states returned by it map to the states in our 452 * list directly. 453 */ 454 if (cpu->acpi_perf_data.control_register.space_id != 455 ACPI_ADR_SPACE_FIXED_HARDWARE) 456 goto err; 457 458 /* 459 * If there is only one entry _PSS, simply ignore _PSS and continue as 460 * usual without taking _PSS into account 461 */ 462 if (cpu->acpi_perf_data.state_count < 2) 463 goto err; 464 465 pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu); 466 for (i = 0; i < cpu->acpi_perf_data.state_count; i++) { 467 pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n", 468 (i == cpu->acpi_perf_data.state ? '*' : ' '), i, 469 (u32) cpu->acpi_perf_data.states[i].core_frequency, 470 (u32) cpu->acpi_perf_data.states[i].power, 471 (u32) cpu->acpi_perf_data.states[i].control); 472 } 473 474 /* 475 * The _PSS table doesn't contain whole turbo frequency range. 476 * This just contains +1 MHZ above the max non turbo frequency, 477 * with control value corresponding to max turbo ratio. But 478 * when cpufreq set policy is called, it will call with this 479 * max frequency, which will cause a reduced performance as 480 * this driver uses real max turbo frequency as the max 481 * frequency. So correct this frequency in _PSS table to 482 * correct max turbo frequency based on the turbo state. 483 * Also need to convert to MHz as _PSS freq is in MHz. 484 */ 485 if (!global.turbo_disabled) 486 cpu->acpi_perf_data.states[0].core_frequency = 487 policy->cpuinfo.max_freq / 1000; 488 cpu->valid_pss_table = true; 489 pr_debug("_PPC limits will be enforced\n"); 490 491 return; 492 493 err: 494 cpu->valid_pss_table = false; 495 acpi_processor_unregister_performance(policy->cpu); 496 } 497 498 static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 499 { 500 struct cpudata *cpu; 501 502 cpu = all_cpu_data[policy->cpu]; 503 if (!cpu->valid_pss_table) 504 return; 505 506 acpi_processor_unregister_performance(policy->cpu); 507 } 508 #else 509 static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 510 { 511 } 512 513 static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 514 { 515 } 516 #endif 517 518 static signed int pid_calc(struct _pid *pid, int32_t busy) 519 { 520 signed int result; 521 int32_t pterm, dterm, fp_error; 522 int32_t integral_limit; 523 524 fp_error = pid->setpoint - busy; 525 526 if (abs(fp_error) <= pid->deadband) 527 return 0; 528 529 pterm = mul_fp(pid->p_gain, fp_error); 530 531 pid->integral += fp_error; 532 533 /* 534 * We limit the integral here so that it will never 535 * get higher than 30. This prevents it from becoming 536 * too large an input over long periods of time and allows 537 * it to get factored out sooner. 538 * 539 * The value of 30 was chosen through experimentation. 540 */ 541 integral_limit = int_tofp(30); 542 if (pid->integral > integral_limit) 543 pid->integral = integral_limit; 544 if (pid->integral < -integral_limit) 545 pid->integral = -integral_limit; 546 547 dterm = mul_fp(pid->d_gain, fp_error - pid->last_err); 548 pid->last_err = fp_error; 549 550 result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm; 551 result = result + (1 << (FRAC_BITS-1)); 552 return (signed int)fp_toint(result); 553 } 554 555 static inline void intel_pstate_pid_reset(struct cpudata *cpu) 556 { 557 struct _pid *pid = &cpu->pid; 558 559 pid->p_gain = percent_fp(pid_params.p_gain_pct); 560 pid->d_gain = percent_fp(pid_params.d_gain_pct); 561 pid->i_gain = percent_fp(pid_params.i_gain_pct); 562 pid->setpoint = int_tofp(pid_params.setpoint); 563 pid->last_err = pid->setpoint - int_tofp(100); 564 pid->deadband = int_tofp(pid_params.deadband); 565 pid->integral = 0; 566 } 567 568 static inline void update_turbo_state(void) 569 { 570 u64 misc_en; 571 struct cpudata *cpu; 572 573 cpu = all_cpu_data[0]; 574 rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); 575 global.turbo_disabled = 576 (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || 577 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); 578 } 579 580 static int min_perf_pct_min(void) 581 { 582 struct cpudata *cpu = all_cpu_data[0]; 583 584 return DIV_ROUND_UP(cpu->pstate.min_pstate * 100, 585 cpu->pstate.turbo_pstate); 586 } 587 588 static s16 intel_pstate_get_epb(struct cpudata *cpu_data) 589 { 590 u64 epb; 591 int ret; 592 593 if (!static_cpu_has(X86_FEATURE_EPB)) 594 return -ENXIO; 595 596 ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); 597 if (ret) 598 return (s16)ret; 599 600 return (s16)(epb & 0x0f); 601 } 602 603 static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data) 604 { 605 s16 epp; 606 607 if (static_cpu_has(X86_FEATURE_HWP_EPP)) { 608 /* 609 * When hwp_req_data is 0, means that caller didn't read 610 * MSR_HWP_REQUEST, so need to read and get EPP. 611 */ 612 if (!hwp_req_data) { 613 epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, 614 &hwp_req_data); 615 if (epp) 616 return epp; 617 } 618 epp = (hwp_req_data >> 24) & 0xff; 619 } else { 620 /* When there is no EPP present, HWP uses EPB settings */ 621 epp = intel_pstate_get_epb(cpu_data); 622 } 623 624 return epp; 625 } 626 627 static int intel_pstate_set_epb(int cpu, s16 pref) 628 { 629 u64 epb; 630 int ret; 631 632 if (!static_cpu_has(X86_FEATURE_EPB)) 633 return -ENXIO; 634 635 ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); 636 if (ret) 637 return ret; 638 639 epb = (epb & ~0x0f) | pref; 640 wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb); 641 642 return 0; 643 } 644 645 /* 646 * EPP/EPB display strings corresponding to EPP index in the 647 * energy_perf_strings[] 648 * index String 649 *------------------------------------- 650 * 0 default 651 * 1 performance 652 * 2 balance_performance 653 * 3 balance_power 654 * 4 power 655 */ 656 static const char * const energy_perf_strings[] = { 657 "default", 658 "performance", 659 "balance_performance", 660 "balance_power", 661 "power", 662 NULL 663 }; 664 665 static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) 666 { 667 s16 epp; 668 int index = -EINVAL; 669 670 epp = intel_pstate_get_epp(cpu_data, 0); 671 if (epp < 0) 672 return epp; 673 674 if (static_cpu_has(X86_FEATURE_HWP_EPP)) { 675 /* 676 * Range: 677 * 0x00-0x3F : Performance 678 * 0x40-0x7F : Balance performance 679 * 0x80-0xBF : Balance power 680 * 0xC0-0xFF : Power 681 * The EPP is a 8 bit value, but our ranges restrict the 682 * value which can be set. Here only using top two bits 683 * effectively. 684 */ 685 index = (epp >> 6) + 1; 686 } else if (static_cpu_has(X86_FEATURE_EPB)) { 687 /* 688 * Range: 689 * 0x00-0x03 : Performance 690 * 0x04-0x07 : Balance performance 691 * 0x08-0x0B : Balance power 692 * 0x0C-0x0F : Power 693 * The EPB is a 4 bit value, but our ranges restrict the 694 * value which can be set. Here only using top two bits 695 * effectively. 696 */ 697 index = (epp >> 2) + 1; 698 } 699 700 return index; 701 } 702 703 static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, 704 int pref_index) 705 { 706 int epp = -EINVAL; 707 int ret; 708 709 if (!pref_index) 710 epp = cpu_data->epp_default; 711 712 mutex_lock(&intel_pstate_limits_lock); 713 714 if (static_cpu_has(X86_FEATURE_HWP_EPP)) { 715 u64 value; 716 717 ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value); 718 if (ret) 719 goto return_pref; 720 721 value &= ~GENMASK_ULL(31, 24); 722 723 /* 724 * If epp is not default, convert from index into 725 * energy_perf_strings to epp value, by shifting 6 726 * bits left to use only top two bits in epp. 727 * The resultant epp need to shifted by 24 bits to 728 * epp position in MSR_HWP_REQUEST. 729 */ 730 if (epp == -EINVAL) 731 epp = (pref_index - 1) << 6; 732 733 value |= (u64)epp << 24; 734 ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value); 735 } else { 736 if (epp == -EINVAL) 737 epp = (pref_index - 1) << 2; 738 ret = intel_pstate_set_epb(cpu_data->cpu, epp); 739 } 740 return_pref: 741 mutex_unlock(&intel_pstate_limits_lock); 742 743 return ret; 744 } 745 746 static ssize_t show_energy_performance_available_preferences( 747 struct cpufreq_policy *policy, char *buf) 748 { 749 int i = 0; 750 int ret = 0; 751 752 while (energy_perf_strings[i] != NULL) 753 ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]); 754 755 ret += sprintf(&buf[ret], "\n"); 756 757 return ret; 758 } 759 760 cpufreq_freq_attr_ro(energy_performance_available_preferences); 761 762 static ssize_t store_energy_performance_preference( 763 struct cpufreq_policy *policy, const char *buf, size_t count) 764 { 765 struct cpudata *cpu_data = all_cpu_data[policy->cpu]; 766 char str_preference[21]; 767 int ret, i = 0; 768 769 ret = sscanf(buf, "%20s", str_preference); 770 if (ret != 1) 771 return -EINVAL; 772 773 while (energy_perf_strings[i] != NULL) { 774 if (!strcmp(str_preference, energy_perf_strings[i])) { 775 intel_pstate_set_energy_pref_index(cpu_data, i); 776 return count; 777 } 778 ++i; 779 } 780 781 return -EINVAL; 782 } 783 784 static ssize_t show_energy_performance_preference( 785 struct cpufreq_policy *policy, char *buf) 786 { 787 struct cpudata *cpu_data = all_cpu_data[policy->cpu]; 788 int preference; 789 790 preference = intel_pstate_get_energy_pref_index(cpu_data); 791 if (preference < 0) 792 return preference; 793 794 return sprintf(buf, "%s\n", energy_perf_strings[preference]); 795 } 796 797 cpufreq_freq_attr_rw(energy_performance_preference); 798 799 static struct freq_attr *hwp_cpufreq_attrs[] = { 800 &energy_performance_preference, 801 &energy_performance_available_preferences, 802 NULL, 803 }; 804 805 static void intel_pstate_hwp_set(struct cpufreq_policy *policy) 806 { 807 int min, hw_min, max, hw_max, cpu; 808 u64 value, cap; 809 810 for_each_cpu(cpu, policy->cpus) { 811 struct cpudata *cpu_data = all_cpu_data[cpu]; 812 s16 epp; 813 814 rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); 815 hw_min = HWP_LOWEST_PERF(cap); 816 if (global.no_turbo) 817 hw_max = HWP_GUARANTEED_PERF(cap); 818 else 819 hw_max = HWP_HIGHEST_PERF(cap); 820 821 max = fp_ext_toint(hw_max * cpu_data->max_perf); 822 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) 823 min = max; 824 else 825 min = fp_ext_toint(hw_max * cpu_data->min_perf); 826 827 rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); 828 829 value &= ~HWP_MIN_PERF(~0L); 830 value |= HWP_MIN_PERF(min); 831 832 value &= ~HWP_MAX_PERF(~0L); 833 value |= HWP_MAX_PERF(max); 834 835 if (cpu_data->epp_policy == cpu_data->policy) 836 goto skip_epp; 837 838 cpu_data->epp_policy = cpu_data->policy; 839 840 if (cpu_data->epp_saved >= 0) { 841 epp = cpu_data->epp_saved; 842 cpu_data->epp_saved = -EINVAL; 843 goto update_epp; 844 } 845 846 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) { 847 epp = intel_pstate_get_epp(cpu_data, value); 848 cpu_data->epp_powersave = epp; 849 /* If EPP read was failed, then don't try to write */ 850 if (epp < 0) 851 goto skip_epp; 852 853 854 epp = 0; 855 } else { 856 /* skip setting EPP, when saved value is invalid */ 857 if (cpu_data->epp_powersave < 0) 858 goto skip_epp; 859 860 /* 861 * No need to restore EPP when it is not zero. This 862 * means: 863 * - Policy is not changed 864 * - user has manually changed 865 * - Error reading EPB 866 */ 867 epp = intel_pstate_get_epp(cpu_data, value); 868 if (epp) 869 goto skip_epp; 870 871 epp = cpu_data->epp_powersave; 872 } 873 update_epp: 874 if (static_cpu_has(X86_FEATURE_HWP_EPP)) { 875 value &= ~GENMASK_ULL(31, 24); 876 value |= (u64)epp << 24; 877 } else { 878 intel_pstate_set_epb(cpu, epp); 879 } 880 skip_epp: 881 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); 882 } 883 } 884 885 static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy) 886 { 887 struct cpudata *cpu_data = all_cpu_data[policy->cpu]; 888 889 if (!hwp_active) 890 return 0; 891 892 cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0); 893 894 return 0; 895 } 896 897 static int intel_pstate_resume(struct cpufreq_policy *policy) 898 { 899 if (!hwp_active) 900 return 0; 901 902 mutex_lock(&intel_pstate_limits_lock); 903 904 all_cpu_data[policy->cpu]->epp_policy = 0; 905 intel_pstate_hwp_set(policy); 906 907 mutex_unlock(&intel_pstate_limits_lock); 908 909 return 0; 910 } 911 912 static void intel_pstate_update_policies(void) 913 { 914 int cpu; 915 916 for_each_possible_cpu(cpu) 917 cpufreq_update_policy(cpu); 918 } 919 920 /************************** debugfs begin ************************/ 921 static int pid_param_set(void *data, u64 val) 922 { 923 unsigned int cpu; 924 925 *(u32 *)data = val; 926 pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC; 927 for_each_possible_cpu(cpu) 928 if (all_cpu_data[cpu]) 929 intel_pstate_pid_reset(all_cpu_data[cpu]); 930 931 return 0; 932 } 933 934 static int pid_param_get(void *data, u64 *val) 935 { 936 *val = *(u32 *)data; 937 return 0; 938 } 939 DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, pid_param_set, "%llu\n"); 940 941 static struct dentry *debugfs_parent; 942 943 struct pid_param { 944 char *name; 945 void *value; 946 struct dentry *dentry; 947 }; 948 949 static struct pid_param pid_files[] = { 950 {"sample_rate_ms", &pid_params.sample_rate_ms, }, 951 {"d_gain_pct", &pid_params.d_gain_pct, }, 952 {"i_gain_pct", &pid_params.i_gain_pct, }, 953 {"deadband", &pid_params.deadband, }, 954 {"setpoint", &pid_params.setpoint, }, 955 {"p_gain_pct", &pid_params.p_gain_pct, }, 956 {NULL, NULL, } 957 }; 958 959 static void intel_pstate_debug_expose_params(void) 960 { 961 int i; 962 963 debugfs_parent = debugfs_create_dir("pstate_snb", NULL); 964 if (IS_ERR_OR_NULL(debugfs_parent)) 965 return; 966 967 for (i = 0; pid_files[i].name; i++) { 968 struct dentry *dentry; 969 970 dentry = debugfs_create_file(pid_files[i].name, 0660, 971 debugfs_parent, pid_files[i].value, 972 &fops_pid_param); 973 if (!IS_ERR(dentry)) 974 pid_files[i].dentry = dentry; 975 } 976 } 977 978 static void intel_pstate_debug_hide_params(void) 979 { 980 int i; 981 982 if (IS_ERR_OR_NULL(debugfs_parent)) 983 return; 984 985 for (i = 0; pid_files[i].name; i++) { 986 debugfs_remove(pid_files[i].dentry); 987 pid_files[i].dentry = NULL; 988 } 989 990 debugfs_remove(debugfs_parent); 991 debugfs_parent = NULL; 992 } 993 994 /************************** debugfs end ************************/ 995 996 /************************** sysfs begin ************************/ 997 #define show_one(file_name, object) \ 998 static ssize_t show_##file_name \ 999 (struct kobject *kobj, struct attribute *attr, char *buf) \ 1000 { \ 1001 return sprintf(buf, "%u\n", global.object); \ 1002 } 1003 1004 static ssize_t intel_pstate_show_status(char *buf); 1005 static int intel_pstate_update_status(const char *buf, size_t size); 1006 1007 static ssize_t show_status(struct kobject *kobj, 1008 struct attribute *attr, char *buf) 1009 { 1010 ssize_t ret; 1011 1012 mutex_lock(&intel_pstate_driver_lock); 1013 ret = intel_pstate_show_status(buf); 1014 mutex_unlock(&intel_pstate_driver_lock); 1015 1016 return ret; 1017 } 1018 1019 static ssize_t store_status(struct kobject *a, struct attribute *b, 1020 const char *buf, size_t count) 1021 { 1022 char *p = memchr(buf, '\n', count); 1023 int ret; 1024 1025 mutex_lock(&intel_pstate_driver_lock); 1026 ret = intel_pstate_update_status(buf, p ? p - buf : count); 1027 mutex_unlock(&intel_pstate_driver_lock); 1028 1029 return ret < 0 ? ret : count; 1030 } 1031 1032 static ssize_t show_turbo_pct(struct kobject *kobj, 1033 struct attribute *attr, char *buf) 1034 { 1035 struct cpudata *cpu; 1036 int total, no_turbo, turbo_pct; 1037 uint32_t turbo_fp; 1038 1039 mutex_lock(&intel_pstate_driver_lock); 1040 1041 if (!intel_pstate_driver) { 1042 mutex_unlock(&intel_pstate_driver_lock); 1043 return -EAGAIN; 1044 } 1045 1046 cpu = all_cpu_data[0]; 1047 1048 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 1049 no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; 1050 turbo_fp = div_fp(no_turbo, total); 1051 turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); 1052 1053 mutex_unlock(&intel_pstate_driver_lock); 1054 1055 return sprintf(buf, "%u\n", turbo_pct); 1056 } 1057 1058 static ssize_t show_num_pstates(struct kobject *kobj, 1059 struct attribute *attr, char *buf) 1060 { 1061 struct cpudata *cpu; 1062 int total; 1063 1064 mutex_lock(&intel_pstate_driver_lock); 1065 1066 if (!intel_pstate_driver) { 1067 mutex_unlock(&intel_pstate_driver_lock); 1068 return -EAGAIN; 1069 } 1070 1071 cpu = all_cpu_data[0]; 1072 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 1073 1074 mutex_unlock(&intel_pstate_driver_lock); 1075 1076 return sprintf(buf, "%u\n", total); 1077 } 1078 1079 static ssize_t show_no_turbo(struct kobject *kobj, 1080 struct attribute *attr, char *buf) 1081 { 1082 ssize_t ret; 1083 1084 mutex_lock(&intel_pstate_driver_lock); 1085 1086 if (!intel_pstate_driver) { 1087 mutex_unlock(&intel_pstate_driver_lock); 1088 return -EAGAIN; 1089 } 1090 1091 update_turbo_state(); 1092 if (global.turbo_disabled) 1093 ret = sprintf(buf, "%u\n", global.turbo_disabled); 1094 else 1095 ret = sprintf(buf, "%u\n", global.no_turbo); 1096 1097 mutex_unlock(&intel_pstate_driver_lock); 1098 1099 return ret; 1100 } 1101 1102 static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, 1103 const char *buf, size_t count) 1104 { 1105 unsigned int input; 1106 int ret; 1107 1108 ret = sscanf(buf, "%u", &input); 1109 if (ret != 1) 1110 return -EINVAL; 1111 1112 mutex_lock(&intel_pstate_driver_lock); 1113 1114 if (!intel_pstate_driver) { 1115 mutex_unlock(&intel_pstate_driver_lock); 1116 return -EAGAIN; 1117 } 1118 1119 mutex_lock(&intel_pstate_limits_lock); 1120 1121 update_turbo_state(); 1122 if (global.turbo_disabled) { 1123 pr_warn("Turbo disabled by BIOS or unavailable on processor\n"); 1124 mutex_unlock(&intel_pstate_limits_lock); 1125 mutex_unlock(&intel_pstate_driver_lock); 1126 return -EPERM; 1127 } 1128 1129 global.no_turbo = clamp_t(int, input, 0, 1); 1130 1131 if (global.no_turbo) { 1132 struct cpudata *cpu = all_cpu_data[0]; 1133 int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate; 1134 1135 /* Squash the global minimum into the permitted range. */ 1136 if (global.min_perf_pct > pct) 1137 global.min_perf_pct = pct; 1138 } 1139 1140 mutex_unlock(&intel_pstate_limits_lock); 1141 1142 intel_pstate_update_policies(); 1143 1144 mutex_unlock(&intel_pstate_driver_lock); 1145 1146 return count; 1147 } 1148 1149 static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, 1150 const char *buf, size_t count) 1151 { 1152 unsigned int input; 1153 int ret; 1154 1155 ret = sscanf(buf, "%u", &input); 1156 if (ret != 1) 1157 return -EINVAL; 1158 1159 mutex_lock(&intel_pstate_driver_lock); 1160 1161 if (!intel_pstate_driver) { 1162 mutex_unlock(&intel_pstate_driver_lock); 1163 return -EAGAIN; 1164 } 1165 1166 mutex_lock(&intel_pstate_limits_lock); 1167 1168 global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100); 1169 1170 mutex_unlock(&intel_pstate_limits_lock); 1171 1172 intel_pstate_update_policies(); 1173 1174 mutex_unlock(&intel_pstate_driver_lock); 1175 1176 return count; 1177 } 1178 1179 static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, 1180 const char *buf, size_t count) 1181 { 1182 unsigned int input; 1183 int ret; 1184 1185 ret = sscanf(buf, "%u", &input); 1186 if (ret != 1) 1187 return -EINVAL; 1188 1189 mutex_lock(&intel_pstate_driver_lock); 1190 1191 if (!intel_pstate_driver) { 1192 mutex_unlock(&intel_pstate_driver_lock); 1193 return -EAGAIN; 1194 } 1195 1196 mutex_lock(&intel_pstate_limits_lock); 1197 1198 global.min_perf_pct = clamp_t(int, input, 1199 min_perf_pct_min(), global.max_perf_pct); 1200 1201 mutex_unlock(&intel_pstate_limits_lock); 1202 1203 intel_pstate_update_policies(); 1204 1205 mutex_unlock(&intel_pstate_driver_lock); 1206 1207 return count; 1208 } 1209 1210 show_one(max_perf_pct, max_perf_pct); 1211 show_one(min_perf_pct, min_perf_pct); 1212 1213 define_one_global_rw(status); 1214 define_one_global_rw(no_turbo); 1215 define_one_global_rw(max_perf_pct); 1216 define_one_global_rw(min_perf_pct); 1217 define_one_global_ro(turbo_pct); 1218 define_one_global_ro(num_pstates); 1219 1220 static struct attribute *intel_pstate_attributes[] = { 1221 &status.attr, 1222 &no_turbo.attr, 1223 &turbo_pct.attr, 1224 &num_pstates.attr, 1225 NULL 1226 }; 1227 1228 static struct attribute_group intel_pstate_attr_group = { 1229 .attrs = intel_pstate_attributes, 1230 }; 1231 1232 static void __init intel_pstate_sysfs_expose_params(void) 1233 { 1234 struct kobject *intel_pstate_kobject; 1235 int rc; 1236 1237 intel_pstate_kobject = kobject_create_and_add("intel_pstate", 1238 &cpu_subsys.dev_root->kobj); 1239 if (WARN_ON(!intel_pstate_kobject)) 1240 return; 1241 1242 rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group); 1243 if (WARN_ON(rc)) 1244 return; 1245 1246 /* 1247 * If per cpu limits are enforced there are no global limits, so 1248 * return without creating max/min_perf_pct attributes 1249 */ 1250 if (per_cpu_limits) 1251 return; 1252 1253 rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr); 1254 WARN_ON(rc); 1255 1256 rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr); 1257 WARN_ON(rc); 1258 1259 } 1260 /************************** sysfs end ************************/ 1261 1262 static void intel_pstate_hwp_enable(struct cpudata *cpudata) 1263 { 1264 /* First disable HWP notification interrupt as we don't process them */ 1265 if (static_cpu_has(X86_FEATURE_HWP_NOTIFY)) 1266 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); 1267 1268 wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); 1269 cpudata->epp_policy = 0; 1270 if (cpudata->epp_default == -EINVAL) 1271 cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); 1272 } 1273 1274 #define MSR_IA32_POWER_CTL_BIT_EE 19 1275 1276 /* Disable energy efficiency optimization */ 1277 static void intel_pstate_disable_ee(int cpu) 1278 { 1279 u64 power_ctl; 1280 int ret; 1281 1282 ret = rdmsrl_on_cpu(cpu, MSR_IA32_POWER_CTL, &power_ctl); 1283 if (ret) 1284 return; 1285 1286 if (!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE))) { 1287 pr_info("Disabling energy efficiency optimization\n"); 1288 power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE); 1289 wrmsrl_on_cpu(cpu, MSR_IA32_POWER_CTL, power_ctl); 1290 } 1291 } 1292 1293 static int atom_get_min_pstate(void) 1294 { 1295 u64 value; 1296 1297 rdmsrl(MSR_ATOM_CORE_RATIOS, value); 1298 return (value >> 8) & 0x7F; 1299 } 1300 1301 static int atom_get_max_pstate(void) 1302 { 1303 u64 value; 1304 1305 rdmsrl(MSR_ATOM_CORE_RATIOS, value); 1306 return (value >> 16) & 0x7F; 1307 } 1308 1309 static int atom_get_turbo_pstate(void) 1310 { 1311 u64 value; 1312 1313 rdmsrl(MSR_ATOM_CORE_TURBO_RATIOS, value); 1314 return value & 0x7F; 1315 } 1316 1317 static u64 atom_get_val(struct cpudata *cpudata, int pstate) 1318 { 1319 u64 val; 1320 int32_t vid_fp; 1321 u32 vid; 1322 1323 val = (u64)pstate << 8; 1324 if (global.no_turbo && !global.turbo_disabled) 1325 val |= (u64)1 << 32; 1326 1327 vid_fp = cpudata->vid.min + mul_fp( 1328 int_tofp(pstate - cpudata->pstate.min_pstate), 1329 cpudata->vid.ratio); 1330 1331 vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); 1332 vid = ceiling_fp(vid_fp); 1333 1334 if (pstate > cpudata->pstate.max_pstate) 1335 vid = cpudata->vid.turbo; 1336 1337 return val | vid; 1338 } 1339 1340 static int silvermont_get_scaling(void) 1341 { 1342 u64 value; 1343 int i; 1344 /* Defined in Table 35-6 from SDM (Sept 2015) */ 1345 static int silvermont_freq_table[] = { 1346 83300, 100000, 133300, 116700, 80000}; 1347 1348 rdmsrl(MSR_FSB_FREQ, value); 1349 i = value & 0x7; 1350 WARN_ON(i > 4); 1351 1352 return silvermont_freq_table[i]; 1353 } 1354 1355 static int airmont_get_scaling(void) 1356 { 1357 u64 value; 1358 int i; 1359 /* Defined in Table 35-10 from SDM (Sept 2015) */ 1360 static int airmont_freq_table[] = { 1361 83300, 100000, 133300, 116700, 80000, 1362 93300, 90000, 88900, 87500}; 1363 1364 rdmsrl(MSR_FSB_FREQ, value); 1365 i = value & 0xF; 1366 WARN_ON(i > 8); 1367 1368 return airmont_freq_table[i]; 1369 } 1370 1371 static void atom_get_vid(struct cpudata *cpudata) 1372 { 1373 u64 value; 1374 1375 rdmsrl(MSR_ATOM_CORE_VIDS, value); 1376 cpudata->vid.min = int_tofp((value >> 8) & 0x7f); 1377 cpudata->vid.max = int_tofp((value >> 16) & 0x7f); 1378 cpudata->vid.ratio = div_fp( 1379 cpudata->vid.max - cpudata->vid.min, 1380 int_tofp(cpudata->pstate.max_pstate - 1381 cpudata->pstate.min_pstate)); 1382 1383 rdmsrl(MSR_ATOM_CORE_TURBO_VIDS, value); 1384 cpudata->vid.turbo = value & 0x7f; 1385 } 1386 1387 static int core_get_min_pstate(void) 1388 { 1389 u64 value; 1390 1391 rdmsrl(MSR_PLATFORM_INFO, value); 1392 return (value >> 40) & 0xFF; 1393 } 1394 1395 static int core_get_max_pstate_physical(void) 1396 { 1397 u64 value; 1398 1399 rdmsrl(MSR_PLATFORM_INFO, value); 1400 return (value >> 8) & 0xFF; 1401 } 1402 1403 static int core_get_tdp_ratio(u64 plat_info) 1404 { 1405 /* Check how many TDP levels present */ 1406 if (plat_info & 0x600000000) { 1407 u64 tdp_ctrl; 1408 u64 tdp_ratio; 1409 int tdp_msr; 1410 int err; 1411 1412 /* Get the TDP level (0, 1, 2) to get ratios */ 1413 err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl); 1414 if (err) 1415 return err; 1416 1417 /* TDP MSR are continuous starting at 0x648 */ 1418 tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x03); 1419 err = rdmsrl_safe(tdp_msr, &tdp_ratio); 1420 if (err) 1421 return err; 1422 1423 /* For level 1 and 2, bits[23:16] contain the ratio */ 1424 if (tdp_ctrl & 0x03) 1425 tdp_ratio >>= 16; 1426 1427 tdp_ratio &= 0xff; /* ratios are only 8 bits long */ 1428 pr_debug("tdp_ratio %x\n", (int)tdp_ratio); 1429 1430 return (int)tdp_ratio; 1431 } 1432 1433 return -ENXIO; 1434 } 1435 1436 static int core_get_max_pstate(void) 1437 { 1438 u64 tar; 1439 u64 plat_info; 1440 int max_pstate; 1441 int tdp_ratio; 1442 int err; 1443 1444 rdmsrl(MSR_PLATFORM_INFO, plat_info); 1445 max_pstate = (plat_info >> 8) & 0xFF; 1446 1447 tdp_ratio = core_get_tdp_ratio(plat_info); 1448 if (tdp_ratio <= 0) 1449 return max_pstate; 1450 1451 if (hwp_active) { 1452 /* Turbo activation ratio is not used on HWP platforms */ 1453 return tdp_ratio; 1454 } 1455 1456 err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar); 1457 if (!err) { 1458 int tar_levels; 1459 1460 /* Do some sanity checking for safety */ 1461 tar_levels = tar & 0xff; 1462 if (tdp_ratio - 1 == tar_levels) { 1463 max_pstate = tar_levels; 1464 pr_debug("max_pstate=TAC %x\n", max_pstate); 1465 } 1466 } 1467 1468 return max_pstate; 1469 } 1470 1471 static int core_get_turbo_pstate(void) 1472 { 1473 u64 value; 1474 int nont, ret; 1475 1476 rdmsrl(MSR_TURBO_RATIO_LIMIT, value); 1477 nont = core_get_max_pstate(); 1478 ret = (value) & 255; 1479 if (ret <= nont) 1480 ret = nont; 1481 return ret; 1482 } 1483 1484 static inline int core_get_scaling(void) 1485 { 1486 return 100000; 1487 } 1488 1489 static u64 core_get_val(struct cpudata *cpudata, int pstate) 1490 { 1491 u64 val; 1492 1493 val = (u64)pstate << 8; 1494 if (global.no_turbo && !global.turbo_disabled) 1495 val |= (u64)1 << 32; 1496 1497 return val; 1498 } 1499 1500 static int knl_get_turbo_pstate(void) 1501 { 1502 u64 value; 1503 int nont, ret; 1504 1505 rdmsrl(MSR_TURBO_RATIO_LIMIT, value); 1506 nont = core_get_max_pstate(); 1507 ret = (((value) >> 8) & 0xFF); 1508 if (ret <= nont) 1509 ret = nont; 1510 return ret; 1511 } 1512 1513 static struct cpu_defaults core_params = { 1514 .funcs = { 1515 .get_max = core_get_max_pstate, 1516 .get_max_physical = core_get_max_pstate_physical, 1517 .get_min = core_get_min_pstate, 1518 .get_turbo = core_get_turbo_pstate, 1519 .get_scaling = core_get_scaling, 1520 .get_val = core_get_val, 1521 .get_target_pstate = get_target_pstate_use_performance, 1522 }, 1523 }; 1524 1525 static const struct cpu_defaults silvermont_params = { 1526 .funcs = { 1527 .get_max = atom_get_max_pstate, 1528 .get_max_physical = atom_get_max_pstate, 1529 .get_min = atom_get_min_pstate, 1530 .get_turbo = atom_get_turbo_pstate, 1531 .get_val = atom_get_val, 1532 .get_scaling = silvermont_get_scaling, 1533 .get_vid = atom_get_vid, 1534 .get_target_pstate = get_target_pstate_use_cpu_load, 1535 }, 1536 }; 1537 1538 static const struct cpu_defaults airmont_params = { 1539 .funcs = { 1540 .get_max = atom_get_max_pstate, 1541 .get_max_physical = atom_get_max_pstate, 1542 .get_min = atom_get_min_pstate, 1543 .get_turbo = atom_get_turbo_pstate, 1544 .get_val = atom_get_val, 1545 .get_scaling = airmont_get_scaling, 1546 .get_vid = atom_get_vid, 1547 .get_target_pstate = get_target_pstate_use_cpu_load, 1548 }, 1549 }; 1550 1551 static const struct cpu_defaults knl_params = { 1552 .funcs = { 1553 .get_max = core_get_max_pstate, 1554 .get_max_physical = core_get_max_pstate_physical, 1555 .get_min = core_get_min_pstate, 1556 .get_turbo = knl_get_turbo_pstate, 1557 .get_scaling = core_get_scaling, 1558 .get_val = core_get_val, 1559 .get_target_pstate = get_target_pstate_use_performance, 1560 }, 1561 }; 1562 1563 static const struct cpu_defaults bxt_params = { 1564 .funcs = { 1565 .get_max = core_get_max_pstate, 1566 .get_max_physical = core_get_max_pstate_physical, 1567 .get_min = core_get_min_pstate, 1568 .get_turbo = core_get_turbo_pstate, 1569 .get_scaling = core_get_scaling, 1570 .get_val = core_get_val, 1571 .get_target_pstate = get_target_pstate_use_cpu_load, 1572 }, 1573 }; 1574 1575 static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) 1576 { 1577 int max_perf = cpu->pstate.turbo_pstate; 1578 int max_perf_adj; 1579 int min_perf; 1580 1581 if (global.no_turbo || global.turbo_disabled) 1582 max_perf = cpu->pstate.max_pstate; 1583 1584 /* 1585 * performance can be limited by user through sysfs, by cpufreq 1586 * policy, or by cpu specific default values determined through 1587 * experimentation. 1588 */ 1589 max_perf_adj = fp_ext_toint(max_perf * cpu->max_perf); 1590 *max = clamp_t(int, max_perf_adj, 1591 cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); 1592 1593 min_perf = fp_ext_toint(max_perf * cpu->min_perf); 1594 *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); 1595 } 1596 1597 static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) 1598 { 1599 trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); 1600 cpu->pstate.current_pstate = pstate; 1601 /* 1602 * Generally, there is no guarantee that this code will always run on 1603 * the CPU being updated, so force the register update to run on the 1604 * right CPU. 1605 */ 1606 wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, 1607 pstate_funcs.get_val(cpu, pstate)); 1608 } 1609 1610 static void intel_pstate_set_min_pstate(struct cpudata *cpu) 1611 { 1612 intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); 1613 } 1614 1615 static void intel_pstate_max_within_limits(struct cpudata *cpu) 1616 { 1617 int min_pstate, max_pstate; 1618 1619 update_turbo_state(); 1620 intel_pstate_get_min_max(cpu, &min_pstate, &max_pstate); 1621 intel_pstate_set_pstate(cpu, max_pstate); 1622 } 1623 1624 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) 1625 { 1626 cpu->pstate.min_pstate = pstate_funcs.get_min(); 1627 cpu->pstate.max_pstate = pstate_funcs.get_max(); 1628 cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(); 1629 cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); 1630 cpu->pstate.scaling = pstate_funcs.get_scaling(); 1631 cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling; 1632 cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; 1633 1634 if (pstate_funcs.get_vid) 1635 pstate_funcs.get_vid(cpu); 1636 1637 intel_pstate_set_min_pstate(cpu); 1638 } 1639 1640 static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu) 1641 { 1642 struct sample *sample = &cpu->sample; 1643 1644 sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf); 1645 } 1646 1647 static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) 1648 { 1649 u64 aperf, mperf; 1650 unsigned long flags; 1651 u64 tsc; 1652 1653 local_irq_save(flags); 1654 rdmsrl(MSR_IA32_APERF, aperf); 1655 rdmsrl(MSR_IA32_MPERF, mperf); 1656 tsc = rdtsc(); 1657 if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) { 1658 local_irq_restore(flags); 1659 return false; 1660 } 1661 local_irq_restore(flags); 1662 1663 cpu->last_sample_time = cpu->sample.time; 1664 cpu->sample.time = time; 1665 cpu->sample.aperf = aperf; 1666 cpu->sample.mperf = mperf; 1667 cpu->sample.tsc = tsc; 1668 cpu->sample.aperf -= cpu->prev_aperf; 1669 cpu->sample.mperf -= cpu->prev_mperf; 1670 cpu->sample.tsc -= cpu->prev_tsc; 1671 1672 cpu->prev_aperf = aperf; 1673 cpu->prev_mperf = mperf; 1674 cpu->prev_tsc = tsc; 1675 /* 1676 * First time this function is invoked in a given cycle, all of the 1677 * previous sample data fields are equal to zero or stale and they must 1678 * be populated with meaningful numbers for things to work, so assume 1679 * that sample.time will always be reset before setting the utilization 1680 * update hook and make the caller skip the sample then. 1681 */ 1682 if (cpu->last_sample_time) { 1683 intel_pstate_calc_avg_perf(cpu); 1684 return true; 1685 } 1686 return false; 1687 } 1688 1689 static inline int32_t get_avg_frequency(struct cpudata *cpu) 1690 { 1691 return mul_ext_fp(cpu->sample.core_avg_perf, 1692 cpu->pstate.max_pstate_physical * cpu->pstate.scaling); 1693 } 1694 1695 static inline int32_t get_avg_pstate(struct cpudata *cpu) 1696 { 1697 return mul_ext_fp(cpu->pstate.max_pstate_physical, 1698 cpu->sample.core_avg_perf); 1699 } 1700 1701 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) 1702 { 1703 struct sample *sample = &cpu->sample; 1704 int32_t busy_frac, boost; 1705 int target, avg_pstate; 1706 1707 busy_frac = div_fp(sample->mperf, sample->tsc); 1708 1709 boost = cpu->iowait_boost; 1710 cpu->iowait_boost >>= 1; 1711 1712 if (busy_frac < boost) 1713 busy_frac = boost; 1714 1715 sample->busy_scaled = busy_frac * 100; 1716 1717 target = global.no_turbo || global.turbo_disabled ? 1718 cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; 1719 target += target >> 2; 1720 target = mul_fp(target, busy_frac); 1721 if (target < cpu->pstate.min_pstate) 1722 target = cpu->pstate.min_pstate; 1723 1724 /* 1725 * If the average P-state during the previous cycle was higher than the 1726 * current target, add 50% of the difference to the target to reduce 1727 * possible performance oscillations and offset possible performance 1728 * loss related to moving the workload from one CPU to another within 1729 * a package/module. 1730 */ 1731 avg_pstate = get_avg_pstate(cpu); 1732 if (avg_pstate > target) 1733 target += (avg_pstate - target) >> 1; 1734 1735 return target; 1736 } 1737 1738 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) 1739 { 1740 int32_t perf_scaled, max_pstate, current_pstate, sample_ratio; 1741 u64 duration_ns; 1742 1743 /* 1744 * perf_scaled is the ratio of the average P-state during the last 1745 * sampling period to the P-state requested last time (in percent). 1746 * 1747 * That measures the system's response to the previous P-state 1748 * selection. 1749 */ 1750 max_pstate = cpu->pstate.max_pstate_physical; 1751 current_pstate = cpu->pstate.current_pstate; 1752 perf_scaled = mul_ext_fp(cpu->sample.core_avg_perf, 1753 div_fp(100 * max_pstate, current_pstate)); 1754 1755 /* 1756 * Since our utilization update callback will not run unless we are 1757 * in C0, check if the actual elapsed time is significantly greater (3x) 1758 * than our sample interval. If it is, then we were idle for a long 1759 * enough period of time to adjust our performance metric. 1760 */ 1761 duration_ns = cpu->sample.time - cpu->last_sample_time; 1762 if ((s64)duration_ns > pid_params.sample_rate_ns * 3) { 1763 sample_ratio = div_fp(pid_params.sample_rate_ns, duration_ns); 1764 perf_scaled = mul_fp(perf_scaled, sample_ratio); 1765 } else { 1766 sample_ratio = div_fp(100 * cpu->sample.mperf, cpu->sample.tsc); 1767 if (sample_ratio < int_tofp(1)) 1768 perf_scaled = 0; 1769 } 1770 1771 cpu->sample.busy_scaled = perf_scaled; 1772 return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled); 1773 } 1774 1775 static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) 1776 { 1777 int max_perf, min_perf; 1778 1779 intel_pstate_get_min_max(cpu, &min_perf, &max_perf); 1780 pstate = clamp_t(int, pstate, min_perf, max_perf); 1781 return pstate; 1782 } 1783 1784 static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) 1785 { 1786 if (pstate == cpu->pstate.current_pstate) 1787 return; 1788 1789 cpu->pstate.current_pstate = pstate; 1790 wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); 1791 } 1792 1793 static void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) 1794 { 1795 int from, target_pstate; 1796 struct sample *sample; 1797 1798 from = cpu->pstate.current_pstate; 1799 1800 target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ? 1801 cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu); 1802 1803 update_turbo_state(); 1804 1805 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 1806 trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu); 1807 intel_pstate_update_pstate(cpu, target_pstate); 1808 1809 sample = &cpu->sample; 1810 trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf), 1811 fp_toint(sample->busy_scaled), 1812 from, 1813 cpu->pstate.current_pstate, 1814 sample->mperf, 1815 sample->aperf, 1816 sample->tsc, 1817 get_avg_frequency(cpu), 1818 fp_toint(cpu->iowait_boost * 100)); 1819 } 1820 1821 static void intel_pstate_update_util_hwp(struct update_util_data *data, 1822 u64 time, unsigned int flags) 1823 { 1824 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 1825 u64 delta_ns = time - cpu->sample.time; 1826 1827 if ((s64)delta_ns >= INTEL_PSTATE_HWP_SAMPLING_INTERVAL) 1828 intel_pstate_sample(cpu, time); 1829 } 1830 1831 static void intel_pstate_update_util_pid(struct update_util_data *data, 1832 u64 time, unsigned int flags) 1833 { 1834 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 1835 u64 delta_ns = time - cpu->sample.time; 1836 1837 if ((s64)delta_ns < pid_params.sample_rate_ns) 1838 return; 1839 1840 if (intel_pstate_sample(cpu, time)) 1841 intel_pstate_adjust_busy_pstate(cpu); 1842 } 1843 1844 static void intel_pstate_update_util(struct update_util_data *data, u64 time, 1845 unsigned int flags) 1846 { 1847 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 1848 u64 delta_ns; 1849 1850 if (flags & SCHED_CPUFREQ_IOWAIT) { 1851 cpu->iowait_boost = int_tofp(1); 1852 } else if (cpu->iowait_boost) { 1853 /* Clear iowait_boost if the CPU may have been idle. */ 1854 delta_ns = time - cpu->last_update; 1855 if (delta_ns > TICK_NSEC) 1856 cpu->iowait_boost = 0; 1857 } 1858 cpu->last_update = time; 1859 delta_ns = time - cpu->sample.time; 1860 if ((s64)delta_ns < INTEL_PSTATE_DEFAULT_SAMPLING_INTERVAL) 1861 return; 1862 1863 if (intel_pstate_sample(cpu, time)) 1864 intel_pstate_adjust_busy_pstate(cpu); 1865 } 1866 1867 /* Utilization update callback to register in the active mode. */ 1868 static void (*update_util_cb)(struct update_util_data *data, u64 time, 1869 unsigned int flags) = intel_pstate_update_util; 1870 1871 #define ICPU(model, policy) \ 1872 { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\ 1873 (unsigned long)&policy } 1874 1875 static const struct x86_cpu_id intel_pstate_cpu_ids[] = { 1876 ICPU(INTEL_FAM6_SANDYBRIDGE, core_params), 1877 ICPU(INTEL_FAM6_SANDYBRIDGE_X, core_params), 1878 ICPU(INTEL_FAM6_ATOM_SILVERMONT1, silvermont_params), 1879 ICPU(INTEL_FAM6_IVYBRIDGE, core_params), 1880 ICPU(INTEL_FAM6_HASWELL_CORE, core_params), 1881 ICPU(INTEL_FAM6_BROADWELL_CORE, core_params), 1882 ICPU(INTEL_FAM6_IVYBRIDGE_X, core_params), 1883 ICPU(INTEL_FAM6_HASWELL_X, core_params), 1884 ICPU(INTEL_FAM6_HASWELL_ULT, core_params), 1885 ICPU(INTEL_FAM6_HASWELL_GT3E, core_params), 1886 ICPU(INTEL_FAM6_BROADWELL_GT3E, core_params), 1887 ICPU(INTEL_FAM6_ATOM_AIRMONT, airmont_params), 1888 ICPU(INTEL_FAM6_SKYLAKE_MOBILE, core_params), 1889 ICPU(INTEL_FAM6_BROADWELL_X, core_params), 1890 ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_params), 1891 ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params), 1892 ICPU(INTEL_FAM6_XEON_PHI_KNL, knl_params), 1893 ICPU(INTEL_FAM6_XEON_PHI_KNM, knl_params), 1894 ICPU(INTEL_FAM6_ATOM_GOLDMONT, bxt_params), 1895 {} 1896 }; 1897 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); 1898 1899 static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { 1900 ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params), 1901 ICPU(INTEL_FAM6_BROADWELL_X, core_params), 1902 ICPU(INTEL_FAM6_SKYLAKE_X, core_params), 1903 {} 1904 }; 1905 1906 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = { 1907 ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, core_params), 1908 {} 1909 }; 1910 1911 static int intel_pstate_init_cpu(unsigned int cpunum) 1912 { 1913 struct cpudata *cpu; 1914 1915 cpu = all_cpu_data[cpunum]; 1916 1917 if (!cpu) { 1918 cpu = kzalloc(sizeof(*cpu), GFP_KERNEL); 1919 if (!cpu) 1920 return -ENOMEM; 1921 1922 all_cpu_data[cpunum] = cpu; 1923 1924 cpu->epp_default = -EINVAL; 1925 cpu->epp_powersave = -EINVAL; 1926 cpu->epp_saved = -EINVAL; 1927 } 1928 1929 cpu = all_cpu_data[cpunum]; 1930 1931 cpu->cpu = cpunum; 1932 1933 if (hwp_active) { 1934 const struct x86_cpu_id *id; 1935 1936 id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids); 1937 if (id) 1938 intel_pstate_disable_ee(cpunum); 1939 1940 intel_pstate_hwp_enable(cpu); 1941 } else if (pstate_funcs.get_target_pstate == get_target_pstate_use_performance) { 1942 intel_pstate_pid_reset(cpu); 1943 } 1944 1945 intel_pstate_get_cpu_pstates(cpu); 1946 1947 pr_debug("controlling: cpu %d\n", cpunum); 1948 1949 return 0; 1950 } 1951 1952 static unsigned int intel_pstate_get(unsigned int cpu_num) 1953 { 1954 struct cpudata *cpu = all_cpu_data[cpu_num]; 1955 1956 return cpu ? get_avg_frequency(cpu) : 0; 1957 } 1958 1959 static void intel_pstate_set_update_util_hook(unsigned int cpu_num) 1960 { 1961 struct cpudata *cpu = all_cpu_data[cpu_num]; 1962 1963 if (cpu->update_util_set) 1964 return; 1965 1966 /* Prevent intel_pstate_update_util() from using stale data. */ 1967 cpu->sample.time = 0; 1968 cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, update_util_cb); 1969 cpu->update_util_set = true; 1970 } 1971 1972 static void intel_pstate_clear_update_util_hook(unsigned int cpu) 1973 { 1974 struct cpudata *cpu_data = all_cpu_data[cpu]; 1975 1976 if (!cpu_data->update_util_set) 1977 return; 1978 1979 cpufreq_remove_update_util_hook(cpu); 1980 cpu_data->update_util_set = false; 1981 synchronize_sched(); 1982 } 1983 1984 static int intel_pstate_get_max_freq(struct cpudata *cpu) 1985 { 1986 return global.turbo_disabled || global.no_turbo ? 1987 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 1988 } 1989 1990 static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy, 1991 struct cpudata *cpu) 1992 { 1993 int max_freq = intel_pstate_get_max_freq(cpu); 1994 int32_t max_policy_perf, min_policy_perf; 1995 1996 max_policy_perf = div_ext_fp(policy->max, max_freq); 1997 max_policy_perf = clamp_t(int32_t, max_policy_perf, 0, int_ext_tofp(1)); 1998 if (policy->max == policy->min) { 1999 min_policy_perf = max_policy_perf; 2000 } else { 2001 min_policy_perf = div_ext_fp(policy->min, max_freq); 2002 min_policy_perf = clamp_t(int32_t, min_policy_perf, 2003 0, max_policy_perf); 2004 } 2005 2006 /* Normalize user input to [min_perf, max_perf] */ 2007 if (per_cpu_limits) { 2008 cpu->min_perf = min_policy_perf; 2009 cpu->max_perf = max_policy_perf; 2010 } else { 2011 int32_t global_min, global_max; 2012 2013 /* Global limits are in percent of the maximum turbo P-state. */ 2014 global_max = percent_ext_fp(global.max_perf_pct); 2015 global_min = percent_ext_fp(global.min_perf_pct); 2016 if (max_freq != cpu->pstate.turbo_freq) { 2017 int32_t turbo_factor; 2018 2019 turbo_factor = div_ext_fp(cpu->pstate.turbo_pstate, 2020 cpu->pstate.max_pstate); 2021 global_min = mul_ext_fp(global_min, turbo_factor); 2022 global_max = mul_ext_fp(global_max, turbo_factor); 2023 } 2024 global_min = clamp_t(int32_t, global_min, 0, global_max); 2025 2026 cpu->min_perf = max(min_policy_perf, global_min); 2027 cpu->min_perf = min(cpu->min_perf, max_policy_perf); 2028 cpu->max_perf = min(max_policy_perf, global_max); 2029 cpu->max_perf = max(min_policy_perf, cpu->max_perf); 2030 2031 /* Make sure min_perf <= max_perf */ 2032 cpu->min_perf = min(cpu->min_perf, cpu->max_perf); 2033 } 2034 2035 cpu->max_perf = round_up(cpu->max_perf, EXT_FRAC_BITS); 2036 cpu->min_perf = round_up(cpu->min_perf, EXT_FRAC_BITS); 2037 2038 pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu, 2039 fp_ext_toint(cpu->max_perf * 100), 2040 fp_ext_toint(cpu->min_perf * 100)); 2041 } 2042 2043 static int intel_pstate_set_policy(struct cpufreq_policy *policy) 2044 { 2045 struct cpudata *cpu; 2046 2047 if (!policy->cpuinfo.max_freq) 2048 return -ENODEV; 2049 2050 pr_debug("set_policy cpuinfo.max %u policy->max %u\n", 2051 policy->cpuinfo.max_freq, policy->max); 2052 2053 cpu = all_cpu_data[policy->cpu]; 2054 cpu->policy = policy->policy; 2055 2056 mutex_lock(&intel_pstate_limits_lock); 2057 2058 intel_pstate_update_perf_limits(policy, cpu); 2059 2060 if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { 2061 /* 2062 * NOHZ_FULL CPUs need this as the governor callback may not 2063 * be invoked on them. 2064 */ 2065 intel_pstate_clear_update_util_hook(policy->cpu); 2066 intel_pstate_max_within_limits(cpu); 2067 } 2068 2069 intel_pstate_set_update_util_hook(policy->cpu); 2070 2071 if (hwp_active) 2072 intel_pstate_hwp_set(policy); 2073 2074 mutex_unlock(&intel_pstate_limits_lock); 2075 2076 return 0; 2077 } 2078 2079 static void intel_pstate_adjust_policy_max(struct cpufreq_policy *policy, 2080 struct cpudata *cpu) 2081 { 2082 if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate && 2083 policy->max < policy->cpuinfo.max_freq && 2084 policy->max > cpu->pstate.max_freq) { 2085 pr_debug("policy->max > max non turbo frequency\n"); 2086 policy->max = policy->cpuinfo.max_freq; 2087 } 2088 } 2089 2090 static int intel_pstate_verify_policy(struct cpufreq_policy *policy) 2091 { 2092 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2093 2094 update_turbo_state(); 2095 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, 2096 intel_pstate_get_max_freq(cpu)); 2097 2098 if (policy->policy != CPUFREQ_POLICY_POWERSAVE && 2099 policy->policy != CPUFREQ_POLICY_PERFORMANCE) 2100 return -EINVAL; 2101 2102 intel_pstate_adjust_policy_max(policy, cpu); 2103 2104 return 0; 2105 } 2106 2107 static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) 2108 { 2109 intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); 2110 } 2111 2112 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) 2113 { 2114 pr_debug("CPU %d exiting\n", policy->cpu); 2115 2116 intel_pstate_clear_update_util_hook(policy->cpu); 2117 if (hwp_active) 2118 intel_pstate_hwp_save_state(policy); 2119 else 2120 intel_cpufreq_stop_cpu(policy); 2121 } 2122 2123 static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) 2124 { 2125 intel_pstate_exit_perf_limits(policy); 2126 2127 policy->fast_switch_possible = false; 2128 2129 return 0; 2130 } 2131 2132 static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) 2133 { 2134 struct cpudata *cpu; 2135 int rc; 2136 2137 rc = intel_pstate_init_cpu(policy->cpu); 2138 if (rc) 2139 return rc; 2140 2141 cpu = all_cpu_data[policy->cpu]; 2142 2143 cpu->max_perf = int_ext_tofp(1); 2144 cpu->min_perf = 0; 2145 2146 policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; 2147 policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; 2148 2149 /* cpuinfo and default policy values */ 2150 policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; 2151 update_turbo_state(); 2152 policy->cpuinfo.max_freq = global.turbo_disabled ? 2153 cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; 2154 policy->cpuinfo.max_freq *= cpu->pstate.scaling; 2155 2156 intel_pstate_init_acpi_perf_limits(policy); 2157 cpumask_set_cpu(policy->cpu, policy->cpus); 2158 2159 policy->fast_switch_possible = true; 2160 2161 return 0; 2162 } 2163 2164 static int intel_pstate_cpu_init(struct cpufreq_policy *policy) 2165 { 2166 int ret = __intel_pstate_cpu_init(policy); 2167 2168 if (ret) 2169 return ret; 2170 2171 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 2172 if (IS_ENABLED(CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE)) 2173 policy->policy = CPUFREQ_POLICY_PERFORMANCE; 2174 else 2175 policy->policy = CPUFREQ_POLICY_POWERSAVE; 2176 2177 return 0; 2178 } 2179 2180 static struct cpufreq_driver intel_pstate = { 2181 .flags = CPUFREQ_CONST_LOOPS, 2182 .verify = intel_pstate_verify_policy, 2183 .setpolicy = intel_pstate_set_policy, 2184 .suspend = intel_pstate_hwp_save_state, 2185 .resume = intel_pstate_resume, 2186 .get = intel_pstate_get, 2187 .init = intel_pstate_cpu_init, 2188 .exit = intel_pstate_cpu_exit, 2189 .stop_cpu = intel_pstate_stop_cpu, 2190 .name = "intel_pstate", 2191 }; 2192 2193 static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy) 2194 { 2195 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2196 2197 update_turbo_state(); 2198 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, 2199 intel_pstate_get_max_freq(cpu)); 2200 2201 intel_pstate_adjust_policy_max(policy, cpu); 2202 2203 intel_pstate_update_perf_limits(policy, cpu); 2204 2205 return 0; 2206 } 2207 2208 static int intel_cpufreq_target(struct cpufreq_policy *policy, 2209 unsigned int target_freq, 2210 unsigned int relation) 2211 { 2212 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2213 struct cpufreq_freqs freqs; 2214 int target_pstate; 2215 2216 update_turbo_state(); 2217 2218 freqs.old = policy->cur; 2219 freqs.new = target_freq; 2220 2221 cpufreq_freq_transition_begin(policy, &freqs); 2222 switch (relation) { 2223 case CPUFREQ_RELATION_L: 2224 target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling); 2225 break; 2226 case CPUFREQ_RELATION_H: 2227 target_pstate = freqs.new / cpu->pstate.scaling; 2228 break; 2229 default: 2230 target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling); 2231 break; 2232 } 2233 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 2234 if (target_pstate != cpu->pstate.current_pstate) { 2235 cpu->pstate.current_pstate = target_pstate; 2236 wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, 2237 pstate_funcs.get_val(cpu, target_pstate)); 2238 } 2239 freqs.new = target_pstate * cpu->pstate.scaling; 2240 cpufreq_freq_transition_end(policy, &freqs, false); 2241 2242 return 0; 2243 } 2244 2245 static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, 2246 unsigned int target_freq) 2247 { 2248 struct cpudata *cpu = all_cpu_data[policy->cpu]; 2249 int target_pstate; 2250 2251 update_turbo_state(); 2252 2253 target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); 2254 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 2255 intel_pstate_update_pstate(cpu, target_pstate); 2256 return target_pstate * cpu->pstate.scaling; 2257 } 2258 2259 static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) 2260 { 2261 int ret = __intel_pstate_cpu_init(policy); 2262 2263 if (ret) 2264 return ret; 2265 2266 policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; 2267 /* This reflects the intel_pstate_get_cpu_pstates() setting. */ 2268 policy->cur = policy->cpuinfo.min_freq; 2269 2270 return 0; 2271 } 2272 2273 static struct cpufreq_driver intel_cpufreq = { 2274 .flags = CPUFREQ_CONST_LOOPS, 2275 .verify = intel_cpufreq_verify_policy, 2276 .target = intel_cpufreq_target, 2277 .fast_switch = intel_cpufreq_fast_switch, 2278 .init = intel_cpufreq_cpu_init, 2279 .exit = intel_pstate_cpu_exit, 2280 .stop_cpu = intel_cpufreq_stop_cpu, 2281 .name = "intel_cpufreq", 2282 }; 2283 2284 static struct cpufreq_driver *default_driver = &intel_pstate; 2285 2286 static void intel_pstate_driver_cleanup(void) 2287 { 2288 unsigned int cpu; 2289 2290 get_online_cpus(); 2291 for_each_online_cpu(cpu) { 2292 if (all_cpu_data[cpu]) { 2293 if (intel_pstate_driver == &intel_pstate) 2294 intel_pstate_clear_update_util_hook(cpu); 2295 2296 kfree(all_cpu_data[cpu]); 2297 all_cpu_data[cpu] = NULL; 2298 } 2299 } 2300 put_online_cpus(); 2301 intel_pstate_driver = NULL; 2302 } 2303 2304 static int intel_pstate_register_driver(struct cpufreq_driver *driver) 2305 { 2306 int ret; 2307 2308 memset(&global, 0, sizeof(global)); 2309 global.max_perf_pct = 100; 2310 2311 intel_pstate_driver = driver; 2312 ret = cpufreq_register_driver(intel_pstate_driver); 2313 if (ret) { 2314 intel_pstate_driver_cleanup(); 2315 return ret; 2316 } 2317 2318 global.min_perf_pct = min_perf_pct_min(); 2319 2320 if (intel_pstate_driver == &intel_pstate && !hwp_active && 2321 pstate_funcs.get_target_pstate != get_target_pstate_use_cpu_load) 2322 intel_pstate_debug_expose_params(); 2323 2324 return 0; 2325 } 2326 2327 static int intel_pstate_unregister_driver(void) 2328 { 2329 if (hwp_active) 2330 return -EBUSY; 2331 2332 if (intel_pstate_driver == &intel_pstate && !hwp_active && 2333 pstate_funcs.get_target_pstate != get_target_pstate_use_cpu_load) 2334 intel_pstate_debug_hide_params(); 2335 2336 cpufreq_unregister_driver(intel_pstate_driver); 2337 intel_pstate_driver_cleanup(); 2338 2339 return 0; 2340 } 2341 2342 static ssize_t intel_pstate_show_status(char *buf) 2343 { 2344 if (!intel_pstate_driver) 2345 return sprintf(buf, "off\n"); 2346 2347 return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ? 2348 "active" : "passive"); 2349 } 2350 2351 static int intel_pstate_update_status(const char *buf, size_t size) 2352 { 2353 int ret; 2354 2355 if (size == 3 && !strncmp(buf, "off", size)) 2356 return intel_pstate_driver ? 2357 intel_pstate_unregister_driver() : -EINVAL; 2358 2359 if (size == 6 && !strncmp(buf, "active", size)) { 2360 if (intel_pstate_driver) { 2361 if (intel_pstate_driver == &intel_pstate) 2362 return 0; 2363 2364 ret = intel_pstate_unregister_driver(); 2365 if (ret) 2366 return ret; 2367 } 2368 2369 return intel_pstate_register_driver(&intel_pstate); 2370 } 2371 2372 if (size == 7 && !strncmp(buf, "passive", size)) { 2373 if (intel_pstate_driver) { 2374 if (intel_pstate_driver == &intel_cpufreq) 2375 return 0; 2376 2377 ret = intel_pstate_unregister_driver(); 2378 if (ret) 2379 return ret; 2380 } 2381 2382 return intel_pstate_register_driver(&intel_cpufreq); 2383 } 2384 2385 return -EINVAL; 2386 } 2387 2388 static int no_load __initdata; 2389 static int no_hwp __initdata; 2390 static int hwp_only __initdata; 2391 static unsigned int force_load __initdata; 2392 2393 static int __init intel_pstate_msrs_not_valid(void) 2394 { 2395 if (!pstate_funcs.get_max() || 2396 !pstate_funcs.get_min() || 2397 !pstate_funcs.get_turbo()) 2398 return -ENODEV; 2399 2400 return 0; 2401 } 2402 2403 #ifdef CONFIG_ACPI 2404 static void intel_pstate_use_acpi_profile(void) 2405 { 2406 switch (acpi_gbl_FADT.preferred_profile) { 2407 case PM_MOBILE: 2408 case PM_TABLET: 2409 case PM_APPLIANCE_PC: 2410 case PM_DESKTOP: 2411 case PM_WORKSTATION: 2412 pstate_funcs.get_target_pstate = 2413 get_target_pstate_use_cpu_load; 2414 } 2415 } 2416 #else 2417 static void intel_pstate_use_acpi_profile(void) 2418 { 2419 } 2420 #endif 2421 2422 static void __init copy_cpu_funcs(struct pstate_funcs *funcs) 2423 { 2424 pstate_funcs.get_max = funcs->get_max; 2425 pstate_funcs.get_max_physical = funcs->get_max_physical; 2426 pstate_funcs.get_min = funcs->get_min; 2427 pstate_funcs.get_turbo = funcs->get_turbo; 2428 pstate_funcs.get_scaling = funcs->get_scaling; 2429 pstate_funcs.get_val = funcs->get_val; 2430 pstate_funcs.get_vid = funcs->get_vid; 2431 pstate_funcs.get_target_pstate = funcs->get_target_pstate; 2432 2433 intel_pstate_use_acpi_profile(); 2434 2435 if (pstate_funcs.get_target_pstate == get_target_pstate_use_performance) 2436 update_util_cb = intel_pstate_update_util_pid; 2437 } 2438 2439 #ifdef CONFIG_ACPI 2440 2441 static bool __init intel_pstate_no_acpi_pss(void) 2442 { 2443 int i; 2444 2445 for_each_possible_cpu(i) { 2446 acpi_status status; 2447 union acpi_object *pss; 2448 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 2449 struct acpi_processor *pr = per_cpu(processors, i); 2450 2451 if (!pr) 2452 continue; 2453 2454 status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer); 2455 if (ACPI_FAILURE(status)) 2456 continue; 2457 2458 pss = buffer.pointer; 2459 if (pss && pss->type == ACPI_TYPE_PACKAGE) { 2460 kfree(pss); 2461 return false; 2462 } 2463 2464 kfree(pss); 2465 } 2466 2467 return true; 2468 } 2469 2470 static bool __init intel_pstate_has_acpi_ppc(void) 2471 { 2472 int i; 2473 2474 for_each_possible_cpu(i) { 2475 struct acpi_processor *pr = per_cpu(processors, i); 2476 2477 if (!pr) 2478 continue; 2479 if (acpi_has_method(pr->handle, "_PPC")) 2480 return true; 2481 } 2482 return false; 2483 } 2484 2485 enum { 2486 PSS, 2487 PPC, 2488 }; 2489 2490 struct hw_vendor_info { 2491 u16 valid; 2492 char oem_id[ACPI_OEM_ID_SIZE]; 2493 char oem_table_id[ACPI_OEM_TABLE_ID_SIZE]; 2494 int oem_pwr_table; 2495 }; 2496 2497 /* Hardware vendor-specific info that has its own power management modes */ 2498 static struct hw_vendor_info vendor_info[] __initdata = { 2499 {1, "HP ", "ProLiant", PSS}, 2500 {1, "ORACLE", "X4-2 ", PPC}, 2501 {1, "ORACLE", "X4-2L ", PPC}, 2502 {1, "ORACLE", "X4-2B ", PPC}, 2503 {1, "ORACLE", "X3-2 ", PPC}, 2504 {1, "ORACLE", "X3-2L ", PPC}, 2505 {1, "ORACLE", "X3-2B ", PPC}, 2506 {1, "ORACLE", "X4470M2 ", PPC}, 2507 {1, "ORACLE", "X4270M3 ", PPC}, 2508 {1, "ORACLE", "X4270M2 ", PPC}, 2509 {1, "ORACLE", "X4170M2 ", PPC}, 2510 {1, "ORACLE", "X4170 M3", PPC}, 2511 {1, "ORACLE", "X4275 M3", PPC}, 2512 {1, "ORACLE", "X6-2 ", PPC}, 2513 {1, "ORACLE", "Sudbury ", PPC}, 2514 {0, "", ""}, 2515 }; 2516 2517 static bool __init intel_pstate_platform_pwr_mgmt_exists(void) 2518 { 2519 struct acpi_table_header hdr; 2520 struct hw_vendor_info *v_info; 2521 const struct x86_cpu_id *id; 2522 u64 misc_pwr; 2523 2524 id = x86_match_cpu(intel_pstate_cpu_oob_ids); 2525 if (id) { 2526 rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr); 2527 if ( misc_pwr & (1 << 8)) 2528 return true; 2529 } 2530 2531 if (acpi_disabled || 2532 ACPI_FAILURE(acpi_get_table_header(ACPI_SIG_FADT, 0, &hdr))) 2533 return false; 2534 2535 for (v_info = vendor_info; v_info->valid; v_info++) { 2536 if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) && 2537 !strncmp(hdr.oem_table_id, v_info->oem_table_id, 2538 ACPI_OEM_TABLE_ID_SIZE)) 2539 switch (v_info->oem_pwr_table) { 2540 case PSS: 2541 return intel_pstate_no_acpi_pss(); 2542 case PPC: 2543 return intel_pstate_has_acpi_ppc() && 2544 (!force_load); 2545 } 2546 } 2547 2548 return false; 2549 } 2550 2551 static void intel_pstate_request_control_from_smm(void) 2552 { 2553 /* 2554 * It may be unsafe to request P-states control from SMM if _PPC support 2555 * has not been enabled. 2556 */ 2557 if (acpi_ppc) 2558 acpi_processor_pstate_control(); 2559 } 2560 #else /* CONFIG_ACPI not enabled */ 2561 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; } 2562 static inline bool intel_pstate_has_acpi_ppc(void) { return false; } 2563 static inline void intel_pstate_request_control_from_smm(void) {} 2564 #endif /* CONFIG_ACPI */ 2565 2566 static const struct x86_cpu_id hwp_support_ids[] __initconst = { 2567 { X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP }, 2568 {} 2569 }; 2570 2571 static int __init intel_pstate_init(void) 2572 { 2573 int rc; 2574 2575 if (no_load) 2576 return -ENODEV; 2577 2578 if (x86_match_cpu(hwp_support_ids)) { 2579 copy_cpu_funcs(&core_params.funcs); 2580 if (no_hwp) { 2581 update_util_cb = intel_pstate_update_util; 2582 } else { 2583 hwp_active++; 2584 intel_pstate.attr = hwp_cpufreq_attrs; 2585 update_util_cb = intel_pstate_update_util_hwp; 2586 goto hwp_cpu_matched; 2587 } 2588 } else { 2589 const struct x86_cpu_id *id; 2590 struct cpu_defaults *cpu_def; 2591 2592 id = x86_match_cpu(intel_pstate_cpu_ids); 2593 if (!id) 2594 return -ENODEV; 2595 2596 cpu_def = (struct cpu_defaults *)id->driver_data; 2597 copy_cpu_funcs(&cpu_def->funcs); 2598 } 2599 2600 if (intel_pstate_msrs_not_valid()) 2601 return -ENODEV; 2602 2603 hwp_cpu_matched: 2604 /* 2605 * The Intel pstate driver will be ignored if the platform 2606 * firmware has its own power management modes. 2607 */ 2608 if (intel_pstate_platform_pwr_mgmt_exists()) 2609 return -ENODEV; 2610 2611 if (!hwp_active && hwp_only) 2612 return -ENOTSUPP; 2613 2614 pr_info("Intel P-state driver initializing\n"); 2615 2616 all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus()); 2617 if (!all_cpu_data) 2618 return -ENOMEM; 2619 2620 intel_pstate_request_control_from_smm(); 2621 2622 intel_pstate_sysfs_expose_params(); 2623 2624 mutex_lock(&intel_pstate_driver_lock); 2625 rc = intel_pstate_register_driver(default_driver); 2626 mutex_unlock(&intel_pstate_driver_lock); 2627 if (rc) 2628 return rc; 2629 2630 if (hwp_active) 2631 pr_info("HWP enabled\n"); 2632 2633 return 0; 2634 } 2635 device_initcall(intel_pstate_init); 2636 2637 static int __init intel_pstate_setup(char *str) 2638 { 2639 if (!str) 2640 return -EINVAL; 2641 2642 if (!strcmp(str, "disable")) { 2643 no_load = 1; 2644 } else if (!strcmp(str, "passive")) { 2645 pr_info("Passive mode enabled\n"); 2646 default_driver = &intel_cpufreq; 2647 no_hwp = 1; 2648 } 2649 if (!strcmp(str, "no_hwp")) { 2650 pr_info("HWP disabled\n"); 2651 no_hwp = 1; 2652 } 2653 if (!strcmp(str, "force")) 2654 force_load = 1; 2655 if (!strcmp(str, "hwp_only")) 2656 hwp_only = 1; 2657 if (!strcmp(str, "per_cpu_perf_limits")) 2658 per_cpu_limits = true; 2659 2660 #ifdef CONFIG_ACPI 2661 if (!strcmp(str, "support_acpi_ppc")) 2662 acpi_ppc = true; 2663 #endif 2664 2665 return 0; 2666 } 2667 early_param("intel_pstate", intel_pstate_setup); 2668 2669 MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>"); 2670 MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors"); 2671 MODULE_LICENSE("GPL"); 2672