1 /* 2 * intel_pstate.c: Native P state management for Intel processors 3 * 4 * (C) Copyright 2012 Intel Corporation 5 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com> 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; version 2 10 * of the License. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/kernel.h> 16 #include <linux/kernel_stat.h> 17 #include <linux/module.h> 18 #include <linux/ktime.h> 19 #include <linux/hrtimer.h> 20 #include <linux/tick.h> 21 #include <linux/slab.h> 22 #include <linux/sched.h> 23 #include <linux/list.h> 24 #include <linux/cpu.h> 25 #include <linux/cpufreq.h> 26 #include <linux/sysfs.h> 27 #include <linux/types.h> 28 #include <linux/fs.h> 29 #include <linux/debugfs.h> 30 #include <linux/acpi.h> 31 #include <linux/vmalloc.h> 32 #include <trace/events/power.h> 33 34 #include <asm/div64.h> 35 #include <asm/msr.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/cpufeature.h> 38 39 #define ATOM_RATIOS 0x66a 40 #define ATOM_VIDS 0x66b 41 #define ATOM_TURBO_RATIOS 0x66c 42 #define ATOM_TURBO_VIDS 0x66d 43 44 #ifdef CONFIG_ACPI 45 #include <acpi/processor.h> 46 #endif 47 48 #define FRAC_BITS 8 49 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) 50 #define fp_toint(X) ((X) >> FRAC_BITS) 51 52 static inline int32_t mul_fp(int32_t x, int32_t y) 53 { 54 return ((int64_t)x * (int64_t)y) >> FRAC_BITS; 55 } 56 57 static inline int32_t div_fp(s64 x, s64 y) 58 { 59 return div64_s64((int64_t)x << FRAC_BITS, y); 60 } 61 62 static inline int ceiling_fp(int32_t x) 63 { 64 int mask, ret; 65 66 ret = fp_toint(x); 67 mask = (1 << FRAC_BITS) - 1; 68 if (x & mask) 69 ret += 1; 70 return ret; 71 } 72 73 /** 74 * struct sample - Store performance sample 75 * @core_pct_busy: Ratio of APERF/MPERF in percent, which is actual 76 * performance during last sample period 77 * @busy_scaled: Scaled busy value which is used to calculate next 78 * P state. This can be different than core_pct_busy 79 * to account for cpu idle period 80 * @aperf: Difference of actual performance frequency clock count 81 * read from APERF MSR between last and current sample 82 * @mperf: Difference of maximum performance frequency clock count 83 * read from MPERF MSR between last and current sample 84 * @tsc: Difference of time stamp counter between last and 85 * current sample 86 * @freq: Effective frequency calculated from APERF/MPERF 87 * @time: Current time from scheduler 88 * 89 * This structure is used in the cpudata structure to store performance sample 90 * data for choosing next P State. 91 */ 92 struct sample { 93 int32_t core_pct_busy; 94 int32_t busy_scaled; 95 u64 aperf; 96 u64 mperf; 97 u64 tsc; 98 int freq; 99 u64 time; 100 }; 101 102 /** 103 * struct pstate_data - Store P state data 104 * @current_pstate: Current requested P state 105 * @min_pstate: Min P state possible for this platform 106 * @max_pstate: Max P state possible for this platform 107 * @max_pstate_physical:This is physical Max P state for a processor 108 * This can be higher than the max_pstate which can 109 * be limited by platform thermal design power limits 110 * @scaling: Scaling factor to convert frequency to cpufreq 111 * frequency units 112 * @turbo_pstate: Max Turbo P state possible for this platform 113 * 114 * Stores the per cpu model P state limits and current P state. 115 */ 116 struct pstate_data { 117 int current_pstate; 118 int min_pstate; 119 int max_pstate; 120 int max_pstate_physical; 121 int scaling; 122 int turbo_pstate; 123 }; 124 125 /** 126 * struct vid_data - Stores voltage information data 127 * @min: VID data for this platform corresponding to 128 * the lowest P state 129 * @max: VID data corresponding to the highest P State. 130 * @turbo: VID data for turbo P state 131 * @ratio: Ratio of (vid max - vid min) / 132 * (max P state - Min P State) 133 * 134 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling) 135 * This data is used in Atom platforms, where in addition to target P state, 136 * the voltage data needs to be specified to select next P State. 137 */ 138 struct vid_data { 139 int min; 140 int max; 141 int turbo; 142 int32_t ratio; 143 }; 144 145 /** 146 * struct _pid - Stores PID data 147 * @setpoint: Target set point for busyness or performance 148 * @integral: Storage for accumulated error values 149 * @p_gain: PID proportional gain 150 * @i_gain: PID integral gain 151 * @d_gain: PID derivative gain 152 * @deadband: PID deadband 153 * @last_err: Last error storage for integral part of PID calculation 154 * 155 * Stores PID coefficients and last error for PID controller. 156 */ 157 struct _pid { 158 int setpoint; 159 int32_t integral; 160 int32_t p_gain; 161 int32_t i_gain; 162 int32_t d_gain; 163 int deadband; 164 int32_t last_err; 165 }; 166 167 /** 168 * struct cpudata - Per CPU instance data storage 169 * @cpu: CPU number for this instance data 170 * @update_util: CPUFreq utility callback information 171 * @pstate: Stores P state limits for this CPU 172 * @vid: Stores VID limits for this CPU 173 * @pid: Stores PID parameters for this CPU 174 * @last_sample_time: Last Sample time 175 * @prev_aperf: Last APERF value read from APERF MSR 176 * @prev_mperf: Last MPERF value read from MPERF MSR 177 * @prev_tsc: Last timestamp counter (TSC) value 178 * @prev_cummulative_iowait: IO Wait time difference from last and 179 * current sample 180 * @sample: Storage for storing last Sample data 181 * @acpi_perf_data: Stores ACPI perf information read from _PSS 182 * @valid_pss_table: Set to true for valid ACPI _PSS entries found 183 * 184 * This structure stores per CPU instance data for all CPUs. 185 */ 186 struct cpudata { 187 int cpu; 188 189 struct update_util_data update_util; 190 191 struct pstate_data pstate; 192 struct vid_data vid; 193 struct _pid pid; 194 195 u64 last_sample_time; 196 u64 prev_aperf; 197 u64 prev_mperf; 198 u64 prev_tsc; 199 u64 prev_cummulative_iowait; 200 struct sample sample; 201 #ifdef CONFIG_ACPI 202 struct acpi_processor_performance acpi_perf_data; 203 bool valid_pss_table; 204 #endif 205 }; 206 207 static struct cpudata **all_cpu_data; 208 209 /** 210 * struct pid_adjust_policy - Stores static PID configuration data 211 * @sample_rate_ms: PID calculation sample rate in ms 212 * @sample_rate_ns: Sample rate calculation in ns 213 * @deadband: PID deadband 214 * @setpoint: PID Setpoint 215 * @p_gain_pct: PID proportional gain 216 * @i_gain_pct: PID integral gain 217 * @d_gain_pct: PID derivative gain 218 * 219 * Stores per CPU model static PID configuration data. 220 */ 221 struct pstate_adjust_policy { 222 int sample_rate_ms; 223 s64 sample_rate_ns; 224 int deadband; 225 int setpoint; 226 int p_gain_pct; 227 int d_gain_pct; 228 int i_gain_pct; 229 }; 230 231 /** 232 * struct pstate_funcs - Per CPU model specific callbacks 233 * @get_max: Callback to get maximum non turbo effective P state 234 * @get_max_physical: Callback to get maximum non turbo physical P state 235 * @get_min: Callback to get minimum P state 236 * @get_turbo: Callback to get turbo P state 237 * @get_scaling: Callback to get frequency scaling factor 238 * @get_val: Callback to convert P state to actual MSR write value 239 * @get_vid: Callback to get VID data for Atom platforms 240 * @get_target_pstate: Callback to a function to calculate next P state to use 241 * 242 * Core and Atom CPU models have different way to get P State limits. This 243 * structure is used to store those callbacks. 244 */ 245 struct pstate_funcs { 246 int (*get_max)(void); 247 int (*get_max_physical)(void); 248 int (*get_min)(void); 249 int (*get_turbo)(void); 250 int (*get_scaling)(void); 251 u64 (*get_val)(struct cpudata*, int pstate); 252 void (*get_vid)(struct cpudata *); 253 int32_t (*get_target_pstate)(struct cpudata *); 254 }; 255 256 /** 257 * struct cpu_defaults- Per CPU model default config data 258 * @pid_policy: PID config data 259 * @funcs: Callback function data 260 */ 261 struct cpu_defaults { 262 struct pstate_adjust_policy pid_policy; 263 struct pstate_funcs funcs; 264 }; 265 266 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu); 267 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu); 268 269 static struct pstate_adjust_policy pid_params; 270 static struct pstate_funcs pstate_funcs; 271 static int hwp_active; 272 273 #ifdef CONFIG_ACPI 274 static bool acpi_ppc; 275 #endif 276 277 /** 278 * struct perf_limits - Store user and policy limits 279 * @no_turbo: User requested turbo state from intel_pstate sysfs 280 * @turbo_disabled: Platform turbo status either from msr 281 * MSR_IA32_MISC_ENABLE or when maximum available pstate 282 * matches the maximum turbo pstate 283 * @max_perf_pct: Effective maximum performance limit in percentage, this 284 * is minimum of either limits enforced by cpufreq policy 285 * or limits from user set limits via intel_pstate sysfs 286 * @min_perf_pct: Effective minimum performance limit in percentage, this 287 * is maximum of either limits enforced by cpufreq policy 288 * or limits from user set limits via intel_pstate sysfs 289 * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct 290 * This value is used to limit max pstate 291 * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct 292 * This value is used to limit min pstate 293 * @max_policy_pct: The maximum performance in percentage enforced by 294 * cpufreq setpolicy interface 295 * @max_sysfs_pct: The maximum performance in percentage enforced by 296 * intel pstate sysfs interface 297 * @min_policy_pct: The minimum performance in percentage enforced by 298 * cpufreq setpolicy interface 299 * @min_sysfs_pct: The minimum performance in percentage enforced by 300 * intel pstate sysfs interface 301 * 302 * Storage for user and policy defined limits. 303 */ 304 struct perf_limits { 305 int no_turbo; 306 int turbo_disabled; 307 int max_perf_pct; 308 int min_perf_pct; 309 int32_t max_perf; 310 int32_t min_perf; 311 int max_policy_pct; 312 int max_sysfs_pct; 313 int min_policy_pct; 314 int min_sysfs_pct; 315 }; 316 317 static struct perf_limits performance_limits = { 318 .no_turbo = 0, 319 .turbo_disabled = 0, 320 .max_perf_pct = 100, 321 .max_perf = int_tofp(1), 322 .min_perf_pct = 100, 323 .min_perf = int_tofp(1), 324 .max_policy_pct = 100, 325 .max_sysfs_pct = 100, 326 .min_policy_pct = 0, 327 .min_sysfs_pct = 0, 328 }; 329 330 static struct perf_limits powersave_limits = { 331 .no_turbo = 0, 332 .turbo_disabled = 0, 333 .max_perf_pct = 100, 334 .max_perf = int_tofp(1), 335 .min_perf_pct = 0, 336 .min_perf = 0, 337 .max_policy_pct = 100, 338 .max_sysfs_pct = 100, 339 .min_policy_pct = 0, 340 .min_sysfs_pct = 0, 341 }; 342 343 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE 344 static struct perf_limits *limits = &performance_limits; 345 #else 346 static struct perf_limits *limits = &powersave_limits; 347 #endif 348 349 #ifdef CONFIG_ACPI 350 /* 351 * The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and 352 * in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and 353 * max_turbo_pstate fields. The PERF_CTL MSR contains 16 bit value for P state 354 * ratio, out of it only high 8 bits are used. For example 0x1700 is setting 355 * target ratio 0x17. The _PSS control value stores in a format which can be 356 * directly written to PERF_CTL MSR. But in intel_pstate driver this shift 357 * occurs during write to PERF_CTL (E.g. for cores core_set_pstate()). 358 * This function converts the _PSS control value to intel pstate driver format 359 * for comparison and assignment. 360 */ 361 static int convert_to_native_pstate_format(struct cpudata *cpu, int index) 362 { 363 return cpu->acpi_perf_data.states[index].control >> 8; 364 } 365 366 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 367 { 368 struct cpudata *cpu; 369 int turbo_pss_ctl; 370 int ret; 371 int i; 372 373 if (!acpi_ppc) 374 return; 375 376 cpu = all_cpu_data[policy->cpu]; 377 378 ret = acpi_processor_register_performance(&cpu->acpi_perf_data, 379 policy->cpu); 380 if (ret) 381 return; 382 383 /* 384 * Check if the control value in _PSS is for PERF_CTL MSR, which should 385 * guarantee that the states returned by it map to the states in our 386 * list directly. 387 */ 388 if (cpu->acpi_perf_data.control_register.space_id != 389 ACPI_ADR_SPACE_FIXED_HARDWARE) 390 goto err; 391 392 /* 393 * If there is only one entry _PSS, simply ignore _PSS and continue as 394 * usual without taking _PSS into account 395 */ 396 if (cpu->acpi_perf_data.state_count < 2) 397 goto err; 398 399 pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu); 400 for (i = 0; i < cpu->acpi_perf_data.state_count; i++) { 401 pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n", 402 (i == cpu->acpi_perf_data.state ? '*' : ' '), i, 403 (u32) cpu->acpi_perf_data.states[i].core_frequency, 404 (u32) cpu->acpi_perf_data.states[i].power, 405 (u32) cpu->acpi_perf_data.states[i].control); 406 } 407 408 /* 409 * The _PSS table doesn't contain whole turbo frequency range. 410 * This just contains +1 MHZ above the max non turbo frequency, 411 * with control value corresponding to max turbo ratio. But 412 * when cpufreq set policy is called, it will call with this 413 * max frequency, which will cause a reduced performance as 414 * this driver uses real max turbo frequency as the max 415 * frequency. So correct this frequency in _PSS table to 416 * correct max turbo frequency based on the turbo ratio. 417 * Also need to convert to MHz as _PSS freq is in MHz. 418 */ 419 turbo_pss_ctl = convert_to_native_pstate_format(cpu, 0); 420 if (turbo_pss_ctl > cpu->pstate.max_pstate) 421 cpu->acpi_perf_data.states[0].core_frequency = 422 policy->cpuinfo.max_freq / 1000; 423 cpu->valid_pss_table = true; 424 pr_info("_PPC limits will be enforced\n"); 425 426 return; 427 428 err: 429 cpu->valid_pss_table = false; 430 acpi_processor_unregister_performance(policy->cpu); 431 } 432 433 static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 434 { 435 struct cpudata *cpu; 436 437 cpu = all_cpu_data[policy->cpu]; 438 if (!cpu->valid_pss_table) 439 return; 440 441 acpi_processor_unregister_performance(policy->cpu); 442 } 443 444 #else 445 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 446 { 447 } 448 449 static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 450 { 451 } 452 #endif 453 454 static inline void pid_reset(struct _pid *pid, int setpoint, int busy, 455 int deadband, int integral) { 456 pid->setpoint = int_tofp(setpoint); 457 pid->deadband = int_tofp(deadband); 458 pid->integral = int_tofp(integral); 459 pid->last_err = int_tofp(setpoint) - int_tofp(busy); 460 } 461 462 static inline void pid_p_gain_set(struct _pid *pid, int percent) 463 { 464 pid->p_gain = div_fp(percent, 100); 465 } 466 467 static inline void pid_i_gain_set(struct _pid *pid, int percent) 468 { 469 pid->i_gain = div_fp(percent, 100); 470 } 471 472 static inline void pid_d_gain_set(struct _pid *pid, int percent) 473 { 474 pid->d_gain = div_fp(percent, 100); 475 } 476 477 static signed int pid_calc(struct _pid *pid, int32_t busy) 478 { 479 signed int result; 480 int32_t pterm, dterm, fp_error; 481 int32_t integral_limit; 482 483 fp_error = pid->setpoint - busy; 484 485 if (abs(fp_error) <= pid->deadband) 486 return 0; 487 488 pterm = mul_fp(pid->p_gain, fp_error); 489 490 pid->integral += fp_error; 491 492 /* 493 * We limit the integral here so that it will never 494 * get higher than 30. This prevents it from becoming 495 * too large an input over long periods of time and allows 496 * it to get factored out sooner. 497 * 498 * The value of 30 was chosen through experimentation. 499 */ 500 integral_limit = int_tofp(30); 501 if (pid->integral > integral_limit) 502 pid->integral = integral_limit; 503 if (pid->integral < -integral_limit) 504 pid->integral = -integral_limit; 505 506 dterm = mul_fp(pid->d_gain, fp_error - pid->last_err); 507 pid->last_err = fp_error; 508 509 result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm; 510 result = result + (1 << (FRAC_BITS-1)); 511 return (signed int)fp_toint(result); 512 } 513 514 static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu) 515 { 516 pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct); 517 pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct); 518 pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct); 519 520 pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0); 521 } 522 523 static inline void intel_pstate_reset_all_pid(void) 524 { 525 unsigned int cpu; 526 527 for_each_online_cpu(cpu) { 528 if (all_cpu_data[cpu]) 529 intel_pstate_busy_pid_reset(all_cpu_data[cpu]); 530 } 531 } 532 533 static inline void update_turbo_state(void) 534 { 535 u64 misc_en; 536 struct cpudata *cpu; 537 538 cpu = all_cpu_data[0]; 539 rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); 540 limits->turbo_disabled = 541 (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || 542 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); 543 } 544 545 static void intel_pstate_hwp_set(const struct cpumask *cpumask) 546 { 547 int min, hw_min, max, hw_max, cpu, range, adj_range; 548 u64 value, cap; 549 550 rdmsrl(MSR_HWP_CAPABILITIES, cap); 551 hw_min = HWP_LOWEST_PERF(cap); 552 hw_max = HWP_HIGHEST_PERF(cap); 553 range = hw_max - hw_min; 554 555 for_each_cpu(cpu, cpumask) { 556 rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); 557 adj_range = limits->min_perf_pct * range / 100; 558 min = hw_min + adj_range; 559 value &= ~HWP_MIN_PERF(~0L); 560 value |= HWP_MIN_PERF(min); 561 562 adj_range = limits->max_perf_pct * range / 100; 563 max = hw_min + adj_range; 564 if (limits->no_turbo) { 565 hw_max = HWP_GUARANTEED_PERF(cap); 566 if (hw_max < max) 567 max = hw_max; 568 } 569 570 value &= ~HWP_MAX_PERF(~0L); 571 value |= HWP_MAX_PERF(max); 572 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); 573 } 574 } 575 576 static void intel_pstate_hwp_set_online_cpus(void) 577 { 578 get_online_cpus(); 579 intel_pstate_hwp_set(cpu_online_mask); 580 put_online_cpus(); 581 } 582 583 /************************** debugfs begin ************************/ 584 static int pid_param_set(void *data, u64 val) 585 { 586 *(u32 *)data = val; 587 intel_pstate_reset_all_pid(); 588 return 0; 589 } 590 591 static int pid_param_get(void *data, u64 *val) 592 { 593 *val = *(u32 *)data; 594 return 0; 595 } 596 DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, pid_param_set, "%llu\n"); 597 598 struct pid_param { 599 char *name; 600 void *value; 601 }; 602 603 static struct pid_param pid_files[] = { 604 {"sample_rate_ms", &pid_params.sample_rate_ms}, 605 {"d_gain_pct", &pid_params.d_gain_pct}, 606 {"i_gain_pct", &pid_params.i_gain_pct}, 607 {"deadband", &pid_params.deadband}, 608 {"setpoint", &pid_params.setpoint}, 609 {"p_gain_pct", &pid_params.p_gain_pct}, 610 {NULL, NULL} 611 }; 612 613 static void __init intel_pstate_debug_expose_params(void) 614 { 615 struct dentry *debugfs_parent; 616 int i = 0; 617 618 if (hwp_active) 619 return; 620 debugfs_parent = debugfs_create_dir("pstate_snb", NULL); 621 if (IS_ERR_OR_NULL(debugfs_parent)) 622 return; 623 while (pid_files[i].name) { 624 debugfs_create_file(pid_files[i].name, 0660, 625 debugfs_parent, pid_files[i].value, 626 &fops_pid_param); 627 i++; 628 } 629 } 630 631 /************************** debugfs end ************************/ 632 633 /************************** sysfs begin ************************/ 634 #define show_one(file_name, object) \ 635 static ssize_t show_##file_name \ 636 (struct kobject *kobj, struct attribute *attr, char *buf) \ 637 { \ 638 return sprintf(buf, "%u\n", limits->object); \ 639 } 640 641 static ssize_t show_turbo_pct(struct kobject *kobj, 642 struct attribute *attr, char *buf) 643 { 644 struct cpudata *cpu; 645 int total, no_turbo, turbo_pct; 646 uint32_t turbo_fp; 647 648 cpu = all_cpu_data[0]; 649 650 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 651 no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; 652 turbo_fp = div_fp(no_turbo, total); 653 turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); 654 return sprintf(buf, "%u\n", turbo_pct); 655 } 656 657 static ssize_t show_num_pstates(struct kobject *kobj, 658 struct attribute *attr, char *buf) 659 { 660 struct cpudata *cpu; 661 int total; 662 663 cpu = all_cpu_data[0]; 664 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 665 return sprintf(buf, "%u\n", total); 666 } 667 668 static ssize_t show_no_turbo(struct kobject *kobj, 669 struct attribute *attr, char *buf) 670 { 671 ssize_t ret; 672 673 update_turbo_state(); 674 if (limits->turbo_disabled) 675 ret = sprintf(buf, "%u\n", limits->turbo_disabled); 676 else 677 ret = sprintf(buf, "%u\n", limits->no_turbo); 678 679 return ret; 680 } 681 682 static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, 683 const char *buf, size_t count) 684 { 685 unsigned int input; 686 int ret; 687 688 ret = sscanf(buf, "%u", &input); 689 if (ret != 1) 690 return -EINVAL; 691 692 update_turbo_state(); 693 if (limits->turbo_disabled) { 694 pr_warn("Turbo disabled by BIOS or unavailable on processor\n"); 695 return -EPERM; 696 } 697 698 limits->no_turbo = clamp_t(int, input, 0, 1); 699 700 if (hwp_active) 701 intel_pstate_hwp_set_online_cpus(); 702 703 return count; 704 } 705 706 static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, 707 const char *buf, size_t count) 708 { 709 unsigned int input; 710 int ret; 711 712 ret = sscanf(buf, "%u", &input); 713 if (ret != 1) 714 return -EINVAL; 715 716 limits->max_sysfs_pct = clamp_t(int, input, 0 , 100); 717 limits->max_perf_pct = min(limits->max_policy_pct, 718 limits->max_sysfs_pct); 719 limits->max_perf_pct = max(limits->min_policy_pct, 720 limits->max_perf_pct); 721 limits->max_perf_pct = max(limits->min_perf_pct, 722 limits->max_perf_pct); 723 limits->max_perf = div_fp(limits->max_perf_pct, 100); 724 725 if (hwp_active) 726 intel_pstate_hwp_set_online_cpus(); 727 return count; 728 } 729 730 static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, 731 const char *buf, size_t count) 732 { 733 unsigned int input; 734 int ret; 735 736 ret = sscanf(buf, "%u", &input); 737 if (ret != 1) 738 return -EINVAL; 739 740 limits->min_sysfs_pct = clamp_t(int, input, 0 , 100); 741 limits->min_perf_pct = max(limits->min_policy_pct, 742 limits->min_sysfs_pct); 743 limits->min_perf_pct = min(limits->max_policy_pct, 744 limits->min_perf_pct); 745 limits->min_perf_pct = min(limits->max_perf_pct, 746 limits->min_perf_pct); 747 limits->min_perf = div_fp(limits->min_perf_pct, 100); 748 749 if (hwp_active) 750 intel_pstate_hwp_set_online_cpus(); 751 return count; 752 } 753 754 show_one(max_perf_pct, max_perf_pct); 755 show_one(min_perf_pct, min_perf_pct); 756 757 define_one_global_rw(no_turbo); 758 define_one_global_rw(max_perf_pct); 759 define_one_global_rw(min_perf_pct); 760 define_one_global_ro(turbo_pct); 761 define_one_global_ro(num_pstates); 762 763 static struct attribute *intel_pstate_attributes[] = { 764 &no_turbo.attr, 765 &max_perf_pct.attr, 766 &min_perf_pct.attr, 767 &turbo_pct.attr, 768 &num_pstates.attr, 769 NULL 770 }; 771 772 static struct attribute_group intel_pstate_attr_group = { 773 .attrs = intel_pstate_attributes, 774 }; 775 776 static void __init intel_pstate_sysfs_expose_params(void) 777 { 778 struct kobject *intel_pstate_kobject; 779 int rc; 780 781 intel_pstate_kobject = kobject_create_and_add("intel_pstate", 782 &cpu_subsys.dev_root->kobj); 783 BUG_ON(!intel_pstate_kobject); 784 rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group); 785 BUG_ON(rc); 786 } 787 /************************** sysfs end ************************/ 788 789 static void intel_pstate_hwp_enable(struct cpudata *cpudata) 790 { 791 /* First disable HWP notification interrupt as we don't process them */ 792 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); 793 794 wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); 795 } 796 797 static int atom_get_min_pstate(void) 798 { 799 u64 value; 800 801 rdmsrl(ATOM_RATIOS, value); 802 return (value >> 8) & 0x7F; 803 } 804 805 static int atom_get_max_pstate(void) 806 { 807 u64 value; 808 809 rdmsrl(ATOM_RATIOS, value); 810 return (value >> 16) & 0x7F; 811 } 812 813 static int atom_get_turbo_pstate(void) 814 { 815 u64 value; 816 817 rdmsrl(ATOM_TURBO_RATIOS, value); 818 return value & 0x7F; 819 } 820 821 static u64 atom_get_val(struct cpudata *cpudata, int pstate) 822 { 823 u64 val; 824 int32_t vid_fp; 825 u32 vid; 826 827 val = (u64)pstate << 8; 828 if (limits->no_turbo && !limits->turbo_disabled) 829 val |= (u64)1 << 32; 830 831 vid_fp = cpudata->vid.min + mul_fp( 832 int_tofp(pstate - cpudata->pstate.min_pstate), 833 cpudata->vid.ratio); 834 835 vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); 836 vid = ceiling_fp(vid_fp); 837 838 if (pstate > cpudata->pstate.max_pstate) 839 vid = cpudata->vid.turbo; 840 841 return val | vid; 842 } 843 844 static int silvermont_get_scaling(void) 845 { 846 u64 value; 847 int i; 848 /* Defined in Table 35-6 from SDM (Sept 2015) */ 849 static int silvermont_freq_table[] = { 850 83300, 100000, 133300, 116700, 80000}; 851 852 rdmsrl(MSR_FSB_FREQ, value); 853 i = value & 0x7; 854 WARN_ON(i > 4); 855 856 return silvermont_freq_table[i]; 857 } 858 859 static int airmont_get_scaling(void) 860 { 861 u64 value; 862 int i; 863 /* Defined in Table 35-10 from SDM (Sept 2015) */ 864 static int airmont_freq_table[] = { 865 83300, 100000, 133300, 116700, 80000, 866 93300, 90000, 88900, 87500}; 867 868 rdmsrl(MSR_FSB_FREQ, value); 869 i = value & 0xF; 870 WARN_ON(i > 8); 871 872 return airmont_freq_table[i]; 873 } 874 875 static void atom_get_vid(struct cpudata *cpudata) 876 { 877 u64 value; 878 879 rdmsrl(ATOM_VIDS, value); 880 cpudata->vid.min = int_tofp((value >> 8) & 0x7f); 881 cpudata->vid.max = int_tofp((value >> 16) & 0x7f); 882 cpudata->vid.ratio = div_fp( 883 cpudata->vid.max - cpudata->vid.min, 884 int_tofp(cpudata->pstate.max_pstate - 885 cpudata->pstate.min_pstate)); 886 887 rdmsrl(ATOM_TURBO_VIDS, value); 888 cpudata->vid.turbo = value & 0x7f; 889 } 890 891 static int core_get_min_pstate(void) 892 { 893 u64 value; 894 895 rdmsrl(MSR_PLATFORM_INFO, value); 896 return (value >> 40) & 0xFF; 897 } 898 899 static int core_get_max_pstate_physical(void) 900 { 901 u64 value; 902 903 rdmsrl(MSR_PLATFORM_INFO, value); 904 return (value >> 8) & 0xFF; 905 } 906 907 static int core_get_max_pstate(void) 908 { 909 u64 tar; 910 u64 plat_info; 911 int max_pstate; 912 int err; 913 914 rdmsrl(MSR_PLATFORM_INFO, plat_info); 915 max_pstate = (plat_info >> 8) & 0xFF; 916 917 err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar); 918 if (!err) { 919 /* Do some sanity checking for safety */ 920 if (plat_info & 0x600000000) { 921 u64 tdp_ctrl; 922 u64 tdp_ratio; 923 int tdp_msr; 924 925 err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl); 926 if (err) 927 goto skip_tar; 928 929 tdp_msr = MSR_CONFIG_TDP_NOMINAL + tdp_ctrl; 930 err = rdmsrl_safe(tdp_msr, &tdp_ratio); 931 if (err) 932 goto skip_tar; 933 934 if (tdp_ratio - 1 == tar) { 935 max_pstate = tar; 936 pr_debug("max_pstate=TAC %x\n", max_pstate); 937 } else { 938 goto skip_tar; 939 } 940 } 941 } 942 943 skip_tar: 944 return max_pstate; 945 } 946 947 static int core_get_turbo_pstate(void) 948 { 949 u64 value; 950 int nont, ret; 951 952 rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value); 953 nont = core_get_max_pstate(); 954 ret = (value) & 255; 955 if (ret <= nont) 956 ret = nont; 957 return ret; 958 } 959 960 static inline int core_get_scaling(void) 961 { 962 return 100000; 963 } 964 965 static u64 core_get_val(struct cpudata *cpudata, int pstate) 966 { 967 u64 val; 968 969 val = (u64)pstate << 8; 970 if (limits->no_turbo && !limits->turbo_disabled) 971 val |= (u64)1 << 32; 972 973 return val; 974 } 975 976 static int knl_get_turbo_pstate(void) 977 { 978 u64 value; 979 int nont, ret; 980 981 rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value); 982 nont = core_get_max_pstate(); 983 ret = (((value) >> 8) & 0xFF); 984 if (ret <= nont) 985 ret = nont; 986 return ret; 987 } 988 989 static struct cpu_defaults core_params = { 990 .pid_policy = { 991 .sample_rate_ms = 10, 992 .deadband = 0, 993 .setpoint = 97, 994 .p_gain_pct = 20, 995 .d_gain_pct = 0, 996 .i_gain_pct = 0, 997 }, 998 .funcs = { 999 .get_max = core_get_max_pstate, 1000 .get_max_physical = core_get_max_pstate_physical, 1001 .get_min = core_get_min_pstate, 1002 .get_turbo = core_get_turbo_pstate, 1003 .get_scaling = core_get_scaling, 1004 .get_val = core_get_val, 1005 .get_target_pstate = get_target_pstate_use_performance, 1006 }, 1007 }; 1008 1009 static struct cpu_defaults silvermont_params = { 1010 .pid_policy = { 1011 .sample_rate_ms = 10, 1012 .deadband = 0, 1013 .setpoint = 60, 1014 .p_gain_pct = 14, 1015 .d_gain_pct = 0, 1016 .i_gain_pct = 4, 1017 }, 1018 .funcs = { 1019 .get_max = atom_get_max_pstate, 1020 .get_max_physical = atom_get_max_pstate, 1021 .get_min = atom_get_min_pstate, 1022 .get_turbo = atom_get_turbo_pstate, 1023 .get_val = atom_get_val, 1024 .get_scaling = silvermont_get_scaling, 1025 .get_vid = atom_get_vid, 1026 .get_target_pstate = get_target_pstate_use_cpu_load, 1027 }, 1028 }; 1029 1030 static struct cpu_defaults airmont_params = { 1031 .pid_policy = { 1032 .sample_rate_ms = 10, 1033 .deadband = 0, 1034 .setpoint = 60, 1035 .p_gain_pct = 14, 1036 .d_gain_pct = 0, 1037 .i_gain_pct = 4, 1038 }, 1039 .funcs = { 1040 .get_max = atom_get_max_pstate, 1041 .get_max_physical = atom_get_max_pstate, 1042 .get_min = atom_get_min_pstate, 1043 .get_turbo = atom_get_turbo_pstate, 1044 .get_val = atom_get_val, 1045 .get_scaling = airmont_get_scaling, 1046 .get_vid = atom_get_vid, 1047 .get_target_pstate = get_target_pstate_use_cpu_load, 1048 }, 1049 }; 1050 1051 static struct cpu_defaults knl_params = { 1052 .pid_policy = { 1053 .sample_rate_ms = 10, 1054 .deadband = 0, 1055 .setpoint = 97, 1056 .p_gain_pct = 20, 1057 .d_gain_pct = 0, 1058 .i_gain_pct = 0, 1059 }, 1060 .funcs = { 1061 .get_max = core_get_max_pstate, 1062 .get_max_physical = core_get_max_pstate_physical, 1063 .get_min = core_get_min_pstate, 1064 .get_turbo = knl_get_turbo_pstate, 1065 .get_scaling = core_get_scaling, 1066 .get_val = core_get_val, 1067 .get_target_pstate = get_target_pstate_use_performance, 1068 }, 1069 }; 1070 1071 static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) 1072 { 1073 int max_perf = cpu->pstate.turbo_pstate; 1074 int max_perf_adj; 1075 int min_perf; 1076 1077 if (limits->no_turbo || limits->turbo_disabled) 1078 max_perf = cpu->pstate.max_pstate; 1079 1080 /* 1081 * performance can be limited by user through sysfs, by cpufreq 1082 * policy, or by cpu specific default values determined through 1083 * experimentation. 1084 */ 1085 max_perf_adj = fp_toint(max_perf * limits->max_perf); 1086 *max = clamp_t(int, max_perf_adj, 1087 cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); 1088 1089 min_perf = fp_toint(max_perf * limits->min_perf); 1090 *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); 1091 } 1092 1093 static inline void intel_pstate_record_pstate(struct cpudata *cpu, int pstate) 1094 { 1095 trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); 1096 cpu->pstate.current_pstate = pstate; 1097 } 1098 1099 static void intel_pstate_set_min_pstate(struct cpudata *cpu) 1100 { 1101 int pstate = cpu->pstate.min_pstate; 1102 1103 intel_pstate_record_pstate(cpu, pstate); 1104 /* 1105 * Generally, there is no guarantee that this code will always run on 1106 * the CPU being updated, so force the register update to run on the 1107 * right CPU. 1108 */ 1109 wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, 1110 pstate_funcs.get_val(cpu, pstate)); 1111 } 1112 1113 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) 1114 { 1115 cpu->pstate.min_pstate = pstate_funcs.get_min(); 1116 cpu->pstate.max_pstate = pstate_funcs.get_max(); 1117 cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(); 1118 cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); 1119 cpu->pstate.scaling = pstate_funcs.get_scaling(); 1120 1121 if (pstate_funcs.get_vid) 1122 pstate_funcs.get_vid(cpu); 1123 1124 intel_pstate_set_min_pstate(cpu); 1125 } 1126 1127 static inline void intel_pstate_calc_busy(struct cpudata *cpu) 1128 { 1129 struct sample *sample = &cpu->sample; 1130 int64_t core_pct; 1131 1132 core_pct = sample->aperf * int_tofp(100); 1133 core_pct = div64_u64(core_pct, sample->mperf); 1134 1135 sample->core_pct_busy = (int32_t)core_pct; 1136 } 1137 1138 static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) 1139 { 1140 u64 aperf, mperf; 1141 unsigned long flags; 1142 u64 tsc; 1143 1144 local_irq_save(flags); 1145 rdmsrl(MSR_IA32_APERF, aperf); 1146 rdmsrl(MSR_IA32_MPERF, mperf); 1147 tsc = rdtsc(); 1148 if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) { 1149 local_irq_restore(flags); 1150 return false; 1151 } 1152 local_irq_restore(flags); 1153 1154 cpu->last_sample_time = cpu->sample.time; 1155 cpu->sample.time = time; 1156 cpu->sample.aperf = aperf; 1157 cpu->sample.mperf = mperf; 1158 cpu->sample.tsc = tsc; 1159 cpu->sample.aperf -= cpu->prev_aperf; 1160 cpu->sample.mperf -= cpu->prev_mperf; 1161 cpu->sample.tsc -= cpu->prev_tsc; 1162 1163 cpu->prev_aperf = aperf; 1164 cpu->prev_mperf = mperf; 1165 cpu->prev_tsc = tsc; 1166 /* 1167 * First time this function is invoked in a given cycle, all of the 1168 * previous sample data fields are equal to zero or stale and they must 1169 * be populated with meaningful numbers for things to work, so assume 1170 * that sample.time will always be reset before setting the utilization 1171 * update hook and make the caller skip the sample then. 1172 */ 1173 return !!cpu->last_sample_time; 1174 } 1175 1176 static inline int32_t get_avg_frequency(struct cpudata *cpu) 1177 { 1178 return div64_u64(cpu->pstate.max_pstate_physical * cpu->sample.aperf * 1179 cpu->pstate.scaling, cpu->sample.mperf); 1180 } 1181 1182 static inline int32_t get_avg_pstate(struct cpudata *cpu) 1183 { 1184 return div64_u64(cpu->pstate.max_pstate_physical * cpu->sample.aperf, 1185 cpu->sample.mperf); 1186 } 1187 1188 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) 1189 { 1190 struct sample *sample = &cpu->sample; 1191 u64 cummulative_iowait, delta_iowait_us; 1192 u64 delta_iowait_mperf; 1193 u64 mperf, now; 1194 int32_t cpu_load; 1195 1196 cummulative_iowait = get_cpu_iowait_time_us(cpu->cpu, &now); 1197 1198 /* 1199 * Convert iowait time into number of IO cycles spent at max_freq. 1200 * IO is considered as busy only for the cpu_load algorithm. For 1201 * performance this is not needed since we always try to reach the 1202 * maximum P-State, so we are already boosting the IOs. 1203 */ 1204 delta_iowait_us = cummulative_iowait - cpu->prev_cummulative_iowait; 1205 delta_iowait_mperf = div64_u64(delta_iowait_us * cpu->pstate.scaling * 1206 cpu->pstate.max_pstate, MSEC_PER_SEC); 1207 1208 mperf = cpu->sample.mperf + delta_iowait_mperf; 1209 cpu->prev_cummulative_iowait = cummulative_iowait; 1210 1211 /* 1212 * The load can be estimated as the ratio of the mperf counter 1213 * running at a constant frequency during active periods 1214 * (C0) and the time stamp counter running at the same frequency 1215 * also during C-states. 1216 */ 1217 cpu_load = div64_u64(int_tofp(100) * mperf, sample->tsc); 1218 cpu->sample.busy_scaled = cpu_load; 1219 1220 return get_avg_pstate(cpu) - pid_calc(&cpu->pid, cpu_load); 1221 } 1222 1223 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) 1224 { 1225 int32_t core_busy, max_pstate, current_pstate, sample_ratio; 1226 u64 duration_ns; 1227 1228 intel_pstate_calc_busy(cpu); 1229 1230 /* 1231 * core_busy is the ratio of actual performance to max 1232 * max_pstate is the max non turbo pstate available 1233 * current_pstate was the pstate that was requested during 1234 * the last sample period. 1235 * 1236 * We normalize core_busy, which was our actual percent 1237 * performance to what we requested during the last sample 1238 * period. The result will be a percentage of busy at a 1239 * specified pstate. 1240 */ 1241 core_busy = cpu->sample.core_pct_busy; 1242 max_pstate = cpu->pstate.max_pstate_physical; 1243 current_pstate = cpu->pstate.current_pstate; 1244 core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); 1245 1246 /* 1247 * Since our utilization update callback will not run unless we are 1248 * in C0, check if the actual elapsed time is significantly greater (3x) 1249 * than our sample interval. If it is, then we were idle for a long 1250 * enough period of time to adjust our busyness. 1251 */ 1252 duration_ns = cpu->sample.time - cpu->last_sample_time; 1253 if ((s64)duration_ns > pid_params.sample_rate_ns * 3) { 1254 sample_ratio = div_fp(pid_params.sample_rate_ns, duration_ns); 1255 core_busy = mul_fp(core_busy, sample_ratio); 1256 } else { 1257 sample_ratio = div_fp(100 * cpu->sample.mperf, cpu->sample.tsc); 1258 if (sample_ratio < int_tofp(1)) 1259 core_busy = 0; 1260 } 1261 1262 cpu->sample.busy_scaled = core_busy; 1263 return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy); 1264 } 1265 1266 static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) 1267 { 1268 int max_perf, min_perf; 1269 1270 update_turbo_state(); 1271 1272 intel_pstate_get_min_max(cpu, &min_perf, &max_perf); 1273 pstate = clamp_t(int, pstate, min_perf, max_perf); 1274 if (pstate == cpu->pstate.current_pstate) 1275 return; 1276 1277 intel_pstate_record_pstate(cpu, pstate); 1278 wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); 1279 } 1280 1281 static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) 1282 { 1283 int from, target_pstate; 1284 struct sample *sample; 1285 1286 from = cpu->pstate.current_pstate; 1287 1288 target_pstate = pstate_funcs.get_target_pstate(cpu); 1289 1290 intel_pstate_update_pstate(cpu, target_pstate); 1291 1292 sample = &cpu->sample; 1293 trace_pstate_sample(fp_toint(sample->core_pct_busy), 1294 fp_toint(sample->busy_scaled), 1295 from, 1296 cpu->pstate.current_pstate, 1297 sample->mperf, 1298 sample->aperf, 1299 sample->tsc, 1300 get_avg_frequency(cpu)); 1301 } 1302 1303 static void intel_pstate_update_util(struct update_util_data *data, u64 time, 1304 unsigned long util, unsigned long max) 1305 { 1306 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 1307 u64 delta_ns = time - cpu->sample.time; 1308 1309 if ((s64)delta_ns >= pid_params.sample_rate_ns) { 1310 bool sample_taken = intel_pstate_sample(cpu, time); 1311 1312 if (sample_taken && !hwp_active) 1313 intel_pstate_adjust_busy_pstate(cpu); 1314 } 1315 } 1316 1317 #define ICPU(model, policy) \ 1318 { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\ 1319 (unsigned long)&policy } 1320 1321 static const struct x86_cpu_id intel_pstate_cpu_ids[] = { 1322 ICPU(0x2a, core_params), 1323 ICPU(0x2d, core_params), 1324 ICPU(0x37, silvermont_params), 1325 ICPU(0x3a, core_params), 1326 ICPU(0x3c, core_params), 1327 ICPU(0x3d, core_params), 1328 ICPU(0x3e, core_params), 1329 ICPU(0x3f, core_params), 1330 ICPU(0x45, core_params), 1331 ICPU(0x46, core_params), 1332 ICPU(0x47, core_params), 1333 ICPU(0x4c, airmont_params), 1334 ICPU(0x4e, core_params), 1335 ICPU(0x4f, core_params), 1336 ICPU(0x5e, core_params), 1337 ICPU(0x56, core_params), 1338 ICPU(0x57, knl_params), 1339 {} 1340 }; 1341 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); 1342 1343 static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] = { 1344 ICPU(0x56, core_params), 1345 {} 1346 }; 1347 1348 static int intel_pstate_init_cpu(unsigned int cpunum) 1349 { 1350 struct cpudata *cpu; 1351 1352 if (!all_cpu_data[cpunum]) 1353 all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), 1354 GFP_KERNEL); 1355 if (!all_cpu_data[cpunum]) 1356 return -ENOMEM; 1357 1358 cpu = all_cpu_data[cpunum]; 1359 1360 cpu->cpu = cpunum; 1361 1362 if (hwp_active) { 1363 intel_pstate_hwp_enable(cpu); 1364 pid_params.sample_rate_ms = 50; 1365 pid_params.sample_rate_ns = 50 * NSEC_PER_MSEC; 1366 } 1367 1368 intel_pstate_get_cpu_pstates(cpu); 1369 1370 intel_pstate_busy_pid_reset(cpu); 1371 1372 pr_debug("controlling: cpu %d\n", cpunum); 1373 1374 return 0; 1375 } 1376 1377 static unsigned int intel_pstate_get(unsigned int cpu_num) 1378 { 1379 struct sample *sample; 1380 struct cpudata *cpu; 1381 1382 cpu = all_cpu_data[cpu_num]; 1383 if (!cpu) 1384 return 0; 1385 sample = &cpu->sample; 1386 return get_avg_frequency(cpu); 1387 } 1388 1389 static void intel_pstate_set_update_util_hook(unsigned int cpu_num) 1390 { 1391 struct cpudata *cpu = all_cpu_data[cpu_num]; 1392 1393 /* Prevent intel_pstate_update_util() from using stale data. */ 1394 cpu->sample.time = 0; 1395 cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, 1396 intel_pstate_update_util); 1397 } 1398 1399 static void intel_pstate_clear_update_util_hook(unsigned int cpu) 1400 { 1401 cpufreq_remove_update_util_hook(cpu); 1402 synchronize_sched(); 1403 } 1404 1405 static void intel_pstate_set_performance_limits(struct perf_limits *limits) 1406 { 1407 limits->no_turbo = 0; 1408 limits->turbo_disabled = 0; 1409 limits->max_perf_pct = 100; 1410 limits->max_perf = int_tofp(1); 1411 limits->min_perf_pct = 100; 1412 limits->min_perf = int_tofp(1); 1413 limits->max_policy_pct = 100; 1414 limits->max_sysfs_pct = 100; 1415 limits->min_policy_pct = 0; 1416 limits->min_sysfs_pct = 0; 1417 } 1418 1419 static int intel_pstate_set_policy(struct cpufreq_policy *policy) 1420 { 1421 if (!policy->cpuinfo.max_freq) 1422 return -ENODEV; 1423 1424 intel_pstate_clear_update_util_hook(policy->cpu); 1425 1426 if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { 1427 limits = &performance_limits; 1428 if (policy->max >= policy->cpuinfo.max_freq) { 1429 pr_debug("set performance\n"); 1430 intel_pstate_set_performance_limits(limits); 1431 goto out; 1432 } 1433 } else { 1434 pr_debug("set powersave\n"); 1435 limits = &powersave_limits; 1436 } 1437 1438 limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq; 1439 limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100); 1440 limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100, 1441 policy->cpuinfo.max_freq); 1442 limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0 , 100); 1443 1444 /* Normalize user input to [min_policy_pct, max_policy_pct] */ 1445 limits->min_perf_pct = max(limits->min_policy_pct, 1446 limits->min_sysfs_pct); 1447 limits->min_perf_pct = min(limits->max_policy_pct, 1448 limits->min_perf_pct); 1449 limits->max_perf_pct = min(limits->max_policy_pct, 1450 limits->max_sysfs_pct); 1451 limits->max_perf_pct = max(limits->min_policy_pct, 1452 limits->max_perf_pct); 1453 limits->max_perf = round_up(limits->max_perf, FRAC_BITS); 1454 1455 /* Make sure min_perf_pct <= max_perf_pct */ 1456 limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct); 1457 1458 limits->min_perf = div_fp(limits->min_perf_pct, 100); 1459 limits->max_perf = div_fp(limits->max_perf_pct, 100); 1460 1461 out: 1462 intel_pstate_set_update_util_hook(policy->cpu); 1463 1464 if (hwp_active) 1465 intel_pstate_hwp_set(policy->cpus); 1466 1467 return 0; 1468 } 1469 1470 static int intel_pstate_verify_policy(struct cpufreq_policy *policy) 1471 { 1472 cpufreq_verify_within_cpu_limits(policy); 1473 1474 if (policy->policy != CPUFREQ_POLICY_POWERSAVE && 1475 policy->policy != CPUFREQ_POLICY_PERFORMANCE) 1476 return -EINVAL; 1477 1478 return 0; 1479 } 1480 1481 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) 1482 { 1483 int cpu_num = policy->cpu; 1484 struct cpudata *cpu = all_cpu_data[cpu_num]; 1485 1486 pr_debug("CPU %d exiting\n", cpu_num); 1487 1488 intel_pstate_clear_update_util_hook(cpu_num); 1489 1490 if (hwp_active) 1491 return; 1492 1493 intel_pstate_set_min_pstate(cpu); 1494 } 1495 1496 static int intel_pstate_cpu_init(struct cpufreq_policy *policy) 1497 { 1498 struct cpudata *cpu; 1499 int rc; 1500 1501 rc = intel_pstate_init_cpu(policy->cpu); 1502 if (rc) 1503 return rc; 1504 1505 cpu = all_cpu_data[policy->cpu]; 1506 1507 if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100) 1508 policy->policy = CPUFREQ_POLICY_PERFORMANCE; 1509 else 1510 policy->policy = CPUFREQ_POLICY_POWERSAVE; 1511 1512 policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; 1513 policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; 1514 1515 /* cpuinfo and default policy values */ 1516 policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; 1517 policy->cpuinfo.max_freq = 1518 cpu->pstate.turbo_pstate * cpu->pstate.scaling; 1519 intel_pstate_init_acpi_perf_limits(policy); 1520 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 1521 cpumask_set_cpu(policy->cpu, policy->cpus); 1522 1523 return 0; 1524 } 1525 1526 static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) 1527 { 1528 intel_pstate_exit_perf_limits(policy); 1529 1530 return 0; 1531 } 1532 1533 static struct cpufreq_driver intel_pstate_driver = { 1534 .flags = CPUFREQ_CONST_LOOPS, 1535 .verify = intel_pstate_verify_policy, 1536 .setpolicy = intel_pstate_set_policy, 1537 .get = intel_pstate_get, 1538 .init = intel_pstate_cpu_init, 1539 .exit = intel_pstate_cpu_exit, 1540 .stop_cpu = intel_pstate_stop_cpu, 1541 .name = "intel_pstate", 1542 }; 1543 1544 static int __initdata no_load; 1545 static int __initdata no_hwp; 1546 static int __initdata hwp_only; 1547 static unsigned int force_load; 1548 1549 static int intel_pstate_msrs_not_valid(void) 1550 { 1551 if (!pstate_funcs.get_max() || 1552 !pstate_funcs.get_min() || 1553 !pstate_funcs.get_turbo()) 1554 return -ENODEV; 1555 1556 return 0; 1557 } 1558 1559 static void copy_pid_params(struct pstate_adjust_policy *policy) 1560 { 1561 pid_params.sample_rate_ms = policy->sample_rate_ms; 1562 pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC; 1563 pid_params.p_gain_pct = policy->p_gain_pct; 1564 pid_params.i_gain_pct = policy->i_gain_pct; 1565 pid_params.d_gain_pct = policy->d_gain_pct; 1566 pid_params.deadband = policy->deadband; 1567 pid_params.setpoint = policy->setpoint; 1568 } 1569 1570 static void copy_cpu_funcs(struct pstate_funcs *funcs) 1571 { 1572 pstate_funcs.get_max = funcs->get_max; 1573 pstate_funcs.get_max_physical = funcs->get_max_physical; 1574 pstate_funcs.get_min = funcs->get_min; 1575 pstate_funcs.get_turbo = funcs->get_turbo; 1576 pstate_funcs.get_scaling = funcs->get_scaling; 1577 pstate_funcs.get_val = funcs->get_val; 1578 pstate_funcs.get_vid = funcs->get_vid; 1579 pstate_funcs.get_target_pstate = funcs->get_target_pstate; 1580 1581 } 1582 1583 #ifdef CONFIG_ACPI 1584 1585 static bool intel_pstate_no_acpi_pss(void) 1586 { 1587 int i; 1588 1589 for_each_possible_cpu(i) { 1590 acpi_status status; 1591 union acpi_object *pss; 1592 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 1593 struct acpi_processor *pr = per_cpu(processors, i); 1594 1595 if (!pr) 1596 continue; 1597 1598 status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer); 1599 if (ACPI_FAILURE(status)) 1600 continue; 1601 1602 pss = buffer.pointer; 1603 if (pss && pss->type == ACPI_TYPE_PACKAGE) { 1604 kfree(pss); 1605 return false; 1606 } 1607 1608 kfree(pss); 1609 } 1610 1611 return true; 1612 } 1613 1614 static bool intel_pstate_has_acpi_ppc(void) 1615 { 1616 int i; 1617 1618 for_each_possible_cpu(i) { 1619 struct acpi_processor *pr = per_cpu(processors, i); 1620 1621 if (!pr) 1622 continue; 1623 if (acpi_has_method(pr->handle, "_PPC")) 1624 return true; 1625 } 1626 return false; 1627 } 1628 1629 enum { 1630 PSS, 1631 PPC, 1632 }; 1633 1634 struct hw_vendor_info { 1635 u16 valid; 1636 char oem_id[ACPI_OEM_ID_SIZE]; 1637 char oem_table_id[ACPI_OEM_TABLE_ID_SIZE]; 1638 int oem_pwr_table; 1639 }; 1640 1641 /* Hardware vendor-specific info that has its own power management modes */ 1642 static struct hw_vendor_info vendor_info[] = { 1643 {1, "HP ", "ProLiant", PSS}, 1644 {1, "ORACLE", "X4-2 ", PPC}, 1645 {1, "ORACLE", "X4-2L ", PPC}, 1646 {1, "ORACLE", "X4-2B ", PPC}, 1647 {1, "ORACLE", "X3-2 ", PPC}, 1648 {1, "ORACLE", "X3-2L ", PPC}, 1649 {1, "ORACLE", "X3-2B ", PPC}, 1650 {1, "ORACLE", "X4470M2 ", PPC}, 1651 {1, "ORACLE", "X4270M3 ", PPC}, 1652 {1, "ORACLE", "X4270M2 ", PPC}, 1653 {1, "ORACLE", "X4170M2 ", PPC}, 1654 {1, "ORACLE", "X4170 M3", PPC}, 1655 {1, "ORACLE", "X4275 M3", PPC}, 1656 {1, "ORACLE", "X6-2 ", PPC}, 1657 {1, "ORACLE", "Sudbury ", PPC}, 1658 {0, "", ""}, 1659 }; 1660 1661 static bool intel_pstate_platform_pwr_mgmt_exists(void) 1662 { 1663 struct acpi_table_header hdr; 1664 struct hw_vendor_info *v_info; 1665 const struct x86_cpu_id *id; 1666 u64 misc_pwr; 1667 1668 id = x86_match_cpu(intel_pstate_cpu_oob_ids); 1669 if (id) { 1670 rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr); 1671 if ( misc_pwr & (1 << 8)) 1672 return true; 1673 } 1674 1675 if (acpi_disabled || 1676 ACPI_FAILURE(acpi_get_table_header(ACPI_SIG_FADT, 0, &hdr))) 1677 return false; 1678 1679 for (v_info = vendor_info; v_info->valid; v_info++) { 1680 if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) && 1681 !strncmp(hdr.oem_table_id, v_info->oem_table_id, 1682 ACPI_OEM_TABLE_ID_SIZE)) 1683 switch (v_info->oem_pwr_table) { 1684 case PSS: 1685 return intel_pstate_no_acpi_pss(); 1686 case PPC: 1687 return intel_pstate_has_acpi_ppc() && 1688 (!force_load); 1689 } 1690 } 1691 1692 return false; 1693 } 1694 #else /* CONFIG_ACPI not enabled */ 1695 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; } 1696 static inline bool intel_pstate_has_acpi_ppc(void) { return false; } 1697 #endif /* CONFIG_ACPI */ 1698 1699 static const struct x86_cpu_id hwp_support_ids[] __initconst = { 1700 { X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP }, 1701 {} 1702 }; 1703 1704 static int __init intel_pstate_init(void) 1705 { 1706 int cpu, rc = 0; 1707 const struct x86_cpu_id *id; 1708 struct cpu_defaults *cpu_def; 1709 1710 if (no_load) 1711 return -ENODEV; 1712 1713 if (x86_match_cpu(hwp_support_ids) && !no_hwp) { 1714 copy_cpu_funcs(&core_params.funcs); 1715 hwp_active++; 1716 goto hwp_cpu_matched; 1717 } 1718 1719 id = x86_match_cpu(intel_pstate_cpu_ids); 1720 if (!id) 1721 return -ENODEV; 1722 1723 cpu_def = (struct cpu_defaults *)id->driver_data; 1724 1725 copy_pid_params(&cpu_def->pid_policy); 1726 copy_cpu_funcs(&cpu_def->funcs); 1727 1728 if (intel_pstate_msrs_not_valid()) 1729 return -ENODEV; 1730 1731 hwp_cpu_matched: 1732 /* 1733 * The Intel pstate driver will be ignored if the platform 1734 * firmware has its own power management modes. 1735 */ 1736 if (intel_pstate_platform_pwr_mgmt_exists()) 1737 return -ENODEV; 1738 1739 pr_info("Intel P-state driver initializing\n"); 1740 1741 all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus()); 1742 if (!all_cpu_data) 1743 return -ENOMEM; 1744 1745 if (!hwp_active && hwp_only) 1746 goto out; 1747 1748 rc = cpufreq_register_driver(&intel_pstate_driver); 1749 if (rc) 1750 goto out; 1751 1752 intel_pstate_debug_expose_params(); 1753 intel_pstate_sysfs_expose_params(); 1754 1755 if (hwp_active) 1756 pr_info("HWP enabled\n"); 1757 1758 return rc; 1759 out: 1760 get_online_cpus(); 1761 for_each_online_cpu(cpu) { 1762 if (all_cpu_data[cpu]) { 1763 intel_pstate_clear_update_util_hook(cpu); 1764 kfree(all_cpu_data[cpu]); 1765 } 1766 } 1767 1768 put_online_cpus(); 1769 vfree(all_cpu_data); 1770 return -ENODEV; 1771 } 1772 device_initcall(intel_pstate_init); 1773 1774 static int __init intel_pstate_setup(char *str) 1775 { 1776 if (!str) 1777 return -EINVAL; 1778 1779 if (!strcmp(str, "disable")) 1780 no_load = 1; 1781 if (!strcmp(str, "no_hwp")) { 1782 pr_info("HWP disabled\n"); 1783 no_hwp = 1; 1784 } 1785 if (!strcmp(str, "force")) 1786 force_load = 1; 1787 if (!strcmp(str, "hwp_only")) 1788 hwp_only = 1; 1789 1790 #ifdef CONFIG_ACPI 1791 if (!strcmp(str, "support_acpi_ppc")) 1792 acpi_ppc = true; 1793 #endif 1794 1795 return 0; 1796 } 1797 early_param("intel_pstate", intel_pstate_setup); 1798 1799 MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>"); 1800 MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors"); 1801 MODULE_LICENSE("GPL"); 1802