1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * x86 APERF/MPERF KHz calculation for 4 * /sys/.../cpufreq/scaling_cur_freq 5 * 6 * Copyright (C) 2017 Intel Corp. 7 * Author: Len Brown <len.brown@intel.com> 8 */ 9 #include <linux/cpufreq.h> 10 #include <linux/delay.h> 11 #include <linux/ktime.h> 12 #include <linux/math64.h> 13 #include <linux/percpu.h> 14 #include <linux/rcupdate.h> 15 #include <linux/sched/isolation.h> 16 #include <linux/sched/topology.h> 17 #include <linux/smp.h> 18 #include <linux/syscore_ops.h> 19 20 #include <asm/cpu.h> 21 #include <asm/cpu_device_id.h> 22 #include <asm/intel-family.h> 23 24 #include "cpu.h" 25 26 struct aperfmperf { 27 seqcount_t seq; 28 unsigned long last_update; 29 u64 acnt; 30 u64 mcnt; 31 u64 aperf; 32 u64 mperf; 33 }; 34 35 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { 36 .seq = SEQCNT_ZERO(cpu_samples.seq) 37 }; 38 39 static void init_counter_refs(void) 40 { 41 u64 aperf, mperf; 42 43 rdmsrl(MSR_IA32_APERF, aperf); 44 rdmsrl(MSR_IA32_MPERF, mperf); 45 46 this_cpu_write(cpu_samples.aperf, aperf); 47 this_cpu_write(cpu_samples.mperf, mperf); 48 } 49 50 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) 51 /* 52 * APERF/MPERF frequency ratio computation. 53 * 54 * The scheduler wants to do frequency invariant accounting and needs a <1 55 * ratio to account for the 'current' frequency, corresponding to 56 * freq_curr / freq_max. 57 * 58 * Since the frequency freq_curr on x86 is controlled by micro-controller and 59 * our P-state setting is little more than a request/hint, we need to observe 60 * the effective frequency 'BusyMHz', i.e. the average frequency over a time 61 * interval after discarding idle time. This is given by: 62 * 63 * BusyMHz = delta_APERF / delta_MPERF * freq_base 64 * 65 * where freq_base is the max non-turbo P-state. 66 * 67 * The freq_max term has to be set to a somewhat arbitrary value, because we 68 * can't know which turbo states will be available at a given point in time: 69 * it all depends on the thermal headroom of the entire package. We set it to 70 * the turbo level with 4 cores active. 71 * 72 * Benchmarks show that's a good compromise between the 1C turbo ratio 73 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, 74 * which would ignore the entire turbo range (a conspicuous part, making 75 * freq_curr/freq_max always maxed out). 76 * 77 * An exception to the heuristic above is the Atom uarch, where we choose the 78 * highest turbo level for freq_max since Atom's are generally oriented towards 79 * power efficiency. 80 * 81 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio 82 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. 83 */ 84 85 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); 86 87 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; 88 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; 89 90 void arch_set_max_freq_ratio(bool turbo_disabled) 91 { 92 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : 93 arch_turbo_freq_ratio; 94 } 95 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); 96 97 static bool __init turbo_disabled(void) 98 { 99 u64 misc_en; 100 int err; 101 102 err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); 103 if (err) 104 return false; 105 106 return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); 107 } 108 109 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 110 { 111 int err; 112 113 err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); 114 if (err) 115 return false; 116 117 err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); 118 if (err) 119 return false; 120 121 *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ 122 *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ 123 124 return true; 125 } 126 127 #define X86_MATCH(model) \ 128 X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ 129 INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) 130 131 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { 132 X86_MATCH(XEON_PHI_KNL), 133 X86_MATCH(XEON_PHI_KNM), 134 {} 135 }; 136 137 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { 138 X86_MATCH(SKYLAKE_X), 139 {} 140 }; 141 142 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { 143 X86_MATCH(ATOM_GOLDMONT), 144 X86_MATCH(ATOM_GOLDMONT_D), 145 X86_MATCH(ATOM_GOLDMONT_PLUS), 146 {} 147 }; 148 149 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, 150 int num_delta_fratio) 151 { 152 int fratio, delta_fratio, found; 153 int err, i; 154 u64 msr; 155 156 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 157 if (err) 158 return false; 159 160 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 161 162 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); 163 if (err) 164 return false; 165 166 fratio = (msr >> 8) & 0xFF; 167 i = 16; 168 found = 0; 169 do { 170 if (found >= num_delta_fratio) { 171 *turbo_freq = fratio; 172 return true; 173 } 174 175 delta_fratio = (msr >> (i + 5)) & 0x7; 176 177 if (delta_fratio) { 178 found += 1; 179 fratio -= delta_fratio; 180 } 181 182 i += 8; 183 } while (i < 64); 184 185 return true; 186 } 187 188 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) 189 { 190 u64 ratios, counts; 191 u32 group_size; 192 int err, i; 193 194 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 195 if (err) 196 return false; 197 198 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 199 200 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); 201 if (err) 202 return false; 203 204 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); 205 if (err) 206 return false; 207 208 for (i = 0; i < 64; i += 8) { 209 group_size = (counts >> i) & 0xFF; 210 if (group_size >= size) { 211 *turbo_freq = (ratios >> i) & 0xFF; 212 return true; 213 } 214 } 215 216 return false; 217 } 218 219 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 220 { 221 u64 msr; 222 int err; 223 224 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 225 if (err) 226 return false; 227 228 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); 229 if (err) 230 return false; 231 232 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 233 *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ 234 235 /* The CPU may have less than 4 cores */ 236 if (!*turbo_freq) 237 *turbo_freq = msr & 0xFF; /* 1C turbo */ 238 239 return true; 240 } 241 242 static bool __init intel_set_max_freq_ratio(void) 243 { 244 u64 base_freq, turbo_freq; 245 u64 turbo_ratio; 246 247 if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) 248 goto out; 249 250 if (x86_match_cpu(has_glm_turbo_ratio_limits) && 251 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 252 goto out; 253 254 if (x86_match_cpu(has_knl_turbo_ratio_limits) && 255 knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 256 goto out; 257 258 if (x86_match_cpu(has_skx_turbo_ratio_limits) && 259 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) 260 goto out; 261 262 if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) 263 goto out; 264 265 return false; 266 267 out: 268 /* 269 * Some hypervisors advertise X86_FEATURE_APERFMPERF 270 * but then fill all MSR's with zeroes. 271 * Some CPUs have turbo boost but don't declare any turbo ratio 272 * in MSR_TURBO_RATIO_LIMIT. 273 */ 274 if (!base_freq || !turbo_freq) { 275 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); 276 return false; 277 } 278 279 turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); 280 if (!turbo_ratio) { 281 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); 282 return false; 283 } 284 285 arch_turbo_freq_ratio = turbo_ratio; 286 arch_set_max_freq_ratio(turbo_disabled()); 287 288 return true; 289 } 290 291 #ifdef CONFIG_PM_SLEEP 292 static struct syscore_ops freq_invariance_syscore_ops = { 293 .resume = init_counter_refs, 294 }; 295 296 static void register_freq_invariance_syscore_ops(void) 297 { 298 register_syscore_ops(&freq_invariance_syscore_ops); 299 } 300 #else 301 static inline void register_freq_invariance_syscore_ops(void) {} 302 #endif 303 304 static void freq_invariance_enable(void) 305 { 306 if (static_branch_unlikely(&arch_scale_freq_key)) { 307 WARN_ON_ONCE(1); 308 return; 309 } 310 static_branch_enable(&arch_scale_freq_key); 311 register_freq_invariance_syscore_ops(); 312 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); 313 } 314 315 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) 316 { 317 arch_turbo_freq_ratio = ratio; 318 arch_set_max_freq_ratio(turbo_disabled); 319 freq_invariance_enable(); 320 } 321 322 static void __init bp_init_freq_invariance(void) 323 { 324 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 325 return; 326 327 if (intel_set_max_freq_ratio()) 328 freq_invariance_enable(); 329 } 330 331 static void disable_freq_invariance_workfn(struct work_struct *work) 332 { 333 static_branch_disable(&arch_scale_freq_key); 334 } 335 336 static DECLARE_WORK(disable_freq_invariance_work, 337 disable_freq_invariance_workfn); 338 339 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; 340 341 static void scale_freq_tick(u64 acnt, u64 mcnt) 342 { 343 u64 freq_scale; 344 345 if (!arch_scale_freq_invariant()) 346 return; 347 348 if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) 349 goto error; 350 351 if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) 352 goto error; 353 354 freq_scale = div64_u64(acnt, mcnt); 355 if (!freq_scale) 356 goto error; 357 358 if (freq_scale > SCHED_CAPACITY_SCALE) 359 freq_scale = SCHED_CAPACITY_SCALE; 360 361 this_cpu_write(arch_freq_scale, freq_scale); 362 return; 363 364 error: 365 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); 366 schedule_work(&disable_freq_invariance_work); 367 } 368 #else 369 static inline void bp_init_freq_invariance(void) { } 370 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } 371 #endif /* CONFIG_X86_64 && CONFIG_SMP */ 372 373 void arch_scale_freq_tick(void) 374 { 375 struct aperfmperf *s = this_cpu_ptr(&cpu_samples); 376 u64 acnt, mcnt, aperf, mperf; 377 378 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 379 return; 380 381 rdmsrl(MSR_IA32_APERF, aperf); 382 rdmsrl(MSR_IA32_MPERF, mperf); 383 acnt = aperf - s->aperf; 384 mcnt = mperf - s->mperf; 385 386 s->aperf = aperf; 387 s->mperf = mperf; 388 389 raw_write_seqcount_begin(&s->seq); 390 s->last_update = jiffies; 391 s->acnt = acnt; 392 s->mcnt = mcnt; 393 raw_write_seqcount_end(&s->seq); 394 395 scale_freq_tick(acnt, mcnt); 396 } 397 398 /* 399 * Discard samples older than the define maximum sample age of 20ms. There 400 * is no point in sending IPIs in such a case. If the scheduler tick was 401 * not running then the CPU is either idle or isolated. 402 */ 403 #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) 404 405 unsigned int arch_freq_get_on_cpu(int cpu) 406 { 407 struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); 408 unsigned int seq, freq; 409 unsigned long last; 410 u64 acnt, mcnt; 411 412 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 413 goto fallback; 414 415 do { 416 seq = raw_read_seqcount_begin(&s->seq); 417 last = s->last_update; 418 acnt = s->acnt; 419 mcnt = s->mcnt; 420 } while (read_seqcount_retry(&s->seq, seq)); 421 422 /* 423 * Bail on invalid count and when the last update was too long ago, 424 * which covers idle and NOHZ full CPUs. 425 */ 426 if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) 427 goto fallback; 428 429 return div64_u64((cpu_khz * acnt), mcnt); 430 431 fallback: 432 freq = cpufreq_quick_get(cpu); 433 return freq ? freq : cpu_khz; 434 } 435 436 static int __init bp_init_aperfmperf(void) 437 { 438 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 439 return 0; 440 441 init_counter_refs(); 442 bp_init_freq_invariance(); 443 return 0; 444 } 445 early_initcall(bp_init_aperfmperf); 446 447 void ap_init_aperfmperf(void) 448 { 449 if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 450 init_counter_refs(); 451 } 452