1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * (C) 2010,2011 Thomas Renninger <trenn@suse.de>, Novell Inc. 4 */ 5 6 #if defined(__i386__) || defined(__x86_64__) 7 8 #include <stdio.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <string.h> 12 #include <limits.h> 13 14 #include <cpufreq.h> 15 16 #include "helpers/helpers.h" 17 #include "idle_monitor/cpupower-monitor.h" 18 19 #define MSR_APERF 0xE8 20 #define MSR_MPERF 0xE7 21 22 #define MSR_TSC 0x10 23 24 #define MSR_AMD_HWCR 0xc0010015 25 26 enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT }; 27 28 static int mperf_get_count_percent(unsigned int self_id, double *percent, 29 unsigned int cpu); 30 static int mperf_get_count_freq(unsigned int id, unsigned long long *count, 31 unsigned int cpu); 32 static struct timespec time_start, time_end; 33 34 static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = { 35 { 36 .name = "C0", 37 .desc = N_("Processor Core not idle"), 38 .id = C0, 39 .range = RANGE_THREAD, 40 .get_count_percent = mperf_get_count_percent, 41 }, 42 { 43 .name = "Cx", 44 .desc = N_("Processor Core in an idle state"), 45 .id = Cx, 46 .range = RANGE_THREAD, 47 .get_count_percent = mperf_get_count_percent, 48 }, 49 50 { 51 .name = "Freq", 52 .desc = N_("Average Frequency (including boost) in MHz"), 53 .id = AVG_FREQ, 54 .range = RANGE_THREAD, 55 .get_count = mperf_get_count_freq, 56 }, 57 }; 58 59 enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF }; 60 static int max_freq_mode; 61 /* 62 * The max frequency mperf is ticking at (in C0), either retrieved via: 63 * 1) calculated after measurements if we know TSC ticks at mperf/P0 frequency 64 * 2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time 65 * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen) 66 */ 67 static unsigned long max_frequency; 68 69 static unsigned long long tsc_at_measure_start; 70 static unsigned long long tsc_at_measure_end; 71 static unsigned long long *mperf_previous_count; 72 static unsigned long long *aperf_previous_count; 73 static unsigned long long *mperf_current_count; 74 static unsigned long long *aperf_current_count; 75 76 /* valid flag for all CPUs. If a MSR read failed it will be zero */ 77 static int *is_valid; 78 79 static int mperf_get_tsc(unsigned long long *tsc) 80 { 81 int ret; 82 83 ret = read_msr(base_cpu, MSR_TSC, tsc); 84 if (ret) 85 dprint("Reading TSC MSR failed, returning %llu\n", *tsc); 86 return ret; 87 } 88 89 static int get_aperf_mperf(int cpu, unsigned long long *aval, 90 unsigned long long *mval) 91 { 92 int ret; 93 94 /* 95 * Running on the cpu from which we read the registers will 96 * prevent APERF/MPERF from going out of sync because of IPI 97 * latency introduced by read_msr()s. 98 */ 99 if (mperf_monitor.flags.per_cpu_schedule) { 100 if (bind_cpu(cpu)) 101 return 1; 102 } 103 104 ret = read_msr(cpu, MSR_APERF, aval); 105 ret |= read_msr(cpu, MSR_MPERF, mval); 106 107 return ret; 108 } 109 110 static int mperf_init_stats(unsigned int cpu) 111 { 112 unsigned long long aval, mval; 113 int ret; 114 115 ret = get_aperf_mperf(cpu, &aval, &mval); 116 aperf_previous_count[cpu] = aval; 117 mperf_previous_count[cpu] = mval; 118 is_valid[cpu] = !ret; 119 120 return 0; 121 } 122 123 static int mperf_measure_stats(unsigned int cpu) 124 { 125 unsigned long long aval, mval; 126 int ret; 127 128 ret = get_aperf_mperf(cpu, &aval, &mval); 129 aperf_current_count[cpu] = aval; 130 mperf_current_count[cpu] = mval; 131 is_valid[cpu] = !ret; 132 133 return 0; 134 } 135 136 static int mperf_get_count_percent(unsigned int id, double *percent, 137 unsigned int cpu) 138 { 139 unsigned long long aperf_diff, mperf_diff, tsc_diff; 140 unsigned long long timediff; 141 142 if (!is_valid[cpu]) 143 return -1; 144 145 if (id != C0 && id != Cx) 146 return -1; 147 148 mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; 149 aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; 150 151 if (max_freq_mode == MAX_FREQ_TSC_REF) { 152 tsc_diff = tsc_at_measure_end - tsc_at_measure_start; 153 *percent = 100.0 * mperf_diff / tsc_diff; 154 dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n", 155 mperf_cstates[id].name, mperf_diff, tsc_diff); 156 } else if (max_freq_mode == MAX_FREQ_SYSFS) { 157 timediff = max_frequency * timespec_diff_us(time_start, time_end); 158 *percent = 100.0 * mperf_diff / timediff; 159 dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n", 160 mperf_cstates[id].name, mperf_diff, timediff); 161 } else 162 return -1; 163 164 if (id == Cx) 165 *percent = 100.0 - *percent; 166 167 dprint("%s: previous: %llu - current: %llu - (%u)\n", 168 mperf_cstates[id].name, mperf_diff, aperf_diff, cpu); 169 dprint("%s: %f\n", mperf_cstates[id].name, *percent); 170 return 0; 171 } 172 173 static int mperf_get_count_freq(unsigned int id, unsigned long long *count, 174 unsigned int cpu) 175 { 176 unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff; 177 178 if (id != AVG_FREQ) 179 return 1; 180 181 if (!is_valid[cpu]) 182 return -1; 183 184 mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; 185 aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; 186 187 if (max_freq_mode == MAX_FREQ_TSC_REF) { 188 /* Calculate max_freq from TSC count */ 189 tsc_diff = tsc_at_measure_end - tsc_at_measure_start; 190 time_diff = timespec_diff_us(time_start, time_end); 191 max_frequency = tsc_diff / time_diff; 192 } 193 194 *count = max_frequency * ((double)aperf_diff / mperf_diff); 195 dprint("%s: Average freq based on %s maximum frequency:\n", 196 mperf_cstates[id].name, 197 (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read"); 198 dprint("max_frequency: %lu\n", max_frequency); 199 dprint("aperf_diff: %llu\n", aperf_diff); 200 dprint("mperf_diff: %llu\n", mperf_diff); 201 dprint("avg freq: %llu\n", *count); 202 return 0; 203 } 204 205 static int mperf_start(void) 206 { 207 int cpu; 208 unsigned long long dbg; 209 210 clock_gettime(CLOCK_REALTIME, &time_start); 211 mperf_get_tsc(&tsc_at_measure_start); 212 213 for (cpu = 0; cpu < cpu_count; cpu++) 214 mperf_init_stats(cpu); 215 216 mperf_get_tsc(&dbg); 217 dprint("TSC diff: %llu\n", dbg - tsc_at_measure_start); 218 return 0; 219 } 220 221 static int mperf_stop(void) 222 { 223 unsigned long long dbg; 224 int cpu; 225 226 for (cpu = 0; cpu < cpu_count; cpu++) 227 mperf_measure_stats(cpu); 228 229 mperf_get_tsc(&tsc_at_measure_end); 230 clock_gettime(CLOCK_REALTIME, &time_end); 231 232 mperf_get_tsc(&dbg); 233 dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end); 234 235 return 0; 236 } 237 238 /* 239 * Mperf register is defined to tick at P0 (maximum) frequency 240 * 241 * Instead of reading out P0 which can be tricky to read out from HW, 242 * we use TSC counter if it reliably ticks at P0/mperf frequency. 243 * 244 * Still try to fall back to: 245 * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq 246 * on older Intel HW without invariant TSC feature. 247 * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but 248 * it's still double checked (MSR_AMD_HWCR)). 249 * 250 * On these machines the user would still get useful mperf 251 * stats when acpi-cpufreq driver is loaded. 252 */ 253 static int init_maxfreq_mode(void) 254 { 255 int ret; 256 unsigned long long hwcr; 257 unsigned long min; 258 259 if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC)) 260 goto use_sysfs; 261 262 if (cpupower_cpu_info.vendor == X86_VENDOR_AMD || 263 cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { 264 /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf 265 * freq. 266 * A test whether hwcr is accessable/available would be: 267 * (cpupower_cpu_info.family > 0x10 || 268 * cpupower_cpu_info.family == 0x10 && 269 * cpupower_cpu_info.model >= 0x2)) 270 * This should be the case for all aperf/mperf 271 * capable AMD machines and is therefore safe to test here. 272 * Compare with Linus kernel git commit: acf01734b1747b1ec4 273 */ 274 ret = read_msr(0, MSR_AMD_HWCR, &hwcr); 275 /* 276 * If the MSR read failed, assume a Xen system that did 277 * not explicitly provide access to it and assume TSC works 278 */ 279 if (ret != 0) { 280 dprint("TSC read 0x%x failed - assume TSC working\n", 281 MSR_AMD_HWCR); 282 return 0; 283 } else if (1 & (hwcr >> 24)) { 284 max_freq_mode = MAX_FREQ_TSC_REF; 285 return 0; 286 } else { /* Use sysfs max frequency if available */ } 287 } else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) { 288 /* 289 * On Intel we assume mperf (in C0) is ticking at same 290 * rate than TSC 291 */ 292 max_freq_mode = MAX_FREQ_TSC_REF; 293 return 0; 294 } 295 use_sysfs: 296 if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) { 297 dprint("Cannot retrieve max freq from cpufreq kernel " 298 "subsystem\n"); 299 return -1; 300 } 301 max_freq_mode = MAX_FREQ_SYSFS; 302 max_frequency /= 1000; /* Default automatically to MHz value */ 303 return 0; 304 } 305 306 /* 307 * This monitor provides: 308 * 309 * 1) Average frequency a CPU resided in 310 * This always works if the CPU has aperf/mperf capabilities 311 * 312 * 2) C0 and Cx (any sleep state) time a CPU resided in 313 * Works if mperf timer stops ticking in sleep states which 314 * seem to be the case on all current HW. 315 * Both is directly retrieved from HW registers and is independent 316 * from kernel statistics. 317 */ 318 struct cpuidle_monitor mperf_monitor; 319 struct cpuidle_monitor *mperf_register(void) 320 { 321 if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) 322 return NULL; 323 324 if (init_maxfreq_mode()) 325 return NULL; 326 327 if (cpupower_cpu_info.vendor == X86_VENDOR_AMD) 328 mperf_monitor.flags.per_cpu_schedule = 1; 329 330 /* Free this at program termination */ 331 is_valid = calloc(cpu_count, sizeof(int)); 332 mperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); 333 aperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); 334 mperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); 335 aperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); 336 337 mperf_monitor.name_len = strlen(mperf_monitor.name); 338 return &mperf_monitor; 339 } 340 341 void mperf_unregister(void) 342 { 343 free(mperf_previous_count); 344 free(aperf_previous_count); 345 free(mperf_current_count); 346 free(aperf_current_count); 347 free(is_valid); 348 } 349 350 struct cpuidle_monitor mperf_monitor = { 351 .name = "Mperf", 352 .hw_states_num = MPERF_CSTATE_COUNT, 353 .hw_states = mperf_cstates, 354 .start = mperf_start, 355 .stop = mperf_stop, 356 .do_register = mperf_register, 357 .unregister = mperf_unregister, 358 .flags.needs_root = 1, 359 .overflow_s = 922000000 /* 922337203 seconds TSC overflow 360 at 20GHz */ 361 }; 362 #endif /* #if defined(__i386__) || defined(__x86_64__) */ 363