1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * (C) 2010,2011 Thomas Renninger <trenn@suse.de>, Novell Inc. 4 */ 5 6 #if defined(__i386__) || defined(__x86_64__) 7 8 #include <stdio.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <string.h> 12 #include <limits.h> 13 14 #include <cpufreq.h> 15 16 #include "helpers/helpers.h" 17 #include "idle_monitor/cpupower-monitor.h" 18 19 #define MSR_APERF 0xE8 20 #define MSR_MPERF 0xE7 21 22 #define MSR_TSC 0x10 23 24 #define MSR_AMD_HWCR 0xc0010015 25 26 enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT }; 27 28 static int mperf_get_count_percent(unsigned int self_id, double *percent, 29 unsigned int cpu); 30 static int mperf_get_count_freq(unsigned int id, unsigned long long *count, 31 unsigned int cpu); 32 static struct timespec time_start, time_end; 33 34 static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = { 35 { 36 .name = "C0", 37 .desc = N_("Processor Core not idle"), 38 .id = C0, 39 .range = RANGE_THREAD, 40 .get_count_percent = mperf_get_count_percent, 41 }, 42 { 43 .name = "Cx", 44 .desc = N_("Processor Core in an idle state"), 45 .id = Cx, 46 .range = RANGE_THREAD, 47 .get_count_percent = mperf_get_count_percent, 48 }, 49 50 { 51 .name = "Freq", 52 .desc = N_("Average Frequency (including boost) in MHz"), 53 .id = AVG_FREQ, 54 .range = RANGE_THREAD, 55 .get_count = mperf_get_count_freq, 56 }, 57 }; 58 59 enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF }; 60 static int max_freq_mode; 61 /* 62 * The max frequency mperf is ticking at (in C0), either retrieved via: 63 * 1) calculated after measurements if we know TSC ticks at mperf/P0 frequency 64 * 2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time 65 * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen) 66 */ 67 static unsigned long max_frequency; 68 69 static unsigned long long tsc_at_measure_start; 70 static unsigned long long tsc_at_measure_end; 71 static unsigned long long *mperf_previous_count; 72 static unsigned long long *aperf_previous_count; 73 static unsigned long long *mperf_current_count; 74 static unsigned long long *aperf_current_count; 75 76 /* valid flag for all CPUs. If a MSR read failed it will be zero */ 77 static int *is_valid; 78 79 static int mperf_get_tsc(unsigned long long *tsc) 80 { 81 int ret; 82 83 ret = read_msr(base_cpu, MSR_TSC, tsc); 84 if (ret) 85 dprint("Reading TSC MSR failed, returning %llu\n", *tsc); 86 return ret; 87 } 88 89 static int mperf_init_stats(unsigned int cpu) 90 { 91 unsigned long long val; 92 int ret; 93 94 ret = read_msr(cpu, MSR_APERF, &val); 95 aperf_previous_count[cpu] = val; 96 ret |= read_msr(cpu, MSR_MPERF, &val); 97 mperf_previous_count[cpu] = val; 98 is_valid[cpu] = !ret; 99 100 return 0; 101 } 102 103 static int mperf_measure_stats(unsigned int cpu) 104 { 105 unsigned long long val; 106 int ret; 107 108 ret = read_msr(cpu, MSR_APERF, &val); 109 aperf_current_count[cpu] = val; 110 ret |= read_msr(cpu, MSR_MPERF, &val); 111 mperf_current_count[cpu] = val; 112 is_valid[cpu] = !ret; 113 114 return 0; 115 } 116 117 static int mperf_get_count_percent(unsigned int id, double *percent, 118 unsigned int cpu) 119 { 120 unsigned long long aperf_diff, mperf_diff, tsc_diff; 121 unsigned long long timediff; 122 123 if (!is_valid[cpu]) 124 return -1; 125 126 if (id != C0 && id != Cx) 127 return -1; 128 129 mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; 130 aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; 131 132 if (max_freq_mode == MAX_FREQ_TSC_REF) { 133 tsc_diff = tsc_at_measure_end - tsc_at_measure_start; 134 *percent = 100.0 * mperf_diff / tsc_diff; 135 dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n", 136 mperf_cstates[id].name, mperf_diff, tsc_diff); 137 } else if (max_freq_mode == MAX_FREQ_SYSFS) { 138 timediff = max_frequency * timespec_diff_us(time_start, time_end); 139 *percent = 100.0 * mperf_diff / timediff; 140 dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n", 141 mperf_cstates[id].name, mperf_diff, timediff); 142 } else 143 return -1; 144 145 if (id == Cx) 146 *percent = 100.0 - *percent; 147 148 dprint("%s: previous: %llu - current: %llu - (%u)\n", 149 mperf_cstates[id].name, mperf_diff, aperf_diff, cpu); 150 dprint("%s: %f\n", mperf_cstates[id].name, *percent); 151 return 0; 152 } 153 154 static int mperf_get_count_freq(unsigned int id, unsigned long long *count, 155 unsigned int cpu) 156 { 157 unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff; 158 159 if (id != AVG_FREQ) 160 return 1; 161 162 if (!is_valid[cpu]) 163 return -1; 164 165 mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; 166 aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; 167 168 if (max_freq_mode == MAX_FREQ_TSC_REF) { 169 /* Calculate max_freq from TSC count */ 170 tsc_diff = tsc_at_measure_end - tsc_at_measure_start; 171 time_diff = timespec_diff_us(time_start, time_end); 172 max_frequency = tsc_diff / time_diff; 173 } 174 175 *count = max_frequency * ((double)aperf_diff / mperf_diff); 176 dprint("%s: Average freq based on %s maximum frequency:\n", 177 mperf_cstates[id].name, 178 (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read"); 179 dprint("max_frequency: %lu\n", max_frequency); 180 dprint("aperf_diff: %llu\n", aperf_diff); 181 dprint("mperf_diff: %llu\n", mperf_diff); 182 dprint("avg freq: %llu\n", *count); 183 return 0; 184 } 185 186 static int mperf_start(void) 187 { 188 int cpu; 189 unsigned long long dbg; 190 191 clock_gettime(CLOCK_REALTIME, &time_start); 192 mperf_get_tsc(&tsc_at_measure_start); 193 194 for (cpu = 0; cpu < cpu_count; cpu++) 195 mperf_init_stats(cpu); 196 197 mperf_get_tsc(&dbg); 198 dprint("TSC diff: %llu\n", dbg - tsc_at_measure_start); 199 return 0; 200 } 201 202 static int mperf_stop(void) 203 { 204 unsigned long long dbg; 205 int cpu; 206 207 for (cpu = 0; cpu < cpu_count; cpu++) 208 mperf_measure_stats(cpu); 209 210 mperf_get_tsc(&tsc_at_measure_end); 211 clock_gettime(CLOCK_REALTIME, &time_end); 212 213 mperf_get_tsc(&dbg); 214 dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end); 215 216 return 0; 217 } 218 219 /* 220 * Mperf register is defined to tick at P0 (maximum) frequency 221 * 222 * Instead of reading out P0 which can be tricky to read out from HW, 223 * we use TSC counter if it reliably ticks at P0/mperf frequency. 224 * 225 * Still try to fall back to: 226 * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq 227 * on older Intel HW without invariant TSC feature. 228 * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but 229 * it's still double checked (MSR_AMD_HWCR)). 230 * 231 * On these machines the user would still get useful mperf 232 * stats when acpi-cpufreq driver is loaded. 233 */ 234 static int init_maxfreq_mode(void) 235 { 236 int ret; 237 unsigned long long hwcr; 238 unsigned long min; 239 240 if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC)) 241 goto use_sysfs; 242 243 if (cpupower_cpu_info.vendor == X86_VENDOR_AMD || 244 cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { 245 /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf 246 * freq. 247 * A test whether hwcr is accessable/available would be: 248 * (cpupower_cpu_info.family > 0x10 || 249 * cpupower_cpu_info.family == 0x10 && 250 * cpupower_cpu_info.model >= 0x2)) 251 * This should be the case for all aperf/mperf 252 * capable AMD machines and is therefore safe to test here. 253 * Compare with Linus kernel git commit: acf01734b1747b1ec4 254 */ 255 ret = read_msr(0, MSR_AMD_HWCR, &hwcr); 256 /* 257 * If the MSR read failed, assume a Xen system that did 258 * not explicitly provide access to it and assume TSC works 259 */ 260 if (ret != 0) { 261 dprint("TSC read 0x%x failed - assume TSC working\n", 262 MSR_AMD_HWCR); 263 return 0; 264 } else if (1 & (hwcr >> 24)) { 265 max_freq_mode = MAX_FREQ_TSC_REF; 266 return 0; 267 } else { /* Use sysfs max frequency if available */ } 268 } else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) { 269 /* 270 * On Intel we assume mperf (in C0) is ticking at same 271 * rate than TSC 272 */ 273 max_freq_mode = MAX_FREQ_TSC_REF; 274 return 0; 275 } 276 use_sysfs: 277 if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) { 278 dprint("Cannot retrieve max freq from cpufreq kernel " 279 "subsystem\n"); 280 return -1; 281 } 282 max_freq_mode = MAX_FREQ_SYSFS; 283 max_frequency /= 1000; /* Default automatically to MHz value */ 284 return 0; 285 } 286 287 /* 288 * This monitor provides: 289 * 290 * 1) Average frequency a CPU resided in 291 * This always works if the CPU has aperf/mperf capabilities 292 * 293 * 2) C0 and Cx (any sleep state) time a CPU resided in 294 * Works if mperf timer stops ticking in sleep states which 295 * seem to be the case on all current HW. 296 * Both is directly retrieved from HW registers and is independent 297 * from kernel statistics. 298 */ 299 struct cpuidle_monitor mperf_monitor; 300 struct cpuidle_monitor *mperf_register(void) 301 { 302 if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) 303 return NULL; 304 305 if (init_maxfreq_mode()) 306 return NULL; 307 308 /* Free this at program termination */ 309 is_valid = calloc(cpu_count, sizeof(int)); 310 mperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); 311 aperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); 312 mperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); 313 aperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); 314 315 mperf_monitor.name_len = strlen(mperf_monitor.name); 316 return &mperf_monitor; 317 } 318 319 void mperf_unregister(void) 320 { 321 free(mperf_previous_count); 322 free(aperf_previous_count); 323 free(mperf_current_count); 324 free(aperf_current_count); 325 free(is_valid); 326 } 327 328 struct cpuidle_monitor mperf_monitor = { 329 .name = "Mperf", 330 .hw_states_num = MPERF_CSTATE_COUNT, 331 .hw_states = mperf_cstates, 332 .start = mperf_start, 333 .stop = mperf_stop, 334 .do_register = mperf_register, 335 .unregister = mperf_unregister, 336 .needs_root = 1, 337 .overflow_s = 922000000 /* 922337203 seconds TSC overflow 338 at 20GHz */ 339 }; 340 #endif /* #if defined(__i386__) || defined(__x86_64__) */ 341