1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  (C) 2010,2011       Thomas Renninger <trenn@suse.de>, Novell Inc.
4  */
5 
6 #if defined(__i386__) || defined(__x86_64__)
7 
8 #include <stdio.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <limits.h>
13 
14 #include <cpufreq.h>
15 
16 #include "helpers/helpers.h"
17 #include "idle_monitor/cpupower-monitor.h"
18 
19 #define MSR_APERF	0xE8
20 #define MSR_MPERF	0xE7
21 
22 #define MSR_TSC	0x10
23 
24 #define MSR_AMD_HWCR 0xc0010015
25 
26 enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT };
27 
28 static int mperf_get_count_percent(unsigned int self_id, double *percent,
29 				   unsigned int cpu);
30 static int mperf_get_count_freq(unsigned int id, unsigned long long *count,
31 				unsigned int cpu);
32 static struct timespec time_start, time_end;
33 
34 static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = {
35 	{
36 		.name			= "C0",
37 		.desc			= N_("Processor Core not idle"),
38 		.id			= C0,
39 		.range			= RANGE_THREAD,
40 		.get_count_percent	= mperf_get_count_percent,
41 	},
42 	{
43 		.name			= "Cx",
44 		.desc			= N_("Processor Core in an idle state"),
45 		.id			= Cx,
46 		.range			= RANGE_THREAD,
47 		.get_count_percent	= mperf_get_count_percent,
48 	},
49 
50 	{
51 		.name			= "Freq",
52 		.desc			= N_("Average Frequency (including boost) in MHz"),
53 		.id			= AVG_FREQ,
54 		.range			= RANGE_THREAD,
55 		.get_count		= mperf_get_count_freq,
56 	},
57 };
58 
59 enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF };
60 static int max_freq_mode;
61 /*
62  * The max frequency mperf is ticking at (in C0), either retrieved via:
63  *   1) calculated after measurements if we know TSC ticks at mperf/P0 frequency
64  *   2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time
65  * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen)
66  */
67 static unsigned long max_frequency;
68 
69 static unsigned long long tsc_at_measure_start;
70 static unsigned long long tsc_at_measure_end;
71 static unsigned long long *mperf_previous_count;
72 static unsigned long long *aperf_previous_count;
73 static unsigned long long *mperf_current_count;
74 static unsigned long long *aperf_current_count;
75 
76 /* valid flag for all CPUs. If a MSR read failed it will be zero */
77 static int *is_valid;
78 
79 static int mperf_get_tsc(unsigned long long *tsc)
80 {
81 	int ret;
82 
83 	ret = read_msr(base_cpu, MSR_TSC, tsc);
84 	if (ret)
85 		dprint("Reading TSC MSR failed, returning %llu\n", *tsc);
86 	return ret;
87 }
88 
89 static int get_aperf_mperf(int cpu, unsigned long long *aval,
90 				    unsigned long long *mval)
91 {
92 	int ret;
93 
94 	/*
95 	 * Running on the cpu from which we read the registers will
96 	 * prevent APERF/MPERF from going out of sync because of IPI
97 	 * latency introduced by read_msr()s.
98 	 */
99 	if (mperf_monitor.flags.per_cpu_schedule) {
100 		if (bind_cpu(cpu))
101 			return 1;
102 	}
103 
104 	ret  = read_msr(cpu, MSR_APERF, aval);
105 	ret |= read_msr(cpu, MSR_MPERF, mval);
106 
107 	return ret;
108 }
109 
110 static int mperf_init_stats(unsigned int cpu)
111 {
112 	unsigned long long aval, mval;
113 	int ret;
114 
115 	ret = get_aperf_mperf(cpu, &aval, &mval);
116 	aperf_previous_count[cpu] = aval;
117 	mperf_previous_count[cpu] = mval;
118 	is_valid[cpu] = !ret;
119 
120 	return 0;
121 }
122 
123 static int mperf_measure_stats(unsigned int cpu)
124 {
125 	unsigned long long aval, mval;
126 	int ret;
127 
128 	ret = get_aperf_mperf(cpu, &aval, &mval);
129 	aperf_current_count[cpu] = aval;
130 	mperf_current_count[cpu] = mval;
131 	is_valid[cpu] = !ret;
132 
133 	return 0;
134 }
135 
136 static int mperf_get_count_percent(unsigned int id, double *percent,
137 				   unsigned int cpu)
138 {
139 	unsigned long long aperf_diff, mperf_diff, tsc_diff;
140 	unsigned long long timediff;
141 
142 	if (!is_valid[cpu])
143 		return -1;
144 
145 	if (id != C0 && id != Cx)
146 		return -1;
147 
148 	mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu];
149 	aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu];
150 
151 	if (max_freq_mode == MAX_FREQ_TSC_REF) {
152 		tsc_diff = tsc_at_measure_end - tsc_at_measure_start;
153 		*percent = 100.0 * mperf_diff / tsc_diff;
154 		dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n",
155 		       mperf_cstates[id].name, mperf_diff, tsc_diff);
156 	} else if (max_freq_mode == MAX_FREQ_SYSFS) {
157 		timediff = max_frequency * timespec_diff_us(time_start, time_end);
158 		*percent = 100.0 * mperf_diff / timediff;
159 		dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n",
160 		       mperf_cstates[id].name, mperf_diff, timediff);
161 	} else
162 		return -1;
163 
164 	if (id == Cx)
165 		*percent = 100.0 - *percent;
166 
167 	dprint("%s: previous: %llu - current: %llu - (%u)\n",
168 		mperf_cstates[id].name, mperf_diff, aperf_diff, cpu);
169 	dprint("%s: %f\n", mperf_cstates[id].name, *percent);
170 	return 0;
171 }
172 
173 static int mperf_get_count_freq(unsigned int id, unsigned long long *count,
174 				unsigned int cpu)
175 {
176 	unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff;
177 
178 	if (id != AVG_FREQ)
179 		return 1;
180 
181 	if (!is_valid[cpu])
182 		return -1;
183 
184 	mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu];
185 	aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu];
186 
187 	if (max_freq_mode == MAX_FREQ_TSC_REF) {
188 		/* Calculate max_freq from TSC count */
189 		tsc_diff = tsc_at_measure_end - tsc_at_measure_start;
190 		time_diff = timespec_diff_us(time_start, time_end);
191 		max_frequency = tsc_diff / time_diff;
192 	}
193 
194 	*count = max_frequency * ((double)aperf_diff / mperf_diff);
195 	dprint("%s: Average freq based on %s maximum frequency:\n",
196 	       mperf_cstates[id].name,
197 	       (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read");
198 	dprint("max_frequency: %lu\n", max_frequency);
199 	dprint("aperf_diff: %llu\n", aperf_diff);
200 	dprint("mperf_diff: %llu\n", mperf_diff);
201 	dprint("avg freq:   %llu\n", *count);
202 	return 0;
203 }
204 
205 static int mperf_start(void)
206 {
207 	int cpu;
208 	unsigned long long dbg;
209 
210 	clock_gettime(CLOCK_REALTIME, &time_start);
211 	mperf_get_tsc(&tsc_at_measure_start);
212 
213 	for (cpu = 0; cpu < cpu_count; cpu++)
214 		mperf_init_stats(cpu);
215 
216 	mperf_get_tsc(&dbg);
217 	dprint("TSC diff: %llu\n", dbg - tsc_at_measure_start);
218 	return 0;
219 }
220 
221 static int mperf_stop(void)
222 {
223 	unsigned long long dbg;
224 	int cpu;
225 
226 	for (cpu = 0; cpu < cpu_count; cpu++)
227 		mperf_measure_stats(cpu);
228 
229 	mperf_get_tsc(&tsc_at_measure_end);
230 	clock_gettime(CLOCK_REALTIME, &time_end);
231 
232 	mperf_get_tsc(&dbg);
233 	dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end);
234 
235 	return 0;
236 }
237 
238 /*
239  * Mperf register is defined to tick at P0 (maximum) frequency
240  *
241  * Instead of reading out P0 which can be tricky to read out from HW,
242  * we use TSC counter if it reliably ticks at P0/mperf frequency.
243  *
244  * Still try to fall back to:
245  * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq
246  * on older Intel HW without invariant TSC feature.
247  * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but
248  * it's still double checked (MSR_AMD_HWCR)).
249  *
250  * On these machines the user would still get useful mperf
251  * stats when acpi-cpufreq driver is loaded.
252  */
253 static int init_maxfreq_mode(void)
254 {
255 	int ret;
256 	unsigned long long hwcr;
257 	unsigned long min;
258 
259 	if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC))
260 		goto use_sysfs;
261 
262 	if (cpupower_cpu_info.vendor == X86_VENDOR_AMD ||
263 	    cpupower_cpu_info.vendor == X86_VENDOR_HYGON) {
264 		/* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf
265 		 * freq.
266 		 * A test whether hwcr is accessable/available would be:
267 		 * (cpupower_cpu_info.family > 0x10 ||
268 		 *   cpupower_cpu_info.family == 0x10 &&
269 		 *   cpupower_cpu_info.model >= 0x2))
270 		 * This should be the case for all aperf/mperf
271 		 * capable AMD machines and is therefore safe to test here.
272 		 * Compare with Linus kernel git commit: acf01734b1747b1ec4
273 		 */
274 		ret = read_msr(0, MSR_AMD_HWCR, &hwcr);
275 		/*
276 		 * If the MSR read failed, assume a Xen system that did
277 		 * not explicitly provide access to it and assume TSC works
278 		*/
279 		if (ret != 0) {
280 			dprint("TSC read 0x%x failed - assume TSC working\n",
281 			       MSR_AMD_HWCR);
282 			return 0;
283 		} else if (1 & (hwcr >> 24)) {
284 			max_freq_mode = MAX_FREQ_TSC_REF;
285 			return 0;
286 		} else { /* Use sysfs max frequency if available */ }
287 	} else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) {
288 		/*
289 		 * On Intel we assume mperf (in C0) is ticking at same
290 		 * rate than TSC
291 		 */
292 		max_freq_mode = MAX_FREQ_TSC_REF;
293 		return 0;
294 	}
295 use_sysfs:
296 	if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) {
297 		dprint("Cannot retrieve max freq from cpufreq kernel "
298 		       "subsystem\n");
299 		return -1;
300 	}
301 	max_freq_mode = MAX_FREQ_SYSFS;
302 	max_frequency /= 1000; /* Default automatically to MHz value */
303 	return 0;
304 }
305 
306 /*
307  * This monitor provides:
308  *
309  * 1) Average frequency a CPU resided in
310  *    This always works if the CPU has aperf/mperf capabilities
311  *
312  * 2) C0 and Cx (any sleep state) time a CPU resided in
313  *    Works if mperf timer stops ticking in sleep states which
314  *    seem to be the case on all current HW.
315  * Both is directly retrieved from HW registers and is independent
316  * from kernel statistics.
317  */
318 struct cpuidle_monitor mperf_monitor;
319 struct cpuidle_monitor *mperf_register(void)
320 {
321 	if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF))
322 		return NULL;
323 
324 	if (init_maxfreq_mode())
325 		return NULL;
326 
327 	if (cpupower_cpu_info.vendor == X86_VENDOR_AMD)
328 		mperf_monitor.flags.per_cpu_schedule = 1;
329 
330 	/* Free this at program termination */
331 	is_valid = calloc(cpu_count, sizeof(int));
332 	mperf_previous_count = calloc(cpu_count, sizeof(unsigned long long));
333 	aperf_previous_count = calloc(cpu_count, sizeof(unsigned long long));
334 	mperf_current_count = calloc(cpu_count, sizeof(unsigned long long));
335 	aperf_current_count = calloc(cpu_count, sizeof(unsigned long long));
336 
337 	mperf_monitor.name_len = strlen(mperf_monitor.name);
338 	return &mperf_monitor;
339 }
340 
341 void mperf_unregister(void)
342 {
343 	free(mperf_previous_count);
344 	free(aperf_previous_count);
345 	free(mperf_current_count);
346 	free(aperf_current_count);
347 	free(is_valid);
348 }
349 
350 struct cpuidle_monitor mperf_monitor = {
351 	.name			= "Mperf",
352 	.hw_states_num		= MPERF_CSTATE_COUNT,
353 	.hw_states		= mperf_cstates,
354 	.start			= mperf_start,
355 	.stop			= mperf_stop,
356 	.do_register		= mperf_register,
357 	.unregister		= mperf_unregister,
358 	.flags.needs_root	= 1,
359 	.overflow_s		= 922000000 /* 922337203 seconds TSC overflow
360 					       at 20GHz */
361 };
362 #endif /* #if defined(__i386__) || defined(__x86_64__) */
363