1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * turbostat -- show CPU frequency and C-state residency
4  * on modern Intel and AMD processors.
5  *
6  * Copyright (c) 2021 Intel Corporation.
7  * Len Brown <len.brown@intel.com>
8  */
9 
10 #define _GNU_SOURCE
11 #include MSRHEADER
12 #include INTEL_FAMILY_HEADER
13 #include <stdarg.h>
14 #include <stdio.h>
15 #include <err.h>
16 #include <unistd.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <sys/stat.h>
20 #include <sys/select.h>
21 #include <sys/resource.h>
22 #include <fcntl.h>
23 #include <signal.h>
24 #include <sys/time.h>
25 #include <stdlib.h>
26 #include <getopt.h>
27 #include <dirent.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include <sched.h>
31 #include <time.h>
32 #include <cpuid.h>
33 #include <sys/capability.h>
34 #include <errno.h>
35 #include <math.h>
36 #include <linux/perf_event.h>
37 #include <asm/unistd.h>
38 #include <stdbool.h>
39 
40 /*
41  * This list matches the column headers, except
42  * 1. built-in only, the sysfs counters are not here -- we learn of those at run-time
43  * 2. Core and CPU are moved to the end, we can't have strings that contain them
44  *    matching on them for --show and --hide.
45  */
46 
47 /*
48  * buffer size used by sscanf() for added column names
49  * Usually truncated to 7 characters, but also handles 18 columns for raw 64-bit counters
50  */
51 #define	NAME_BYTES 20
52 #define PATH_BYTES 128
53 
54 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
55 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC };
56 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT };
57 
58 struct msr_counter {
59 	unsigned int msr_num;
60 	char name[NAME_BYTES];
61 	char path[PATH_BYTES];
62 	unsigned int width;
63 	enum counter_type type;
64 	enum counter_format format;
65 	struct msr_counter *next;
66 	unsigned int flags;
67 #define	FLAGS_HIDE	(1 << 0)
68 #define	FLAGS_SHOW	(1 << 1)
69 #define	SYSFS_PERCPU	(1 << 1)
70 };
71 
72 struct msr_counter bic[] = {
73 	{ 0x0, "usec" },
74 	{ 0x0, "Time_Of_Day_Seconds" },
75 	{ 0x0, "Package" },
76 	{ 0x0, "Node" },
77 	{ 0x0, "Avg_MHz" },
78 	{ 0x0, "Busy%" },
79 	{ 0x0, "Bzy_MHz" },
80 	{ 0x0, "TSC_MHz" },
81 	{ 0x0, "IRQ" },
82 	{ 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL },
83 	{ 0x0, "sysfs" },
84 	{ 0x0, "CPU%c1" },
85 	{ 0x0, "CPU%c3" },
86 	{ 0x0, "CPU%c6" },
87 	{ 0x0, "CPU%c7" },
88 	{ 0x0, "ThreadC" },
89 	{ 0x0, "CoreTmp" },
90 	{ 0x0, "CoreCnt" },
91 	{ 0x0, "PkgTmp" },
92 	{ 0x0, "GFX%rc6" },
93 	{ 0x0, "GFXMHz" },
94 	{ 0x0, "Pkg%pc2" },
95 	{ 0x0, "Pkg%pc3" },
96 	{ 0x0, "Pkg%pc6" },
97 	{ 0x0, "Pkg%pc7" },
98 	{ 0x0, "Pkg%pc8" },
99 	{ 0x0, "Pkg%pc9" },
100 	{ 0x0, "Pk%pc10" },
101 	{ 0x0, "CPU%LPI" },
102 	{ 0x0, "SYS%LPI" },
103 	{ 0x0, "PkgWatt" },
104 	{ 0x0, "CorWatt" },
105 	{ 0x0, "GFXWatt" },
106 	{ 0x0, "PkgCnt" },
107 	{ 0x0, "RAMWatt" },
108 	{ 0x0, "PKG_%" },
109 	{ 0x0, "RAM_%" },
110 	{ 0x0, "Pkg_J" },
111 	{ 0x0, "Cor_J" },
112 	{ 0x0, "GFX_J" },
113 	{ 0x0, "RAM_J" },
114 	{ 0x0, "Mod%c6" },
115 	{ 0x0, "Totl%C0" },
116 	{ 0x0, "Any%C0" },
117 	{ 0x0, "GFX%C0" },
118 	{ 0x0, "CPUGFX%" },
119 	{ 0x0, "Core" },
120 	{ 0x0, "CPU" },
121 	{ 0x0, "APIC" },
122 	{ 0x0, "X2APIC" },
123 	{ 0x0, "Die" },
124 	{ 0x0, "GFXAMHz" },
125 	{ 0x0, "IPC" },
126 	{ 0x0, "CoreThr" },
127 };
128 
129 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
130 #define	BIC_USEC	(1ULL << 0)
131 #define	BIC_TOD		(1ULL << 1)
132 #define	BIC_Package	(1ULL << 2)
133 #define	BIC_Node	(1ULL << 3)
134 #define	BIC_Avg_MHz	(1ULL << 4)
135 #define	BIC_Busy	(1ULL << 5)
136 #define	BIC_Bzy_MHz	(1ULL << 6)
137 #define	BIC_TSC_MHz	(1ULL << 7)
138 #define	BIC_IRQ		(1ULL << 8)
139 #define	BIC_SMI		(1ULL << 9)
140 #define	BIC_sysfs	(1ULL << 10)
141 #define	BIC_CPU_c1	(1ULL << 11)
142 #define	BIC_CPU_c3	(1ULL << 12)
143 #define	BIC_CPU_c6	(1ULL << 13)
144 #define	BIC_CPU_c7	(1ULL << 14)
145 #define	BIC_ThreadC	(1ULL << 15)
146 #define	BIC_CoreTmp	(1ULL << 16)
147 #define	BIC_CoreCnt	(1ULL << 17)
148 #define	BIC_PkgTmp	(1ULL << 18)
149 #define	BIC_GFX_rc6	(1ULL << 19)
150 #define	BIC_GFXMHz	(1ULL << 20)
151 #define	BIC_Pkgpc2	(1ULL << 21)
152 #define	BIC_Pkgpc3	(1ULL << 22)
153 #define	BIC_Pkgpc6	(1ULL << 23)
154 #define	BIC_Pkgpc7	(1ULL << 24)
155 #define	BIC_Pkgpc8	(1ULL << 25)
156 #define	BIC_Pkgpc9	(1ULL << 26)
157 #define	BIC_Pkgpc10	(1ULL << 27)
158 #define BIC_CPU_LPI	(1ULL << 28)
159 #define BIC_SYS_LPI	(1ULL << 29)
160 #define	BIC_PkgWatt	(1ULL << 30)
161 #define	BIC_CorWatt	(1ULL << 31)
162 #define	BIC_GFXWatt	(1ULL << 32)
163 #define	BIC_PkgCnt	(1ULL << 33)
164 #define	BIC_RAMWatt	(1ULL << 34)
165 #define	BIC_PKG__	(1ULL << 35)
166 #define	BIC_RAM__	(1ULL << 36)
167 #define	BIC_Pkg_J	(1ULL << 37)
168 #define	BIC_Cor_J	(1ULL << 38)
169 #define	BIC_GFX_J	(1ULL << 39)
170 #define	BIC_RAM_J	(1ULL << 40)
171 #define	BIC_Mod_c6	(1ULL << 41)
172 #define	BIC_Totl_c0	(1ULL << 42)
173 #define	BIC_Any_c0	(1ULL << 43)
174 #define	BIC_GFX_c0	(1ULL << 44)
175 #define	BIC_CPUGFX	(1ULL << 45)
176 #define	BIC_Core	(1ULL << 46)
177 #define	BIC_CPU		(1ULL << 47)
178 #define	BIC_APIC	(1ULL << 48)
179 #define	BIC_X2APIC	(1ULL << 49)
180 #define	BIC_Die		(1ULL << 50)
181 #define	BIC_GFXACTMHz	(1ULL << 51)
182 #define	BIC_IPC		(1ULL << 52)
183 #define	BIC_CORE_THROT_CNT	(1ULL << 53)
184 
185 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
186 #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
187 #define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz )
188 #define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX)
189 #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
190 
191 #define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
192 
193 unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT);
194 unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC;
195 
196 #define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME)
197 #define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME)
198 #define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME)
199 #define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT)
200 #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT)
201 #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT)
202 
203 char *proc_stat = "/proc/stat";
204 FILE *outf;
205 int *fd_percpu;
206 int *fd_instr_count_percpu;
207 struct timeval interval_tv = { 5, 0 };
208 struct timespec interval_ts = { 5, 0 };
209 
210 /* Save original CPU model */
211 unsigned int model_orig;
212 
213 unsigned int num_iterations;
214 unsigned int header_iterations;
215 unsigned int debug;
216 unsigned int quiet;
217 unsigned int shown;
218 unsigned int sums_need_wide_columns;
219 unsigned int rapl_joules;
220 unsigned int summary_only;
221 unsigned int list_header_only;
222 unsigned int dump_only;
223 unsigned int do_snb_cstates;
224 unsigned int do_knl_cstates;
225 unsigned int do_slm_cstates;
226 unsigned int use_c1_residency_msr;
227 unsigned int has_aperf;
228 unsigned int has_epb;
229 unsigned int do_irtl_snb;
230 unsigned int do_irtl_hsw;
231 unsigned int units = 1000000;	/* MHz etc */
232 unsigned int genuine_intel;
233 unsigned int authentic_amd;
234 unsigned int hygon_genuine;
235 unsigned int max_level, max_extended_level;
236 unsigned int has_invariant_tsc;
237 unsigned int do_nhm_platform_info;
238 unsigned int no_MSR_MISC_PWR_MGMT;
239 unsigned int aperf_mperf_multiplier = 1;
240 double bclk;
241 double base_hz;
242 unsigned int has_base_hz;
243 double tsc_tweak = 1.0;
244 unsigned int show_pkg_only;
245 unsigned int show_core_only;
246 char *output_buffer, *outp;
247 unsigned int do_rapl;
248 unsigned int do_dts;
249 unsigned int do_ptm;
250 unsigned int do_ipc;
251 unsigned long long gfx_cur_rc6_ms;
252 unsigned long long cpuidle_cur_cpu_lpi_us;
253 unsigned long long cpuidle_cur_sys_lpi_us;
254 unsigned int gfx_cur_mhz;
255 unsigned int gfx_act_mhz;
256 unsigned int tj_max;
257 unsigned int tj_max_override;
258 int tcc_offset_bits;
259 double rapl_power_units, rapl_time_units;
260 double rapl_dram_energy_units, rapl_energy_units;
261 double rapl_joule_counter_range;
262 unsigned int do_core_perf_limit_reasons;
263 unsigned int has_automatic_cstate_conversion;
264 unsigned int dis_cstate_prewake;
265 unsigned int do_gfx_perf_limit_reasons;
266 unsigned int do_ring_perf_limit_reasons;
267 unsigned int crystal_hz;
268 unsigned long long tsc_hz;
269 int base_cpu;
270 double discover_bclk(unsigned int family, unsigned int model);
271 unsigned int has_hwp;		/* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
272 			/* IA32_HWP_REQUEST, IA32_HWP_STATUS */
273 unsigned int has_hwp_notify;	/* IA32_HWP_INTERRUPT */
274 unsigned int has_hwp_activity_window;	/* IA32_HWP_REQUEST[bits 41:32] */
275 unsigned int has_hwp_epp;	/* IA32_HWP_REQUEST[bits 31:24] */
276 unsigned int has_hwp_pkg;	/* IA32_HWP_REQUEST_PKG */
277 unsigned int has_misc_feature_control;
278 unsigned int first_counter_read = 1;
279 int ignore_stdin;
280 
281 #define RAPL_PKG		(1 << 0)
282 					/* 0x610 MSR_PKG_POWER_LIMIT */
283 					/* 0x611 MSR_PKG_ENERGY_STATUS */
284 #define RAPL_PKG_PERF_STATUS	(1 << 1)
285 					/* 0x613 MSR_PKG_PERF_STATUS */
286 #define RAPL_PKG_POWER_INFO	(1 << 2)
287 					/* 0x614 MSR_PKG_POWER_INFO */
288 
289 #define RAPL_DRAM		(1 << 3)
290 					/* 0x618 MSR_DRAM_POWER_LIMIT */
291 					/* 0x619 MSR_DRAM_ENERGY_STATUS */
292 #define RAPL_DRAM_PERF_STATUS	(1 << 4)
293 					/* 0x61b MSR_DRAM_PERF_STATUS */
294 #define RAPL_DRAM_POWER_INFO	(1 << 5)
295 					/* 0x61c MSR_DRAM_POWER_INFO */
296 
297 #define RAPL_CORES_POWER_LIMIT	(1 << 6)
298 					/* 0x638 MSR_PP0_POWER_LIMIT */
299 #define RAPL_CORE_POLICY	(1 << 7)
300 					/* 0x63a MSR_PP0_POLICY */
301 
302 #define RAPL_GFX		(1 << 8)
303 					/* 0x640 MSR_PP1_POWER_LIMIT */
304 					/* 0x641 MSR_PP1_ENERGY_STATUS */
305 					/* 0x642 MSR_PP1_POLICY */
306 
307 #define RAPL_CORES_ENERGY_STATUS	(1 << 9)
308 					/* 0x639 MSR_PP0_ENERGY_STATUS */
309 #define RAPL_PER_CORE_ENERGY	(1 << 10)
310 					/* Indicates cores energy collection is per-core,
311 					 * not per-package. */
312 #define RAPL_AMD_F17H		(1 << 11)
313 					/* 0xc0010299 MSR_RAPL_PWR_UNIT */
314 					/* 0xc001029a MSR_CORE_ENERGY_STAT */
315 					/* 0xc001029b MSR_PKG_ENERGY_STAT */
316 #define RAPL_CORES (RAPL_CORES_ENERGY_STATUS | RAPL_CORES_POWER_LIMIT)
317 #define	TJMAX_DEFAULT	100
318 
319 /* MSRs that are not yet in the kernel-provided header. */
320 #define MSR_RAPL_PWR_UNIT	0xc0010299
321 #define MSR_CORE_ENERGY_STAT	0xc001029a
322 #define MSR_PKG_ENERGY_STAT	0xc001029b
323 
324 #define MAX(a, b) ((a) > (b) ? (a) : (b))
325 
326 int backwards_count;
327 char *progname;
328 
329 #define CPU_SUBSET_MAXCPUS	1024	/* need to use before probe... */
330 cpu_set_t *cpu_present_set, *cpu_affinity_set, *cpu_subset;
331 size_t cpu_present_setsize, cpu_affinity_setsize, cpu_subset_size;
332 #define MAX_ADDED_COUNTERS 8
333 #define MAX_ADDED_THREAD_COUNTERS 24
334 #define BITMASK_SIZE 32
335 
336 struct thread_data {
337 	struct timeval tv_begin;
338 	struct timeval tv_end;
339 	struct timeval tv_delta;
340 	unsigned long long tsc;
341 	unsigned long long aperf;
342 	unsigned long long mperf;
343 	unsigned long long c1;
344 	unsigned long long instr_count;
345 	unsigned long long irq_count;
346 	unsigned int smi_count;
347 	unsigned int cpu_id;
348 	unsigned int apic_id;
349 	unsigned int x2apic_id;
350 	unsigned int flags;
351 	bool is_atom;
352 #define CPU_IS_FIRST_THREAD_IN_CORE	0x2
353 #define CPU_IS_FIRST_CORE_IN_PACKAGE	0x4
354 	unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
355 } *thread_even, *thread_odd;
356 
357 struct core_data {
358 	unsigned long long c3;
359 	unsigned long long c6;
360 	unsigned long long c7;
361 	unsigned long long mc6_us;	/* duplicate as per-core for now, even though per module */
362 	unsigned int core_temp_c;
363 	unsigned int core_energy;	/* MSR_CORE_ENERGY_STAT */
364 	unsigned int core_id;
365 	unsigned long long core_throt_cnt;
366 	unsigned long long counter[MAX_ADDED_COUNTERS];
367 } *core_even, *core_odd;
368 
369 struct pkg_data {
370 	unsigned long long pc2;
371 	unsigned long long pc3;
372 	unsigned long long pc6;
373 	unsigned long long pc7;
374 	unsigned long long pc8;
375 	unsigned long long pc9;
376 	unsigned long long pc10;
377 	unsigned long long cpu_lpi;
378 	unsigned long long sys_lpi;
379 	unsigned long long pkg_wtd_core_c0;
380 	unsigned long long pkg_any_core_c0;
381 	unsigned long long pkg_any_gfxe_c0;
382 	unsigned long long pkg_both_core_gfxe_c0;
383 	long long gfx_rc6_ms;
384 	unsigned int gfx_mhz;
385 	unsigned int gfx_act_mhz;
386 	unsigned int package_id;
387 	unsigned long long energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
388 	unsigned long long energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
389 	unsigned long long energy_cores;	/* MSR_PP0_ENERGY_STATUS */
390 	unsigned long long energy_gfx;	/* MSR_PP1_ENERGY_STATUS */
391 	unsigned long long rapl_pkg_perf_status;	/* MSR_PKG_PERF_STATUS */
392 	unsigned long long rapl_dram_perf_status;	/* MSR_DRAM_PERF_STATUS */
393 	unsigned int pkg_temp_c;
394 	unsigned long long counter[MAX_ADDED_COUNTERS];
395 } *package_even, *package_odd;
396 
397 #define ODD_COUNTERS thread_odd, core_odd, package_odd
398 #define EVEN_COUNTERS thread_even, core_even, package_even
399 
400 #define GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no)	      \
401 	((thread_base) +						      \
402 	 ((pkg_no) *							      \
403 	  topo.nodes_per_pkg * topo.cores_per_node * topo.threads_per_core) + \
404 	 ((node_no) * topo.cores_per_node * topo.threads_per_core) +	      \
405 	 ((core_no) * topo.threads_per_core) +				      \
406 	 (thread_no))
407 
408 #define GET_CORE(core_base, core_no, node_no, pkg_no)			\
409 	((core_base) +							\
410 	 ((pkg_no) *  topo.nodes_per_pkg * topo.cores_per_node) +	\
411 	 ((node_no) * topo.cores_per_node) +				\
412 	 (core_no))
413 
414 #define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no)
415 
416 /*
417  * The accumulated sum of MSR is defined as a monotonic
418  * increasing MSR, it will be accumulated periodically,
419  * despite its register's bit width.
420  */
421 enum {
422 	IDX_PKG_ENERGY,
423 	IDX_DRAM_ENERGY,
424 	IDX_PP0_ENERGY,
425 	IDX_PP1_ENERGY,
426 	IDX_PKG_PERF,
427 	IDX_DRAM_PERF,
428 	IDX_COUNT,
429 };
430 
431 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr);
432 
433 struct msr_sum_array {
434 	/* get_msr_sum() = sum + (get_msr() - last) */
435 	struct {
436 		/*The accumulated MSR value is updated by the timer */
437 		unsigned long long sum;
438 		/*The MSR footprint recorded in last timer */
439 		unsigned long long last;
440 	} entries[IDX_COUNT];
441 };
442 
443 /* The percpu MSR sum array.*/
444 struct msr_sum_array *per_cpu_msr_sum;
445 
446 off_t idx_to_offset(int idx)
447 {
448 	off_t offset;
449 
450 	switch (idx) {
451 	case IDX_PKG_ENERGY:
452 		if (do_rapl & RAPL_AMD_F17H)
453 			offset = MSR_PKG_ENERGY_STAT;
454 		else
455 			offset = MSR_PKG_ENERGY_STATUS;
456 		break;
457 	case IDX_DRAM_ENERGY:
458 		offset = MSR_DRAM_ENERGY_STATUS;
459 		break;
460 	case IDX_PP0_ENERGY:
461 		offset = MSR_PP0_ENERGY_STATUS;
462 		break;
463 	case IDX_PP1_ENERGY:
464 		offset = MSR_PP1_ENERGY_STATUS;
465 		break;
466 	case IDX_PKG_PERF:
467 		offset = MSR_PKG_PERF_STATUS;
468 		break;
469 	case IDX_DRAM_PERF:
470 		offset = MSR_DRAM_PERF_STATUS;
471 		break;
472 	default:
473 		offset = -1;
474 	}
475 	return offset;
476 }
477 
478 int offset_to_idx(off_t offset)
479 {
480 	int idx;
481 
482 	switch (offset) {
483 	case MSR_PKG_ENERGY_STATUS:
484 	case MSR_PKG_ENERGY_STAT:
485 		idx = IDX_PKG_ENERGY;
486 		break;
487 	case MSR_DRAM_ENERGY_STATUS:
488 		idx = IDX_DRAM_ENERGY;
489 		break;
490 	case MSR_PP0_ENERGY_STATUS:
491 		idx = IDX_PP0_ENERGY;
492 		break;
493 	case MSR_PP1_ENERGY_STATUS:
494 		idx = IDX_PP1_ENERGY;
495 		break;
496 	case MSR_PKG_PERF_STATUS:
497 		idx = IDX_PKG_PERF;
498 		break;
499 	case MSR_DRAM_PERF_STATUS:
500 		idx = IDX_DRAM_PERF;
501 		break;
502 	default:
503 		idx = -1;
504 	}
505 	return idx;
506 }
507 
508 int idx_valid(int idx)
509 {
510 	switch (idx) {
511 	case IDX_PKG_ENERGY:
512 		return do_rapl & (RAPL_PKG | RAPL_AMD_F17H);
513 	case IDX_DRAM_ENERGY:
514 		return do_rapl & RAPL_DRAM;
515 	case IDX_PP0_ENERGY:
516 		return do_rapl & RAPL_CORES_ENERGY_STATUS;
517 	case IDX_PP1_ENERGY:
518 		return do_rapl & RAPL_GFX;
519 	case IDX_PKG_PERF:
520 		return do_rapl & RAPL_PKG_PERF_STATUS;
521 	case IDX_DRAM_PERF:
522 		return do_rapl & RAPL_DRAM_PERF_STATUS;
523 	default:
524 		return 0;
525 	}
526 }
527 
528 struct sys_counters {
529 	unsigned int added_thread_counters;
530 	unsigned int added_core_counters;
531 	unsigned int added_package_counters;
532 	struct msr_counter *tp;
533 	struct msr_counter *cp;
534 	struct msr_counter *pp;
535 } sys;
536 
537 struct system_summary {
538 	struct thread_data threads;
539 	struct core_data cores;
540 	struct pkg_data packages;
541 } average;
542 
543 struct cpu_topology {
544 	int physical_package_id;
545 	int die_id;
546 	int logical_cpu_id;
547 	int physical_node_id;
548 	int logical_node_id;	/* 0-based count within the package */
549 	int physical_core_id;
550 	int thread_id;
551 	cpu_set_t *put_ids;	/* Processing Unit/Thread IDs */
552 } *cpus;
553 
554 struct topo_params {
555 	int num_packages;
556 	int num_die;
557 	int num_cpus;
558 	int num_cores;
559 	int max_cpu_num;
560 	int max_node_num;
561 	int nodes_per_pkg;
562 	int cores_per_node;
563 	int threads_per_core;
564 } topo;
565 
566 struct timeval tv_even, tv_odd, tv_delta;
567 
568 int *irq_column_2_cpu;		/* /proc/interrupts column numbers */
569 int *irqs_per_cpu;		/* indexed by cpu_num */
570 
571 void setup_all_buffers(void);
572 
573 char *sys_lpi_file;
574 char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us";
575 char *sys_lpi_file_debugfs = "/sys/kernel/debug/pmc_core/slp_s0_residency_usec";
576 
577 int cpu_is_not_present(int cpu)
578 {
579 	return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
580 }
581 
582 /*
583  * run func(thread, core, package) in topology order
584  * skip non-present cpus
585  */
586 
587 int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *),
588 		 struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
589 {
590 	int retval, pkg_no, core_no, thread_no, node_no;
591 
592 	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
593 		for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) {
594 			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
595 				for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
596 					struct thread_data *t;
597 					struct core_data *c;
598 					struct pkg_data *p;
599 
600 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
601 
602 					if (cpu_is_not_present(t->cpu_id))
603 						continue;
604 
605 					c = GET_CORE(core_base, core_no, node_no, pkg_no);
606 					p = GET_PKG(pkg_base, pkg_no);
607 
608 					retval = func(t, c, p);
609 					if (retval)
610 						return retval;
611 				}
612 			}
613 		}
614 	}
615 	return 0;
616 }
617 
618 int cpu_migrate(int cpu)
619 {
620 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
621 	CPU_SET_S(cpu, cpu_affinity_setsize, cpu_affinity_set);
622 	if (sched_setaffinity(0, cpu_affinity_setsize, cpu_affinity_set) == -1)
623 		return -1;
624 	else
625 		return 0;
626 }
627 
628 int get_msr_fd(int cpu)
629 {
630 	char pathname[32];
631 	int fd;
632 
633 	fd = fd_percpu[cpu];
634 
635 	if (fd)
636 		return fd;
637 
638 	sprintf(pathname, "/dev/cpu/%d/msr", cpu);
639 	fd = open(pathname, O_RDONLY);
640 	if (fd < 0)
641 		err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname);
642 
643 	fd_percpu[cpu] = fd;
644 
645 	return fd;
646 }
647 
648 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
649 {
650 	return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
651 }
652 
653 static int perf_instr_count_open(int cpu_num)
654 {
655 	struct perf_event_attr pea;
656 	int fd;
657 
658 	memset(&pea, 0, sizeof(struct perf_event_attr));
659 	pea.type = PERF_TYPE_HARDWARE;
660 	pea.size = sizeof(struct perf_event_attr);
661 	pea.config = PERF_COUNT_HW_INSTRUCTIONS;
662 
663 	/* counter for cpu_num, including user + kernel and all processes */
664 	fd = perf_event_open(&pea, -1, cpu_num, -1, 0);
665 	if (fd == -1) {
666 		warn("cpu%d: perf instruction counter", cpu_num);
667 		BIC_NOT_PRESENT(BIC_IPC);
668 	}
669 
670 	return fd;
671 }
672 
673 int get_instr_count_fd(int cpu)
674 {
675 	if (fd_instr_count_percpu[cpu])
676 		return fd_instr_count_percpu[cpu];
677 
678 	fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu);
679 
680 	return fd_instr_count_percpu[cpu];
681 }
682 
683 int get_msr(int cpu, off_t offset, unsigned long long *msr)
684 {
685 	ssize_t retval;
686 
687 	retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
688 
689 	if (retval != sizeof *msr)
690 		err(-1, "cpu%d: msr offset 0x%llx read failed", cpu, (unsigned long long)offset);
691 
692 	return 0;
693 }
694 
695 #define MAX_DEFERRED 16
696 char *deferred_add_names[MAX_DEFERRED];
697 char *deferred_skip_names[MAX_DEFERRED];
698 int deferred_add_index;
699 int deferred_skip_index;
700 
701 /*
702  * HIDE_LIST - hide this list of counters, show the rest [default]
703  * SHOW_LIST - show this list of counters, hide the rest
704  */
705 enum show_hide_mode { SHOW_LIST, HIDE_LIST } global_show_hide_mode = HIDE_LIST;
706 
707 void help(void)
708 {
709 	fprintf(outf,
710 		"Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
711 		"\n"
712 		"Turbostat forks the specified COMMAND and prints statistics\n"
713 		"when COMMAND completes.\n"
714 		"If no COMMAND is specified, turbostat wakes every 5-seconds\n"
715 		"to print statistics, until interrupted.\n"
716 		"  -a, --add	add a counter\n"
717 		"		  eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
718 		"  -c, --cpu	cpu-set	limit output to summary plus cpu-set:\n"
719 		"		  {core | package | j,k,l..m,n-p }\n"
720 		"  -d, --debug	displays usec, Time_Of_Day_Seconds and more debugging\n"
721 		"  -D, --Dump	displays the raw counter values\n"
722 		"  -e, --enable	[all | column]\n"
723 		"		shows all or the specified disabled column\n"
724 		"  -H, --hide [column|column,column,...]\n"
725 		"		hide the specified column(s)\n"
726 		"  -i, --interval sec.subsec\n"
727 		"		Override default 5-second measurement interval\n"
728 		"  -J, --Joules	displays energy in Joules instead of Watts\n"
729 		"  -l, --list	list column headers only\n"
730 		"  -n, --num_iterations num\n"
731 		"		number of the measurement iterations\n"
732 		"  -N, --header_iterations num\n"
733 		"		print header every num iterations\n"
734 		"  -o, --out file\n"
735 		"		create or truncate \"file\" for all output\n"
736 		"  -q, --quiet	skip decoding system configuration header\n"
737 		"  -s, --show [column|column,column,...]\n"
738 		"		show only the specified column(s)\n"
739 		"  -S, --Summary\n"
740 		"		limits output to 1-line system summary per interval\n"
741 		"  -T, --TCC temperature\n"
742 		"		sets the Thermal Control Circuit temperature in\n"
743 		"		  degrees Celsius\n"
744 		"  -h, --help	print this help message\n"
745 		"  -v, --version	print version information\n" "\n" "For more help, run \"man turbostat\"\n");
746 }
747 
748 /*
749  * bic_lookup
750  * for all the strings in comma separate name_list,
751  * set the approprate bit in return value.
752  */
753 unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
754 {
755 	int i;
756 	unsigned long long retval = 0;
757 
758 	while (name_list) {
759 		char *comma;
760 
761 		comma = strchr(name_list, ',');
762 
763 		if (comma)
764 			*comma = '\0';
765 
766 		for (i = 0; i < MAX_BIC; ++i) {
767 			if (!strcmp(name_list, bic[i].name)) {
768 				retval |= (1ULL << i);
769 				break;
770 			}
771 			if (!strcmp(name_list, "all")) {
772 				retval |= ~0;
773 				break;
774 			} else if (!strcmp(name_list, "topology")) {
775 				retval |= BIC_TOPOLOGY;
776 				break;
777 			} else if (!strcmp(name_list, "power")) {
778 				retval |= BIC_THERMAL_PWR;
779 				break;
780 			} else if (!strcmp(name_list, "idle")) {
781 				retval |= BIC_IDLE;
782 				break;
783 			} else if (!strcmp(name_list, "frequency")) {
784 				retval |= BIC_FREQUENCY;
785 				break;
786 			} else if (!strcmp(name_list, "other")) {
787 				retval |= BIC_OTHER;
788 				break;
789 			}
790 
791 		}
792 		if (i == MAX_BIC) {
793 			if (mode == SHOW_LIST) {
794 				deferred_add_names[deferred_add_index++] = name_list;
795 				if (deferred_add_index >= MAX_DEFERRED) {
796 					fprintf(stderr, "More than max %d un-recognized --add options '%s'\n",
797 						MAX_DEFERRED, name_list);
798 					help();
799 					exit(1);
800 				}
801 			} else {
802 				deferred_skip_names[deferred_skip_index++] = name_list;
803 				if (debug)
804 					fprintf(stderr, "deferred \"%s\"\n", name_list);
805 				if (deferred_skip_index >= MAX_DEFERRED) {
806 					fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n",
807 						MAX_DEFERRED, name_list);
808 					help();
809 					exit(1);
810 				}
811 			}
812 		}
813 
814 		name_list = comma;
815 		if (name_list)
816 			name_list++;
817 
818 	}
819 	return retval;
820 }
821 
822 void print_header(char *delim)
823 {
824 	struct msr_counter *mp;
825 	int printed = 0;
826 
827 	if (DO_BIC(BIC_USEC))
828 		outp += sprintf(outp, "%susec", (printed++ ? delim : ""));
829 	if (DO_BIC(BIC_TOD))
830 		outp += sprintf(outp, "%sTime_Of_Day_Seconds", (printed++ ? delim : ""));
831 	if (DO_BIC(BIC_Package))
832 		outp += sprintf(outp, "%sPackage", (printed++ ? delim : ""));
833 	if (DO_BIC(BIC_Die))
834 		outp += sprintf(outp, "%sDie", (printed++ ? delim : ""));
835 	if (DO_BIC(BIC_Node))
836 		outp += sprintf(outp, "%sNode", (printed++ ? delim : ""));
837 	if (DO_BIC(BIC_Core))
838 		outp += sprintf(outp, "%sCore", (printed++ ? delim : ""));
839 	if (DO_BIC(BIC_CPU))
840 		outp += sprintf(outp, "%sCPU", (printed++ ? delim : ""));
841 	if (DO_BIC(BIC_APIC))
842 		outp += sprintf(outp, "%sAPIC", (printed++ ? delim : ""));
843 	if (DO_BIC(BIC_X2APIC))
844 		outp += sprintf(outp, "%sX2APIC", (printed++ ? delim : ""));
845 	if (DO_BIC(BIC_Avg_MHz))
846 		outp += sprintf(outp, "%sAvg_MHz", (printed++ ? delim : ""));
847 	if (DO_BIC(BIC_Busy))
848 		outp += sprintf(outp, "%sBusy%%", (printed++ ? delim : ""));
849 	if (DO_BIC(BIC_Bzy_MHz))
850 		outp += sprintf(outp, "%sBzy_MHz", (printed++ ? delim : ""));
851 	if (DO_BIC(BIC_TSC_MHz))
852 		outp += sprintf(outp, "%sTSC_MHz", (printed++ ? delim : ""));
853 
854 	if (DO_BIC(BIC_IPC))
855 		outp += sprintf(outp, "%sIPC", (printed++ ? delim : ""));
856 
857 	if (DO_BIC(BIC_IRQ)) {
858 		if (sums_need_wide_columns)
859 			outp += sprintf(outp, "%s     IRQ", (printed++ ? delim : ""));
860 		else
861 			outp += sprintf(outp, "%sIRQ", (printed++ ? delim : ""));
862 	}
863 
864 	if (DO_BIC(BIC_SMI))
865 		outp += sprintf(outp, "%sSMI", (printed++ ? delim : ""));
866 
867 	for (mp = sys.tp; mp; mp = mp->next) {
868 
869 		if (mp->format == FORMAT_RAW) {
870 			if (mp->width == 64)
871 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), mp->name);
872 			else
873 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), mp->name);
874 		} else {
875 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
876 				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), mp->name);
877 			else
878 				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), mp->name);
879 		}
880 	}
881 
882 	if (DO_BIC(BIC_CPU_c1))
883 		outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : ""));
884 	if (DO_BIC(BIC_CPU_c3))
885 		outp += sprintf(outp, "%sCPU%%c3", (printed++ ? delim : ""));
886 	if (DO_BIC(BIC_CPU_c6))
887 		outp += sprintf(outp, "%sCPU%%c6", (printed++ ? delim : ""));
888 	if (DO_BIC(BIC_CPU_c7))
889 		outp += sprintf(outp, "%sCPU%%c7", (printed++ ? delim : ""));
890 
891 	if (DO_BIC(BIC_Mod_c6))
892 		outp += sprintf(outp, "%sMod%%c6", (printed++ ? delim : ""));
893 
894 	if (DO_BIC(BIC_CoreTmp))
895 		outp += sprintf(outp, "%sCoreTmp", (printed++ ? delim : ""));
896 
897 	if (DO_BIC(BIC_CORE_THROT_CNT))
898 		outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : ""));
899 
900 	if (do_rapl && !rapl_joules) {
901 		if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
902 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
903 	} else if (do_rapl && rapl_joules) {
904 		if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY))
905 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
906 	}
907 
908 	for (mp = sys.cp; mp; mp = mp->next) {
909 		if (mp->format == FORMAT_RAW) {
910 			if (mp->width == 64)
911 				outp += sprintf(outp, "%s%18.18s", delim, mp->name);
912 			else
913 				outp += sprintf(outp, "%s%10.10s", delim, mp->name);
914 		} else {
915 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
916 				outp += sprintf(outp, "%s%8s", delim, mp->name);
917 			else
918 				outp += sprintf(outp, "%s%s", delim, mp->name);
919 		}
920 	}
921 
922 	if (DO_BIC(BIC_PkgTmp))
923 		outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : ""));
924 
925 	if (DO_BIC(BIC_GFX_rc6))
926 		outp += sprintf(outp, "%sGFX%%rc6", (printed++ ? delim : ""));
927 
928 	if (DO_BIC(BIC_GFXMHz))
929 		outp += sprintf(outp, "%sGFXMHz", (printed++ ? delim : ""));
930 
931 	if (DO_BIC(BIC_GFXACTMHz))
932 		outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : ""));
933 
934 	if (DO_BIC(BIC_Totl_c0))
935 		outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : ""));
936 	if (DO_BIC(BIC_Any_c0))
937 		outp += sprintf(outp, "%sAny%%C0", (printed++ ? delim : ""));
938 	if (DO_BIC(BIC_GFX_c0))
939 		outp += sprintf(outp, "%sGFX%%C0", (printed++ ? delim : ""));
940 	if (DO_BIC(BIC_CPUGFX))
941 		outp += sprintf(outp, "%sCPUGFX%%", (printed++ ? delim : ""));
942 
943 	if (DO_BIC(BIC_Pkgpc2))
944 		outp += sprintf(outp, "%sPkg%%pc2", (printed++ ? delim : ""));
945 	if (DO_BIC(BIC_Pkgpc3))
946 		outp += sprintf(outp, "%sPkg%%pc3", (printed++ ? delim : ""));
947 	if (DO_BIC(BIC_Pkgpc6))
948 		outp += sprintf(outp, "%sPkg%%pc6", (printed++ ? delim : ""));
949 	if (DO_BIC(BIC_Pkgpc7))
950 		outp += sprintf(outp, "%sPkg%%pc7", (printed++ ? delim : ""));
951 	if (DO_BIC(BIC_Pkgpc8))
952 		outp += sprintf(outp, "%sPkg%%pc8", (printed++ ? delim : ""));
953 	if (DO_BIC(BIC_Pkgpc9))
954 		outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : ""));
955 	if (DO_BIC(BIC_Pkgpc10))
956 		outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : ""));
957 	if (DO_BIC(BIC_CPU_LPI))
958 		outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : ""));
959 	if (DO_BIC(BIC_SYS_LPI))
960 		outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
961 
962 	if (do_rapl && !rapl_joules) {
963 		if (DO_BIC(BIC_PkgWatt))
964 			outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
965 		if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY))
966 			outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
967 		if (DO_BIC(BIC_GFXWatt))
968 			outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : ""));
969 		if (DO_BIC(BIC_RAMWatt))
970 			outp += sprintf(outp, "%sRAMWatt", (printed++ ? delim : ""));
971 		if (DO_BIC(BIC_PKG__))
972 			outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
973 		if (DO_BIC(BIC_RAM__))
974 			outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
975 	} else if (do_rapl && rapl_joules) {
976 		if (DO_BIC(BIC_Pkg_J))
977 			outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
978 		if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY))
979 			outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
980 		if (DO_BIC(BIC_GFX_J))
981 			outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : ""));
982 		if (DO_BIC(BIC_RAM_J))
983 			outp += sprintf(outp, "%sRAM_J", (printed++ ? delim : ""));
984 		if (DO_BIC(BIC_PKG__))
985 			outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
986 		if (DO_BIC(BIC_RAM__))
987 			outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
988 	}
989 	for (mp = sys.pp; mp; mp = mp->next) {
990 		if (mp->format == FORMAT_RAW) {
991 			if (mp->width == 64)
992 				outp += sprintf(outp, "%s%18.18s", delim, mp->name);
993 			else
994 				outp += sprintf(outp, "%s%10.10s", delim, mp->name);
995 		} else {
996 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
997 				outp += sprintf(outp, "%s%8s", delim, mp->name);
998 			else
999 				outp += sprintf(outp, "%s%s", delim, mp->name);
1000 		}
1001 	}
1002 
1003 	outp += sprintf(outp, "\n");
1004 }
1005 
1006 int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1007 {
1008 	int i;
1009 	struct msr_counter *mp;
1010 
1011 	outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p);
1012 
1013 	if (t) {
1014 		outp += sprintf(outp, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags);
1015 		outp += sprintf(outp, "TSC: %016llX\n", t->tsc);
1016 		outp += sprintf(outp, "aperf: %016llX\n", t->aperf);
1017 		outp += sprintf(outp, "mperf: %016llX\n", t->mperf);
1018 		outp += sprintf(outp, "c1: %016llX\n", t->c1);
1019 
1020 		if (DO_BIC(BIC_IPC))
1021 			outp += sprintf(outp, "IPC: %lld\n", t->instr_count);
1022 
1023 		if (DO_BIC(BIC_IRQ))
1024 			outp += sprintf(outp, "IRQ: %lld\n", t->irq_count);
1025 		if (DO_BIC(BIC_SMI))
1026 			outp += sprintf(outp, "SMI: %d\n", t->smi_count);
1027 
1028 		for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
1029 			outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]);
1030 		}
1031 	}
1032 
1033 	if (c) {
1034 		outp += sprintf(outp, "core: %d\n", c->core_id);
1035 		outp += sprintf(outp, "c3: %016llX\n", c->c3);
1036 		outp += sprintf(outp, "c6: %016llX\n", c->c6);
1037 		outp += sprintf(outp, "c7: %016llX\n", c->c7);
1038 		outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c);
1039 		outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt);
1040 		outp += sprintf(outp, "Joules: %0X\n", c->core_energy);
1041 
1042 		for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
1043 			outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]);
1044 		}
1045 		outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us);
1046 	}
1047 
1048 	if (p) {
1049 		outp += sprintf(outp, "package: %d\n", p->package_id);
1050 
1051 		outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0);
1052 		outp += sprintf(outp, "Any cores: %016llX\n", p->pkg_any_core_c0);
1053 		outp += sprintf(outp, "Any GFX: %016llX\n", p->pkg_any_gfxe_c0);
1054 		outp += sprintf(outp, "CPU + GFX: %016llX\n", p->pkg_both_core_gfxe_c0);
1055 
1056 		outp += sprintf(outp, "pc2: %016llX\n", p->pc2);
1057 		if (DO_BIC(BIC_Pkgpc3))
1058 			outp += sprintf(outp, "pc3: %016llX\n", p->pc3);
1059 		if (DO_BIC(BIC_Pkgpc6))
1060 			outp += sprintf(outp, "pc6: %016llX\n", p->pc6);
1061 		if (DO_BIC(BIC_Pkgpc7))
1062 			outp += sprintf(outp, "pc7: %016llX\n", p->pc7);
1063 		outp += sprintf(outp, "pc8: %016llX\n", p->pc8);
1064 		outp += sprintf(outp, "pc9: %016llX\n", p->pc9);
1065 		outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
1066 		outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi);
1067 		outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi);
1068 		outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg);
1069 		outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores);
1070 		outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx);
1071 		outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram);
1072 		outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status);
1073 		outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status);
1074 		outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c);
1075 
1076 		for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
1077 			outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]);
1078 		}
1079 	}
1080 
1081 	outp += sprintf(outp, "\n");
1082 
1083 	return 0;
1084 }
1085 
1086 /*
1087  * column formatting convention & formats
1088  */
1089 int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1090 {
1091 	double interval_float, tsc;
1092 	char *fmt8;
1093 	int i;
1094 	struct msr_counter *mp;
1095 	char *delim = "\t";
1096 	int printed = 0;
1097 
1098 	/* if showing only 1st thread in core and this isn't one, bail out */
1099 	if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
1100 		return 0;
1101 
1102 	/* if showing only 1st thread in pkg and this isn't one, bail out */
1103 	if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
1104 		return 0;
1105 
1106 	/*if not summary line and --cpu is used */
1107 	if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
1108 		return 0;
1109 
1110 	if (DO_BIC(BIC_USEC)) {
1111 		/* on each row, print how many usec each timestamp took to gather */
1112 		struct timeval tv;
1113 
1114 		timersub(&t->tv_end, &t->tv_begin, &tv);
1115 		outp += sprintf(outp, "%5ld\t", tv.tv_sec * 1000000 + tv.tv_usec);
1116 	}
1117 
1118 	/* Time_Of_Day_Seconds: on each row, print sec.usec last timestamp taken */
1119 	if (DO_BIC(BIC_TOD))
1120 		outp += sprintf(outp, "%10ld.%06ld\t", t->tv_end.tv_sec, t->tv_end.tv_usec);
1121 
1122 	interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec / 1000000.0;
1123 
1124 	tsc = t->tsc * tsc_tweak;
1125 
1126 	/* topo columns, print blanks on 1st (average) line */
1127 	if (t == &average.threads) {
1128 		if (DO_BIC(BIC_Package))
1129 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1130 		if (DO_BIC(BIC_Die))
1131 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1132 		if (DO_BIC(BIC_Node))
1133 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1134 		if (DO_BIC(BIC_Core))
1135 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1136 		if (DO_BIC(BIC_CPU))
1137 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1138 		if (DO_BIC(BIC_APIC))
1139 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1140 		if (DO_BIC(BIC_X2APIC))
1141 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1142 	} else {
1143 		if (DO_BIC(BIC_Package)) {
1144 			if (p)
1145 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->package_id);
1146 			else
1147 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1148 		}
1149 		if (DO_BIC(BIC_Die)) {
1150 			if (c)
1151 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].die_id);
1152 			else
1153 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1154 		}
1155 		if (DO_BIC(BIC_Node)) {
1156 			if (t)
1157 				outp += sprintf(outp, "%s%d",
1158 						(printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id);
1159 			else
1160 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1161 		}
1162 		if (DO_BIC(BIC_Core)) {
1163 			if (c)
1164 				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_id);
1165 			else
1166 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
1167 		}
1168 		if (DO_BIC(BIC_CPU))
1169 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->cpu_id);
1170 		if (DO_BIC(BIC_APIC))
1171 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->apic_id);
1172 		if (DO_BIC(BIC_X2APIC))
1173 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->x2apic_id);
1174 	}
1175 
1176 	if (DO_BIC(BIC_Avg_MHz))
1177 		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float);
1178 
1179 	if (DO_BIC(BIC_Busy))
1180 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc);
1181 
1182 	if (DO_BIC(BIC_Bzy_MHz)) {
1183 		if (has_base_hz)
1184 			outp +=
1185 			    sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf);
1186 		else
1187 			outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""),
1188 					tsc / units * t->aperf / t->mperf / interval_float);
1189 	}
1190 
1191 	if (DO_BIC(BIC_TSC_MHz))
1192 		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc / units / interval_float);
1193 
1194 	if (DO_BIC(BIC_IPC))
1195 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf);
1196 
1197 	/* IRQ */
1198 	if (DO_BIC(BIC_IRQ)) {
1199 		if (sums_need_wide_columns)
1200 			outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->irq_count);
1201 		else
1202 			outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->irq_count);
1203 	}
1204 
1205 	/* SMI */
1206 	if (DO_BIC(BIC_SMI))
1207 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count);
1208 
1209 	/* Added counters */
1210 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
1211 		if (mp->format == FORMAT_RAW) {
1212 			if (mp->width == 32)
1213 				outp +=
1214 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]);
1215 			else
1216 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]);
1217 		} else if (mp->format == FORMAT_DELTA) {
1218 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
1219 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->counter[i]);
1220 			else
1221 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]);
1222 		} else if (mp->format == FORMAT_PERCENT) {
1223 			if (mp->type == COUNTER_USEC)
1224 				outp +=
1225 				    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
1226 					    t->counter[i] / interval_float / 10000);
1227 			else
1228 				outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc);
1229 		}
1230 	}
1231 
1232 	/* C1 */
1233 	if (DO_BIC(BIC_CPU_c1))
1234 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
1235 
1236 	/* print per-core data only for 1st thread in core */
1237 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
1238 		goto done;
1239 
1240 	if (DO_BIC(BIC_CPU_c3))
1241 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc);
1242 	if (DO_BIC(BIC_CPU_c6))
1243 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc);
1244 	if (DO_BIC(BIC_CPU_c7))
1245 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc);
1246 
1247 	/* Mod%c6 */
1248 	if (DO_BIC(BIC_Mod_c6))
1249 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->mc6_us / tsc);
1250 
1251 	if (DO_BIC(BIC_CoreTmp))
1252 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c);
1253 
1254 	/* Core throttle count */
1255 	if (DO_BIC(BIC_CORE_THROT_CNT))
1256 		outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt);
1257 
1258 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
1259 		if (mp->format == FORMAT_RAW) {
1260 			if (mp->width == 32)
1261 				outp +=
1262 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]);
1263 			else
1264 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]);
1265 		} else if (mp->format == FORMAT_DELTA) {
1266 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
1267 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->counter[i]);
1268 			else
1269 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]);
1270 		} else if (mp->format == FORMAT_PERCENT) {
1271 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc);
1272 		}
1273 	}
1274 
1275 	fmt8 = "%s%.2f";
1276 
1277 	if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
1278 		outp +=
1279 		    sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float);
1280 	if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY))
1281 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units);
1282 
1283 	/* print per-package data only for 1st core in package */
1284 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
1285 		goto done;
1286 
1287 	/* PkgTmp */
1288 	if (DO_BIC(BIC_PkgTmp))
1289 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->pkg_temp_c);
1290 
1291 	/* GFXrc6 */
1292 	if (DO_BIC(BIC_GFX_rc6)) {
1293 		if (p->gfx_rc6_ms == -1) {	/* detect GFX counter reset */
1294 			outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
1295 		} else {
1296 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
1297 					p->gfx_rc6_ms / 10.0 / interval_float);
1298 		}
1299 	}
1300 
1301 	/* GFXMHz */
1302 	if (DO_BIC(BIC_GFXMHz))
1303 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_mhz);
1304 
1305 	/* GFXACTMHz */
1306 	if (DO_BIC(BIC_GFXACTMHz))
1307 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz);
1308 
1309 	/* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
1310 	if (DO_BIC(BIC_Totl_c0))
1311 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc);
1312 	if (DO_BIC(BIC_Any_c0))
1313 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc);
1314 	if (DO_BIC(BIC_GFX_c0))
1315 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc);
1316 	if (DO_BIC(BIC_CPUGFX))
1317 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc);
1318 
1319 	if (DO_BIC(BIC_Pkgpc2))
1320 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc);
1321 	if (DO_BIC(BIC_Pkgpc3))
1322 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc);
1323 	if (DO_BIC(BIC_Pkgpc6))
1324 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc);
1325 	if (DO_BIC(BIC_Pkgpc7))
1326 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc);
1327 	if (DO_BIC(BIC_Pkgpc8))
1328 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc);
1329 	if (DO_BIC(BIC_Pkgpc9))
1330 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc);
1331 	if (DO_BIC(BIC_Pkgpc10))
1332 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
1333 
1334 	if (DO_BIC(BIC_CPU_LPI))
1335 		outp +=
1336 		    sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float);
1337 	if (DO_BIC(BIC_SYS_LPI))
1338 		outp +=
1339 		    sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float);
1340 
1341 	if (DO_BIC(BIC_PkgWatt))
1342 		outp +=
1343 		    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float);
1344 
1345 	if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY))
1346 		outp +=
1347 		    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float);
1348 	if (DO_BIC(BIC_GFXWatt))
1349 		outp +=
1350 		    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float);
1351 	if (DO_BIC(BIC_RAMWatt))
1352 		outp +=
1353 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
1354 			    p->energy_dram * rapl_dram_energy_units / interval_float);
1355 	if (DO_BIC(BIC_Pkg_J))
1356 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units);
1357 	if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY))
1358 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units);
1359 	if (DO_BIC(BIC_GFX_J))
1360 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units);
1361 	if (DO_BIC(BIC_RAM_J))
1362 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units);
1363 	if (DO_BIC(BIC_PKG__))
1364 		outp +=
1365 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
1366 			    100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float);
1367 	if (DO_BIC(BIC_RAM__))
1368 		outp +=
1369 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
1370 			    100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float);
1371 
1372 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
1373 		if (mp->format == FORMAT_RAW) {
1374 			if (mp->width == 32)
1375 				outp +=
1376 				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]);
1377 			else
1378 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]);
1379 		} else if (mp->format == FORMAT_DELTA) {
1380 			if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
1381 				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->counter[i]);
1382 			else
1383 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]);
1384 		} else if (mp->format == FORMAT_PERCENT) {
1385 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc);
1386 		}
1387 	}
1388 
1389 done:
1390 	if (*(outp - 1) != '\n')
1391 		outp += sprintf(outp, "\n");
1392 
1393 	return 0;
1394 }
1395 
1396 void flush_output_stdout(void)
1397 {
1398 	FILE *filep;
1399 
1400 	if (outf == stderr)
1401 		filep = stdout;
1402 	else
1403 		filep = outf;
1404 
1405 	fputs(output_buffer, filep);
1406 	fflush(filep);
1407 
1408 	outp = output_buffer;
1409 }
1410 
1411 void flush_output_stderr(void)
1412 {
1413 	fputs(output_buffer, outf);
1414 	fflush(outf);
1415 	outp = output_buffer;
1416 }
1417 
1418 void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1419 {
1420 	static int count;
1421 
1422 	if ((!count || (header_iterations && !(count % header_iterations))) || !summary_only)
1423 		print_header("\t");
1424 
1425 	format_counters(&average.threads, &average.cores, &average.packages);
1426 
1427 	count++;
1428 
1429 	if (summary_only)
1430 		return;
1431 
1432 	for_all_cpus(format_counters, t, c, p);
1433 }
1434 
1435 #define DELTA_WRAP32(new, old)			\
1436 	old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32);
1437 
1438 int delta_package(struct pkg_data *new, struct pkg_data *old)
1439 {
1440 	int i;
1441 	struct msr_counter *mp;
1442 
1443 	if (DO_BIC(BIC_Totl_c0))
1444 		old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
1445 	if (DO_BIC(BIC_Any_c0))
1446 		old->pkg_any_core_c0 = new->pkg_any_core_c0 - old->pkg_any_core_c0;
1447 	if (DO_BIC(BIC_GFX_c0))
1448 		old->pkg_any_gfxe_c0 = new->pkg_any_gfxe_c0 - old->pkg_any_gfxe_c0;
1449 	if (DO_BIC(BIC_CPUGFX))
1450 		old->pkg_both_core_gfxe_c0 = new->pkg_both_core_gfxe_c0 - old->pkg_both_core_gfxe_c0;
1451 
1452 	old->pc2 = new->pc2 - old->pc2;
1453 	if (DO_BIC(BIC_Pkgpc3))
1454 		old->pc3 = new->pc3 - old->pc3;
1455 	if (DO_BIC(BIC_Pkgpc6))
1456 		old->pc6 = new->pc6 - old->pc6;
1457 	if (DO_BIC(BIC_Pkgpc7))
1458 		old->pc7 = new->pc7 - old->pc7;
1459 	old->pc8 = new->pc8 - old->pc8;
1460 	old->pc9 = new->pc9 - old->pc9;
1461 	old->pc10 = new->pc10 - old->pc10;
1462 	old->cpu_lpi = new->cpu_lpi - old->cpu_lpi;
1463 	old->sys_lpi = new->sys_lpi - old->sys_lpi;
1464 	old->pkg_temp_c = new->pkg_temp_c;
1465 
1466 	/* flag an error when rc6 counter resets/wraps */
1467 	if (old->gfx_rc6_ms > new->gfx_rc6_ms)
1468 		old->gfx_rc6_ms = -1;
1469 	else
1470 		old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms;
1471 
1472 	old->gfx_mhz = new->gfx_mhz;
1473 	old->gfx_act_mhz = new->gfx_act_mhz;
1474 
1475 	old->energy_pkg = new->energy_pkg - old->energy_pkg;
1476 	old->energy_cores = new->energy_cores - old->energy_cores;
1477 	old->energy_gfx = new->energy_gfx - old->energy_gfx;
1478 	old->energy_dram = new->energy_dram - old->energy_dram;
1479 	old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status;
1480 	old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status;
1481 
1482 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
1483 		if (mp->format == FORMAT_RAW)
1484 			old->counter[i] = new->counter[i];
1485 		else
1486 			old->counter[i] = new->counter[i] - old->counter[i];
1487 	}
1488 
1489 	return 0;
1490 }
1491 
1492 void delta_core(struct core_data *new, struct core_data *old)
1493 {
1494 	int i;
1495 	struct msr_counter *mp;
1496 
1497 	old->c3 = new->c3 - old->c3;
1498 	old->c6 = new->c6 - old->c6;
1499 	old->c7 = new->c7 - old->c7;
1500 	old->core_temp_c = new->core_temp_c;
1501 	old->core_throt_cnt = new->core_throt_cnt;
1502 	old->mc6_us = new->mc6_us - old->mc6_us;
1503 
1504 	DELTA_WRAP32(new->core_energy, old->core_energy);
1505 
1506 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
1507 		if (mp->format == FORMAT_RAW)
1508 			old->counter[i] = new->counter[i];
1509 		else
1510 			old->counter[i] = new->counter[i] - old->counter[i];
1511 	}
1512 }
1513 
1514 int soft_c1_residency_display(int bic)
1515 {
1516 	if (!DO_BIC(BIC_CPU_c1) || use_c1_residency_msr)
1517 		return 0;
1518 
1519 	return DO_BIC_READ(bic);
1520 }
1521 
1522 /*
1523  * old = new - old
1524  */
1525 int delta_thread(struct thread_data *new, struct thread_data *old, struct core_data *core_delta)
1526 {
1527 	int i;
1528 	struct msr_counter *mp;
1529 
1530 	/* we run cpuid just the 1st time, copy the results */
1531 	if (DO_BIC(BIC_APIC))
1532 		new->apic_id = old->apic_id;
1533 	if (DO_BIC(BIC_X2APIC))
1534 		new->x2apic_id = old->x2apic_id;
1535 
1536 	/*
1537 	 * the timestamps from start of measurement interval are in "old"
1538 	 * the timestamp from end of measurement interval are in "new"
1539 	 * over-write old w/ new so we can print end of interval values
1540 	 */
1541 
1542 	timersub(&new->tv_begin, &old->tv_begin, &old->tv_delta);
1543 	old->tv_begin = new->tv_begin;
1544 	old->tv_end = new->tv_end;
1545 
1546 	old->tsc = new->tsc - old->tsc;
1547 
1548 	/* check for TSC < 1 Mcycles over interval */
1549 	if (old->tsc < (1000 * 1000))
1550 		errx(-3, "Insanely slow TSC rate, TSC stops in idle?\n"
1551 		     "You can disable all c-states by booting with \"idle=poll\"\n"
1552 		     "or just the deep ones with \"processor.max_cstate=1\"");
1553 
1554 	old->c1 = new->c1 - old->c1;
1555 
1556 	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) {
1557 		if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
1558 			old->aperf = new->aperf - old->aperf;
1559 			old->mperf = new->mperf - old->mperf;
1560 		} else {
1561 			return -1;
1562 		}
1563 	}
1564 
1565 	if (use_c1_residency_msr) {
1566 		/*
1567 		 * Some models have a dedicated C1 residency MSR,
1568 		 * which should be more accurate than the derivation below.
1569 		 */
1570 	} else {
1571 		/*
1572 		 * As counter collection is not atomic,
1573 		 * it is possible for mperf's non-halted cycles + idle states
1574 		 * to exceed TSC's all cycles: show c1 = 0% in that case.
1575 		 */
1576 		if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > (old->tsc * tsc_tweak))
1577 			old->c1 = 0;
1578 		else {
1579 			/* normal case, derive c1 */
1580 			old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3
1581 			    - core_delta->c6 - core_delta->c7;
1582 		}
1583 	}
1584 
1585 	if (old->mperf == 0) {
1586 		if (debug > 1)
1587 			fprintf(outf, "cpu%d MPERF 0!\n", old->cpu_id);
1588 		old->mperf = 1;	/* divide by 0 protection */
1589 	}
1590 
1591 	if (DO_BIC(BIC_IPC))
1592 		old->instr_count = new->instr_count - old->instr_count;
1593 
1594 	if (DO_BIC(BIC_IRQ))
1595 		old->irq_count = new->irq_count - old->irq_count;
1596 
1597 	if (DO_BIC(BIC_SMI))
1598 		old->smi_count = new->smi_count - old->smi_count;
1599 
1600 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
1601 		if (mp->format == FORMAT_RAW)
1602 			old->counter[i] = new->counter[i];
1603 		else
1604 			old->counter[i] = new->counter[i] - old->counter[i];
1605 	}
1606 	return 0;
1607 }
1608 
1609 int delta_cpu(struct thread_data *t, struct core_data *c,
1610 	      struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2)
1611 {
1612 	int retval = 0;
1613 
1614 	/* calculate core delta only for 1st thread in core */
1615 	if (t->flags & CPU_IS_FIRST_THREAD_IN_CORE)
1616 		delta_core(c, c2);
1617 
1618 	/* always calculate thread delta */
1619 	retval = delta_thread(t, t2, c2);	/* c2 is core delta */
1620 	if (retval)
1621 		return retval;
1622 
1623 	/* calculate package delta only for 1st core in package */
1624 	if (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)
1625 		retval = delta_package(p, p2);
1626 
1627 	return retval;
1628 }
1629 
1630 void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1631 {
1632 	int i;
1633 	struct msr_counter *mp;
1634 
1635 	t->tv_begin.tv_sec = 0;
1636 	t->tv_begin.tv_usec = 0;
1637 	t->tv_end.tv_sec = 0;
1638 	t->tv_end.tv_usec = 0;
1639 	t->tv_delta.tv_sec = 0;
1640 	t->tv_delta.tv_usec = 0;
1641 
1642 	t->tsc = 0;
1643 	t->aperf = 0;
1644 	t->mperf = 0;
1645 	t->c1 = 0;
1646 
1647 	t->instr_count = 0;
1648 
1649 	t->irq_count = 0;
1650 	t->smi_count = 0;
1651 
1652 	/* tells format_counters to dump all fields from this set */
1653 	t->flags = CPU_IS_FIRST_THREAD_IN_CORE | CPU_IS_FIRST_CORE_IN_PACKAGE;
1654 
1655 	c->c3 = 0;
1656 	c->c6 = 0;
1657 	c->c7 = 0;
1658 	c->mc6_us = 0;
1659 	c->core_temp_c = 0;
1660 	c->core_energy = 0;
1661 	c->core_throt_cnt = 0;
1662 
1663 	p->pkg_wtd_core_c0 = 0;
1664 	p->pkg_any_core_c0 = 0;
1665 	p->pkg_any_gfxe_c0 = 0;
1666 	p->pkg_both_core_gfxe_c0 = 0;
1667 
1668 	p->pc2 = 0;
1669 	if (DO_BIC(BIC_Pkgpc3))
1670 		p->pc3 = 0;
1671 	if (DO_BIC(BIC_Pkgpc6))
1672 		p->pc6 = 0;
1673 	if (DO_BIC(BIC_Pkgpc7))
1674 		p->pc7 = 0;
1675 	p->pc8 = 0;
1676 	p->pc9 = 0;
1677 	p->pc10 = 0;
1678 	p->cpu_lpi = 0;
1679 	p->sys_lpi = 0;
1680 
1681 	p->energy_pkg = 0;
1682 	p->energy_dram = 0;
1683 	p->energy_cores = 0;
1684 	p->energy_gfx = 0;
1685 	p->rapl_pkg_perf_status = 0;
1686 	p->rapl_dram_perf_status = 0;
1687 	p->pkg_temp_c = 0;
1688 
1689 	p->gfx_rc6_ms = 0;
1690 	p->gfx_mhz = 0;
1691 	p->gfx_act_mhz = 0;
1692 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next)
1693 		t->counter[i] = 0;
1694 
1695 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next)
1696 		c->counter[i] = 0;
1697 
1698 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next)
1699 		p->counter[i] = 0;
1700 }
1701 
1702 int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1703 {
1704 	int i;
1705 	struct msr_counter *mp;
1706 
1707 	/* copy un-changing apic_id's */
1708 	if (DO_BIC(BIC_APIC))
1709 		average.threads.apic_id = t->apic_id;
1710 	if (DO_BIC(BIC_X2APIC))
1711 		average.threads.x2apic_id = t->x2apic_id;
1712 
1713 	/* remember first tv_begin */
1714 	if (average.threads.tv_begin.tv_sec == 0)
1715 		average.threads.tv_begin = t->tv_begin;
1716 
1717 	/* remember last tv_end */
1718 	average.threads.tv_end = t->tv_end;
1719 
1720 	average.threads.tsc += t->tsc;
1721 	average.threads.aperf += t->aperf;
1722 	average.threads.mperf += t->mperf;
1723 	average.threads.c1 += t->c1;
1724 
1725 	average.threads.instr_count += t->instr_count;
1726 
1727 	average.threads.irq_count += t->irq_count;
1728 	average.threads.smi_count += t->smi_count;
1729 
1730 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
1731 		if (mp->format == FORMAT_RAW)
1732 			continue;
1733 		average.threads.counter[i] += t->counter[i];
1734 	}
1735 
1736 	/* sum per-core values only for 1st thread in core */
1737 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
1738 		return 0;
1739 
1740 	average.cores.c3 += c->c3;
1741 	average.cores.c6 += c->c6;
1742 	average.cores.c7 += c->c7;
1743 	average.cores.mc6_us += c->mc6_us;
1744 
1745 	average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
1746 	average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt);
1747 
1748 	average.cores.core_energy += c->core_energy;
1749 
1750 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
1751 		if (mp->format == FORMAT_RAW)
1752 			continue;
1753 		average.cores.counter[i] += c->counter[i];
1754 	}
1755 
1756 	/* sum per-pkg values only for 1st core in pkg */
1757 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
1758 		return 0;
1759 
1760 	if (DO_BIC(BIC_Totl_c0))
1761 		average.packages.pkg_wtd_core_c0 += p->pkg_wtd_core_c0;
1762 	if (DO_BIC(BIC_Any_c0))
1763 		average.packages.pkg_any_core_c0 += p->pkg_any_core_c0;
1764 	if (DO_BIC(BIC_GFX_c0))
1765 		average.packages.pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0;
1766 	if (DO_BIC(BIC_CPUGFX))
1767 		average.packages.pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0;
1768 
1769 	average.packages.pc2 += p->pc2;
1770 	if (DO_BIC(BIC_Pkgpc3))
1771 		average.packages.pc3 += p->pc3;
1772 	if (DO_BIC(BIC_Pkgpc6))
1773 		average.packages.pc6 += p->pc6;
1774 	if (DO_BIC(BIC_Pkgpc7))
1775 		average.packages.pc7 += p->pc7;
1776 	average.packages.pc8 += p->pc8;
1777 	average.packages.pc9 += p->pc9;
1778 	average.packages.pc10 += p->pc10;
1779 
1780 	average.packages.cpu_lpi = p->cpu_lpi;
1781 	average.packages.sys_lpi = p->sys_lpi;
1782 
1783 	average.packages.energy_pkg += p->energy_pkg;
1784 	average.packages.energy_dram += p->energy_dram;
1785 	average.packages.energy_cores += p->energy_cores;
1786 	average.packages.energy_gfx += p->energy_gfx;
1787 
1788 	average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
1789 	average.packages.gfx_mhz = p->gfx_mhz;
1790 	average.packages.gfx_act_mhz = p->gfx_act_mhz;
1791 
1792 	average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
1793 
1794 	average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status;
1795 	average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status;
1796 
1797 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
1798 		if (mp->format == FORMAT_RAW)
1799 			continue;
1800 		average.packages.counter[i] += p->counter[i];
1801 	}
1802 	return 0;
1803 }
1804 
1805 /*
1806  * sum the counters for all cpus in the system
1807  * compute the weighted average
1808  */
1809 void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1810 {
1811 	int i;
1812 	struct msr_counter *mp;
1813 
1814 	clear_counters(&average.threads, &average.cores, &average.packages);
1815 
1816 	for_all_cpus(sum_counters, t, c, p);
1817 
1818 	/* Use the global time delta for the average. */
1819 	average.threads.tv_delta = tv_delta;
1820 
1821 	average.threads.tsc /= topo.num_cpus;
1822 	average.threads.aperf /= topo.num_cpus;
1823 	average.threads.mperf /= topo.num_cpus;
1824 	average.threads.instr_count /= topo.num_cpus;
1825 	average.threads.c1 /= topo.num_cpus;
1826 
1827 	if (average.threads.irq_count > 9999999)
1828 		sums_need_wide_columns = 1;
1829 
1830 	average.cores.c3 /= topo.num_cores;
1831 	average.cores.c6 /= topo.num_cores;
1832 	average.cores.c7 /= topo.num_cores;
1833 	average.cores.mc6_us /= topo.num_cores;
1834 
1835 	if (DO_BIC(BIC_Totl_c0))
1836 		average.packages.pkg_wtd_core_c0 /= topo.num_packages;
1837 	if (DO_BIC(BIC_Any_c0))
1838 		average.packages.pkg_any_core_c0 /= topo.num_packages;
1839 	if (DO_BIC(BIC_GFX_c0))
1840 		average.packages.pkg_any_gfxe_c0 /= topo.num_packages;
1841 	if (DO_BIC(BIC_CPUGFX))
1842 		average.packages.pkg_both_core_gfxe_c0 /= topo.num_packages;
1843 
1844 	average.packages.pc2 /= topo.num_packages;
1845 	if (DO_BIC(BIC_Pkgpc3))
1846 		average.packages.pc3 /= topo.num_packages;
1847 	if (DO_BIC(BIC_Pkgpc6))
1848 		average.packages.pc6 /= topo.num_packages;
1849 	if (DO_BIC(BIC_Pkgpc7))
1850 		average.packages.pc7 /= topo.num_packages;
1851 
1852 	average.packages.pc8 /= topo.num_packages;
1853 	average.packages.pc9 /= topo.num_packages;
1854 	average.packages.pc10 /= topo.num_packages;
1855 
1856 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
1857 		if (mp->format == FORMAT_RAW)
1858 			continue;
1859 		if (mp->type == COUNTER_ITEMS) {
1860 			if (average.threads.counter[i] > 9999999)
1861 				sums_need_wide_columns = 1;
1862 			continue;
1863 		}
1864 		average.threads.counter[i] /= topo.num_cpus;
1865 	}
1866 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
1867 		if (mp->format == FORMAT_RAW)
1868 			continue;
1869 		if (mp->type == COUNTER_ITEMS) {
1870 			if (average.cores.counter[i] > 9999999)
1871 				sums_need_wide_columns = 1;
1872 		}
1873 		average.cores.counter[i] /= topo.num_cores;
1874 	}
1875 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
1876 		if (mp->format == FORMAT_RAW)
1877 			continue;
1878 		if (mp->type == COUNTER_ITEMS) {
1879 			if (average.packages.counter[i] > 9999999)
1880 				sums_need_wide_columns = 1;
1881 		}
1882 		average.packages.counter[i] /= topo.num_packages;
1883 	}
1884 }
1885 
1886 static unsigned long long rdtsc(void)
1887 {
1888 	unsigned int low, high;
1889 
1890 	asm volatile ("rdtsc":"=a" (low), "=d"(high));
1891 
1892 	return low | ((unsigned long long)high) << 32;
1893 }
1894 
1895 /*
1896  * Open a file, and exit on failure
1897  */
1898 FILE *fopen_or_die(const char *path, const char *mode)
1899 {
1900 	FILE *filep = fopen(path, mode);
1901 
1902 	if (!filep)
1903 		err(1, "%s: open failed", path);
1904 	return filep;
1905 }
1906 
1907 /*
1908  * snapshot_sysfs_counter()
1909  *
1910  * return snapshot of given counter
1911  */
1912 unsigned long long snapshot_sysfs_counter(char *path)
1913 {
1914 	FILE *fp;
1915 	int retval;
1916 	unsigned long long counter;
1917 
1918 	fp = fopen_or_die(path, "r");
1919 
1920 	retval = fscanf(fp, "%lld", &counter);
1921 	if (retval != 1)
1922 		err(1, "snapshot_sysfs_counter(%s)", path);
1923 
1924 	fclose(fp);
1925 
1926 	return counter;
1927 }
1928 
1929 int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp)
1930 {
1931 	if (mp->msr_num != 0) {
1932 		if (get_msr(cpu, mp->msr_num, counterp))
1933 			return -1;
1934 	} else {
1935 		char path[128 + PATH_BYTES];
1936 
1937 		if (mp->flags & SYSFS_PERCPU) {
1938 			sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path);
1939 
1940 			*counterp = snapshot_sysfs_counter(path);
1941 		} else {
1942 			*counterp = snapshot_sysfs_counter(mp->path);
1943 		}
1944 	}
1945 
1946 	return 0;
1947 }
1948 
1949 int get_epb(int cpu)
1950 {
1951 	char path[128 + PATH_BYTES];
1952 	unsigned long long msr;
1953 	int ret, epb = -1;
1954 	FILE *fp;
1955 
1956 	sprintf(path, "/sys/devices/system/cpu/cpu%d/power/energy_perf_bias", cpu);
1957 
1958 	fp = fopen(path, "r");
1959 	if (!fp)
1960 		goto msr_fallback;
1961 
1962 	ret = fscanf(fp, "%d", &epb);
1963 	if (ret != 1)
1964 		err(1, "%s(%s)", __func__, path);
1965 
1966 	fclose(fp);
1967 
1968 	return epb;
1969 
1970 msr_fallback:
1971 	get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr);
1972 
1973 	return msr & 0xf;
1974 }
1975 
1976 void get_apic_id(struct thread_data *t)
1977 {
1978 	unsigned int eax, ebx, ecx, edx;
1979 
1980 	if (DO_BIC(BIC_APIC)) {
1981 		eax = ebx = ecx = edx = 0;
1982 		__cpuid(1, eax, ebx, ecx, edx);
1983 
1984 		t->apic_id = (ebx >> 24) & 0xff;
1985 	}
1986 
1987 	if (!DO_BIC(BIC_X2APIC))
1988 		return;
1989 
1990 	if (authentic_amd || hygon_genuine) {
1991 		unsigned int topology_extensions;
1992 
1993 		if (max_extended_level < 0x8000001e)
1994 			return;
1995 
1996 		eax = ebx = ecx = edx = 0;
1997 		__cpuid(0x80000001, eax, ebx, ecx, edx);
1998 		topology_extensions = ecx & (1 << 22);
1999 
2000 		if (topology_extensions == 0)
2001 			return;
2002 
2003 		eax = ebx = ecx = edx = 0;
2004 		__cpuid(0x8000001e, eax, ebx, ecx, edx);
2005 
2006 		t->x2apic_id = eax;
2007 		return;
2008 	}
2009 
2010 	if (!genuine_intel)
2011 		return;
2012 
2013 	if (max_level < 0xb)
2014 		return;
2015 
2016 	ecx = 0;
2017 	__cpuid(0xb, eax, ebx, ecx, edx);
2018 	t->x2apic_id = edx;
2019 
2020 	if (debug && (t->apic_id != (t->x2apic_id & 0xff)))
2021 		fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id);
2022 }
2023 
2024 int get_core_throt_cnt(int cpu, unsigned long long *cnt)
2025 {
2026 	char path[128 + PATH_BYTES];
2027 	unsigned long long tmp;
2028 	FILE *fp;
2029 	int ret;
2030 
2031 	sprintf(path, "/sys/devices/system/cpu/cpu%d/thermal_throttle/core_throttle_count", cpu);
2032 	fp = fopen(path, "r");
2033 	if (!fp)
2034 		return -1;
2035 	ret = fscanf(fp, "%lld", &tmp);
2036 	if (ret != 1)
2037 		return -1;
2038 	fclose(fp);
2039 	*cnt = tmp;
2040 
2041 	return 0;
2042 }
2043 
2044 /*
2045  * get_counters(...)
2046  * migrate to cpu
2047  * acquire and record local counters for that cpu
2048  */
2049 int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2050 {
2051 	int cpu = t->cpu_id;
2052 	unsigned long long msr;
2053 	int aperf_mperf_retry_count = 0;
2054 	struct msr_counter *mp;
2055 	int i;
2056 
2057 	if (cpu_migrate(cpu)) {
2058 		fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu);
2059 		return -1;
2060 	}
2061 
2062 	gettimeofday(&t->tv_begin, (struct timezone *)NULL);
2063 
2064 	if (first_counter_read)
2065 		get_apic_id(t);
2066 retry:
2067 	t->tsc = rdtsc();	/* we are running on local CPU of interest */
2068 
2069 	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) {
2070 		unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
2071 
2072 		/*
2073 		 * The TSC, APERF and MPERF must be read together for
2074 		 * APERF/MPERF and MPERF/TSC to give accurate results.
2075 		 *
2076 		 * Unfortunately, APERF and MPERF are read by
2077 		 * individual system call, so delays may occur
2078 		 * between them.  If the time to read them
2079 		 * varies by a large amount, we re-read them.
2080 		 */
2081 
2082 		/*
2083 		 * This initial dummy APERF read has been seen to
2084 		 * reduce jitter in the subsequent reads.
2085 		 */
2086 
2087 		if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
2088 			return -3;
2089 
2090 		t->tsc = rdtsc();	/* re-read close to APERF */
2091 
2092 		tsc_before = t->tsc;
2093 
2094 		if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
2095 			return -3;
2096 
2097 		tsc_between = rdtsc();
2098 
2099 		if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
2100 			return -4;
2101 
2102 		tsc_after = rdtsc();
2103 
2104 		aperf_time = tsc_between - tsc_before;
2105 		mperf_time = tsc_after - tsc_between;
2106 
2107 		/*
2108 		 * If the system call latency to read APERF and MPERF
2109 		 * differ by more than 2x, then try again.
2110 		 */
2111 		if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
2112 			aperf_mperf_retry_count++;
2113 			if (aperf_mperf_retry_count < 5)
2114 				goto retry;
2115 			else
2116 				warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
2117 		}
2118 		aperf_mperf_retry_count = 0;
2119 
2120 		t->aperf = t->aperf * aperf_mperf_multiplier;
2121 		t->mperf = t->mperf * aperf_mperf_multiplier;
2122 	}
2123 
2124 	if (DO_BIC(BIC_IPC))
2125 		if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
2126 			return -4;
2127 
2128 	if (DO_BIC(BIC_IRQ))
2129 		t->irq_count = irqs_per_cpu[cpu];
2130 	if (DO_BIC(BIC_SMI)) {
2131 		if (get_msr(cpu, MSR_SMI_COUNT, &msr))
2132 			return -5;
2133 		t->smi_count = msr & 0xFFFFFFFF;
2134 	}
2135 	if (DO_BIC(BIC_CPU_c1) && use_c1_residency_msr) {
2136 		if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1))
2137 			return -6;
2138 	}
2139 
2140 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2141 		if (get_mp(cpu, mp, &t->counter[i]))
2142 			return -10;
2143 	}
2144 
2145 	/* collect core counters only for 1st thread in core */
2146 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
2147 		goto done;
2148 
2149 	if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) {
2150 		if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3))
2151 			return -6;
2152 	}
2153 
2154 	if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !do_knl_cstates) {
2155 		if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6))
2156 			return -7;
2157 	} else if (do_knl_cstates || soft_c1_residency_display(BIC_CPU_c6)) {
2158 		if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6))
2159 			return -7;
2160 	}
2161 
2162 	if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) {
2163 		if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7))
2164 			return -8;
2165 		else if (t->is_atom) {
2166 			/*
2167 			 * For Atom CPUs that has core cstate deeper than c6,
2168 			 * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper.
2169 			 * Minus CC7 (and deeper cstates) residency to get
2170 			 * accturate cc6 residency.
2171 			 */
2172 			c->c6 -= c->c7;
2173 		}
2174 	}
2175 
2176 	if (DO_BIC(BIC_Mod_c6))
2177 		if (get_msr(cpu, MSR_MODULE_C6_RES_MS, &c->mc6_us))
2178 			return -8;
2179 
2180 	if (DO_BIC(BIC_CoreTmp)) {
2181 		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
2182 			return -9;
2183 		c->core_temp_c = tj_max - ((msr >> 16) & 0x7F);
2184 	}
2185 
2186 	if (DO_BIC(BIC_CORE_THROT_CNT))
2187 		get_core_throt_cnt(cpu, &c->core_throt_cnt);
2188 
2189 	if (do_rapl & RAPL_AMD_F17H) {
2190 		if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr))
2191 			return -14;
2192 		c->core_energy = msr & 0xFFFFFFFF;
2193 	}
2194 
2195 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2196 		if (get_mp(cpu, mp, &c->counter[i]))
2197 			return -10;
2198 	}
2199 
2200 	/* collect package counters only for 1st core in package */
2201 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
2202 		goto done;
2203 
2204 	if (DO_BIC(BIC_Totl_c0)) {
2205 		if (get_msr(cpu, MSR_PKG_WEIGHTED_CORE_C0_RES, &p->pkg_wtd_core_c0))
2206 			return -10;
2207 	}
2208 	if (DO_BIC(BIC_Any_c0)) {
2209 		if (get_msr(cpu, MSR_PKG_ANY_CORE_C0_RES, &p->pkg_any_core_c0))
2210 			return -11;
2211 	}
2212 	if (DO_BIC(BIC_GFX_c0)) {
2213 		if (get_msr(cpu, MSR_PKG_ANY_GFXE_C0_RES, &p->pkg_any_gfxe_c0))
2214 			return -12;
2215 	}
2216 	if (DO_BIC(BIC_CPUGFX)) {
2217 		if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0))
2218 			return -13;
2219 	}
2220 	if (DO_BIC(BIC_Pkgpc3))
2221 		if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3))
2222 			return -9;
2223 	if (DO_BIC(BIC_Pkgpc6)) {
2224 		if (do_slm_cstates) {
2225 			if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6))
2226 				return -10;
2227 		} else {
2228 			if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6))
2229 				return -10;
2230 		}
2231 	}
2232 
2233 	if (DO_BIC(BIC_Pkgpc2))
2234 		if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2))
2235 			return -11;
2236 	if (DO_BIC(BIC_Pkgpc7))
2237 		if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7))
2238 			return -12;
2239 	if (DO_BIC(BIC_Pkgpc8))
2240 		if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8))
2241 			return -13;
2242 	if (DO_BIC(BIC_Pkgpc9))
2243 		if (get_msr(cpu, MSR_PKG_C9_RESIDENCY, &p->pc9))
2244 			return -13;
2245 	if (DO_BIC(BIC_Pkgpc10))
2246 		if (get_msr(cpu, MSR_PKG_C10_RESIDENCY, &p->pc10))
2247 			return -13;
2248 
2249 	if (DO_BIC(BIC_CPU_LPI))
2250 		p->cpu_lpi = cpuidle_cur_cpu_lpi_us;
2251 	if (DO_BIC(BIC_SYS_LPI))
2252 		p->sys_lpi = cpuidle_cur_sys_lpi_us;
2253 
2254 	if (do_rapl & RAPL_PKG) {
2255 		if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
2256 			return -13;
2257 		p->energy_pkg = msr;
2258 	}
2259 	if (do_rapl & RAPL_CORES_ENERGY_STATUS) {
2260 		if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
2261 			return -14;
2262 		p->energy_cores = msr;
2263 	}
2264 	if (do_rapl & RAPL_DRAM) {
2265 		if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
2266 			return -15;
2267 		p->energy_dram = msr;
2268 	}
2269 	if (do_rapl & RAPL_GFX) {
2270 		if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
2271 			return -16;
2272 		p->energy_gfx = msr;
2273 	}
2274 	if (do_rapl & RAPL_PKG_PERF_STATUS) {
2275 		if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
2276 			return -16;
2277 		p->rapl_pkg_perf_status = msr;
2278 	}
2279 	if (do_rapl & RAPL_DRAM_PERF_STATUS) {
2280 		if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
2281 			return -16;
2282 		p->rapl_dram_perf_status = msr;
2283 	}
2284 	if (do_rapl & RAPL_AMD_F17H) {
2285 		if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
2286 			return -13;
2287 		p->energy_pkg = msr;
2288 	}
2289 	if (DO_BIC(BIC_PkgTmp)) {
2290 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
2291 			return -17;
2292 		p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F);
2293 	}
2294 
2295 	if (DO_BIC(BIC_GFX_rc6))
2296 		p->gfx_rc6_ms = gfx_cur_rc6_ms;
2297 
2298 	if (DO_BIC(BIC_GFXMHz))
2299 		p->gfx_mhz = gfx_cur_mhz;
2300 
2301 	if (DO_BIC(BIC_GFXACTMHz))
2302 		p->gfx_act_mhz = gfx_act_mhz;
2303 
2304 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2305 		if (get_mp(cpu, mp, &p->counter[i]))
2306 			return -10;
2307 	}
2308 done:
2309 	gettimeofday(&t->tv_end, (struct timezone *)NULL);
2310 
2311 	return 0;
2312 }
2313 
2314 /*
2315  * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit:
2316  * If you change the values, note they are used both in comparisons
2317  * (>= PCL__7) and to index pkg_cstate_limit_strings[].
2318  */
2319 
2320 #define PCLUKN 0		/* Unknown */
2321 #define PCLRSV 1		/* Reserved */
2322 #define PCL__0 2		/* PC0 */
2323 #define PCL__1 3		/* PC1 */
2324 #define PCL__2 4		/* PC2 */
2325 #define PCL__3 5		/* PC3 */
2326 #define PCL__4 6		/* PC4 */
2327 #define PCL__6 7		/* PC6 */
2328 #define PCL_6N 8		/* PC6 No Retention */
2329 #define PCL_6R 9		/* PC6 Retention */
2330 #define PCL__7 10		/* PC7 */
2331 #define PCL_7S 11		/* PC7 Shrink */
2332 #define PCL__8 12		/* PC8 */
2333 #define PCL__9 13		/* PC9 */
2334 #define PCL_10 14		/* PC10 */
2335 #define PCLUNL 15		/* Unlimited */
2336 
2337 int pkg_cstate_limit = PCLUKN;
2338 char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2",
2339 	"pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"
2340 };
2341 
2342 int nhm_pkg_cstate_limits[16] =
2343     { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2344 	PCLRSV, PCLRSV
2345 };
2346 
2347 int snb_pkg_cstate_limits[16] =
2348     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2349 	PCLRSV, PCLRSV
2350 };
2351 
2352 int hsw_pkg_cstate_limits[16] =
2353     { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2354 	PCLRSV, PCLRSV
2355 };
2356 
2357 int slv_pkg_cstate_limits[16] =
2358     { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2359 	PCL__6, PCL__7
2360 };
2361 
2362 int amt_pkg_cstate_limits[16] =
2363     { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2364 	PCLRSV, PCLRSV
2365 };
2366 
2367 int phi_pkg_cstate_limits[16] =
2368     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2369 	PCLRSV, PCLRSV
2370 };
2371 
2372 int glm_pkg_cstate_limits[16] =
2373     { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2374 	PCLRSV, PCLRSV
2375 };
2376 
2377 int skx_pkg_cstate_limits[16] =
2378     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2379 	PCLRSV, PCLRSV
2380 };
2381 
2382 int icx_pkg_cstate_limits[16] =
2383     { PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
2384 	PCLRSV, PCLRSV
2385 };
2386 
2387 static void calculate_tsc_tweak()
2388 {
2389 	tsc_tweak = base_hz / tsc_hz;
2390 }
2391 
2392 void prewake_cstate_probe(unsigned int family, unsigned int model);
2393 
2394 static void dump_nhm_platform_info(void)
2395 {
2396 	unsigned long long msr;
2397 	unsigned int ratio;
2398 
2399 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
2400 
2401 	fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
2402 
2403 	ratio = (msr >> 40) & 0xFF;
2404 	fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk);
2405 
2406 	ratio = (msr >> 8) & 0xFF;
2407 	fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
2408 
2409 	get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
2410 	fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
2411 		base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
2412 
2413 	/* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
2414 	if (dis_cstate_prewake)
2415 		fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN");
2416 
2417 	return;
2418 }
2419 
2420 static void dump_hsw_turbo_ratio_limits(void)
2421 {
2422 	unsigned long long msr;
2423 	unsigned int ratio;
2424 
2425 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT2, &msr);
2426 
2427 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr);
2428 
2429 	ratio = (msr >> 8) & 0xFF;
2430 	if (ratio)
2431 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", ratio, bclk, ratio * bclk);
2432 
2433 	ratio = (msr >> 0) & 0xFF;
2434 	if (ratio)
2435 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", ratio, bclk, ratio * bclk);
2436 	return;
2437 }
2438 
2439 static void dump_ivt_turbo_ratio_limits(void)
2440 {
2441 	unsigned long long msr;
2442 	unsigned int ratio;
2443 
2444 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &msr);
2445 
2446 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr);
2447 
2448 	ratio = (msr >> 56) & 0xFF;
2449 	if (ratio)
2450 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", ratio, bclk, ratio * bclk);
2451 
2452 	ratio = (msr >> 48) & 0xFF;
2453 	if (ratio)
2454 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", ratio, bclk, ratio * bclk);
2455 
2456 	ratio = (msr >> 40) & 0xFF;
2457 	if (ratio)
2458 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", ratio, bclk, ratio * bclk);
2459 
2460 	ratio = (msr >> 32) & 0xFF;
2461 	if (ratio)
2462 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", ratio, bclk, ratio * bclk);
2463 
2464 	ratio = (msr >> 24) & 0xFF;
2465 	if (ratio)
2466 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", ratio, bclk, ratio * bclk);
2467 
2468 	ratio = (msr >> 16) & 0xFF;
2469 	if (ratio)
2470 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", ratio, bclk, ratio * bclk);
2471 
2472 	ratio = (msr >> 8) & 0xFF;
2473 	if (ratio)
2474 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", ratio, bclk, ratio * bclk);
2475 
2476 	ratio = (msr >> 0) & 0xFF;
2477 	if (ratio)
2478 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", ratio, bclk, ratio * bclk);
2479 	return;
2480 }
2481 
2482 int has_turbo_ratio_group_limits(int family, int model)
2483 {
2484 
2485 	if (!genuine_intel)
2486 		return 0;
2487 
2488 	switch (model) {
2489 	case INTEL_FAM6_ATOM_GOLDMONT:
2490 	case INTEL_FAM6_SKYLAKE_X:
2491 	case INTEL_FAM6_ICELAKE_X:
2492 	case INTEL_FAM6_ATOM_GOLDMONT_D:
2493 	case INTEL_FAM6_ATOM_TREMONT_D:
2494 		return 1;
2495 	}
2496 	return 0;
2497 }
2498 
2499 static void dump_turbo_ratio_limits(int family, int model)
2500 {
2501 	unsigned long long msr, core_counts;
2502 	unsigned int ratio, group_size;
2503 
2504 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
2505 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
2506 
2507 	if (has_turbo_ratio_group_limits(family, model)) {
2508 		get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
2509 		fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts);
2510 	} else {
2511 		core_counts = 0x0807060504030201;
2512 	}
2513 
2514 	ratio = (msr >> 56) & 0xFF;
2515 	group_size = (core_counts >> 56) & 0xFF;
2516 	if (ratio)
2517 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
2518 			ratio, bclk, ratio * bclk, group_size);
2519 
2520 	ratio = (msr >> 48) & 0xFF;
2521 	group_size = (core_counts >> 48) & 0xFF;
2522 	if (ratio)
2523 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
2524 			ratio, bclk, ratio * bclk, group_size);
2525 
2526 	ratio = (msr >> 40) & 0xFF;
2527 	group_size = (core_counts >> 40) & 0xFF;
2528 	if (ratio)
2529 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
2530 			ratio, bclk, ratio * bclk, group_size);
2531 
2532 	ratio = (msr >> 32) & 0xFF;
2533 	group_size = (core_counts >> 32) & 0xFF;
2534 	if (ratio)
2535 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
2536 			ratio, bclk, ratio * bclk, group_size);
2537 
2538 	ratio = (msr >> 24) & 0xFF;
2539 	group_size = (core_counts >> 24) & 0xFF;
2540 	if (ratio)
2541 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
2542 			ratio, bclk, ratio * bclk, group_size);
2543 
2544 	ratio = (msr >> 16) & 0xFF;
2545 	group_size = (core_counts >> 16) & 0xFF;
2546 	if (ratio)
2547 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
2548 			ratio, bclk, ratio * bclk, group_size);
2549 
2550 	ratio = (msr >> 8) & 0xFF;
2551 	group_size = (core_counts >> 8) & 0xFF;
2552 	if (ratio)
2553 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
2554 			ratio, bclk, ratio * bclk, group_size);
2555 
2556 	ratio = (msr >> 0) & 0xFF;
2557 	group_size = (core_counts >> 0) & 0xFF;
2558 	if (ratio)
2559 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
2560 			ratio, bclk, ratio * bclk, group_size);
2561 	return;
2562 }
2563 
2564 static void dump_atom_turbo_ratio_limits(void)
2565 {
2566 	unsigned long long msr;
2567 	unsigned int ratio;
2568 
2569 	get_msr(base_cpu, MSR_ATOM_CORE_RATIOS, &msr);
2570 	fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
2571 
2572 	ratio = (msr >> 0) & 0x3F;
2573 	if (ratio)
2574 		fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", ratio, bclk, ratio * bclk);
2575 
2576 	ratio = (msr >> 8) & 0x3F;
2577 	if (ratio)
2578 		fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", ratio, bclk, ratio * bclk);
2579 
2580 	ratio = (msr >> 16) & 0x3F;
2581 	if (ratio)
2582 		fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
2583 
2584 	get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr);
2585 	fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
2586 
2587 	ratio = (msr >> 24) & 0x3F;
2588 	if (ratio)
2589 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", ratio, bclk, ratio * bclk);
2590 
2591 	ratio = (msr >> 16) & 0x3F;
2592 	if (ratio)
2593 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", ratio, bclk, ratio * bclk);
2594 
2595 	ratio = (msr >> 8) & 0x3F;
2596 	if (ratio)
2597 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", ratio, bclk, ratio * bclk);
2598 
2599 	ratio = (msr >> 0) & 0x3F;
2600 	if (ratio)
2601 		fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", ratio, bclk, ratio * bclk);
2602 }
2603 
2604 static void dump_knl_turbo_ratio_limits(void)
2605 {
2606 	const unsigned int buckets_no = 7;
2607 
2608 	unsigned long long msr;
2609 	int delta_cores, delta_ratio;
2610 	int i, b_nr;
2611 	unsigned int cores[buckets_no];
2612 	unsigned int ratio[buckets_no];
2613 
2614 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
2615 
2616 	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
2617 
2618 	/*
2619 	 * Turbo encoding in KNL is as follows:
2620 	 * [0] -- Reserved
2621 	 * [7:1] -- Base value of number of active cores of bucket 1.
2622 	 * [15:8] -- Base value of freq ratio of bucket 1.
2623 	 * [20:16] -- +ve delta of number of active cores of bucket 2.
2624 	 * i.e. active cores of bucket 2 =
2625 	 * active cores of bucket 1 + delta
2626 	 * [23:21] -- Negative delta of freq ratio of bucket 2.
2627 	 * i.e. freq ratio of bucket 2 =
2628 	 * freq ratio of bucket 1 - delta
2629 	 * [28:24]-- +ve delta of number of active cores of bucket 3.
2630 	 * [31:29]-- -ve delta of freq ratio of bucket 3.
2631 	 * [36:32]-- +ve delta of number of active cores of bucket 4.
2632 	 * [39:37]-- -ve delta of freq ratio of bucket 4.
2633 	 * [44:40]-- +ve delta of number of active cores of bucket 5.
2634 	 * [47:45]-- -ve delta of freq ratio of bucket 5.
2635 	 * [52:48]-- +ve delta of number of active cores of bucket 6.
2636 	 * [55:53]-- -ve delta of freq ratio of bucket 6.
2637 	 * [60:56]-- +ve delta of number of active cores of bucket 7.
2638 	 * [63:61]-- -ve delta of freq ratio of bucket 7.
2639 	 */
2640 
2641 	b_nr = 0;
2642 	cores[b_nr] = (msr & 0xFF) >> 1;
2643 	ratio[b_nr] = (msr >> 8) & 0xFF;
2644 
2645 	for (i = 16; i < 64; i += 8) {
2646 		delta_cores = (msr >> i) & 0x1F;
2647 		delta_ratio = (msr >> (i + 5)) & 0x7;
2648 
2649 		cores[b_nr + 1] = cores[b_nr] + delta_cores;
2650 		ratio[b_nr + 1] = ratio[b_nr] - delta_ratio;
2651 		b_nr++;
2652 	}
2653 
2654 	for (i = buckets_no - 1; i >= 0; i--)
2655 		if (i > 0 ? ratio[i] != ratio[i - 1] : 1)
2656 			fprintf(outf,
2657 				"%d * %.1f = %.1f MHz max turbo %d active cores\n",
2658 				ratio[i], bclk, ratio[i] * bclk, cores[i]);
2659 }
2660 
2661 static void dump_nhm_cst_cfg(void)
2662 {
2663 	unsigned long long msr;
2664 
2665 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
2666 
2667 	fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr);
2668 
2669 	fprintf(outf, " (%s%s%s%s%slocked, pkg-cstate-limit=%d (%s)",
2670 		(msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
2671 		(msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
2672 		(msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
2673 		(msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
2674 		(msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]);
2675 
2676 #define AUTOMATIC_CSTATE_CONVERSION		(1UL << 16)
2677 	if (has_automatic_cstate_conversion) {
2678 		fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
2679 	}
2680 
2681 	fprintf(outf, ")\n");
2682 
2683 	return;
2684 }
2685 
2686 static void dump_config_tdp(void)
2687 {
2688 	unsigned long long msr;
2689 
2690 	get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
2691 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
2692 	fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xFF);
2693 
2694 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
2695 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
2696 	if (msr) {
2697 		fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
2698 		fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
2699 		fprintf(outf, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
2700 		fprintf(outf, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0x7FFF);
2701 	}
2702 	fprintf(outf, ")\n");
2703 
2704 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
2705 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
2706 	if (msr) {
2707 		fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
2708 		fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
2709 		fprintf(outf, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
2710 		fprintf(outf, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0x7FFF);
2711 	}
2712 	fprintf(outf, ")\n");
2713 
2714 	get_msr(base_cpu, MSR_CONFIG_TDP_CONTROL, &msr);
2715 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr);
2716 	if ((msr) & 0x3)
2717 		fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
2718 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
2719 	fprintf(outf, ")\n");
2720 
2721 	get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
2722 	fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
2723 	fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0xFF);
2724 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
2725 	fprintf(outf, ")\n");
2726 }
2727 
2728 unsigned int irtl_time_units[] = { 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
2729 
2730 void print_irtl(void)
2731 {
2732 	unsigned long long msr;
2733 
2734 	get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
2735 	fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
2736 	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
2737 		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
2738 
2739 	get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
2740 	fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
2741 	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
2742 		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
2743 
2744 	get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
2745 	fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
2746 	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
2747 		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
2748 
2749 	if (!do_irtl_hsw)
2750 		return;
2751 
2752 	get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
2753 	fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
2754 	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
2755 		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
2756 
2757 	get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
2758 	fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
2759 	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
2760 		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
2761 
2762 	get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
2763 	fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
2764 	fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
2765 		(msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
2766 
2767 }
2768 
2769 void free_fd_percpu(void)
2770 {
2771 	int i;
2772 
2773 	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
2774 		if (fd_percpu[i] != 0)
2775 			close(fd_percpu[i]);
2776 	}
2777 
2778 	free(fd_percpu);
2779 }
2780 
2781 void free_all_buffers(void)
2782 {
2783 	int i;
2784 
2785 	CPU_FREE(cpu_present_set);
2786 	cpu_present_set = NULL;
2787 	cpu_present_setsize = 0;
2788 
2789 	CPU_FREE(cpu_affinity_set);
2790 	cpu_affinity_set = NULL;
2791 	cpu_affinity_setsize = 0;
2792 
2793 	free(thread_even);
2794 	free(core_even);
2795 	free(package_even);
2796 
2797 	thread_even = NULL;
2798 	core_even = NULL;
2799 	package_even = NULL;
2800 
2801 	free(thread_odd);
2802 	free(core_odd);
2803 	free(package_odd);
2804 
2805 	thread_odd = NULL;
2806 	core_odd = NULL;
2807 	package_odd = NULL;
2808 
2809 	free(output_buffer);
2810 	output_buffer = NULL;
2811 	outp = NULL;
2812 
2813 	free_fd_percpu();
2814 
2815 	free(irq_column_2_cpu);
2816 	free(irqs_per_cpu);
2817 
2818 	for (i = 0; i <= topo.max_cpu_num; ++i) {
2819 		if (cpus[i].put_ids)
2820 			CPU_FREE(cpus[i].put_ids);
2821 	}
2822 	free(cpus);
2823 }
2824 
2825 /*
2826  * Parse a file containing a single int.
2827  * Return 0 if file can not be opened
2828  * Exit if file can be opened, but can not be parsed
2829  */
2830 int parse_int_file(const char *fmt, ...)
2831 {
2832 	va_list args;
2833 	char path[PATH_MAX];
2834 	FILE *filep;
2835 	int value;
2836 
2837 	va_start(args, fmt);
2838 	vsnprintf(path, sizeof(path), fmt, args);
2839 	va_end(args);
2840 	filep = fopen(path, "r");
2841 	if (!filep)
2842 		return 0;
2843 	if (fscanf(filep, "%d", &value) != 1)
2844 		err(1, "%s: failed to parse number from file", path);
2845 	fclose(filep);
2846 	return value;
2847 }
2848 
2849 /*
2850  * cpu_is_first_core_in_package(cpu)
2851  * return 1 if given CPU is 1st core in package
2852  */
2853 int cpu_is_first_core_in_package(int cpu)
2854 {
2855 	return cpu == parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", cpu);
2856 }
2857 
2858 int get_physical_package_id(int cpu)
2859 {
2860 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu);
2861 }
2862 
2863 int get_die_id(int cpu)
2864 {
2865 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/die_id", cpu);
2866 }
2867 
2868 int get_core_id(int cpu)
2869 {
2870 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_id", cpu);
2871 }
2872 
2873 void set_node_data(void)
2874 {
2875 	int pkg, node, lnode, cpu, cpux;
2876 	int cpu_count;
2877 
2878 	/* initialize logical_node_id */
2879 	for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu)
2880 		cpus[cpu].logical_node_id = -1;
2881 
2882 	cpu_count = 0;
2883 	for (pkg = 0; pkg < topo.num_packages; pkg++) {
2884 		lnode = 0;
2885 		for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
2886 			if (cpus[cpu].physical_package_id != pkg)
2887 				continue;
2888 			/* find a cpu with an unset logical_node_id */
2889 			if (cpus[cpu].logical_node_id != -1)
2890 				continue;
2891 			cpus[cpu].logical_node_id = lnode;
2892 			node = cpus[cpu].physical_node_id;
2893 			cpu_count++;
2894 			/*
2895 			 * find all matching cpus on this pkg and set
2896 			 * the logical_node_id
2897 			 */
2898 			for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) {
2899 				if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) {
2900 					cpus[cpux].logical_node_id = lnode;
2901 					cpu_count++;
2902 				}
2903 			}
2904 			lnode++;
2905 			if (lnode > topo.nodes_per_pkg)
2906 				topo.nodes_per_pkg = lnode;
2907 		}
2908 		if (cpu_count >= topo.max_cpu_num)
2909 			break;
2910 	}
2911 }
2912 
2913 int get_physical_node_id(struct cpu_topology *thiscpu)
2914 {
2915 	char path[80];
2916 	FILE *filep;
2917 	int i;
2918 	int cpu = thiscpu->logical_cpu_id;
2919 
2920 	for (i = 0; i <= topo.max_cpu_num; i++) {
2921 		sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i);
2922 		filep = fopen(path, "r");
2923 		if (!filep)
2924 			continue;
2925 		fclose(filep);
2926 		return i;
2927 	}
2928 	return -1;
2929 }
2930 
2931 int get_thread_siblings(struct cpu_topology *thiscpu)
2932 {
2933 	char path[80], character;
2934 	FILE *filep;
2935 	unsigned long map;
2936 	int so, shift, sib_core;
2937 	int cpu = thiscpu->logical_cpu_id;
2938 	int offset = topo.max_cpu_num + 1;
2939 	size_t size;
2940 	int thread_id = 0;
2941 
2942 	thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1));
2943 	if (thiscpu->thread_id < 0)
2944 		thiscpu->thread_id = thread_id++;
2945 	if (!thiscpu->put_ids)
2946 		return -1;
2947 
2948 	size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
2949 	CPU_ZERO_S(size, thiscpu->put_ids);
2950 
2951 	sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
2952 	filep = fopen(path, "r");
2953 
2954 	if (!filep) {
2955 		warnx("%s: open failed", path);
2956 		return -1;
2957 	}
2958 	do {
2959 		offset -= BITMASK_SIZE;
2960 		if (fscanf(filep, "%lx%c", &map, &character) != 2)
2961 			err(1, "%s: failed to parse file", path);
2962 		for (shift = 0; shift < BITMASK_SIZE; shift++) {
2963 			if ((map >> shift) & 0x1) {
2964 				so = shift + offset;
2965 				sib_core = get_core_id(so);
2966 				if (sib_core == thiscpu->physical_core_id) {
2967 					CPU_SET_S(so, size, thiscpu->put_ids);
2968 					if ((so != cpu) && (cpus[so].thread_id < 0))
2969 						cpus[so].thread_id = thread_id++;
2970 				}
2971 			}
2972 		}
2973 	} while (!strncmp(&character, ",", 1));
2974 	fclose(filep);
2975 
2976 	return CPU_COUNT_S(size, thiscpu->put_ids);
2977 }
2978 
2979 /*
2980  * run func(thread, core, package) in topology order
2981  * skip non-present cpus
2982  */
2983 
2984 int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
2985 			       struct pkg_data *, struct thread_data *, struct core_data *,
2986 			       struct pkg_data *), struct thread_data *thread_base,
2987 		   struct core_data *core_base, struct pkg_data *pkg_base,
2988 		   struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2)
2989 {
2990 	int retval, pkg_no, node_no, core_no, thread_no;
2991 
2992 	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
2993 		for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) {
2994 			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
2995 				for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
2996 					struct thread_data *t, *t2;
2997 					struct core_data *c, *c2;
2998 					struct pkg_data *p, *p2;
2999 
3000 					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
3001 
3002 					if (cpu_is_not_present(t->cpu_id))
3003 						continue;
3004 
3005 					t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
3006 
3007 					c = GET_CORE(core_base, core_no, node_no, pkg_no);
3008 					c2 = GET_CORE(core_base2, core_no, node_no, pkg_no);
3009 
3010 					p = GET_PKG(pkg_base, pkg_no);
3011 					p2 = GET_PKG(pkg_base2, pkg_no);
3012 
3013 					retval = func(t, c, p, t2, c2, p2);
3014 					if (retval)
3015 						return retval;
3016 				}
3017 			}
3018 		}
3019 	}
3020 	return 0;
3021 }
3022 
3023 /*
3024  * run func(cpu) on every cpu in /proc/stat
3025  * return max_cpu number
3026  */
3027 int for_all_proc_cpus(int (func) (int))
3028 {
3029 	FILE *fp;
3030 	int cpu_num;
3031 	int retval;
3032 
3033 	fp = fopen_or_die(proc_stat, "r");
3034 
3035 	retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n");
3036 	if (retval != 0)
3037 		err(1, "%s: failed to parse format", proc_stat);
3038 
3039 	while (1) {
3040 		retval = fscanf(fp, "cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n", &cpu_num);
3041 		if (retval != 1)
3042 			break;
3043 
3044 		retval = func(cpu_num);
3045 		if (retval) {
3046 			fclose(fp);
3047 			return (retval);
3048 		}
3049 	}
3050 	fclose(fp);
3051 	return 0;
3052 }
3053 
3054 void re_initialize(void)
3055 {
3056 	free_all_buffers();
3057 	setup_all_buffers();
3058 	fprintf(outf, "turbostat: re-initialized with num_cpus %d\n", topo.num_cpus);
3059 }
3060 
3061 void set_max_cpu_num(void)
3062 {
3063 	FILE *filep;
3064 	int base_cpu;
3065 	unsigned long dummy;
3066 	char pathname[64];
3067 
3068 	base_cpu = sched_getcpu();
3069 	if (base_cpu < 0)
3070 		err(1, "cannot find calling cpu ID");
3071 	sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu);
3072 
3073 	filep = fopen_or_die(pathname, "r");
3074 	topo.max_cpu_num = 0;
3075 	while (fscanf(filep, "%lx,", &dummy) == 1)
3076 		topo.max_cpu_num += BITMASK_SIZE;
3077 	fclose(filep);
3078 	topo.max_cpu_num--;	/* 0 based */
3079 }
3080 
3081 /*
3082  * count_cpus()
3083  * remember the last one seen, it will be the max
3084  */
3085 int count_cpus(int cpu)
3086 {
3087 	topo.num_cpus++;
3088 	return 0;
3089 }
3090 
3091 int mark_cpu_present(int cpu)
3092 {
3093 	CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set);
3094 	return 0;
3095 }
3096 
3097 int init_thread_id(int cpu)
3098 {
3099 	cpus[cpu].thread_id = -1;
3100 	return 0;
3101 }
3102 
3103 /*
3104  * snapshot_proc_interrupts()
3105  *
3106  * read and record summary of /proc/interrupts
3107  *
3108  * return 1 if config change requires a restart, else return 0
3109  */
3110 int snapshot_proc_interrupts(void)
3111 {
3112 	static FILE *fp;
3113 	int column, retval;
3114 
3115 	if (fp == NULL)
3116 		fp = fopen_or_die("/proc/interrupts", "r");
3117 	else
3118 		rewind(fp);
3119 
3120 	/* read 1st line of /proc/interrupts to get cpu* name for each column */
3121 	for (column = 0; column < topo.num_cpus; ++column) {
3122 		int cpu_number;
3123 
3124 		retval = fscanf(fp, " CPU%d", &cpu_number);
3125 		if (retval != 1)
3126 			break;
3127 
3128 		if (cpu_number > topo.max_cpu_num) {
3129 			warn("/proc/interrupts: cpu%d: > %d", cpu_number, topo.max_cpu_num);
3130 			return 1;
3131 		}
3132 
3133 		irq_column_2_cpu[column] = cpu_number;
3134 		irqs_per_cpu[cpu_number] = 0;
3135 	}
3136 
3137 	/* read /proc/interrupt count lines and sum up irqs per cpu */
3138 	while (1) {
3139 		int column;
3140 		char buf[64];
3141 
3142 		retval = fscanf(fp, " %s:", buf);	/* flush irq# "N:" */
3143 		if (retval != 1)
3144 			break;
3145 
3146 		/* read the count per cpu */
3147 		for (column = 0; column < topo.num_cpus; ++column) {
3148 
3149 			int cpu_number, irq_count;
3150 
3151 			retval = fscanf(fp, " %d", &irq_count);
3152 			if (retval != 1)
3153 				break;
3154 
3155 			cpu_number = irq_column_2_cpu[column];
3156 			irqs_per_cpu[cpu_number] += irq_count;
3157 
3158 		}
3159 
3160 		while (getc(fp) != '\n') ;	/* flush interrupt description */
3161 
3162 	}
3163 	return 0;
3164 }
3165 
3166 /*
3167  * snapshot_gfx_rc6_ms()
3168  *
3169  * record snapshot of
3170  * /sys/class/drm/card0/power/rc6_residency_ms
3171  *
3172  * return 1 if config change requires a restart, else return 0
3173  */
3174 int snapshot_gfx_rc6_ms(void)
3175 {
3176 	FILE *fp;
3177 	int retval;
3178 
3179 	fp = fopen_or_die("/sys/class/drm/card0/power/rc6_residency_ms", "r");
3180 
3181 	retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms);
3182 	if (retval != 1)
3183 		err(1, "GFX rc6");
3184 
3185 	fclose(fp);
3186 
3187 	return 0;
3188 }
3189 
3190 /*
3191  * snapshot_gfx_mhz()
3192  *
3193  * record snapshot of
3194  * /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz
3195  *
3196  * return 1 if config change requires a restart, else return 0
3197  */
3198 int snapshot_gfx_mhz(void)
3199 {
3200 	static FILE *fp;
3201 	int retval;
3202 
3203 	if (fp == NULL)
3204 		fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r");
3205 	else {
3206 		rewind(fp);
3207 		fflush(fp);
3208 	}
3209 
3210 	retval = fscanf(fp, "%d", &gfx_cur_mhz);
3211 	if (retval != 1)
3212 		err(1, "GFX MHz");
3213 
3214 	return 0;
3215 }
3216 
3217 /*
3218  * snapshot_gfx_cur_mhz()
3219  *
3220  * record snapshot of
3221  * /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz
3222  *
3223  * return 1 if config change requires a restart, else return 0
3224  */
3225 int snapshot_gfx_act_mhz(void)
3226 {
3227 	static FILE *fp;
3228 	int retval;
3229 
3230 	if (fp == NULL)
3231 		fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r");
3232 	else {
3233 		rewind(fp);
3234 		fflush(fp);
3235 	}
3236 
3237 	retval = fscanf(fp, "%d", &gfx_act_mhz);
3238 	if (retval != 1)
3239 		err(1, "GFX ACT MHz");
3240 
3241 	return 0;
3242 }
3243 
3244 /*
3245  * snapshot_cpu_lpi()
3246  *
3247  * record snapshot of
3248  * /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us
3249  */
3250 int snapshot_cpu_lpi_us(void)
3251 {
3252 	FILE *fp;
3253 	int retval;
3254 
3255 	fp = fopen_or_die("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", "r");
3256 
3257 	retval = fscanf(fp, "%lld", &cpuidle_cur_cpu_lpi_us);
3258 	if (retval != 1) {
3259 		fprintf(stderr, "Disabling Low Power Idle CPU output\n");
3260 		BIC_NOT_PRESENT(BIC_CPU_LPI);
3261 		fclose(fp);
3262 		return -1;
3263 	}
3264 
3265 	fclose(fp);
3266 
3267 	return 0;
3268 }
3269 
3270 /*
3271  * snapshot_sys_lpi()
3272  *
3273  * record snapshot of sys_lpi_file
3274  */
3275 int snapshot_sys_lpi_us(void)
3276 {
3277 	FILE *fp;
3278 	int retval;
3279 
3280 	fp = fopen_or_die(sys_lpi_file, "r");
3281 
3282 	retval = fscanf(fp, "%lld", &cpuidle_cur_sys_lpi_us);
3283 	if (retval != 1) {
3284 		fprintf(stderr, "Disabling Low Power Idle System output\n");
3285 		BIC_NOT_PRESENT(BIC_SYS_LPI);
3286 		fclose(fp);
3287 		return -1;
3288 	}
3289 	fclose(fp);
3290 
3291 	return 0;
3292 }
3293 
3294 /*
3295  * snapshot /proc and /sys files
3296  *
3297  * return 1 if configuration restart needed, else return 0
3298  */
3299 int snapshot_proc_sysfs_files(void)
3300 {
3301 	if (DO_BIC(BIC_IRQ))
3302 		if (snapshot_proc_interrupts())
3303 			return 1;
3304 
3305 	if (DO_BIC(BIC_GFX_rc6))
3306 		snapshot_gfx_rc6_ms();
3307 
3308 	if (DO_BIC(BIC_GFXMHz))
3309 		snapshot_gfx_mhz();
3310 
3311 	if (DO_BIC(BIC_GFXACTMHz))
3312 		snapshot_gfx_act_mhz();
3313 
3314 	if (DO_BIC(BIC_CPU_LPI))
3315 		snapshot_cpu_lpi_us();
3316 
3317 	if (DO_BIC(BIC_SYS_LPI))
3318 		snapshot_sys_lpi_us();
3319 
3320 	return 0;
3321 }
3322 
3323 int exit_requested;
3324 
3325 static void signal_handler(int signal)
3326 {
3327 	switch (signal) {
3328 	case SIGINT:
3329 		exit_requested = 1;
3330 		if (debug)
3331 			fprintf(stderr, " SIGINT\n");
3332 		break;
3333 	case SIGUSR1:
3334 		if (debug > 1)
3335 			fprintf(stderr, "SIGUSR1\n");
3336 		break;
3337 	}
3338 }
3339 
3340 void setup_signal_handler(void)
3341 {
3342 	struct sigaction sa;
3343 
3344 	memset(&sa, 0, sizeof(sa));
3345 
3346 	sa.sa_handler = &signal_handler;
3347 
3348 	if (sigaction(SIGINT, &sa, NULL) < 0)
3349 		err(1, "sigaction SIGINT");
3350 	if (sigaction(SIGUSR1, &sa, NULL) < 0)
3351 		err(1, "sigaction SIGUSR1");
3352 }
3353 
3354 void do_sleep(void)
3355 {
3356 	struct timeval tout;
3357 	struct timespec rest;
3358 	fd_set readfds;
3359 	int retval;
3360 
3361 	FD_ZERO(&readfds);
3362 	FD_SET(0, &readfds);
3363 
3364 	if (ignore_stdin) {
3365 		nanosleep(&interval_ts, NULL);
3366 		return;
3367 	}
3368 
3369 	tout = interval_tv;
3370 	retval = select(1, &readfds, NULL, NULL, &tout);
3371 
3372 	if (retval == 1) {
3373 		switch (getc(stdin)) {
3374 		case 'q':
3375 			exit_requested = 1;
3376 			break;
3377 		case EOF:
3378 			/*
3379 			 * 'stdin' is a pipe closed on the other end. There
3380 			 * won't be any further input.
3381 			 */
3382 			ignore_stdin = 1;
3383 			/* Sleep the rest of the time */
3384 			rest.tv_sec = (tout.tv_sec + tout.tv_usec / 1000000);
3385 			rest.tv_nsec = (tout.tv_usec % 1000000) * 1000;
3386 			nanosleep(&rest, NULL);
3387 		}
3388 	}
3389 }
3390 
3391 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
3392 {
3393 	int ret, idx;
3394 	unsigned long long msr_cur, msr_last;
3395 
3396 	if (!per_cpu_msr_sum)
3397 		return 1;
3398 
3399 	idx = offset_to_idx(offset);
3400 	if (idx < 0)
3401 		return idx;
3402 	/* get_msr_sum() = sum + (get_msr() - last) */
3403 	ret = get_msr(cpu, offset, &msr_cur);
3404 	if (ret)
3405 		return ret;
3406 	msr_last = per_cpu_msr_sum[cpu].entries[idx].last;
3407 	DELTA_WRAP32(msr_cur, msr_last);
3408 	*msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum;
3409 
3410 	return 0;
3411 }
3412 
3413 timer_t timerid;
3414 
3415 /* Timer callback, update the sum of MSRs periodically. */
3416 static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p)
3417 {
3418 	int i, ret;
3419 	int cpu = t->cpu_id;
3420 
3421 	for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
3422 		unsigned long long msr_cur, msr_last;
3423 		off_t offset;
3424 
3425 		if (!idx_valid(i))
3426 			continue;
3427 		offset = idx_to_offset(i);
3428 		if (offset < 0)
3429 			continue;
3430 		ret = get_msr(cpu, offset, &msr_cur);
3431 		if (ret) {
3432 			fprintf(outf, "Can not update msr(0x%llx)\n", (unsigned long long)offset);
3433 			continue;
3434 		}
3435 
3436 		msr_last = per_cpu_msr_sum[cpu].entries[i].last;
3437 		per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff;
3438 
3439 		DELTA_WRAP32(msr_cur, msr_last);
3440 		per_cpu_msr_sum[cpu].entries[i].sum += msr_last;
3441 	}
3442 	return 0;
3443 }
3444 
3445 static void msr_record_handler(union sigval v)
3446 {
3447 	for_all_cpus(update_msr_sum, EVEN_COUNTERS);
3448 }
3449 
3450 void msr_sum_record(void)
3451 {
3452 	struct itimerspec its;
3453 	struct sigevent sev;
3454 
3455 	per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array));
3456 	if (!per_cpu_msr_sum) {
3457 		fprintf(outf, "Can not allocate memory for long time MSR.\n");
3458 		return;
3459 	}
3460 	/*
3461 	 * Signal handler might be restricted, so use thread notifier instead.
3462 	 */
3463 	memset(&sev, 0, sizeof(struct sigevent));
3464 	sev.sigev_notify = SIGEV_THREAD;
3465 	sev.sigev_notify_function = msr_record_handler;
3466 
3467 	sev.sigev_value.sival_ptr = &timerid;
3468 	if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) {
3469 		fprintf(outf, "Can not create timer.\n");
3470 		goto release_msr;
3471 	}
3472 
3473 	its.it_value.tv_sec = 0;
3474 	its.it_value.tv_nsec = 1;
3475 	/*
3476 	 * A wraparound time has been calculated early.
3477 	 * Some sources state that the peak power for a
3478 	 * microprocessor is usually 1.5 times the TDP rating,
3479 	 * use 2 * TDP for safety.
3480 	 */
3481 	its.it_interval.tv_sec = rapl_joule_counter_range / 2;
3482 	its.it_interval.tv_nsec = 0;
3483 
3484 	if (timer_settime(timerid, 0, &its, NULL) == -1) {
3485 		fprintf(outf, "Can not set timer.\n");
3486 		goto release_timer;
3487 	}
3488 	return;
3489 
3490 release_timer:
3491 	timer_delete(timerid);
3492 release_msr:
3493 	free(per_cpu_msr_sum);
3494 }
3495 
3496 /*
3497  * set_my_sched_priority(pri)
3498  * return previous
3499  *
3500  * if non-root, do this:
3501  * # /sbin/setcap cap_sys_rawio,cap_sys_nice=+ep /usr/bin/turbostat
3502  */
3503 int set_my_sched_priority(int priority)
3504 {
3505 	int retval;
3506 	int original_priority;
3507 
3508 	errno = 0;
3509 	original_priority = getpriority(PRIO_PROCESS, 0);
3510 	if (errno && (original_priority == -1))
3511 		err(errno, "getpriority");
3512 
3513 	retval = setpriority(PRIO_PROCESS, 0, priority);
3514 	if (retval)
3515 		err(retval, "setpriority(%d)", priority);
3516 
3517 	errno = 0;
3518 	retval = getpriority(PRIO_PROCESS, 0);
3519 	if (retval != priority)
3520 		err(retval, "getpriority(%d) != setpriority(%d)", retval, priority);
3521 
3522 	return original_priority;
3523 }
3524 
3525 void turbostat_loop()
3526 {
3527 	int retval;
3528 	int restarted = 0;
3529 	int done_iters = 0;
3530 
3531 	setup_signal_handler();
3532 
3533 	/*
3534 	 * elevate own priority for interval mode
3535 	 */
3536 	set_my_sched_priority(-20);
3537 
3538 restart:
3539 	restarted++;
3540 
3541 	snapshot_proc_sysfs_files();
3542 	retval = for_all_cpus(get_counters, EVEN_COUNTERS);
3543 	first_counter_read = 0;
3544 	if (retval < -1) {
3545 		exit(retval);
3546 	} else if (retval == -1) {
3547 		if (restarted > 10) {
3548 			exit(retval);
3549 		}
3550 		re_initialize();
3551 		goto restart;
3552 	}
3553 	restarted = 0;
3554 	done_iters = 0;
3555 	gettimeofday(&tv_even, (struct timezone *)NULL);
3556 
3557 	while (1) {
3558 		if (for_all_proc_cpus(cpu_is_not_present)) {
3559 			re_initialize();
3560 			goto restart;
3561 		}
3562 		do_sleep();
3563 		if (snapshot_proc_sysfs_files())
3564 			goto restart;
3565 		retval = for_all_cpus(get_counters, ODD_COUNTERS);
3566 		if (retval < -1) {
3567 			exit(retval);
3568 		} else if (retval == -1) {
3569 			re_initialize();
3570 			goto restart;
3571 		}
3572 		gettimeofday(&tv_odd, (struct timezone *)NULL);
3573 		timersub(&tv_odd, &tv_even, &tv_delta);
3574 		if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS)) {
3575 			re_initialize();
3576 			goto restart;
3577 		}
3578 		compute_average(EVEN_COUNTERS);
3579 		format_all_counters(EVEN_COUNTERS);
3580 		flush_output_stdout();
3581 		if (exit_requested)
3582 			break;
3583 		if (num_iterations && ++done_iters >= num_iterations)
3584 			break;
3585 		do_sleep();
3586 		if (snapshot_proc_sysfs_files())
3587 			goto restart;
3588 		retval = for_all_cpus(get_counters, EVEN_COUNTERS);
3589 		if (retval < -1) {
3590 			exit(retval);
3591 		} else if (retval == -1) {
3592 			re_initialize();
3593 			goto restart;
3594 		}
3595 		gettimeofday(&tv_even, (struct timezone *)NULL);
3596 		timersub(&tv_even, &tv_odd, &tv_delta);
3597 		if (for_all_cpus_2(delta_cpu, EVEN_COUNTERS, ODD_COUNTERS)) {
3598 			re_initialize();
3599 			goto restart;
3600 		}
3601 		compute_average(ODD_COUNTERS);
3602 		format_all_counters(ODD_COUNTERS);
3603 		flush_output_stdout();
3604 		if (exit_requested)
3605 			break;
3606 		if (num_iterations && ++done_iters >= num_iterations)
3607 			break;
3608 	}
3609 }
3610 
3611 void check_dev_msr()
3612 {
3613 	struct stat sb;
3614 	char pathname[32];
3615 
3616 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
3617 	if (stat(pathname, &sb))
3618 		if (system("/sbin/modprobe msr > /dev/null 2>&1"))
3619 			err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" ");
3620 }
3621 
3622 /*
3623  * check for CAP_SYS_RAWIO
3624  * return 0 on success
3625  * return 1 on fail
3626  */
3627 int check_for_cap_sys_rawio(void)
3628 {
3629 	cap_t caps;
3630 	cap_flag_value_t cap_flag_value;
3631 
3632 	caps = cap_get_proc();
3633 	if (caps == NULL)
3634 		err(-6, "cap_get_proc\n");
3635 
3636 	if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value))
3637 		err(-6, "cap_get\n");
3638 
3639 	if (cap_flag_value != CAP_SET) {
3640 		warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname);
3641 		return 1;
3642 	}
3643 
3644 	if (cap_free(caps) == -1)
3645 		err(-6, "cap_free\n");
3646 
3647 	return 0;
3648 }
3649 
3650 void check_permissions(void)
3651 {
3652 	int do_exit = 0;
3653 	char pathname[32];
3654 
3655 	/* check for CAP_SYS_RAWIO */
3656 	do_exit += check_for_cap_sys_rawio();
3657 
3658 	/* test file permissions */
3659 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
3660 	if (euidaccess(pathname, R_OK)) {
3661 		do_exit++;
3662 		warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr");
3663 	}
3664 
3665 	/* if all else fails, thell them to be root */
3666 	if (do_exit)
3667 		if (getuid() != 0)
3668 			warnx("... or simply run as root");
3669 
3670 	if (do_exit)
3671 		exit(-6);
3672 }
3673 
3674 /*
3675  * NHM adds support for additional MSRs:
3676  *
3677  * MSR_SMI_COUNT                   0x00000034
3678  *
3679  * MSR_PLATFORM_INFO               0x000000ce
3680  * MSR_PKG_CST_CONFIG_CONTROL     0x000000e2
3681  *
3682  * MSR_MISC_PWR_MGMT               0x000001aa
3683  *
3684  * MSR_PKG_C3_RESIDENCY            0x000003f8
3685  * MSR_PKG_C6_RESIDENCY            0x000003f9
3686  * MSR_CORE_C3_RESIDENCY           0x000003fc
3687  * MSR_CORE_C6_RESIDENCY           0x000003fd
3688  *
3689  * Side effect:
3690  * sets global pkg_cstate_limit to decode MSR_PKG_CST_CONFIG_CONTROL
3691  * sets has_misc_feature_control
3692  */
3693 int probe_nhm_msrs(unsigned int family, unsigned int model)
3694 {
3695 	unsigned long long msr;
3696 	unsigned int base_ratio;
3697 	int *pkg_cstate_limits;
3698 
3699 	if (!genuine_intel)
3700 		return 0;
3701 
3702 	if (family != 6)
3703 		return 0;
3704 
3705 	bclk = discover_bclk(family, model);
3706 
3707 	switch (model) {
3708 	case INTEL_FAM6_NEHALEM:	/* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
3709 	case INTEL_FAM6_NEHALEM_EX:	/* Nehalem-EX Xeon - Beckton */
3710 		pkg_cstate_limits = nhm_pkg_cstate_limits;
3711 		break;
3712 	case INTEL_FAM6_SANDYBRIDGE:	/* SNB */
3713 	case INTEL_FAM6_SANDYBRIDGE_X:	/* SNB Xeon */
3714 	case INTEL_FAM6_IVYBRIDGE:	/* IVB */
3715 	case INTEL_FAM6_IVYBRIDGE_X:	/* IVB Xeon */
3716 		pkg_cstate_limits = snb_pkg_cstate_limits;
3717 		has_misc_feature_control = 1;
3718 		break;
3719 	case INTEL_FAM6_HASWELL:	/* HSW */
3720 	case INTEL_FAM6_HASWELL_G:	/* HSW */
3721 	case INTEL_FAM6_HASWELL_X:	/* HSX */
3722 	case INTEL_FAM6_HASWELL_L:	/* HSW */
3723 	case INTEL_FAM6_BROADWELL:	/* BDW */
3724 	case INTEL_FAM6_BROADWELL_G:	/* BDW */
3725 	case INTEL_FAM6_BROADWELL_X:	/* BDX */
3726 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
3727 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
3728 		pkg_cstate_limits = hsw_pkg_cstate_limits;
3729 		has_misc_feature_control = 1;
3730 		break;
3731 	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
3732 		pkg_cstate_limits = skx_pkg_cstate_limits;
3733 		has_misc_feature_control = 1;
3734 		break;
3735 	case INTEL_FAM6_ICELAKE_X:	/* ICX */
3736 		pkg_cstate_limits = icx_pkg_cstate_limits;
3737 		has_misc_feature_control = 1;
3738 		break;
3739 	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
3740 		no_MSR_MISC_PWR_MGMT = 1;
3741 	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
3742 		pkg_cstate_limits = slv_pkg_cstate_limits;
3743 		break;
3744 	case INTEL_FAM6_ATOM_AIRMONT:	/* AMT */
3745 		pkg_cstate_limits = amt_pkg_cstate_limits;
3746 		no_MSR_MISC_PWR_MGMT = 1;
3747 		break;
3748 	case INTEL_FAM6_XEON_PHI_KNL:	/* PHI */
3749 		pkg_cstate_limits = phi_pkg_cstate_limits;
3750 		break;
3751 	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
3752 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
3753 	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
3754 	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
3755 	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
3756 		pkg_cstate_limits = glm_pkg_cstate_limits;
3757 		break;
3758 	default:
3759 		return 0;
3760 	}
3761 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
3762 	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
3763 
3764 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
3765 	base_ratio = (msr >> 8) & 0xFF;
3766 
3767 	base_hz = base_ratio * bclk * 1000000;
3768 	has_base_hz = 1;
3769 	return 1;
3770 }
3771 
3772 /*
3773  * SLV client has support for unique MSRs:
3774  *
3775  * MSR_CC6_DEMOTION_POLICY_CONFIG
3776  * MSR_MC6_DEMOTION_POLICY_CONFIG
3777  */
3778 
3779 int has_slv_msrs(unsigned int family, unsigned int model)
3780 {
3781 	if (!genuine_intel)
3782 		return 0;
3783 
3784 	switch (model) {
3785 	case INTEL_FAM6_ATOM_SILVERMONT:
3786 	case INTEL_FAM6_ATOM_SILVERMONT_MID:
3787 	case INTEL_FAM6_ATOM_AIRMONT_MID:
3788 		return 1;
3789 	}
3790 	return 0;
3791 }
3792 
3793 int is_dnv(unsigned int family, unsigned int model)
3794 {
3795 
3796 	if (!genuine_intel)
3797 		return 0;
3798 
3799 	switch (model) {
3800 	case INTEL_FAM6_ATOM_GOLDMONT_D:
3801 		return 1;
3802 	}
3803 	return 0;
3804 }
3805 
3806 int is_bdx(unsigned int family, unsigned int model)
3807 {
3808 
3809 	if (!genuine_intel)
3810 		return 0;
3811 
3812 	switch (model) {
3813 	case INTEL_FAM6_BROADWELL_X:
3814 		return 1;
3815 	}
3816 	return 0;
3817 }
3818 
3819 int is_skx(unsigned int family, unsigned int model)
3820 {
3821 
3822 	if (!genuine_intel)
3823 		return 0;
3824 
3825 	switch (model) {
3826 	case INTEL_FAM6_SKYLAKE_X:
3827 		return 1;
3828 	}
3829 	return 0;
3830 }
3831 
3832 int is_icx(unsigned int family, unsigned int model)
3833 {
3834 
3835 	if (!genuine_intel)
3836 		return 0;
3837 
3838 	switch (model) {
3839 	case INTEL_FAM6_ICELAKE_X:
3840 		return 1;
3841 	}
3842 	return 0;
3843 }
3844 
3845 int is_ehl(unsigned int family, unsigned int model)
3846 {
3847 	if (!genuine_intel)
3848 		return 0;
3849 
3850 	switch (model) {
3851 	case INTEL_FAM6_ATOM_TREMONT:
3852 		return 1;
3853 	}
3854 	return 0;
3855 }
3856 
3857 int is_jvl(unsigned int family, unsigned int model)
3858 {
3859 	if (!genuine_intel)
3860 		return 0;
3861 
3862 	switch (model) {
3863 	case INTEL_FAM6_ATOM_TREMONT_D:
3864 		return 1;
3865 	}
3866 	return 0;
3867 }
3868 
3869 int has_turbo_ratio_limit(unsigned int family, unsigned int model)
3870 {
3871 	if (has_slv_msrs(family, model))
3872 		return 0;
3873 
3874 	switch (model) {
3875 		/* Nehalem compatible, but do not include turbo-ratio limit support */
3876 	case INTEL_FAM6_NEHALEM_EX:	/* Nehalem-EX Xeon - Beckton */
3877 	case INTEL_FAM6_XEON_PHI_KNL:	/* PHI - Knights Landing (different MSR definition) */
3878 		return 0;
3879 	default:
3880 		return 1;
3881 	}
3882 }
3883 
3884 int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model)
3885 {
3886 	if (has_slv_msrs(family, model))
3887 		return 1;
3888 
3889 	return 0;
3890 }
3891 
3892 int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model)
3893 {
3894 	if (!genuine_intel)
3895 		return 0;
3896 
3897 	if (family != 6)
3898 		return 0;
3899 
3900 	switch (model) {
3901 	case INTEL_FAM6_IVYBRIDGE_X:	/* IVB Xeon */
3902 	case INTEL_FAM6_HASWELL_X:	/* HSW Xeon */
3903 		return 1;
3904 	default:
3905 		return 0;
3906 	}
3907 }
3908 
3909 int has_hsw_turbo_ratio_limit(unsigned int family, unsigned int model)
3910 {
3911 	if (!genuine_intel)
3912 		return 0;
3913 
3914 	if (family != 6)
3915 		return 0;
3916 
3917 	switch (model) {
3918 	case INTEL_FAM6_HASWELL_X:	/* HSW Xeon */
3919 		return 1;
3920 	default:
3921 		return 0;
3922 	}
3923 }
3924 
3925 int has_knl_turbo_ratio_limit(unsigned int family, unsigned int model)
3926 {
3927 	if (!genuine_intel)
3928 		return 0;
3929 
3930 	if (family != 6)
3931 		return 0;
3932 
3933 	switch (model) {
3934 	case INTEL_FAM6_XEON_PHI_KNL:	/* Knights Landing */
3935 		return 1;
3936 	default:
3937 		return 0;
3938 	}
3939 }
3940 
3941 int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model)
3942 {
3943 	if (!genuine_intel)
3944 		return 0;
3945 
3946 	if (family != 6)
3947 		return 0;
3948 
3949 	switch (model) {
3950 	case INTEL_FAM6_ATOM_GOLDMONT:
3951 	case INTEL_FAM6_SKYLAKE_X:
3952 	case INTEL_FAM6_ICELAKE_X:
3953 		return 1;
3954 	default:
3955 		return 0;
3956 	}
3957 }
3958 
3959 int has_config_tdp(unsigned int family, unsigned int model)
3960 {
3961 	if (!genuine_intel)
3962 		return 0;
3963 
3964 	if (family != 6)
3965 		return 0;
3966 
3967 	switch (model) {
3968 	case INTEL_FAM6_IVYBRIDGE:	/* IVB */
3969 	case INTEL_FAM6_HASWELL:	/* HSW */
3970 	case INTEL_FAM6_HASWELL_X:	/* HSX */
3971 	case INTEL_FAM6_HASWELL_L:	/* HSW */
3972 	case INTEL_FAM6_HASWELL_G:	/* HSW */
3973 	case INTEL_FAM6_BROADWELL:	/* BDW */
3974 	case INTEL_FAM6_BROADWELL_G:	/* BDW */
3975 	case INTEL_FAM6_BROADWELL_X:	/* BDX */
3976 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
3977 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
3978 	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
3979 	case INTEL_FAM6_ICELAKE_X:	/* ICX */
3980 
3981 	case INTEL_FAM6_XEON_PHI_KNL:	/* Knights Landing */
3982 		return 1;
3983 	default:
3984 		return 0;
3985 	}
3986 }
3987 
3988 /*
3989  * tcc_offset_bits:
3990  * 0: Tcc Offset not supported (Default)
3991  * 6: Bit 29:24 of MSR_PLATFORM_INFO
3992  * 4: Bit 27:24 of MSR_PLATFORM_INFO
3993  */
3994 void check_tcc_offset(int model)
3995 {
3996 	unsigned long long msr;
3997 
3998 	if (!genuine_intel)
3999 		return;
4000 
4001 	switch (model) {
4002 	case INTEL_FAM6_SKYLAKE_L:
4003 	case INTEL_FAM6_SKYLAKE:
4004 	case INTEL_FAM6_KABYLAKE_L:
4005 	case INTEL_FAM6_KABYLAKE:
4006 	case INTEL_FAM6_ICELAKE_L:
4007 	case INTEL_FAM6_ICELAKE:
4008 	case INTEL_FAM6_TIGERLAKE_L:
4009 	case INTEL_FAM6_TIGERLAKE:
4010 	case INTEL_FAM6_COMETLAKE:
4011 		if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) {
4012 			msr = (msr >> 30) & 1;
4013 			if (msr)
4014 				tcc_offset_bits = 6;
4015 		}
4016 		return;
4017 	default:
4018 		return;
4019 	}
4020 }
4021 
4022 static void remove_underbar(char *s)
4023 {
4024 	char *to = s;
4025 
4026 	while (*s) {
4027 		if (*s != '_')
4028 			*to++ = *s;
4029 		s++;
4030 	}
4031 
4032 	*to = 0;
4033 }
4034 
4035 static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
4036 {
4037 	if (!do_nhm_platform_info)
4038 		return;
4039 
4040 	dump_nhm_platform_info();
4041 
4042 	if (has_hsw_turbo_ratio_limit(family, model))
4043 		dump_hsw_turbo_ratio_limits();
4044 
4045 	if (has_ivt_turbo_ratio_limit(family, model))
4046 		dump_ivt_turbo_ratio_limits();
4047 
4048 	if (has_turbo_ratio_limit(family, model))
4049 		dump_turbo_ratio_limits(family, model);
4050 
4051 	if (has_atom_turbo_ratio_limit(family, model))
4052 		dump_atom_turbo_ratio_limits();
4053 
4054 	if (has_knl_turbo_ratio_limit(family, model))
4055 		dump_knl_turbo_ratio_limits();
4056 
4057 	if (has_config_tdp(family, model))
4058 		dump_config_tdp();
4059 
4060 	dump_nhm_cst_cfg();
4061 }
4062 
4063 static void dump_sysfs_file(char *path)
4064 {
4065 	FILE *input;
4066 	char cpuidle_buf[64];
4067 
4068 	input = fopen(path, "r");
4069 	if (input == NULL) {
4070 		if (debug)
4071 			fprintf(outf, "NSFOD %s\n", path);
4072 		return;
4073 	}
4074 	if (!fgets(cpuidle_buf, sizeof(cpuidle_buf), input))
4075 		err(1, "%s: failed to read file", path);
4076 	fclose(input);
4077 
4078 	fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
4079 }
4080 
4081 static void dump_sysfs_cstate_config(void)
4082 {
4083 	char path[64];
4084 	char name_buf[16];
4085 	char desc[64];
4086 	FILE *input;
4087 	int state;
4088 	char *sp;
4089 
4090 	if (access("/sys/devices/system/cpu/cpuidle", R_OK)) {
4091 		fprintf(outf, "cpuidle not loaded\n");
4092 		return;
4093 	}
4094 
4095 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_driver");
4096 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor");
4097 	dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor_ro");
4098 
4099 	for (state = 0; state < 10; ++state) {
4100 
4101 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
4102 		input = fopen(path, "r");
4103 		if (input == NULL)
4104 			continue;
4105 		if (!fgets(name_buf, sizeof(name_buf), input))
4106 			err(1, "%s: failed to read file", path);
4107 
4108 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
4109 		sp = strchr(name_buf, '-');
4110 		if (!sp)
4111 			sp = strchrnul(name_buf, '\n');
4112 		*sp = '\0';
4113 		fclose(input);
4114 
4115 		remove_underbar(name_buf);
4116 
4117 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state);
4118 		input = fopen(path, "r");
4119 		if (input == NULL)
4120 			continue;
4121 		if (!fgets(desc, sizeof(desc), input))
4122 			err(1, "%s: failed to read file", path);
4123 
4124 		fprintf(outf, "cpu%d: %s: %s", base_cpu, name_buf, desc);
4125 		fclose(input);
4126 	}
4127 }
4128 
4129 static void dump_sysfs_pstate_config(void)
4130 {
4131 	char path[64];
4132 	char driver_buf[64];
4133 	char governor_buf[64];
4134 	FILE *input;
4135 	int turbo;
4136 
4137 	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu);
4138 	input = fopen(path, "r");
4139 	if (input == NULL) {
4140 		fprintf(outf, "NSFOD %s\n", path);
4141 		return;
4142 	}
4143 	if (!fgets(driver_buf, sizeof(driver_buf), input))
4144 		err(1, "%s: failed to read file", path);
4145 	fclose(input);
4146 
4147 	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu);
4148 	input = fopen(path, "r");
4149 	if (input == NULL) {
4150 		fprintf(outf, "NSFOD %s\n", path);
4151 		return;
4152 	}
4153 	if (!fgets(governor_buf, sizeof(governor_buf), input))
4154 		err(1, "%s: failed to read file", path);
4155 	fclose(input);
4156 
4157 	fprintf(outf, "cpu%d: cpufreq driver: %s", base_cpu, driver_buf);
4158 	fprintf(outf, "cpu%d: cpufreq governor: %s", base_cpu, governor_buf);
4159 
4160 	sprintf(path, "/sys/devices/system/cpu/cpufreq/boost");
4161 	input = fopen(path, "r");
4162 	if (input != NULL) {
4163 		if (fscanf(input, "%d", &turbo) != 1)
4164 			err(1, "%s: failed to parse number from file", path);
4165 		fprintf(outf, "cpufreq boost: %d\n", turbo);
4166 		fclose(input);
4167 	}
4168 
4169 	sprintf(path, "/sys/devices/system/cpu/intel_pstate/no_turbo");
4170 	input = fopen(path, "r");
4171 	if (input != NULL) {
4172 		if (fscanf(input, "%d", &turbo) != 1)
4173 			err(1, "%s: failed to parse number from file", path);
4174 		fprintf(outf, "cpufreq intel_pstate no_turbo: %d\n", turbo);
4175 		fclose(input);
4176 	}
4177 }
4178 
4179 /*
4180  * print_epb()
4181  * Decode the ENERGY_PERF_BIAS MSR
4182  */
4183 int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
4184 {
4185 	char *epb_string;
4186 	int cpu, epb;
4187 
4188 	if (!has_epb)
4189 		return 0;
4190 
4191 	cpu = t->cpu_id;
4192 
4193 	/* EPB is per-package */
4194 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
4195 		return 0;
4196 
4197 	if (cpu_migrate(cpu)) {
4198 		fprintf(outf, "print_epb: Could not migrate to CPU %d\n", cpu);
4199 		return -1;
4200 	}
4201 
4202 	epb = get_epb(cpu);
4203 	if (epb < 0)
4204 		return 0;
4205 
4206 	switch (epb) {
4207 	case ENERGY_PERF_BIAS_PERFORMANCE:
4208 		epb_string = "performance";
4209 		break;
4210 	case ENERGY_PERF_BIAS_NORMAL:
4211 		epb_string = "balanced";
4212 		break;
4213 	case ENERGY_PERF_BIAS_POWERSAVE:
4214 		epb_string = "powersave";
4215 		break;
4216 	default:
4217 		epb_string = "custom";
4218 		break;
4219 	}
4220 	fprintf(outf, "cpu%d: EPB: %d (%s)\n", cpu, epb, epb_string);
4221 
4222 	return 0;
4223 }
4224 
4225 /*
4226  * print_hwp()
4227  * Decode the MSR_HWP_CAPABILITIES
4228  */
4229 int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
4230 {
4231 	unsigned long long msr;
4232 	int cpu;
4233 
4234 	if (!has_hwp)
4235 		return 0;
4236 
4237 	cpu = t->cpu_id;
4238 
4239 	/* MSR_HWP_CAPABILITIES is per-package */
4240 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
4241 		return 0;
4242 
4243 	if (cpu_migrate(cpu)) {
4244 		fprintf(outf, "print_hwp: Could not migrate to CPU %d\n", cpu);
4245 		return -1;
4246 	}
4247 
4248 	if (get_msr(cpu, MSR_PM_ENABLE, &msr))
4249 		return 0;
4250 
4251 	fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", cpu, msr, (msr & (1 << 0)) ? "" : "No-");
4252 
4253 	/* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */
4254 	if ((msr & (1 << 0)) == 0)
4255 		return 0;
4256 
4257 	if (get_msr(cpu, MSR_HWP_CAPABILITIES, &msr))
4258 		return 0;
4259 
4260 	fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
4261 		"(high %d guar %d eff %d low %d)\n",
4262 		cpu, msr,
4263 		(unsigned int)HWP_HIGHEST_PERF(msr),
4264 		(unsigned int)HWP_GUARANTEED_PERF(msr),
4265 		(unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr));
4266 
4267 	if (get_msr(cpu, MSR_HWP_REQUEST, &msr))
4268 		return 0;
4269 
4270 	fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
4271 		"(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n",
4272 		cpu, msr,
4273 		(unsigned int)(((msr) >> 0) & 0xff),
4274 		(unsigned int)(((msr) >> 8) & 0xff),
4275 		(unsigned int)(((msr) >> 16) & 0xff),
4276 		(unsigned int)(((msr) >> 24) & 0xff),
4277 		(unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1));
4278 
4279 	if (has_hwp_pkg) {
4280 		if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr))
4281 			return 0;
4282 
4283 		fprintf(outf, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
4284 			"(min %d max %d des %d epp 0x%x window 0x%x)\n",
4285 			cpu, msr,
4286 			(unsigned int)(((msr) >> 0) & 0xff),
4287 			(unsigned int)(((msr) >> 8) & 0xff),
4288 			(unsigned int)(((msr) >> 16) & 0xff),
4289 			(unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3));
4290 	}
4291 	if (has_hwp_notify) {
4292 		if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr))
4293 			return 0;
4294 
4295 		fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
4296 			"(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n",
4297 			cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis");
4298 	}
4299 	if (get_msr(cpu, MSR_HWP_STATUS, &msr))
4300 		return 0;
4301 
4302 	fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
4303 		"(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
4304 		cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x2) ? "" : "No-");
4305 
4306 	return 0;
4307 }
4308 
4309 /*
4310  * print_perf_limit()
4311  */
4312 int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data *p)
4313 {
4314 	unsigned long long msr;
4315 	int cpu;
4316 
4317 	cpu = t->cpu_id;
4318 
4319 	/* per-package */
4320 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
4321 		return 0;
4322 
4323 	if (cpu_migrate(cpu)) {
4324 		fprintf(outf, "print_perf_limit: Could not migrate to CPU %d\n", cpu);
4325 		return -1;
4326 	}
4327 
4328 	if (do_core_perf_limit_reasons) {
4329 		get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
4330 		fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
4331 		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
4332 			(msr & 1 << 15) ? "bit15, " : "",
4333 			(msr & 1 << 14) ? "bit14, " : "",
4334 			(msr & 1 << 13) ? "Transitions, " : "",
4335 			(msr & 1 << 12) ? "MultiCoreTurbo, " : "",
4336 			(msr & 1 << 11) ? "PkgPwrL2, " : "",
4337 			(msr & 1 << 10) ? "PkgPwrL1, " : "",
4338 			(msr & 1 << 9) ? "CorePwr, " : "",
4339 			(msr & 1 << 8) ? "Amps, " : "",
4340 			(msr & 1 << 6) ? "VR-Therm, " : "",
4341 			(msr & 1 << 5) ? "Auto-HWP, " : "",
4342 			(msr & 1 << 4) ? "Graphics, " : "",
4343 			(msr & 1 << 2) ? "bit2, " : "",
4344 			(msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : "");
4345 		fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
4346 			(msr & 1 << 31) ? "bit31, " : "",
4347 			(msr & 1 << 30) ? "bit30, " : "",
4348 			(msr & 1 << 29) ? "Transitions, " : "",
4349 			(msr & 1 << 28) ? "MultiCoreTurbo, " : "",
4350 			(msr & 1 << 27) ? "PkgPwrL2, " : "",
4351 			(msr & 1 << 26) ? "PkgPwrL1, " : "",
4352 			(msr & 1 << 25) ? "CorePwr, " : "",
4353 			(msr & 1 << 24) ? "Amps, " : "",
4354 			(msr & 1 << 22) ? "VR-Therm, " : "",
4355 			(msr & 1 << 21) ? "Auto-HWP, " : "",
4356 			(msr & 1 << 20) ? "Graphics, " : "",
4357 			(msr & 1 << 18) ? "bit18, " : "",
4358 			(msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : "");
4359 
4360 	}
4361 	if (do_gfx_perf_limit_reasons) {
4362 		get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
4363 		fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
4364 		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)",
4365 			(msr & 1 << 0) ? "PROCHOT, " : "",
4366 			(msr & 1 << 1) ? "ThermStatus, " : "",
4367 			(msr & 1 << 4) ? "Graphics, " : "",
4368 			(msr & 1 << 6) ? "VR-Therm, " : "",
4369 			(msr & 1 << 8) ? "Amps, " : "",
4370 			(msr & 1 << 9) ? "GFXPwr, " : "",
4371 			(msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
4372 		fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n",
4373 			(msr & 1 << 16) ? "PROCHOT, " : "",
4374 			(msr & 1 << 17) ? "ThermStatus, " : "",
4375 			(msr & 1 << 20) ? "Graphics, " : "",
4376 			(msr & 1 << 22) ? "VR-Therm, " : "",
4377 			(msr & 1 << 24) ? "Amps, " : "",
4378 			(msr & 1 << 25) ? "GFXPwr, " : "",
4379 			(msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
4380 	}
4381 	if (do_ring_perf_limit_reasons) {
4382 		get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
4383 		fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
4384 		fprintf(outf, " (Active: %s%s%s%s%s%s)",
4385 			(msr & 1 << 0) ? "PROCHOT, " : "",
4386 			(msr & 1 << 1) ? "ThermStatus, " : "",
4387 			(msr & 1 << 6) ? "VR-Therm, " : "",
4388 			(msr & 1 << 8) ? "Amps, " : "",
4389 			(msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
4390 		fprintf(outf, " (Logged: %s%s%s%s%s%s)\n",
4391 			(msr & 1 << 16) ? "PROCHOT, " : "",
4392 			(msr & 1 << 17) ? "ThermStatus, " : "",
4393 			(msr & 1 << 22) ? "VR-Therm, " : "",
4394 			(msr & 1 << 24) ? "Amps, " : "",
4395 			(msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
4396 	}
4397 	return 0;
4398 }
4399 
4400 #define	RAPL_POWER_GRANULARITY	0x7FFF	/* 15 bit power granularity */
4401 #define	RAPL_TIME_GRANULARITY	0x3F	/* 6 bit time granularity */
4402 
4403 double get_tdp_intel(unsigned int model)
4404 {
4405 	unsigned long long msr;
4406 
4407 	if (do_rapl & RAPL_PKG_POWER_INFO)
4408 		if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
4409 			return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
4410 
4411 	switch (model) {
4412 	case INTEL_FAM6_ATOM_SILVERMONT:
4413 	case INTEL_FAM6_ATOM_SILVERMONT_D:
4414 		return 30.0;
4415 	default:
4416 		return 135.0;
4417 	}
4418 }
4419 
4420 double get_tdp_amd(unsigned int family)
4421 {
4422 	/* This is the max stock TDP of HEDT/Server Fam17h+ chips */
4423 	return 280.0;
4424 }
4425 
4426 /*
4427  * rapl_dram_energy_units_probe()
4428  * Energy units are either hard-coded, or come from RAPL Energy Unit MSR.
4429  */
4430 static double rapl_dram_energy_units_probe(int model, double rapl_energy_units)
4431 {
4432 	/* only called for genuine_intel, family 6 */
4433 
4434 	switch (model) {
4435 	case INTEL_FAM6_HASWELL_X:	/* HSX */
4436 	case INTEL_FAM6_BROADWELL_X:	/* BDX */
4437 	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
4438 	case INTEL_FAM6_XEON_PHI_KNL:	/* KNL */
4439 	case INTEL_FAM6_ICELAKE_X:	/* ICX */
4440 		return (rapl_dram_energy_units = 15.3 / 1000000);
4441 	default:
4442 		return (rapl_energy_units);
4443 	}
4444 }
4445 
4446 void rapl_probe_intel(unsigned int family, unsigned int model)
4447 {
4448 	unsigned long long msr;
4449 	unsigned int time_unit;
4450 	double tdp;
4451 
4452 	if (family != 6)
4453 		return;
4454 
4455 	switch (model) {
4456 	case INTEL_FAM6_SANDYBRIDGE:
4457 	case INTEL_FAM6_IVYBRIDGE:
4458 	case INTEL_FAM6_HASWELL:	/* HSW */
4459 	case INTEL_FAM6_HASWELL_L:	/* HSW */
4460 	case INTEL_FAM6_HASWELL_G:	/* HSW */
4461 	case INTEL_FAM6_BROADWELL:	/* BDW */
4462 	case INTEL_FAM6_BROADWELL_G:	/* BDW */
4463 		do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO;
4464 		if (rapl_joules) {
4465 			BIC_PRESENT(BIC_Pkg_J);
4466 			BIC_PRESENT(BIC_Cor_J);
4467 			BIC_PRESENT(BIC_GFX_J);
4468 		} else {
4469 			BIC_PRESENT(BIC_PkgWatt);
4470 			BIC_PRESENT(BIC_CorWatt);
4471 			BIC_PRESENT(BIC_GFXWatt);
4472 		}
4473 		break;
4474 	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
4475 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
4476 		do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO;
4477 		if (rapl_joules)
4478 			BIC_PRESENT(BIC_Pkg_J);
4479 		else
4480 			BIC_PRESENT(BIC_PkgWatt);
4481 		break;
4482 	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
4483 		do_rapl =
4484 		    RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS
4485 		    | RAPL_GFX | RAPL_PKG_POWER_INFO;
4486 		if (rapl_joules) {
4487 			BIC_PRESENT(BIC_Pkg_J);
4488 			BIC_PRESENT(BIC_Cor_J);
4489 			BIC_PRESENT(BIC_RAM_J);
4490 			BIC_PRESENT(BIC_GFX_J);
4491 		} else {
4492 			BIC_PRESENT(BIC_PkgWatt);
4493 			BIC_PRESENT(BIC_CorWatt);
4494 			BIC_PRESENT(BIC_RAMWatt);
4495 			BIC_PRESENT(BIC_GFXWatt);
4496 		}
4497 		break;
4498 	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
4499 		do_rapl = RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
4500 		BIC_PRESENT(BIC_PKG__);
4501 		if (rapl_joules)
4502 			BIC_PRESENT(BIC_Pkg_J);
4503 		else
4504 			BIC_PRESENT(BIC_PkgWatt);
4505 		break;
4506 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
4507 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
4508 		do_rapl =
4509 		    RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS
4510 		    | RAPL_GFX | RAPL_PKG_POWER_INFO;
4511 		BIC_PRESENT(BIC_PKG__);
4512 		BIC_PRESENT(BIC_RAM__);
4513 		if (rapl_joules) {
4514 			BIC_PRESENT(BIC_Pkg_J);
4515 			BIC_PRESENT(BIC_Cor_J);
4516 			BIC_PRESENT(BIC_RAM_J);
4517 			BIC_PRESENT(BIC_GFX_J);
4518 		} else {
4519 			BIC_PRESENT(BIC_PkgWatt);
4520 			BIC_PRESENT(BIC_CorWatt);
4521 			BIC_PRESENT(BIC_RAMWatt);
4522 			BIC_PRESENT(BIC_GFXWatt);
4523 		}
4524 		break;
4525 	case INTEL_FAM6_HASWELL_X:	/* HSX */
4526 	case INTEL_FAM6_BROADWELL_X:	/* BDX */
4527 	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
4528 	case INTEL_FAM6_ICELAKE_X:	/* ICX */
4529 	case INTEL_FAM6_XEON_PHI_KNL:	/* KNL */
4530 		do_rapl =
4531 		    RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS |
4532 		    RAPL_PKG_POWER_INFO;
4533 		BIC_PRESENT(BIC_PKG__);
4534 		BIC_PRESENT(BIC_RAM__);
4535 		if (rapl_joules) {
4536 			BIC_PRESENT(BIC_Pkg_J);
4537 			BIC_PRESENT(BIC_RAM_J);
4538 		} else {
4539 			BIC_PRESENT(BIC_PkgWatt);
4540 			BIC_PRESENT(BIC_RAMWatt);
4541 		}
4542 		break;
4543 	case INTEL_FAM6_SANDYBRIDGE_X:
4544 	case INTEL_FAM6_IVYBRIDGE_X:
4545 		do_rapl =
4546 		    RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS |
4547 		    RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO;
4548 		BIC_PRESENT(BIC_PKG__);
4549 		BIC_PRESENT(BIC_RAM__);
4550 		if (rapl_joules) {
4551 			BIC_PRESENT(BIC_Pkg_J);
4552 			BIC_PRESENT(BIC_Cor_J);
4553 			BIC_PRESENT(BIC_RAM_J);
4554 		} else {
4555 			BIC_PRESENT(BIC_PkgWatt);
4556 			BIC_PRESENT(BIC_CorWatt);
4557 			BIC_PRESENT(BIC_RAMWatt);
4558 		}
4559 		break;
4560 	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
4561 	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
4562 		do_rapl = RAPL_PKG | RAPL_CORES;
4563 		if (rapl_joules) {
4564 			BIC_PRESENT(BIC_Pkg_J);
4565 			BIC_PRESENT(BIC_Cor_J);
4566 		} else {
4567 			BIC_PRESENT(BIC_PkgWatt);
4568 			BIC_PRESENT(BIC_CorWatt);
4569 		}
4570 		break;
4571 	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
4572 		do_rapl =
4573 		    RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS |
4574 		    RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS;
4575 		BIC_PRESENT(BIC_PKG__);
4576 		BIC_PRESENT(BIC_RAM__);
4577 		if (rapl_joules) {
4578 			BIC_PRESENT(BIC_Pkg_J);
4579 			BIC_PRESENT(BIC_Cor_J);
4580 			BIC_PRESENT(BIC_RAM_J);
4581 		} else {
4582 			BIC_PRESENT(BIC_PkgWatt);
4583 			BIC_PRESENT(BIC_CorWatt);
4584 			BIC_PRESENT(BIC_RAMWatt);
4585 		}
4586 		break;
4587 	default:
4588 		return;
4589 	}
4590 
4591 	/* units on package 0, verify later other packages match */
4592 	if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
4593 		return;
4594 
4595 	rapl_power_units = 1.0 / (1 << (msr & 0xF));
4596 	if (model == INTEL_FAM6_ATOM_SILVERMONT)
4597 		rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000;
4598 	else
4599 		rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
4600 
4601 	rapl_dram_energy_units = rapl_dram_energy_units_probe(model, rapl_energy_units);
4602 
4603 	time_unit = msr >> 16 & 0xF;
4604 	if (time_unit == 0)
4605 		time_unit = 0xA;
4606 
4607 	rapl_time_units = 1.0 / (1 << (time_unit));
4608 
4609 	tdp = get_tdp_intel(model);
4610 
4611 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
4612 	if (!quiet)
4613 		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
4614 }
4615 
4616 void rapl_probe_amd(unsigned int family, unsigned int model)
4617 {
4618 	unsigned long long msr;
4619 	unsigned int eax, ebx, ecx, edx;
4620 	unsigned int has_rapl = 0;
4621 	double tdp;
4622 
4623 	if (max_extended_level >= 0x80000007) {
4624 		__cpuid(0x80000007, eax, ebx, ecx, edx);
4625 		/* RAPL (Fam 17h+) */
4626 		has_rapl = edx & (1 << 14);
4627 	}
4628 
4629 	if (!has_rapl || family < 0x17)
4630 		return;
4631 
4632 	do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY;
4633 	if (rapl_joules) {
4634 		BIC_PRESENT(BIC_Pkg_J);
4635 		BIC_PRESENT(BIC_Cor_J);
4636 	} else {
4637 		BIC_PRESENT(BIC_PkgWatt);
4638 		BIC_PRESENT(BIC_CorWatt);
4639 	}
4640 
4641 	if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
4642 		return;
4643 
4644 	rapl_time_units = ldexp(1.0, -(msr >> 16 & 0xf));
4645 	rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f));
4646 	rapl_power_units = ldexp(1.0, -(msr & 0xf));
4647 
4648 	tdp = get_tdp_amd(family);
4649 
4650 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
4651 	if (!quiet)
4652 		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
4653 }
4654 
4655 /*
4656  * rapl_probe()
4657  *
4658  * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units
4659  */
4660 void rapl_probe(unsigned int family, unsigned int model)
4661 {
4662 	if (genuine_intel)
4663 		rapl_probe_intel(family, model);
4664 	if (authentic_amd || hygon_genuine)
4665 		rapl_probe_amd(family, model);
4666 }
4667 
4668 void perf_limit_reasons_probe(unsigned int family, unsigned int model)
4669 {
4670 	if (!genuine_intel)
4671 		return;
4672 
4673 	if (family != 6)
4674 		return;
4675 
4676 	switch (model) {
4677 	case INTEL_FAM6_HASWELL:	/* HSW */
4678 	case INTEL_FAM6_HASWELL_L:	/* HSW */
4679 	case INTEL_FAM6_HASWELL_G:	/* HSW */
4680 		do_gfx_perf_limit_reasons = 1;
4681 	case INTEL_FAM6_HASWELL_X:	/* HSX */
4682 		do_core_perf_limit_reasons = 1;
4683 		do_ring_perf_limit_reasons = 1;
4684 	default:
4685 		return;
4686 	}
4687 }
4688 
4689 void automatic_cstate_conversion_probe(unsigned int family, unsigned int model)
4690 {
4691 	if (is_skx(family, model) || is_bdx(family, model) || is_icx(family, model))
4692 		has_automatic_cstate_conversion = 1;
4693 }
4694 
4695 void prewake_cstate_probe(unsigned int family, unsigned int model)
4696 {
4697 	if (is_icx(family, model))
4698 		dis_cstate_prewake = 1;
4699 }
4700 
4701 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
4702 {
4703 	unsigned long long msr;
4704 	unsigned int dts, dts2;
4705 	int cpu;
4706 
4707 	if (!(do_dts || do_ptm))
4708 		return 0;
4709 
4710 	cpu = t->cpu_id;
4711 
4712 	/* DTS is per-core, no need to print for each thread */
4713 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
4714 		return 0;
4715 
4716 	if (cpu_migrate(cpu)) {
4717 		fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
4718 		return -1;
4719 	}
4720 
4721 	if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) {
4722 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
4723 			return 0;
4724 
4725 		dts = (msr >> 16) & 0x7F;
4726 		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
4727 
4728 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
4729 			return 0;
4730 
4731 		dts = (msr >> 16) & 0x7F;
4732 		dts2 = (msr >> 8) & 0x7F;
4733 		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
4734 			cpu, msr, tj_max - dts, tj_max - dts2);
4735 	}
4736 
4737 	if (do_dts && debug) {
4738 		unsigned int resolution;
4739 
4740 		if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
4741 			return 0;
4742 
4743 		dts = (msr >> 16) & 0x7F;
4744 		resolution = (msr >> 27) & 0xF;
4745 		fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
4746 			cpu, msr, tj_max - dts, resolution);
4747 
4748 		if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
4749 			return 0;
4750 
4751 		dts = (msr >> 16) & 0x7F;
4752 		dts2 = (msr >> 8) & 0x7F;
4753 		fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
4754 			cpu, msr, tj_max - dts, tj_max - dts2);
4755 	}
4756 
4757 	return 0;
4758 }
4759 
4760 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
4761 {
4762 	fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n",
4763 		cpu, label,
4764 		((msr >> 15) & 1) ? "EN" : "DIS",
4765 		((msr >> 0) & 0x7FFF) * rapl_power_units,
4766 		(1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units,
4767 		(((msr >> 16) & 1) ? "EN" : "DIS"));
4768 
4769 	return;
4770 }
4771 
4772 int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
4773 {
4774 	unsigned long long msr;
4775 	const char *msr_name;
4776 	int cpu;
4777 
4778 	if (!do_rapl)
4779 		return 0;
4780 
4781 	/* RAPL counters are per package, so print only for 1st thread/package */
4782 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
4783 		return 0;
4784 
4785 	cpu = t->cpu_id;
4786 	if (cpu_migrate(cpu)) {
4787 		fprintf(outf, "print_rapl: Could not migrate to CPU %d\n", cpu);
4788 		return -1;
4789 	}
4790 
4791 	if (do_rapl & RAPL_AMD_F17H) {
4792 		msr_name = "MSR_RAPL_PWR_UNIT";
4793 		if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr))
4794 			return -1;
4795 	} else {
4796 		msr_name = "MSR_RAPL_POWER_UNIT";
4797 		if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr))
4798 			return -1;
4799 	}
4800 
4801 	fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr,
4802 		rapl_power_units, rapl_energy_units, rapl_time_units);
4803 
4804 	if (do_rapl & RAPL_PKG_POWER_INFO) {
4805 
4806 		if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
4807 			return -5;
4808 
4809 		fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
4810 			cpu, msr,
4811 			((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
4812 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
4813 			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
4814 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
4815 
4816 	}
4817 	if (do_rapl & RAPL_PKG) {
4818 
4819 		if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
4820 			return -9;
4821 
4822 		fprintf(outf, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n",
4823 			cpu, msr, (msr >> 63) & 1 ? "" : "UN");
4824 
4825 		print_power_limit_msr(cpu, msr, "PKG Limit #1");
4826 		fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%0.3f Watts, %f* sec, clamp %sabled)\n",
4827 			cpu,
4828 			((msr >> 47) & 1) ? "EN" : "DIS",
4829 			((msr >> 32) & 0x7FFF) * rapl_power_units,
4830 			(1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units,
4831 			((msr >> 48) & 1) ? "EN" : "DIS");
4832 
4833 		if (get_msr(cpu, MSR_VR_CURRENT_CONFIG, &msr))
4834 			return -9;
4835 
4836 		fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr);
4837 		fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n",
4838 			cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
4839 	}
4840 
4841 	if (do_rapl & RAPL_DRAM_POWER_INFO) {
4842 		if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
4843 			return -6;
4844 
4845 		fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
4846 			cpu, msr,
4847 			((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
4848 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
4849 			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
4850 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
4851 	}
4852 	if (do_rapl & RAPL_DRAM) {
4853 		if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
4854 			return -9;
4855 		fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
4856 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
4857 
4858 		print_power_limit_msr(cpu, msr, "DRAM Limit");
4859 	}
4860 	if (do_rapl & RAPL_CORE_POLICY) {
4861 		if (get_msr(cpu, MSR_PP0_POLICY, &msr))
4862 			return -7;
4863 
4864 		fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
4865 	}
4866 	if (do_rapl & RAPL_CORES_POWER_LIMIT) {
4867 		if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
4868 			return -9;
4869 		fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
4870 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
4871 		print_power_limit_msr(cpu, msr, "Cores Limit");
4872 	}
4873 	if (do_rapl & RAPL_GFX) {
4874 		if (get_msr(cpu, MSR_PP1_POLICY, &msr))
4875 			return -8;
4876 
4877 		fprintf(outf, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF);
4878 
4879 		if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr))
4880 			return -9;
4881 		fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
4882 			cpu, msr, (msr >> 31) & 1 ? "" : "UN");
4883 		print_power_limit_msr(cpu, msr, "GFX Limit");
4884 	}
4885 	return 0;
4886 }
4887 
4888 /*
4889  * SNB adds support for additional MSRs:
4890  *
4891  * MSR_PKG_C7_RESIDENCY            0x000003fa
4892  * MSR_CORE_C7_RESIDENCY           0x000003fe
4893  * MSR_PKG_C2_RESIDENCY            0x0000060d
4894  */
4895 
4896 int has_snb_msrs(unsigned int family, unsigned int model)
4897 {
4898 	if (!genuine_intel)
4899 		return 0;
4900 
4901 	switch (model) {
4902 	case INTEL_FAM6_SANDYBRIDGE:
4903 	case INTEL_FAM6_SANDYBRIDGE_X:
4904 	case INTEL_FAM6_IVYBRIDGE:	/* IVB */
4905 	case INTEL_FAM6_IVYBRIDGE_X:	/* IVB Xeon */
4906 	case INTEL_FAM6_HASWELL:	/* HSW */
4907 	case INTEL_FAM6_HASWELL_X:	/* HSW */
4908 	case INTEL_FAM6_HASWELL_L:	/* HSW */
4909 	case INTEL_FAM6_HASWELL_G:	/* HSW */
4910 	case INTEL_FAM6_BROADWELL:	/* BDW */
4911 	case INTEL_FAM6_BROADWELL_G:	/* BDW */
4912 	case INTEL_FAM6_BROADWELL_X:	/* BDX */
4913 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
4914 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
4915 	case INTEL_FAM6_SKYLAKE_X:	/* SKX */
4916 	case INTEL_FAM6_ICELAKE_X:	/* ICX */
4917 	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
4918 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
4919 	case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
4920 	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
4921 	case INTEL_FAM6_ATOM_TREMONT_D:	/* JVL */
4922 		return 1;
4923 	}
4924 	return 0;
4925 }
4926 
4927 /*
4928  * HSW ULT added support for C8/C9/C10 MSRs:
4929  *
4930  * MSR_PKG_C8_RESIDENCY		0x00000630
4931  * MSR_PKG_C9_RESIDENCY		0x00000631
4932  * MSR_PKG_C10_RESIDENCY	0x00000632
4933  *
4934  * MSR_PKGC8_IRTL		0x00000633
4935  * MSR_PKGC9_IRTL		0x00000634
4936  * MSR_PKGC10_IRTL		0x00000635
4937  *
4938  */
4939 int has_c8910_msrs(unsigned int family, unsigned int model)
4940 {
4941 	if (!genuine_intel)
4942 		return 0;
4943 
4944 	switch (model) {
4945 	case INTEL_FAM6_HASWELL_L:	/* HSW */
4946 	case INTEL_FAM6_BROADWELL:	/* BDW */
4947 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
4948 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
4949 	case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
4950 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
4951 	case INTEL_FAM6_ATOM_TREMONT:	/* EHL */
4952 		return 1;
4953 	}
4954 	return 0;
4955 }
4956 
4957 /*
4958  * SKL adds support for additional MSRS:
4959  *
4960  * MSR_PKG_WEIGHTED_CORE_C0_RES    0x00000658
4961  * MSR_PKG_ANY_CORE_C0_RES         0x00000659
4962  * MSR_PKG_ANY_GFXE_C0_RES         0x0000065A
4963  * MSR_PKG_BOTH_CORE_GFXE_C0_RES   0x0000065B
4964  */
4965 int has_skl_msrs(unsigned int family, unsigned int model)
4966 {
4967 	if (!genuine_intel)
4968 		return 0;
4969 
4970 	switch (model) {
4971 	case INTEL_FAM6_SKYLAKE_L:	/* SKL */
4972 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
4973 		return 1;
4974 	}
4975 	return 0;
4976 }
4977 
4978 int is_slm(unsigned int family, unsigned int model)
4979 {
4980 	if (!genuine_intel)
4981 		return 0;
4982 	switch (model) {
4983 	case INTEL_FAM6_ATOM_SILVERMONT:	/* BYT */
4984 	case INTEL_FAM6_ATOM_SILVERMONT_D:	/* AVN */
4985 		return 1;
4986 	}
4987 	return 0;
4988 }
4989 
4990 int is_knl(unsigned int family, unsigned int model)
4991 {
4992 	if (!genuine_intel)
4993 		return 0;
4994 	switch (model) {
4995 	case INTEL_FAM6_XEON_PHI_KNL:	/* KNL */
4996 		return 1;
4997 	}
4998 	return 0;
4999 }
5000 
5001 int is_cnl(unsigned int family, unsigned int model)
5002 {
5003 	if (!genuine_intel)
5004 		return 0;
5005 
5006 	switch (model) {
5007 	case INTEL_FAM6_CANNONLAKE_L:	/* CNL */
5008 		return 1;
5009 	}
5010 
5011 	return 0;
5012 }
5013 
5014 unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model)
5015 {
5016 	if (is_knl(family, model))
5017 		return 1024;
5018 	return 1;
5019 }
5020 
5021 #define SLM_BCLK_FREQS 5
5022 double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
5023 
5024 double slm_bclk(void)
5025 {
5026 	unsigned long long msr = 3;
5027 	unsigned int i;
5028 	double freq;
5029 
5030 	if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
5031 		fprintf(outf, "SLM BCLK: unknown\n");
5032 
5033 	i = msr & 0xf;
5034 	if (i >= SLM_BCLK_FREQS) {
5035 		fprintf(outf, "SLM BCLK[%d] invalid\n", i);
5036 		i = 3;
5037 	}
5038 	freq = slm_freq_table[i];
5039 
5040 	if (!quiet)
5041 		fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
5042 
5043 	return freq;
5044 }
5045 
5046 double discover_bclk(unsigned int family, unsigned int model)
5047 {
5048 	if (has_snb_msrs(family, model) || is_knl(family, model))
5049 		return 100.00;
5050 	else if (is_slm(family, model))
5051 		return slm_bclk();
5052 	else
5053 		return 133.33;
5054 }
5055 
5056 int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5057 {
5058 	unsigned int eax, ebx, ecx, edx;
5059 
5060 	if (!genuine_intel)
5061 		return 0;
5062 
5063 	if (cpu_migrate(t->cpu_id)) {
5064 		fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
5065 		return -1;
5066 	}
5067 
5068 	if (max_level < 0x1a)
5069 		return 0;
5070 
5071 	__cpuid(0x1a, eax, ebx, ecx, edx);
5072 	eax = (eax >> 24) & 0xFF;
5073 	if (eax == 0x20)
5074 		t->is_atom = true;
5075 	return 0;
5076 }
5077 
5078 /*
5079  * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where
5080  * the Thermal Control Circuit (TCC) activates.
5081  * This is usually equal to tjMax.
5082  *
5083  * Older processors do not have this MSR, so there we guess,
5084  * but also allow cmdline over-ride with -T.
5085  *
5086  * Several MSR temperature values are in units of degrees-C
5087  * below this value, including the Digital Thermal Sensor (DTS),
5088  * Package Thermal Management Sensor (PTM), and thermal event thresholds.
5089  */
5090 int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5091 {
5092 	unsigned long long msr;
5093 	unsigned int tcc_default, tcc_offset;
5094 	int cpu;
5095 
5096 	/* tj_max is used only for dts or ptm */
5097 	if (!(do_dts || do_ptm))
5098 		return 0;
5099 
5100 	/* this is a per-package concept */
5101 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
5102 		return 0;
5103 
5104 	cpu = t->cpu_id;
5105 	if (cpu_migrate(cpu)) {
5106 		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
5107 		return -1;
5108 	}
5109 
5110 	if (tj_max_override != 0) {
5111 		tj_max = tj_max_override;
5112 		fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", cpu, tj_max);
5113 		return 0;
5114 	}
5115 
5116 	/* Temperature Target MSR is Nehalem and newer only */
5117 	if (!do_nhm_platform_info)
5118 		goto guess;
5119 
5120 	if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
5121 		goto guess;
5122 
5123 	tcc_default = (msr >> 16) & 0xFF;
5124 
5125 	if (!quiet) {
5126 		switch (tcc_offset_bits) {
5127 		case 4:
5128 			tcc_offset = (msr >> 24) & 0xF;
5129 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
5130 				cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
5131 			break;
5132 		case 6:
5133 			tcc_offset = (msr >> 24) & 0x3F;
5134 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
5135 				cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
5136 			break;
5137 		default:
5138 			fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default);
5139 			break;
5140 		}
5141 	}
5142 
5143 	if (!tcc_default)
5144 		goto guess;
5145 
5146 	tj_max = tcc_default;
5147 
5148 	return 0;
5149 
5150 guess:
5151 	tj_max = TJMAX_DEFAULT;
5152 	fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max);
5153 
5154 	return 0;
5155 }
5156 
5157 void decode_feature_control_msr(void)
5158 {
5159 	unsigned long long msr;
5160 
5161 	if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
5162 		fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
5163 			base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
5164 }
5165 
5166 void decode_misc_enable_msr(void)
5167 {
5168 	unsigned long long msr;
5169 
5170 	if (!genuine_intel)
5171 		return;
5172 
5173 	if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr))
5174 		fprintf(outf, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%sTCC %sEIST %sMWAIT %sPREFETCH %sTURBO)\n",
5175 			base_cpu, msr,
5176 			msr & MSR_IA32_MISC_ENABLE_TM1 ? "" : "No-",
5177 			msr & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP ? "" : "No-",
5178 			msr & MSR_IA32_MISC_ENABLE_MWAIT ? "" : "No-",
5179 			msr & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE ? "No-" : "",
5180 			msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ? "No-" : "");
5181 }
5182 
5183 void decode_misc_feature_control(void)
5184 {
5185 	unsigned long long msr;
5186 
5187 	if (!has_misc_feature_control)
5188 		return;
5189 
5190 	if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
5191 		fprintf(outf,
5192 			"cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
5193 			base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
5194 			msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
5195 }
5196 
5197 /*
5198  * Decode MSR_MISC_PWR_MGMT
5199  *
5200  * Decode the bits according to the Nehalem documentation
5201  * bit[0] seems to continue to have same meaning going forward
5202  * bit[1] less so...
5203  */
5204 void decode_misc_pwr_mgmt_msr(void)
5205 {
5206 	unsigned long long msr;
5207 
5208 	if (!do_nhm_platform_info)
5209 		return;
5210 
5211 	if (no_MSR_MISC_PWR_MGMT)
5212 		return;
5213 
5214 	if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
5215 		fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n",
5216 			base_cpu, msr,
5217 			msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS");
5218 }
5219 
5220 /*
5221  * Decode MSR_CC6_DEMOTION_POLICY_CONFIG, MSR_MC6_DEMOTION_POLICY_CONFIG
5222  *
5223  * This MSRs are present on Silvermont processors,
5224  * Intel Atom processor E3000 series (Baytrail), and friends.
5225  */
5226 void decode_c6_demotion_policy_msr(void)
5227 {
5228 	unsigned long long msr;
5229 
5230 	if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
5231 		fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
5232 			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
5233 
5234 	if (!get_msr(base_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr))
5235 		fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n",
5236 			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
5237 }
5238 
5239 /*
5240  * When models are the same, for the purpose of turbostat, reuse
5241  */
5242 unsigned int intel_model_duplicates(unsigned int model)
5243 {
5244 
5245 	switch (model) {
5246 	case INTEL_FAM6_NEHALEM_EP:	/* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */
5247 	case INTEL_FAM6_NEHALEM:	/* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
5248 	case 0x1F:		/* Core i7 and i5 Processor - Nehalem */
5249 	case INTEL_FAM6_WESTMERE:	/* Westmere Client - Clarkdale, Arrandale */
5250 	case INTEL_FAM6_WESTMERE_EP:	/* Westmere EP - Gulftown */
5251 		return INTEL_FAM6_NEHALEM;
5252 
5253 	case INTEL_FAM6_NEHALEM_EX:	/* Nehalem-EX Xeon - Beckton */
5254 	case INTEL_FAM6_WESTMERE_EX:	/* Westmere-EX Xeon - Eagleton */
5255 		return INTEL_FAM6_NEHALEM_EX;
5256 
5257 	case INTEL_FAM6_XEON_PHI_KNM:
5258 		return INTEL_FAM6_XEON_PHI_KNL;
5259 
5260 	case INTEL_FAM6_BROADWELL_X:
5261 	case INTEL_FAM6_BROADWELL_D:	/* BDX-DE */
5262 		return INTEL_FAM6_BROADWELL_X;
5263 
5264 	case INTEL_FAM6_SKYLAKE_L:
5265 	case INTEL_FAM6_SKYLAKE:
5266 	case INTEL_FAM6_KABYLAKE_L:
5267 	case INTEL_FAM6_KABYLAKE:
5268 	case INTEL_FAM6_COMETLAKE_L:
5269 	case INTEL_FAM6_COMETLAKE:
5270 		return INTEL_FAM6_SKYLAKE_L;
5271 
5272 	case INTEL_FAM6_ICELAKE_L:
5273 	case INTEL_FAM6_ICELAKE_NNPI:
5274 	case INTEL_FAM6_TIGERLAKE_L:
5275 	case INTEL_FAM6_TIGERLAKE:
5276 	case INTEL_FAM6_ROCKETLAKE:
5277 	case INTEL_FAM6_LAKEFIELD:
5278 	case INTEL_FAM6_ALDERLAKE:
5279 	case INTEL_FAM6_ALDERLAKE_L:
5280 		return INTEL_FAM6_CANNONLAKE_L;
5281 
5282 	case INTEL_FAM6_ATOM_TREMONT_L:
5283 		return INTEL_FAM6_ATOM_TREMONT;
5284 
5285 	case INTEL_FAM6_ICELAKE_D:
5286 	case INTEL_FAM6_SAPPHIRERAPIDS_X:
5287 		return INTEL_FAM6_ICELAKE_X;
5288 	}
5289 	return model;
5290 }
5291 
5292 void print_dev_latency(void)
5293 {
5294 	char *path = "/dev/cpu_dma_latency";
5295 	int fd;
5296 	int value;
5297 	int retval;
5298 
5299 	fd = open(path, O_RDONLY);
5300 	if (fd < 0) {
5301 		warn("fopen %s\n", path);
5302 		return;
5303 	}
5304 
5305 	retval = read(fd, (void *)&value, sizeof(int));
5306 	if (retval != sizeof(int)) {
5307 		warn("read %s\n", path);
5308 		close(fd);
5309 		return;
5310 	}
5311 	fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", value, value == 2000000000 ? "default" : "constrained");
5312 
5313 	close(fd);
5314 }
5315 
5316 /*
5317  * Linux-perf manages the the HW instructions-retired counter
5318  * by enabling when requested, and hiding rollover
5319  */
5320 void linux_perf_init(void)
5321 {
5322 	if (!BIC_IS_ENABLED(BIC_IPC))
5323 		return;
5324 
5325 	if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
5326 		return;
5327 
5328 	fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
5329 	if (fd_instr_count_percpu == NULL)
5330 		err(-1, "calloc fd_instr_count_percpu");
5331 
5332 	BIC_PRESENT(BIC_IPC);
5333 }
5334 
5335 void process_cpuid()
5336 {
5337 	unsigned int eax, ebx, ecx, edx;
5338 	unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
5339 	unsigned int has_turbo;
5340 	unsigned long long ucode_patch = 0;
5341 
5342 	eax = ebx = ecx = edx = 0;
5343 
5344 	__cpuid(0, max_level, ebx, ecx, edx);
5345 
5346 	if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
5347 		genuine_intel = 1;
5348 	else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
5349 		authentic_amd = 1;
5350 	else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
5351 		hygon_genuine = 1;
5352 
5353 	if (!quiet)
5354 		fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n",
5355 			(char *)&ebx, (char *)&edx, (char *)&ecx, max_level);
5356 
5357 	__cpuid(1, fms, ebx, ecx, edx);
5358 	family = (fms >> 8) & 0xf;
5359 	model = (fms >> 4) & 0xf;
5360 	stepping = fms & 0xf;
5361 	if (family == 0xf)
5362 		family += (fms >> 20) & 0xff;
5363 	if (family >= 6)
5364 		model += ((fms >> 16) & 0xf) << 4;
5365 	ecx_flags = ecx;
5366 	edx_flags = edx;
5367 
5368 	if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
5369 		warnx("get_msr(UCODE)\n");
5370 
5371 	/*
5372 	 * check max extended function levels of CPUID.
5373 	 * This is needed to check for invariant TSC.
5374 	 * This check is valid for both Intel and AMD.
5375 	 */
5376 	ebx = ecx = edx = 0;
5377 	__cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
5378 
5379 	if (!quiet) {
5380 		fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n",
5381 			family, model, stepping, family, model, stepping,
5382 			(unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
5383 		fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
5384 		fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
5385 			ecx_flags & (1 << 0) ? "SSE3" : "-",
5386 			ecx_flags & (1 << 3) ? "MONITOR" : "-",
5387 			ecx_flags & (1 << 6) ? "SMX" : "-",
5388 			ecx_flags & (1 << 7) ? "EIST" : "-",
5389 			ecx_flags & (1 << 8) ? "TM2" : "-",
5390 			edx_flags & (1 << 4) ? "TSC" : "-",
5391 			edx_flags & (1 << 5) ? "MSR" : "-",
5392 			edx_flags & (1 << 22) ? "ACPI-TM" : "-",
5393 			edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
5394 	}
5395 	if (genuine_intel) {
5396 		model_orig = model;
5397 		model = intel_model_duplicates(model);
5398 	}
5399 
5400 	if (!(edx_flags & (1 << 5)))
5401 		errx(1, "CPUID: no MSR");
5402 
5403 	if (max_extended_level >= 0x80000007) {
5404 
5405 		/*
5406 		 * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
5407 		 * this check is valid for both Intel and AMD
5408 		 */
5409 		__cpuid(0x80000007, eax, ebx, ecx, edx);
5410 		has_invariant_tsc = edx & (1 << 8);
5411 	}
5412 
5413 	/*
5414 	 * APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
5415 	 * this check is valid for both Intel and AMD
5416 	 */
5417 
5418 	__cpuid(0x6, eax, ebx, ecx, edx);
5419 	has_aperf = ecx & (1 << 0);
5420 	if (has_aperf) {
5421 		BIC_PRESENT(BIC_Avg_MHz);
5422 		BIC_PRESENT(BIC_Busy);
5423 		BIC_PRESENT(BIC_Bzy_MHz);
5424 	}
5425 	do_dts = eax & (1 << 0);
5426 	if (do_dts)
5427 		BIC_PRESENT(BIC_CoreTmp);
5428 	has_turbo = eax & (1 << 1);
5429 	do_ptm = eax & (1 << 6);
5430 	if (do_ptm)
5431 		BIC_PRESENT(BIC_PkgTmp);
5432 	has_hwp = eax & (1 << 7);
5433 	has_hwp_notify = eax & (1 << 8);
5434 	has_hwp_activity_window = eax & (1 << 9);
5435 	has_hwp_epp = eax & (1 << 10);
5436 	has_hwp_pkg = eax & (1 << 11);
5437 	has_epb = ecx & (1 << 3);
5438 
5439 	if (!quiet)
5440 		fprintf(outf, "CPUID(6): %sAPERF, %sTURBO, %sDTS, %sPTM, %sHWP, "
5441 			"%sHWPnotify, %sHWPwindow, %sHWPepp, %sHWPpkg, %sEPB\n",
5442 			has_aperf ? "" : "No-",
5443 			has_turbo ? "" : "No-",
5444 			do_dts ? "" : "No-",
5445 			do_ptm ? "" : "No-",
5446 			has_hwp ? "" : "No-",
5447 			has_hwp_notify ? "" : "No-",
5448 			has_hwp_activity_window ? "" : "No-",
5449 			has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-");
5450 
5451 	if (!quiet)
5452 		decode_misc_enable_msr();
5453 
5454 	if (max_level >= 0x7 && !quiet) {
5455 		int has_sgx;
5456 
5457 		ecx = 0;
5458 
5459 		__cpuid_count(0x7, 0, eax, ebx, ecx, edx);
5460 
5461 		has_sgx = ebx & (1 << 2);
5462 		fprintf(outf, "CPUID(7): %sSGX\n", has_sgx ? "" : "No-");
5463 
5464 		if (has_sgx)
5465 			decode_feature_control_msr();
5466 	}
5467 
5468 	if (max_level >= 0x15) {
5469 		unsigned int eax_crystal;
5470 		unsigned int ebx_tsc;
5471 
5472 		/*
5473 		 * CPUID 15H TSC/Crystal ratio, possibly Crystal Hz
5474 		 */
5475 		eax_crystal = ebx_tsc = crystal_hz = edx = 0;
5476 		__cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx);
5477 
5478 		if (ebx_tsc != 0) {
5479 
5480 			if (!quiet && (ebx != 0))
5481 				fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
5482 					eax_crystal, ebx_tsc, crystal_hz);
5483 
5484 			if (crystal_hz == 0)
5485 				switch (model) {
5486 				case INTEL_FAM6_SKYLAKE_L:	/* SKL */
5487 					crystal_hz = 24000000;	/* 24.0 MHz */
5488 					break;
5489 				case INTEL_FAM6_ATOM_GOLDMONT_D:	/* DNV */
5490 					crystal_hz = 25000000;	/* 25.0 MHz */
5491 					break;
5492 				case INTEL_FAM6_ATOM_GOLDMONT:	/* BXT */
5493 				case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
5494 					crystal_hz = 19200000;	/* 19.2 MHz */
5495 					break;
5496 				default:
5497 					crystal_hz = 0;
5498 				}
5499 
5500 			if (crystal_hz) {
5501 				tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
5502 				if (!quiet)
5503 					fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
5504 						tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal);
5505 			}
5506 		}
5507 	}
5508 	if (max_level >= 0x16) {
5509 		unsigned int base_mhz, max_mhz, bus_mhz, edx;
5510 
5511 		/*
5512 		 * CPUID 16H Base MHz, Max MHz, Bus MHz
5513 		 */
5514 		base_mhz = max_mhz = bus_mhz = edx = 0;
5515 
5516 		__cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx);
5517 		if (!quiet)
5518 			fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
5519 				base_mhz, max_mhz, bus_mhz);
5520 	}
5521 
5522 	if (has_aperf)
5523 		aperf_mperf_multiplier = get_aperf_mperf_multiplier(family, model);
5524 
5525 	BIC_PRESENT(BIC_IRQ);
5526 	BIC_PRESENT(BIC_TSC_MHz);
5527 
5528 	if (probe_nhm_msrs(family, model)) {
5529 		do_nhm_platform_info = 1;
5530 		BIC_PRESENT(BIC_CPU_c1);
5531 		BIC_PRESENT(BIC_CPU_c3);
5532 		BIC_PRESENT(BIC_CPU_c6);
5533 		BIC_PRESENT(BIC_SMI);
5534 	}
5535 	do_snb_cstates = has_snb_msrs(family, model);
5536 
5537 	if (do_snb_cstates)
5538 		BIC_PRESENT(BIC_CPU_c7);
5539 
5540 	do_irtl_snb = has_snb_msrs(family, model);
5541 	if (do_snb_cstates && (pkg_cstate_limit >= PCL__2))
5542 		BIC_PRESENT(BIC_Pkgpc2);
5543 	if (pkg_cstate_limit >= PCL__3)
5544 		BIC_PRESENT(BIC_Pkgpc3);
5545 	if (pkg_cstate_limit >= PCL__6)
5546 		BIC_PRESENT(BIC_Pkgpc6);
5547 	if (do_snb_cstates && (pkg_cstate_limit >= PCL__7))
5548 		BIC_PRESENT(BIC_Pkgpc7);
5549 	if (has_slv_msrs(family, model)) {
5550 		BIC_NOT_PRESENT(BIC_Pkgpc2);
5551 		BIC_NOT_PRESENT(BIC_Pkgpc3);
5552 		BIC_PRESENT(BIC_Pkgpc6);
5553 		BIC_NOT_PRESENT(BIC_Pkgpc7);
5554 		BIC_PRESENT(BIC_Mod_c6);
5555 		use_c1_residency_msr = 1;
5556 	}
5557 	if (is_jvl(family, model)) {
5558 		BIC_NOT_PRESENT(BIC_CPU_c3);
5559 		BIC_NOT_PRESENT(BIC_CPU_c7);
5560 		BIC_NOT_PRESENT(BIC_Pkgpc2);
5561 		BIC_NOT_PRESENT(BIC_Pkgpc3);
5562 		BIC_NOT_PRESENT(BIC_Pkgpc6);
5563 		BIC_NOT_PRESENT(BIC_Pkgpc7);
5564 	}
5565 	if (is_dnv(family, model)) {
5566 		BIC_PRESENT(BIC_CPU_c1);
5567 		BIC_NOT_PRESENT(BIC_CPU_c3);
5568 		BIC_NOT_PRESENT(BIC_Pkgpc3);
5569 		BIC_NOT_PRESENT(BIC_CPU_c7);
5570 		BIC_NOT_PRESENT(BIC_Pkgpc7);
5571 		use_c1_residency_msr = 1;
5572 	}
5573 	if (is_skx(family, model) || is_icx(family, model)) {
5574 		BIC_NOT_PRESENT(BIC_CPU_c3);
5575 		BIC_NOT_PRESENT(BIC_Pkgpc3);
5576 		BIC_NOT_PRESENT(BIC_CPU_c7);
5577 		BIC_NOT_PRESENT(BIC_Pkgpc7);
5578 	}
5579 	if (is_bdx(family, model)) {
5580 		BIC_NOT_PRESENT(BIC_CPU_c7);
5581 		BIC_NOT_PRESENT(BIC_Pkgpc7);
5582 	}
5583 	if (has_c8910_msrs(family, model)) {
5584 		if (pkg_cstate_limit >= PCL__8)
5585 			BIC_PRESENT(BIC_Pkgpc8);
5586 		if (pkg_cstate_limit >= PCL__9)
5587 			BIC_PRESENT(BIC_Pkgpc9);
5588 		if (pkg_cstate_limit >= PCL_10)
5589 			BIC_PRESENT(BIC_Pkgpc10);
5590 	}
5591 	do_irtl_hsw = has_c8910_msrs(family, model);
5592 	if (has_skl_msrs(family, model)) {
5593 		BIC_PRESENT(BIC_Totl_c0);
5594 		BIC_PRESENT(BIC_Any_c0);
5595 		BIC_PRESENT(BIC_GFX_c0);
5596 		BIC_PRESENT(BIC_CPUGFX);
5597 	}
5598 	do_slm_cstates = is_slm(family, model);
5599 	do_knl_cstates = is_knl(family, model);
5600 
5601 	if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || is_ehl(family, model))
5602 		BIC_NOT_PRESENT(BIC_CPU_c3);
5603 
5604 	if (!quiet)
5605 		decode_misc_pwr_mgmt_msr();
5606 
5607 	if (!quiet && has_slv_msrs(family, model))
5608 		decode_c6_demotion_policy_msr();
5609 
5610 	rapl_probe(family, model);
5611 	perf_limit_reasons_probe(family, model);
5612 	automatic_cstate_conversion_probe(family, model);
5613 
5614 	check_tcc_offset(model_orig);
5615 
5616 	if (!quiet)
5617 		dump_cstate_pstate_config_info(family, model);
5618 
5619 	if (!quiet)
5620 		print_dev_latency();
5621 	if (!quiet)
5622 		dump_sysfs_cstate_config();
5623 	if (!quiet)
5624 		dump_sysfs_pstate_config();
5625 
5626 	if (has_skl_msrs(family, model) || is_ehl(family, model))
5627 		calculate_tsc_tweak();
5628 
5629 	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
5630 		BIC_PRESENT(BIC_GFX_rc6);
5631 
5632 	if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
5633 		BIC_PRESENT(BIC_GFXMHz);
5634 
5635 	if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
5636 		BIC_PRESENT(BIC_GFXACTMHz);
5637 
5638 	if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
5639 		BIC_PRESENT(BIC_CPU_LPI);
5640 	else
5641 		BIC_NOT_PRESENT(BIC_CPU_LPI);
5642 
5643 	if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
5644 		BIC_PRESENT(BIC_CORE_THROT_CNT);
5645 	else
5646 		BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
5647 
5648 	if (!access(sys_lpi_file_sysfs, R_OK)) {
5649 		sys_lpi_file = sys_lpi_file_sysfs;
5650 		BIC_PRESENT(BIC_SYS_LPI);
5651 	} else if (!access(sys_lpi_file_debugfs, R_OK)) {
5652 		sys_lpi_file = sys_lpi_file_debugfs;
5653 		BIC_PRESENT(BIC_SYS_LPI);
5654 	} else {
5655 		sys_lpi_file_sysfs = NULL;
5656 		BIC_NOT_PRESENT(BIC_SYS_LPI);
5657 	}
5658 
5659 	if (!quiet)
5660 		decode_misc_feature_control();
5661 
5662 	return;
5663 }
5664 
5665 /*
5666  * in /dev/cpu/ return success for names that are numbers
5667  * ie. filter out ".", "..", "microcode".
5668  */
5669 int dir_filter(const struct dirent *dirp)
5670 {
5671 	if (isdigit(dirp->d_name[0]))
5672 		return 1;
5673 	else
5674 		return 0;
5675 }
5676 
5677 int open_dev_cpu_msr(int dummy1)
5678 {
5679 	return 0;
5680 }
5681 
5682 void topology_probe()
5683 {
5684 	int i;
5685 	int max_core_id = 0;
5686 	int max_package_id = 0;
5687 	int max_die_id = 0;
5688 	int max_siblings = 0;
5689 
5690 	/* Initialize num_cpus, max_cpu_num */
5691 	set_max_cpu_num();
5692 	topo.num_cpus = 0;
5693 	for_all_proc_cpus(count_cpus);
5694 	if (!summary_only && topo.num_cpus > 1)
5695 		BIC_PRESENT(BIC_CPU);
5696 
5697 	if (debug > 1)
5698 		fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
5699 
5700 	cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology));
5701 	if (cpus == NULL)
5702 		err(1, "calloc cpus");
5703 
5704 	/*
5705 	 * Allocate and initialize cpu_present_set
5706 	 */
5707 	cpu_present_set = CPU_ALLOC((topo.max_cpu_num + 1));
5708 	if (cpu_present_set == NULL)
5709 		err(3, "CPU_ALLOC");
5710 	cpu_present_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
5711 	CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
5712 	for_all_proc_cpus(mark_cpu_present);
5713 
5714 	/*
5715 	 * Validate that all cpus in cpu_subset are also in cpu_present_set
5716 	 */
5717 	for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
5718 		if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
5719 			if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set))
5720 				err(1, "cpu%d not present", i);
5721 	}
5722 
5723 	/*
5724 	 * Allocate and initialize cpu_affinity_set
5725 	 */
5726 	cpu_affinity_set = CPU_ALLOC((topo.max_cpu_num + 1));
5727 	if (cpu_affinity_set == NULL)
5728 		err(3, "CPU_ALLOC");
5729 	cpu_affinity_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
5730 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
5731 
5732 	for_all_proc_cpus(init_thread_id);
5733 
5734 	/*
5735 	 * For online cpus
5736 	 * find max_core_id, max_package_id
5737 	 */
5738 	for (i = 0; i <= topo.max_cpu_num; ++i) {
5739 		int siblings;
5740 
5741 		if (cpu_is_not_present(i)) {
5742 			if (debug > 1)
5743 				fprintf(outf, "cpu%d NOT PRESENT\n", i);
5744 			continue;
5745 		}
5746 
5747 		cpus[i].logical_cpu_id = i;
5748 
5749 		/* get package information */
5750 		cpus[i].physical_package_id = get_physical_package_id(i);
5751 		if (cpus[i].physical_package_id > max_package_id)
5752 			max_package_id = cpus[i].physical_package_id;
5753 
5754 		/* get die information */
5755 		cpus[i].die_id = get_die_id(i);
5756 		if (cpus[i].die_id > max_die_id)
5757 			max_die_id = cpus[i].die_id;
5758 
5759 		/* get numa node information */
5760 		cpus[i].physical_node_id = get_physical_node_id(&cpus[i]);
5761 		if (cpus[i].physical_node_id > topo.max_node_num)
5762 			topo.max_node_num = cpus[i].physical_node_id;
5763 
5764 		/* get core information */
5765 		cpus[i].physical_core_id = get_core_id(i);
5766 		if (cpus[i].physical_core_id > max_core_id)
5767 			max_core_id = cpus[i].physical_core_id;
5768 
5769 		/* get thread information */
5770 		siblings = get_thread_siblings(&cpus[i]);
5771 		if (siblings > max_siblings)
5772 			max_siblings = siblings;
5773 		if (cpus[i].thread_id == 0)
5774 			topo.num_cores++;
5775 	}
5776 
5777 	topo.cores_per_node = max_core_id + 1;
5778 	if (debug > 1)
5779 		fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node);
5780 	if (!summary_only && topo.cores_per_node > 1)
5781 		BIC_PRESENT(BIC_Core);
5782 
5783 	topo.num_die = max_die_id + 1;
5784 	if (debug > 1)
5785 		fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die);
5786 	if (!summary_only && topo.num_die > 1)
5787 		BIC_PRESENT(BIC_Die);
5788 
5789 	topo.num_packages = max_package_id + 1;
5790 	if (debug > 1)
5791 		fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages);
5792 	if (!summary_only && topo.num_packages > 1)
5793 		BIC_PRESENT(BIC_Package);
5794 
5795 	set_node_data();
5796 	if (debug > 1)
5797 		fprintf(outf, "nodes_per_pkg %d\n", topo.nodes_per_pkg);
5798 	if (!summary_only && topo.nodes_per_pkg > 1)
5799 		BIC_PRESENT(BIC_Node);
5800 
5801 	topo.threads_per_core = max_siblings;
5802 	if (debug > 1)
5803 		fprintf(outf, "max_siblings %d\n", max_siblings);
5804 
5805 	if (debug < 1)
5806 		return;
5807 
5808 	for (i = 0; i <= topo.max_cpu_num; ++i) {
5809 		if (cpu_is_not_present(i))
5810 			continue;
5811 		fprintf(outf,
5812 			"cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n",
5813 			i, cpus[i].physical_package_id, cpus[i].die_id,
5814 			cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
5815 	}
5816 
5817 }
5818 
5819 void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p)
5820 {
5821 	int i;
5822 	int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages;
5823 	int num_threads = topo.threads_per_core * num_cores;
5824 
5825 	*t = calloc(num_threads, sizeof(struct thread_data));
5826 	if (*t == NULL)
5827 		goto error;
5828 
5829 	for (i = 0; i < num_threads; i++)
5830 		(*t)[i].cpu_id = -1;
5831 
5832 	*c = calloc(num_cores, sizeof(struct core_data));
5833 	if (*c == NULL)
5834 		goto error;
5835 
5836 	for (i = 0; i < num_cores; i++)
5837 		(*c)[i].core_id = -1;
5838 
5839 	*p = calloc(topo.num_packages, sizeof(struct pkg_data));
5840 	if (*p == NULL)
5841 		goto error;
5842 
5843 	for (i = 0; i < topo.num_packages; i++)
5844 		(*p)[i].package_id = i;
5845 
5846 	return;
5847 error:
5848 	err(1, "calloc counters");
5849 }
5850 
5851 /*
5852  * init_counter()
5853  *
5854  * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE
5855  */
5856 void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id)
5857 {
5858 	int pkg_id = cpus[cpu_id].physical_package_id;
5859 	int node_id = cpus[cpu_id].logical_node_id;
5860 	int core_id = cpus[cpu_id].physical_core_id;
5861 	int thread_id = cpus[cpu_id].thread_id;
5862 	struct thread_data *t;
5863 	struct core_data *c;
5864 	struct pkg_data *p;
5865 
5866 	/* Workaround for systems where physical_node_id==-1
5867 	 * and logical_node_id==(-1 - topo.num_cpus)
5868 	 */
5869 	if (node_id < 0)
5870 		node_id = 0;
5871 
5872 	t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id);
5873 	c = GET_CORE(core_base, core_id, node_id, pkg_id);
5874 	p = GET_PKG(pkg_base, pkg_id);
5875 
5876 	t->cpu_id = cpu_id;
5877 	if (thread_id == 0) {
5878 		t->flags |= CPU_IS_FIRST_THREAD_IN_CORE;
5879 		if (cpu_is_first_core_in_package(cpu_id))
5880 			t->flags |= CPU_IS_FIRST_CORE_IN_PACKAGE;
5881 	}
5882 
5883 	c->core_id = core_id;
5884 	p->package_id = pkg_id;
5885 }
5886 
5887 int initialize_counters(int cpu_id)
5888 {
5889 	init_counter(EVEN_COUNTERS, cpu_id);
5890 	init_counter(ODD_COUNTERS, cpu_id);
5891 	return 0;
5892 }
5893 
5894 void allocate_output_buffer()
5895 {
5896 	output_buffer = calloc(1, (1 + topo.num_cpus) * 2048);
5897 	outp = output_buffer;
5898 	if (outp == NULL)
5899 		err(-1, "calloc output buffer");
5900 }
5901 
5902 void allocate_fd_percpu(void)
5903 {
5904 	fd_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
5905 	if (fd_percpu == NULL)
5906 		err(-1, "calloc fd_percpu");
5907 }
5908 
5909 void allocate_irq_buffers(void)
5910 {
5911 	irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int));
5912 	if (irq_column_2_cpu == NULL)
5913 		err(-1, "calloc %d", topo.num_cpus);
5914 
5915 	irqs_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int));
5916 	if (irqs_per_cpu == NULL)
5917 		err(-1, "calloc %d", topo.max_cpu_num + 1);
5918 }
5919 
5920 void setup_all_buffers(void)
5921 {
5922 	topology_probe();
5923 	allocate_irq_buffers();
5924 	allocate_fd_percpu();
5925 	allocate_counters(&thread_even, &core_even, &package_even);
5926 	allocate_counters(&thread_odd, &core_odd, &package_odd);
5927 	allocate_output_buffer();
5928 	for_all_proc_cpus(initialize_counters);
5929 }
5930 
5931 void set_base_cpu(void)
5932 {
5933 	base_cpu = sched_getcpu();
5934 	if (base_cpu < 0)
5935 		err(-ENODEV, "No valid cpus found");
5936 
5937 	if (debug > 1)
5938 		fprintf(outf, "base_cpu = %d\n", base_cpu);
5939 }
5940 
5941 void turbostat_init()
5942 {
5943 	setup_all_buffers();
5944 	set_base_cpu();
5945 	check_dev_msr();
5946 	check_permissions();
5947 	process_cpuid();
5948 	linux_perf_init();
5949 
5950 	if (!quiet)
5951 		for_all_cpus(print_hwp, ODD_COUNTERS);
5952 
5953 	if (!quiet)
5954 		for_all_cpus(print_epb, ODD_COUNTERS);
5955 
5956 	if (!quiet)
5957 		for_all_cpus(print_perf_limit, ODD_COUNTERS);
5958 
5959 	if (!quiet)
5960 		for_all_cpus(print_rapl, ODD_COUNTERS);
5961 
5962 	for_all_cpus(set_temperature_target, ODD_COUNTERS);
5963 
5964 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
5965 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
5966 
5967 	if (!quiet)
5968 		for_all_cpus(print_thermal, ODD_COUNTERS);
5969 
5970 	if (!quiet && do_irtl_snb)
5971 		print_irtl();
5972 
5973 	if (DO_BIC(BIC_IPC))
5974 		(void)get_instr_count_fd(base_cpu);
5975 }
5976 
5977 int fork_it(char **argv)
5978 {
5979 	pid_t child_pid;
5980 	int status;
5981 
5982 	snapshot_proc_sysfs_files();
5983 	status = for_all_cpus(get_counters, EVEN_COUNTERS);
5984 	first_counter_read = 0;
5985 	if (status)
5986 		exit(status);
5987 	/* clear affinity side-effect of get_counters() */
5988 	sched_setaffinity(0, cpu_present_setsize, cpu_present_set);
5989 	gettimeofday(&tv_even, (struct timezone *)NULL);
5990 
5991 	child_pid = fork();
5992 	if (!child_pid) {
5993 		/* child */
5994 		execvp(argv[0], argv);
5995 		err(errno, "exec %s", argv[0]);
5996 	} else {
5997 
5998 		/* parent */
5999 		if (child_pid == -1)
6000 			err(1, "fork");
6001 
6002 		signal(SIGINT, SIG_IGN);
6003 		signal(SIGQUIT, SIG_IGN);
6004 		if (waitpid(child_pid, &status, 0) == -1)
6005 			err(status, "waitpid");
6006 
6007 		if (WIFEXITED(status))
6008 			status = WEXITSTATUS(status);
6009 	}
6010 	/*
6011 	 * n.b. fork_it() does not check for errors from for_all_cpus()
6012 	 * because re-starting is problematic when forking
6013 	 */
6014 	snapshot_proc_sysfs_files();
6015 	for_all_cpus(get_counters, ODD_COUNTERS);
6016 	gettimeofday(&tv_odd, (struct timezone *)NULL);
6017 	timersub(&tv_odd, &tv_even, &tv_delta);
6018 	if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS))
6019 		fprintf(outf, "%s: Counter reset detected\n", progname);
6020 	else {
6021 		compute_average(EVEN_COUNTERS);
6022 		format_all_counters(EVEN_COUNTERS);
6023 	}
6024 
6025 	fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0);
6026 
6027 	flush_output_stderr();
6028 
6029 	return status;
6030 }
6031 
6032 int get_and_dump_counters(void)
6033 {
6034 	int status;
6035 
6036 	snapshot_proc_sysfs_files();
6037 	status = for_all_cpus(get_counters, ODD_COUNTERS);
6038 	if (status)
6039 		return status;
6040 
6041 	status = for_all_cpus(dump_counters, ODD_COUNTERS);
6042 	if (status)
6043 		return status;
6044 
6045 	flush_output_stdout();
6046 
6047 	return status;
6048 }
6049 
6050 void print_version()
6051 {
6052 	fprintf(outf, "turbostat version 21.05.04" " - Len Brown <lenb@kernel.org>\n");
6053 }
6054 
6055 int add_counter(unsigned int msr_num, char *path, char *name,
6056 		unsigned int width, enum counter_scope scope,
6057 		enum counter_type type, enum counter_format format, int flags)
6058 {
6059 	struct msr_counter *msrp;
6060 
6061 	msrp = calloc(1, sizeof(struct msr_counter));
6062 	if (msrp == NULL) {
6063 		perror("calloc");
6064 		exit(1);
6065 	}
6066 
6067 	msrp->msr_num = msr_num;
6068 	strncpy(msrp->name, name, NAME_BYTES - 1);
6069 	if (path)
6070 		strncpy(msrp->path, path, PATH_BYTES - 1);
6071 	msrp->width = width;
6072 	msrp->type = type;
6073 	msrp->format = format;
6074 	msrp->flags = flags;
6075 
6076 	switch (scope) {
6077 
6078 	case SCOPE_CPU:
6079 		msrp->next = sys.tp;
6080 		sys.tp = msrp;
6081 		sys.added_thread_counters++;
6082 		if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) {
6083 			fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS);
6084 			exit(-1);
6085 		}
6086 		break;
6087 
6088 	case SCOPE_CORE:
6089 		msrp->next = sys.cp;
6090 		sys.cp = msrp;
6091 		sys.added_core_counters++;
6092 		if (sys.added_core_counters > MAX_ADDED_COUNTERS) {
6093 			fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS);
6094 			exit(-1);
6095 		}
6096 		break;
6097 
6098 	case SCOPE_PACKAGE:
6099 		msrp->next = sys.pp;
6100 		sys.pp = msrp;
6101 		sys.added_package_counters++;
6102 		if (sys.added_package_counters > MAX_ADDED_COUNTERS) {
6103 			fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS);
6104 			exit(-1);
6105 		}
6106 		break;
6107 	}
6108 
6109 	return 0;
6110 }
6111 
6112 void parse_add_command(char *add_command)
6113 {
6114 	int msr_num = 0;
6115 	char *path = NULL;
6116 	char name_buffer[NAME_BYTES] = "";
6117 	int width = 64;
6118 	int fail = 0;
6119 	enum counter_scope scope = SCOPE_CPU;
6120 	enum counter_type type = COUNTER_CYCLES;
6121 	enum counter_format format = FORMAT_DELTA;
6122 
6123 	while (add_command) {
6124 
6125 		if (sscanf(add_command, "msr0x%x", &msr_num) == 1)
6126 			goto next;
6127 
6128 		if (sscanf(add_command, "msr%d", &msr_num) == 1)
6129 			goto next;
6130 
6131 		if (*add_command == '/') {
6132 			path = add_command;
6133 			goto next;
6134 		}
6135 
6136 		if (sscanf(add_command, "u%d", &width) == 1) {
6137 			if ((width == 32) || (width == 64))
6138 				goto next;
6139 			width = 64;
6140 		}
6141 		if (!strncmp(add_command, "cpu", strlen("cpu"))) {
6142 			scope = SCOPE_CPU;
6143 			goto next;
6144 		}
6145 		if (!strncmp(add_command, "core", strlen("core"))) {
6146 			scope = SCOPE_CORE;
6147 			goto next;
6148 		}
6149 		if (!strncmp(add_command, "package", strlen("package"))) {
6150 			scope = SCOPE_PACKAGE;
6151 			goto next;
6152 		}
6153 		if (!strncmp(add_command, "cycles", strlen("cycles"))) {
6154 			type = COUNTER_CYCLES;
6155 			goto next;
6156 		}
6157 		if (!strncmp(add_command, "seconds", strlen("seconds"))) {
6158 			type = COUNTER_SECONDS;
6159 			goto next;
6160 		}
6161 		if (!strncmp(add_command, "usec", strlen("usec"))) {
6162 			type = COUNTER_USEC;
6163 			goto next;
6164 		}
6165 		if (!strncmp(add_command, "raw", strlen("raw"))) {
6166 			format = FORMAT_RAW;
6167 			goto next;
6168 		}
6169 		if (!strncmp(add_command, "delta", strlen("delta"))) {
6170 			format = FORMAT_DELTA;
6171 			goto next;
6172 		}
6173 		if (!strncmp(add_command, "percent", strlen("percent"))) {
6174 			format = FORMAT_PERCENT;
6175 			goto next;
6176 		}
6177 
6178 		if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) {	/* 18 < NAME_BYTES */
6179 			char *eos;
6180 
6181 			eos = strchr(name_buffer, ',');
6182 			if (eos)
6183 				*eos = '\0';
6184 			goto next;
6185 		}
6186 
6187 next:
6188 		add_command = strchr(add_command, ',');
6189 		if (add_command) {
6190 			*add_command = '\0';
6191 			add_command++;
6192 		}
6193 
6194 	}
6195 	if ((msr_num == 0) && (path == NULL)) {
6196 		fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n");
6197 		fail++;
6198 	}
6199 
6200 	/* generate default column header */
6201 	if (*name_buffer == '\0') {
6202 		if (width == 32)
6203 			sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
6204 		else
6205 			sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
6206 	}
6207 
6208 	if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0))
6209 		fail++;
6210 
6211 	if (fail) {
6212 		help();
6213 		exit(1);
6214 	}
6215 }
6216 
6217 int is_deferred_add(char *name)
6218 {
6219 	int i;
6220 
6221 	for (i = 0; i < deferred_add_index; ++i)
6222 		if (!strcmp(name, deferred_add_names[i]))
6223 			return 1;
6224 	return 0;
6225 }
6226 
6227 int is_deferred_skip(char *name)
6228 {
6229 	int i;
6230 
6231 	for (i = 0; i < deferred_skip_index; ++i)
6232 		if (!strcmp(name, deferred_skip_names[i]))
6233 			return 1;
6234 	return 0;
6235 }
6236 
6237 void probe_sysfs(void)
6238 {
6239 	char path[64];
6240 	char name_buf[16];
6241 	FILE *input;
6242 	int state;
6243 	char *sp;
6244 
6245 	for (state = 10; state >= 0; --state) {
6246 
6247 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
6248 		input = fopen(path, "r");
6249 		if (input == NULL)
6250 			continue;
6251 		if (!fgets(name_buf, sizeof(name_buf), input))
6252 			err(1, "%s: failed to read file", path);
6253 
6254 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
6255 		sp = strchr(name_buf, '-');
6256 		if (!sp)
6257 			sp = strchrnul(name_buf, '\n');
6258 		*sp = '%';
6259 		*(sp + 1) = '\0';
6260 
6261 		remove_underbar(name_buf);
6262 
6263 		fclose(input);
6264 
6265 		sprintf(path, "cpuidle/state%d/time", state);
6266 
6267 		if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf))
6268 			continue;
6269 
6270 		if (is_deferred_skip(name_buf))
6271 			continue;
6272 
6273 		add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU);
6274 	}
6275 
6276 	for (state = 10; state >= 0; --state) {
6277 
6278 		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
6279 		input = fopen(path, "r");
6280 		if (input == NULL)
6281 			continue;
6282 		if (!fgets(name_buf, sizeof(name_buf), input))
6283 			err(1, "%s: failed to read file", path);
6284 		/* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
6285 		sp = strchr(name_buf, '-');
6286 		if (!sp)
6287 			sp = strchrnul(name_buf, '\n');
6288 		*sp = '\0';
6289 		fclose(input);
6290 
6291 		remove_underbar(name_buf);
6292 
6293 		sprintf(path, "cpuidle/state%d/usage", state);
6294 
6295 		if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf))
6296 			continue;
6297 
6298 		if (is_deferred_skip(name_buf))
6299 			continue;
6300 
6301 		add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU);
6302 	}
6303 
6304 }
6305 
6306 /*
6307  * parse cpuset with following syntax
6308  * 1,2,4..6,8-10 and set bits in cpu_subset
6309  */
6310 void parse_cpu_command(char *optarg)
6311 {
6312 	unsigned int start, end;
6313 	char *next;
6314 
6315 	if (!strcmp(optarg, "core")) {
6316 		if (cpu_subset)
6317 			goto error;
6318 		show_core_only++;
6319 		return;
6320 	}
6321 	if (!strcmp(optarg, "package")) {
6322 		if (cpu_subset)
6323 			goto error;
6324 		show_pkg_only++;
6325 		return;
6326 	}
6327 	if (show_core_only || show_pkg_only)
6328 		goto error;
6329 
6330 	cpu_subset = CPU_ALLOC(CPU_SUBSET_MAXCPUS);
6331 	if (cpu_subset == NULL)
6332 		err(3, "CPU_ALLOC");
6333 	cpu_subset_size = CPU_ALLOC_SIZE(CPU_SUBSET_MAXCPUS);
6334 
6335 	CPU_ZERO_S(cpu_subset_size, cpu_subset);
6336 
6337 	next = optarg;
6338 
6339 	while (next && *next) {
6340 
6341 		if (*next == '-')	/* no negative cpu numbers */
6342 			goto error;
6343 
6344 		start = strtoul(next, &next, 10);
6345 
6346 		if (start >= CPU_SUBSET_MAXCPUS)
6347 			goto error;
6348 		CPU_SET_S(start, cpu_subset_size, cpu_subset);
6349 
6350 		if (*next == '\0')
6351 			break;
6352 
6353 		if (*next == ',') {
6354 			next += 1;
6355 			continue;
6356 		}
6357 
6358 		if (*next == '-') {
6359 			next += 1;	/* start range */
6360 		} else if (*next == '.') {
6361 			next += 1;
6362 			if (*next == '.')
6363 				next += 1;	/* start range */
6364 			else
6365 				goto error;
6366 		}
6367 
6368 		end = strtoul(next, &next, 10);
6369 		if (end <= start)
6370 			goto error;
6371 
6372 		while (++start <= end) {
6373 			if (start >= CPU_SUBSET_MAXCPUS)
6374 				goto error;
6375 			CPU_SET_S(start, cpu_subset_size, cpu_subset);
6376 		}
6377 
6378 		if (*next == ',')
6379 			next += 1;
6380 		else if (*next != '\0')
6381 			goto error;
6382 	}
6383 
6384 	return;
6385 
6386 error:
6387 	fprintf(stderr, "\"--cpu %s\" malformed\n", optarg);
6388 	help();
6389 	exit(-1);
6390 }
6391 
6392 void cmdline(int argc, char **argv)
6393 {
6394 	int opt;
6395 	int option_index = 0;
6396 	static struct option long_options[] = {
6397 		{ "add", required_argument, 0, 'a' },
6398 		{ "cpu", required_argument, 0, 'c' },
6399 		{ "Dump", no_argument, 0, 'D' },
6400 		{ "debug", no_argument, 0, 'd' },	/* internal, not documented */
6401 		{ "enable", required_argument, 0, 'e' },
6402 		{ "interval", required_argument, 0, 'i' },
6403 		{ "IPC", no_argument, 0, 'I' },
6404 		{ "num_iterations", required_argument, 0, 'n' },
6405 		{ "header_iterations", required_argument, 0, 'N' },
6406 		{ "help", no_argument, 0, 'h' },
6407 		{ "hide", required_argument, 0, 'H' },	// meh, -h taken by --help
6408 		{ "Joules", no_argument, 0, 'J' },
6409 		{ "list", no_argument, 0, 'l' },
6410 		{ "out", required_argument, 0, 'o' },
6411 		{ "quiet", no_argument, 0, 'q' },
6412 		{ "show", required_argument, 0, 's' },
6413 		{ "Summary", no_argument, 0, 'S' },
6414 		{ "TCC", required_argument, 0, 'T' },
6415 		{ "version", no_argument, 0, 'v' },
6416 		{ 0, 0, 0, 0 }
6417 	};
6418 
6419 	progname = argv[0];
6420 
6421 	while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) {
6422 		switch (opt) {
6423 		case 'a':
6424 			parse_add_command(optarg);
6425 			break;
6426 		case 'c':
6427 			parse_cpu_command(optarg);
6428 			break;
6429 		case 'D':
6430 			dump_only++;
6431 			break;
6432 		case 'e':
6433 			/* --enable specified counter */
6434 			bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST);
6435 			break;
6436 		case 'd':
6437 			debug++;
6438 			ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
6439 			break;
6440 		case 'H':
6441 			/*
6442 			 * --hide: do not show those specified
6443 			 *  multiple invocations simply clear more bits in enabled mask
6444 			 */
6445 			bic_enabled &= ~bic_lookup(optarg, HIDE_LIST);
6446 			break;
6447 		case 'h':
6448 		default:
6449 			help();
6450 			exit(1);
6451 		case 'i':
6452 			{
6453 				double interval = strtod(optarg, NULL);
6454 
6455 				if (interval < 0.001) {
6456 					fprintf(outf, "interval %f seconds is too small\n", interval);
6457 					exit(2);
6458 				}
6459 
6460 				interval_tv.tv_sec = interval_ts.tv_sec = interval;
6461 				interval_tv.tv_usec = (interval - interval_tv.tv_sec) * 1000000;
6462 				interval_ts.tv_nsec = (interval - interval_ts.tv_sec) * 1000000000;
6463 			}
6464 			break;
6465 		case 'J':
6466 			rapl_joules++;
6467 			break;
6468 		case 'l':
6469 			ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
6470 			list_header_only++;
6471 			quiet++;
6472 			break;
6473 		case 'o':
6474 			outf = fopen_or_die(optarg, "w");
6475 			break;
6476 		case 'q':
6477 			quiet = 1;
6478 			break;
6479 		case 'n':
6480 			num_iterations = strtod(optarg, NULL);
6481 
6482 			if (num_iterations <= 0) {
6483 				fprintf(outf, "iterations %d should be positive number\n", num_iterations);
6484 				exit(2);
6485 			}
6486 			break;
6487 		case 'N':
6488 			header_iterations = strtod(optarg, NULL);
6489 
6490 			if (header_iterations <= 0) {
6491 				fprintf(outf, "iterations %d should be positive number\n", header_iterations);
6492 				exit(2);
6493 			}
6494 			break;
6495 		case 's':
6496 			/*
6497 			 * --show: show only those specified
6498 			 *  The 1st invocation will clear and replace the enabled mask
6499 			 *  subsequent invocations can add to it.
6500 			 */
6501 			if (shown == 0)
6502 				bic_enabled = bic_lookup(optarg, SHOW_LIST);
6503 			else
6504 				bic_enabled |= bic_lookup(optarg, SHOW_LIST);
6505 			shown = 1;
6506 			break;
6507 		case 'S':
6508 			summary_only++;
6509 			break;
6510 		case 'T':
6511 			tj_max_override = atoi(optarg);
6512 			break;
6513 		case 'v':
6514 			print_version();
6515 			exit(0);
6516 			break;
6517 		}
6518 	}
6519 }
6520 
6521 int main(int argc, char **argv)
6522 {
6523 	outf = stderr;
6524 	cmdline(argc, argv);
6525 
6526 	if (!quiet)
6527 		print_version();
6528 
6529 	probe_sysfs();
6530 
6531 	turbostat_init();
6532 
6533 	msr_sum_record();
6534 
6535 	/* dump counters and exit */
6536 	if (dump_only)
6537 		return get_and_dump_counters();
6538 
6539 	/* list header and exit */
6540 	if (list_header_only) {
6541 		print_header(",");
6542 		flush_output_stdout();
6543 		return 0;
6544 	}
6545 
6546 	/*
6547 	 * if any params left, it must be a command to fork
6548 	 */
6549 	if (argc - optind)
6550 		return fork_it(argv + optind);
6551 	else
6552 		turbostat_loop();
6553 
6554 	return 0;
6555 }
6556