xref: /openbmc/linux/arch/x86/events/rapl.c (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Support Intel/AMD RAPL energy consumption counters
4   * Copyright (C) 2013 Google, Inc., Stephane Eranian
5   *
6   * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7   * section 14.7.1 (September 2013)
8   *
9   * AMD RAPL interface for Fam17h is described in the public PPR:
10   * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11   *
12   * RAPL provides more controls than just reporting energy consumption
13   * however here we only expose the 3 energy consumption free running
14   * counters (pp0, pkg, dram).
15   *
16   * Each of those counters increments in a power unit defined by the
17   * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18   * but it can vary.
19   *
20   * Counter to rapl events mappings:
21   *
22   *  pp0 counter: consumption of all physical cores (power plane 0)
23   * 	  event: rapl_energy_cores
24   *    perf code: 0x1
25   *
26   *  pkg counter: consumption of the whole processor package
27   *	  event: rapl_energy_pkg
28   *    perf code: 0x2
29   *
30   * dram counter: consumption of the dram domain (servers only)
31   *	  event: rapl_energy_dram
32   *    perf code: 0x3
33   *
34   * gpu counter: consumption of the builtin-gpu domain (client only)
35   *	  event: rapl_energy_gpu
36   *    perf code: 0x4
37   *
38   *  psys counter: consumption of the builtin-psys domain (client only)
39   *	  event: rapl_energy_psys
40   *    perf code: 0x5
41   *
42   * We manage those counters as free running (read-only). They may be
43   * use simultaneously by other tools, such as turbostat.
44   *
45   * The events only support system-wide mode counting. There is no
46   * sampling support because it does not make sense and is not
47   * supported by the RAPL hardware.
48   *
49   * Because we want to avoid floating-point operations in the kernel,
50   * the events are all reported in fixed point arithmetic (32.32).
51   * Tools must adjust the counts to convert them to Watts using
52   * the duration of the measurement. Tools may use a function such as
53   * ldexp(raw_count, -32);
54   */
55  
56  #define pr_fmt(fmt) "RAPL PMU: " fmt
57  
58  #include <linux/module.h>
59  #include <linux/slab.h>
60  #include <linux/perf_event.h>
61  #include <linux/nospec.h>
62  #include <asm/cpu_device_id.h>
63  #include <asm/intel-family.h>
64  #include "perf_event.h"
65  #include "probe.h"
66  
67  MODULE_LICENSE("GPL");
68  
69  /*
70   * RAPL energy status counters
71   */
72  enum perf_rapl_events {
73  	PERF_RAPL_PP0 = 0,		/* all cores */
74  	PERF_RAPL_PKG,			/* entire package */
75  	PERF_RAPL_RAM,			/* DRAM */
76  	PERF_RAPL_PP1,			/* gpu */
77  	PERF_RAPL_PSYS,			/* psys */
78  
79  	PERF_RAPL_MAX,
80  	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
81  };
82  
83  static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
84  	"pp0-core",
85  	"package",
86  	"dram",
87  	"pp1-gpu",
88  	"psys",
89  };
90  
91  /*
92   * event code: LSB 8 bits, passed in attr->config
93   * any other bit is reserved
94   */
95  #define RAPL_EVENT_MASK	0xFFULL
96  #define RAPL_CNTR_WIDTH 32
97  
98  #define RAPL_EVENT_ATTR_STR(_name, v, str)					\
99  static struct perf_pmu_events_attr event_attr_##v = {				\
100  	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
101  	.id		= 0,							\
102  	.event_str	= str,							\
103  };
104  
105  struct rapl_pmu {
106  	raw_spinlock_t		lock;
107  	int			n_active;
108  	int			cpu;
109  	struct list_head	active_list;
110  	struct pmu		*pmu;
111  	ktime_t			timer_interval;
112  	struct hrtimer		hrtimer;
113  };
114  
115  struct rapl_pmus {
116  	struct pmu		pmu;
117  	unsigned int		maxdie;
118  	struct rapl_pmu		*pmus[];
119  };
120  
121  enum rapl_unit_quirk {
122  	RAPL_UNIT_QUIRK_NONE,
123  	RAPL_UNIT_QUIRK_INTEL_HSW,
124  	RAPL_UNIT_QUIRK_INTEL_SPR,
125  };
126  
127  struct rapl_model {
128  	struct perf_msr *rapl_msrs;
129  	unsigned long	events;
130  	unsigned int	msr_power_unit;
131  	enum rapl_unit_quirk	unit_quirk;
132  };
133  
134   /* 1/2^hw_unit Joule */
135  static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
136  static struct rapl_pmus *rapl_pmus;
137  static cpumask_t rapl_cpu_mask;
138  static unsigned int rapl_cntr_mask;
139  static u64 rapl_timer_ms;
140  static struct perf_msr *rapl_msrs;
141  
cpu_to_rapl_pmu(unsigned int cpu)142  static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
143  {
144  	unsigned int dieid = topology_logical_die_id(cpu);
145  
146  	/*
147  	 * The unsigned check also catches the '-1' return value for non
148  	 * existent mappings in the topology map.
149  	 */
150  	return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
151  }
152  
rapl_read_counter(struct perf_event * event)153  static inline u64 rapl_read_counter(struct perf_event *event)
154  {
155  	u64 raw;
156  	rdmsrl(event->hw.event_base, raw);
157  	return raw;
158  }
159  
rapl_scale(u64 v,int cfg)160  static inline u64 rapl_scale(u64 v, int cfg)
161  {
162  	if (cfg > NR_RAPL_DOMAINS) {
163  		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
164  		return v;
165  	}
166  	/*
167  	 * scale delta to smallest unit (1/2^32)
168  	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
169  	 * or use ldexp(count, -32).
170  	 * Watts = Joules/Time delta
171  	 */
172  	return v << (32 - rapl_hw_unit[cfg - 1]);
173  }
174  
rapl_event_update(struct perf_event * event)175  static u64 rapl_event_update(struct perf_event *event)
176  {
177  	struct hw_perf_event *hwc = &event->hw;
178  	u64 prev_raw_count, new_raw_count;
179  	s64 delta, sdelta;
180  	int shift = RAPL_CNTR_WIDTH;
181  
182  again:
183  	prev_raw_count = local64_read(&hwc->prev_count);
184  	rdmsrl(event->hw.event_base, new_raw_count);
185  
186  	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
187  			    new_raw_count) != prev_raw_count) {
188  		cpu_relax();
189  		goto again;
190  	}
191  
192  	/*
193  	 * Now we have the new raw value and have updated the prev
194  	 * timestamp already. We can now calculate the elapsed delta
195  	 * (event-)time and add that to the generic event.
196  	 *
197  	 * Careful, not all hw sign-extends above the physical width
198  	 * of the count.
199  	 */
200  	delta = (new_raw_count << shift) - (prev_raw_count << shift);
201  	delta >>= shift;
202  
203  	sdelta = rapl_scale(delta, event->hw.config);
204  
205  	local64_add(sdelta, &event->count);
206  
207  	return new_raw_count;
208  }
209  
rapl_start_hrtimer(struct rapl_pmu * pmu)210  static void rapl_start_hrtimer(struct rapl_pmu *pmu)
211  {
212         hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
213  		     HRTIMER_MODE_REL_PINNED);
214  }
215  
rapl_hrtimer_handle(struct hrtimer * hrtimer)216  static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
217  {
218  	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
219  	struct perf_event *event;
220  	unsigned long flags;
221  
222  	if (!pmu->n_active)
223  		return HRTIMER_NORESTART;
224  
225  	raw_spin_lock_irqsave(&pmu->lock, flags);
226  
227  	list_for_each_entry(event, &pmu->active_list, active_entry)
228  		rapl_event_update(event);
229  
230  	raw_spin_unlock_irqrestore(&pmu->lock, flags);
231  
232  	hrtimer_forward_now(hrtimer, pmu->timer_interval);
233  
234  	return HRTIMER_RESTART;
235  }
236  
rapl_hrtimer_init(struct rapl_pmu * pmu)237  static void rapl_hrtimer_init(struct rapl_pmu *pmu)
238  {
239  	struct hrtimer *hr = &pmu->hrtimer;
240  
241  	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
242  	hr->function = rapl_hrtimer_handle;
243  }
244  
__rapl_pmu_event_start(struct rapl_pmu * pmu,struct perf_event * event)245  static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
246  				   struct perf_event *event)
247  {
248  	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
249  		return;
250  
251  	event->hw.state = 0;
252  
253  	list_add_tail(&event->active_entry, &pmu->active_list);
254  
255  	local64_set(&event->hw.prev_count, rapl_read_counter(event));
256  
257  	pmu->n_active++;
258  	if (pmu->n_active == 1)
259  		rapl_start_hrtimer(pmu);
260  }
261  
rapl_pmu_event_start(struct perf_event * event,int mode)262  static void rapl_pmu_event_start(struct perf_event *event, int mode)
263  {
264  	struct rapl_pmu *pmu = event->pmu_private;
265  	unsigned long flags;
266  
267  	raw_spin_lock_irqsave(&pmu->lock, flags);
268  	__rapl_pmu_event_start(pmu, event);
269  	raw_spin_unlock_irqrestore(&pmu->lock, flags);
270  }
271  
rapl_pmu_event_stop(struct perf_event * event,int mode)272  static void rapl_pmu_event_stop(struct perf_event *event, int mode)
273  {
274  	struct rapl_pmu *pmu = event->pmu_private;
275  	struct hw_perf_event *hwc = &event->hw;
276  	unsigned long flags;
277  
278  	raw_spin_lock_irqsave(&pmu->lock, flags);
279  
280  	/* mark event as deactivated and stopped */
281  	if (!(hwc->state & PERF_HES_STOPPED)) {
282  		WARN_ON_ONCE(pmu->n_active <= 0);
283  		pmu->n_active--;
284  		if (pmu->n_active == 0)
285  			hrtimer_cancel(&pmu->hrtimer);
286  
287  		list_del(&event->active_entry);
288  
289  		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
290  		hwc->state |= PERF_HES_STOPPED;
291  	}
292  
293  	/* check if update of sw counter is necessary */
294  	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
295  		/*
296  		 * Drain the remaining delta count out of a event
297  		 * that we are disabling:
298  		 */
299  		rapl_event_update(event);
300  		hwc->state |= PERF_HES_UPTODATE;
301  	}
302  
303  	raw_spin_unlock_irqrestore(&pmu->lock, flags);
304  }
305  
rapl_pmu_event_add(struct perf_event * event,int mode)306  static int rapl_pmu_event_add(struct perf_event *event, int mode)
307  {
308  	struct rapl_pmu *pmu = event->pmu_private;
309  	struct hw_perf_event *hwc = &event->hw;
310  	unsigned long flags;
311  
312  	raw_spin_lock_irqsave(&pmu->lock, flags);
313  
314  	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
315  
316  	if (mode & PERF_EF_START)
317  		__rapl_pmu_event_start(pmu, event);
318  
319  	raw_spin_unlock_irqrestore(&pmu->lock, flags);
320  
321  	return 0;
322  }
323  
rapl_pmu_event_del(struct perf_event * event,int flags)324  static void rapl_pmu_event_del(struct perf_event *event, int flags)
325  {
326  	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
327  }
328  
rapl_pmu_event_init(struct perf_event * event)329  static int rapl_pmu_event_init(struct perf_event *event)
330  {
331  	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
332  	int bit, ret = 0;
333  	struct rapl_pmu *pmu;
334  
335  	/* only look at RAPL events */
336  	if (event->attr.type != rapl_pmus->pmu.type)
337  		return -ENOENT;
338  
339  	/* check only supported bits are set */
340  	if (event->attr.config & ~RAPL_EVENT_MASK)
341  		return -EINVAL;
342  
343  	if (event->cpu < 0)
344  		return -EINVAL;
345  
346  	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
347  
348  	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
349  		return -EINVAL;
350  
351  	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
352  	bit = cfg - 1;
353  
354  	/* check event supported */
355  	if (!(rapl_cntr_mask & (1 << bit)))
356  		return -EINVAL;
357  
358  	/* unsupported modes and filters */
359  	if (event->attr.sample_period) /* no sampling */
360  		return -EINVAL;
361  
362  	/* must be done before validate_group */
363  	pmu = cpu_to_rapl_pmu(event->cpu);
364  	if (!pmu)
365  		return -EINVAL;
366  	event->cpu = pmu->cpu;
367  	event->pmu_private = pmu;
368  	event->hw.event_base = rapl_msrs[bit].msr;
369  	event->hw.config = cfg;
370  	event->hw.idx = bit;
371  
372  	return ret;
373  }
374  
rapl_pmu_event_read(struct perf_event * event)375  static void rapl_pmu_event_read(struct perf_event *event)
376  {
377  	rapl_event_update(event);
378  }
379  
rapl_get_attr_cpumask(struct device * dev,struct device_attribute * attr,char * buf)380  static ssize_t rapl_get_attr_cpumask(struct device *dev,
381  				struct device_attribute *attr, char *buf)
382  {
383  	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
384  }
385  
386  static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
387  
388  static struct attribute *rapl_pmu_attrs[] = {
389  	&dev_attr_cpumask.attr,
390  	NULL,
391  };
392  
393  static struct attribute_group rapl_pmu_attr_group = {
394  	.attrs = rapl_pmu_attrs,
395  };
396  
397  RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
398  RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
399  RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
400  RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
401  RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
402  
403  RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
404  RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
405  RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
406  RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
407  RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
408  
409  /*
410   * we compute in 0.23 nJ increments regardless of MSR
411   */
412  RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
413  RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
414  RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
415  RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
416  RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
417  
418  /*
419   * There are no default events, but we need to create
420   * "events" group (with empty attrs) before updating
421   * it with detected events.
422   */
423  static struct attribute *attrs_empty[] = {
424  	NULL,
425  };
426  
427  static struct attribute_group rapl_pmu_events_group = {
428  	.name = "events",
429  	.attrs = attrs_empty,
430  };
431  
432  PMU_FORMAT_ATTR(event, "config:0-7");
433  static struct attribute *rapl_formats_attr[] = {
434  	&format_attr_event.attr,
435  	NULL,
436  };
437  
438  static struct attribute_group rapl_pmu_format_group = {
439  	.name = "format",
440  	.attrs = rapl_formats_attr,
441  };
442  
443  static const struct attribute_group *rapl_attr_groups[] = {
444  	&rapl_pmu_attr_group,
445  	&rapl_pmu_format_group,
446  	&rapl_pmu_events_group,
447  	NULL,
448  };
449  
450  static struct attribute *rapl_events_cores[] = {
451  	EVENT_PTR(rapl_cores),
452  	EVENT_PTR(rapl_cores_unit),
453  	EVENT_PTR(rapl_cores_scale),
454  	NULL,
455  };
456  
457  static struct attribute_group rapl_events_cores_group = {
458  	.name  = "events",
459  	.attrs = rapl_events_cores,
460  };
461  
462  static struct attribute *rapl_events_pkg[] = {
463  	EVENT_PTR(rapl_pkg),
464  	EVENT_PTR(rapl_pkg_unit),
465  	EVENT_PTR(rapl_pkg_scale),
466  	NULL,
467  };
468  
469  static struct attribute_group rapl_events_pkg_group = {
470  	.name  = "events",
471  	.attrs = rapl_events_pkg,
472  };
473  
474  static struct attribute *rapl_events_ram[] = {
475  	EVENT_PTR(rapl_ram),
476  	EVENT_PTR(rapl_ram_unit),
477  	EVENT_PTR(rapl_ram_scale),
478  	NULL,
479  };
480  
481  static struct attribute_group rapl_events_ram_group = {
482  	.name  = "events",
483  	.attrs = rapl_events_ram,
484  };
485  
486  static struct attribute *rapl_events_gpu[] = {
487  	EVENT_PTR(rapl_gpu),
488  	EVENT_PTR(rapl_gpu_unit),
489  	EVENT_PTR(rapl_gpu_scale),
490  	NULL,
491  };
492  
493  static struct attribute_group rapl_events_gpu_group = {
494  	.name  = "events",
495  	.attrs = rapl_events_gpu,
496  };
497  
498  static struct attribute *rapl_events_psys[] = {
499  	EVENT_PTR(rapl_psys),
500  	EVENT_PTR(rapl_psys_unit),
501  	EVENT_PTR(rapl_psys_scale),
502  	NULL,
503  };
504  
505  static struct attribute_group rapl_events_psys_group = {
506  	.name  = "events",
507  	.attrs = rapl_events_psys,
508  };
509  
test_msr(int idx,void * data)510  static bool test_msr(int idx, void *data)
511  {
512  	return test_bit(idx, (unsigned long *) data);
513  }
514  
515  /* Only lower 32bits of the MSR represents the energy counter */
516  #define RAPL_MSR_MASK 0xFFFFFFFF
517  
518  static struct perf_msr intel_rapl_msrs[] = {
519  	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
520  	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
521  	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
522  	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
523  	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
524  };
525  
526  static struct perf_msr intel_rapl_spr_msrs[] = {
527  	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
528  	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
529  	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
530  	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
531  	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
532  };
533  
534  /*
535   * Force to PERF_RAPL_MAX size due to:
536   * - perf_msr_probe(PERF_RAPL_MAX)
537   * - want to use same event codes across both architectures
538   */
539  static struct perf_msr amd_rapl_msrs[] = {
540  	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, 0, false, 0 },
541  	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
542  	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   0, false, 0 },
543  	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   0, false, 0 },
544  	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  0, false, 0 },
545  };
546  
rapl_cpu_offline(unsigned int cpu)547  static int rapl_cpu_offline(unsigned int cpu)
548  {
549  	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
550  	int target;
551  
552  	/* Check if exiting cpu is used for collecting rapl events */
553  	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
554  		return 0;
555  
556  	pmu->cpu = -1;
557  	/* Find a new cpu to collect rapl events */
558  	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
559  
560  	/* Migrate rapl events to the new target */
561  	if (target < nr_cpu_ids) {
562  		cpumask_set_cpu(target, &rapl_cpu_mask);
563  		pmu->cpu = target;
564  		perf_pmu_migrate_context(pmu->pmu, cpu, target);
565  	}
566  	return 0;
567  }
568  
rapl_cpu_online(unsigned int cpu)569  static int rapl_cpu_online(unsigned int cpu)
570  {
571  	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
572  	int target;
573  
574  	if (!pmu) {
575  		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
576  		if (!pmu)
577  			return -ENOMEM;
578  
579  		raw_spin_lock_init(&pmu->lock);
580  		INIT_LIST_HEAD(&pmu->active_list);
581  		pmu->pmu = &rapl_pmus->pmu;
582  		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
583  		rapl_hrtimer_init(pmu);
584  
585  		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
586  	}
587  
588  	/*
589  	 * Check if there is an online cpu in the package which collects rapl
590  	 * events already.
591  	 */
592  	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
593  	if (target < nr_cpu_ids)
594  		return 0;
595  
596  	cpumask_set_cpu(cpu, &rapl_cpu_mask);
597  	pmu->cpu = cpu;
598  	return 0;
599  }
600  
rapl_check_hw_unit(struct rapl_model * rm)601  static int rapl_check_hw_unit(struct rapl_model *rm)
602  {
603  	u64 msr_rapl_power_unit_bits;
604  	int i;
605  
606  	/* protect rdmsrl() to handle virtualization */
607  	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
608  		return -1;
609  	for (i = 0; i < NR_RAPL_DOMAINS; i++)
610  		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
611  
612  	switch (rm->unit_quirk) {
613  	/*
614  	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
615  	 * different than the unit from power unit MSR. See
616  	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
617  	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
618  	 */
619  	case RAPL_UNIT_QUIRK_INTEL_HSW:
620  		rapl_hw_unit[PERF_RAPL_RAM] = 16;
621  		break;
622  	/* SPR uses a fixed energy unit for Psys domain. */
623  	case RAPL_UNIT_QUIRK_INTEL_SPR:
624  		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
625  		break;
626  	default:
627  		break;
628  	}
629  
630  
631  	/*
632  	 * Calculate the timer rate:
633  	 * Use reference of 200W for scaling the timeout to avoid counter
634  	 * overflows. 200W = 200 Joules/sec
635  	 * Divide interval by 2 to avoid lockstep (2 * 100)
636  	 * if hw unit is 32, then we use 2 ms 1/200/2
637  	 */
638  	rapl_timer_ms = 2;
639  	if (rapl_hw_unit[0] < 32) {
640  		rapl_timer_ms = (1000 / (2 * 100));
641  		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
642  	}
643  	return 0;
644  }
645  
rapl_advertise(void)646  static void __init rapl_advertise(void)
647  {
648  	int i;
649  
650  	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
651  		hweight32(rapl_cntr_mask), rapl_timer_ms);
652  
653  	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
654  		if (rapl_cntr_mask & (1 << i)) {
655  			pr_info("hw unit of domain %s 2^-%d Joules\n",
656  				rapl_domain_names[i], rapl_hw_unit[i]);
657  		}
658  	}
659  }
660  
cleanup_rapl_pmus(void)661  static void cleanup_rapl_pmus(void)
662  {
663  	int i;
664  
665  	for (i = 0; i < rapl_pmus->maxdie; i++)
666  		kfree(rapl_pmus->pmus[i]);
667  	kfree(rapl_pmus);
668  }
669  
670  static const struct attribute_group *rapl_attr_update[] = {
671  	&rapl_events_cores_group,
672  	&rapl_events_pkg_group,
673  	&rapl_events_ram_group,
674  	&rapl_events_gpu_group,
675  	&rapl_events_psys_group,
676  	NULL,
677  };
678  
init_rapl_pmus(void)679  static int __init init_rapl_pmus(void)
680  {
681  	int maxdie = topology_max_packages() * topology_max_die_per_package();
682  	size_t size;
683  
684  	size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
685  	rapl_pmus = kzalloc(size, GFP_KERNEL);
686  	if (!rapl_pmus)
687  		return -ENOMEM;
688  
689  	rapl_pmus->maxdie		= maxdie;
690  	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
691  	rapl_pmus->pmu.attr_update	= rapl_attr_update;
692  	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
693  	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
694  	rapl_pmus->pmu.add		= rapl_pmu_event_add;
695  	rapl_pmus->pmu.del		= rapl_pmu_event_del;
696  	rapl_pmus->pmu.start		= rapl_pmu_event_start;
697  	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
698  	rapl_pmus->pmu.read		= rapl_pmu_event_read;
699  	rapl_pmus->pmu.module		= THIS_MODULE;
700  	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
701  	return 0;
702  }
703  
704  static struct rapl_model model_snb = {
705  	.events		= BIT(PERF_RAPL_PP0) |
706  			  BIT(PERF_RAPL_PKG) |
707  			  BIT(PERF_RAPL_PP1),
708  	.msr_power_unit = MSR_RAPL_POWER_UNIT,
709  	.rapl_msrs      = intel_rapl_msrs,
710  };
711  
712  static struct rapl_model model_snbep = {
713  	.events		= BIT(PERF_RAPL_PP0) |
714  			  BIT(PERF_RAPL_PKG) |
715  			  BIT(PERF_RAPL_RAM),
716  	.msr_power_unit = MSR_RAPL_POWER_UNIT,
717  	.rapl_msrs      = intel_rapl_msrs,
718  };
719  
720  static struct rapl_model model_hsw = {
721  	.events		= BIT(PERF_RAPL_PP0) |
722  			  BIT(PERF_RAPL_PKG) |
723  			  BIT(PERF_RAPL_RAM) |
724  			  BIT(PERF_RAPL_PP1),
725  	.msr_power_unit = MSR_RAPL_POWER_UNIT,
726  	.rapl_msrs      = intel_rapl_msrs,
727  };
728  
729  static struct rapl_model model_hsx = {
730  	.events		= BIT(PERF_RAPL_PP0) |
731  			  BIT(PERF_RAPL_PKG) |
732  			  BIT(PERF_RAPL_RAM),
733  	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
734  	.msr_power_unit = MSR_RAPL_POWER_UNIT,
735  	.rapl_msrs      = intel_rapl_msrs,
736  };
737  
738  static struct rapl_model model_knl = {
739  	.events		= BIT(PERF_RAPL_PKG) |
740  			  BIT(PERF_RAPL_RAM),
741  	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
742  	.msr_power_unit = MSR_RAPL_POWER_UNIT,
743  	.rapl_msrs      = intel_rapl_msrs,
744  };
745  
746  static struct rapl_model model_skl = {
747  	.events		= BIT(PERF_RAPL_PP0) |
748  			  BIT(PERF_RAPL_PKG) |
749  			  BIT(PERF_RAPL_RAM) |
750  			  BIT(PERF_RAPL_PP1) |
751  			  BIT(PERF_RAPL_PSYS),
752  	.msr_power_unit = MSR_RAPL_POWER_UNIT,
753  	.rapl_msrs      = intel_rapl_msrs,
754  };
755  
756  static struct rapl_model model_spr = {
757  	.events		= BIT(PERF_RAPL_PP0) |
758  			  BIT(PERF_RAPL_PKG) |
759  			  BIT(PERF_RAPL_RAM) |
760  			  BIT(PERF_RAPL_PSYS),
761  	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
762  	.msr_power_unit = MSR_RAPL_POWER_UNIT,
763  	.rapl_msrs      = intel_rapl_spr_msrs,
764  };
765  
766  static struct rapl_model model_amd_hygon = {
767  	.events		= BIT(PERF_RAPL_PKG),
768  	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
769  	.rapl_msrs      = amd_rapl_msrs,
770  };
771  
772  static const struct x86_cpu_id rapl_model_match[] __initconst = {
773  	X86_MATCH_FEATURE(X86_FEATURE_RAPL,		&model_amd_hygon),
774  	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&model_snb),
775  	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&model_snbep),
776  	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&model_snb),
777  	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&model_snbep),
778  	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&model_hsw),
779  	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&model_hsx),
780  	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&model_hsw),
781  	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&model_hsw),
782  	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&model_hsw),
783  	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&model_hsw),
784  	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&model_hsx),
785  	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&model_hsx),
786  	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&model_knl),
787  	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&model_knl),
788  	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&model_skl),
789  	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&model_skl),
790  	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&model_hsx),
791  	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&model_skl),
792  	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&model_skl),
793  	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&model_skl),
794  	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&model_hsw),
795  	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&model_hsw),
796  	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&model_hsw),
797  	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&model_skl),
798  	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&model_skl),
799  	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&model_hsx),
800  	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&model_hsx),
801  	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&model_skl),
802  	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&model_skl),
803  	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,		&model_skl),
804  	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,		&model_skl),
805  	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,		&model_skl),
806  	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		&model_skl),
807  	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,	&model_skl),
808  	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&model_spr),
809  	X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X,	&model_spr),
810  	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,		&model_skl),
811  	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,	&model_skl),
812  	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S,	&model_skl),
813  	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE,		&model_skl),
814  	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L,	&model_skl),
815  	{},
816  };
817  MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
818  
rapl_pmu_init(void)819  static int __init rapl_pmu_init(void)
820  {
821  	const struct x86_cpu_id *id;
822  	struct rapl_model *rm;
823  	int ret;
824  
825  	id = x86_match_cpu(rapl_model_match);
826  	if (!id)
827  		return -ENODEV;
828  
829  	rm = (struct rapl_model *) id->driver_data;
830  
831  	rapl_msrs = rm->rapl_msrs;
832  
833  	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
834  					false, (void *) &rm->events);
835  
836  	ret = rapl_check_hw_unit(rm);
837  	if (ret)
838  		return ret;
839  
840  	ret = init_rapl_pmus();
841  	if (ret)
842  		return ret;
843  
844  	/*
845  	 * Install callbacks. Core will call them for each online cpu.
846  	 */
847  	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
848  				"perf/x86/rapl:online",
849  				rapl_cpu_online, rapl_cpu_offline);
850  	if (ret)
851  		goto out;
852  
853  	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
854  	if (ret)
855  		goto out1;
856  
857  	rapl_advertise();
858  	return 0;
859  
860  out1:
861  	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
862  out:
863  	pr_warn("Initialization failed (%d), disabled\n", ret);
864  	cleanup_rapl_pmus();
865  	return ret;
866  }
867  module_init(rapl_pmu_init);
868  
intel_rapl_exit(void)869  static void __exit intel_rapl_exit(void)
870  {
871  	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
872  	perf_pmu_unregister(&rapl_pmus->pmu);
873  	cleanup_rapl_pmus();
874  }
875  module_exit(intel_rapl_exit);
876