xref: /openbmc/linux/drivers/powercap/intel_rapl_common.c (revision 4ebdac060e5e24a89a7b3ec33ec46a41621e57fe)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Common code for Intel Running Average Power Limit (RAPL) support.
4  * Copyright (c) 2019, Intel Corporation.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/cleanup.h>
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/list.h>
12 #include <linux/types.h>
13 #include <linux/device.h>
14 #include <linux/slab.h>
15 #include <linux/log2.h>
16 #include <linux/bitmap.h>
17 #include <linux/delay.h>
18 #include <linux/sysfs.h>
19 #include <linux/cpu.h>
20 #include <linux/powercap.h>
21 #include <linux/suspend.h>
22 #include <linux/intel_rapl.h>
23 #include <linux/processor.h>
24 #include <linux/platform_device.h>
25 
26 #include <asm/iosf_mbi.h>
27 #include <asm/cpu_device_id.h>
28 #include <asm/intel-family.h>
29 
30 /* bitmasks for RAPL MSRs, used by primitive access functions */
31 #define ENERGY_STATUS_MASK      0xffffffff
32 
33 #define POWER_LIMIT1_MASK       0x7FFF
34 #define POWER_LIMIT1_ENABLE     BIT(15)
35 #define POWER_LIMIT1_CLAMP      BIT(16)
36 
37 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
38 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
39 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
40 #define POWER_HIGH_LOCK         BIT_ULL(63)
41 #define POWER_LOW_LOCK          BIT(31)
42 
43 #define POWER_LIMIT4_MASK		0x1FFF
44 
45 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
46 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
47 
48 #define POWER_UNIT_OFFSET	0
49 #define POWER_UNIT_MASK		0x0F
50 
51 #define ENERGY_UNIT_OFFSET	0x08
52 #define ENERGY_UNIT_MASK	0x1F00
53 
54 #define TIME_UNIT_OFFSET	0x10
55 #define TIME_UNIT_MASK		0xF0000
56 
57 #define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
58 #define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
59 #define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
60 #define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
61 
62 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
63 #define PP_POLICY_MASK         0x1F
64 
65 /*
66  * SPR has different layout for Psys Domain PowerLimit registers.
67  * There are 17 bits of PL1 and PL2 instead of 15 bits.
68  * The Enable bits and TimeWindow bits are also shifted as a result.
69  */
70 #define PSYS_POWER_LIMIT1_MASK       0x1FFFF
71 #define PSYS_POWER_LIMIT1_ENABLE     BIT(17)
72 
73 #define PSYS_POWER_LIMIT2_MASK       (0x1FFFFULL<<32)
74 #define PSYS_POWER_LIMIT2_ENABLE     BIT_ULL(49)
75 
76 #define PSYS_TIME_WINDOW1_MASK       (0x7FULL<<19)
77 #define PSYS_TIME_WINDOW2_MASK       (0x7FULL<<51)
78 
79 /* bitmasks for RAPL TPMI, used by primitive access functions */
80 #define TPMI_POWER_LIMIT_MASK	0x3FFFF
81 #define TPMI_POWER_LIMIT_ENABLE	BIT_ULL(62)
82 #define TPMI_TIME_WINDOW_MASK	(0x7FULL<<18)
83 #define TPMI_INFO_SPEC_MASK	0x3FFFF
84 #define TPMI_INFO_MIN_MASK	(0x3FFFFULL << 18)
85 #define TPMI_INFO_MAX_MASK	(0x3FFFFULL << 36)
86 #define TPMI_INFO_MAX_TIME_WIN_MASK	(0x7FULL << 54)
87 
88 /* Non HW constants */
89 #define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
90 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
91 
92 #define TIME_WINDOW_MAX_MSEC 40000
93 #define TIME_WINDOW_MIN_MSEC 250
94 #define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
95 enum unit_type {
96 	ARBITRARY_UNIT,		/* no translation */
97 	POWER_UNIT,
98 	ENERGY_UNIT,
99 	TIME_UNIT,
100 };
101 
102 /* per domain data, some are optional */
103 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
104 
105 #define	DOMAIN_STATE_INACTIVE           BIT(0)
106 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
107 
108 static const char *pl_names[NR_POWER_LIMITS] = {
109 	[POWER_LIMIT1] = "long_term",
110 	[POWER_LIMIT2] = "short_term",
111 	[POWER_LIMIT4] = "peak_power",
112 };
113 
114 enum pl_prims {
115 	PL_ENABLE,
116 	PL_CLAMP,
117 	PL_LIMIT,
118 	PL_TIME_WINDOW,
119 	PL_MAX_POWER,
120 	PL_LOCK,
121 };
122 
is_pl_valid(struct rapl_domain * rd,int pl)123 static bool is_pl_valid(struct rapl_domain *rd, int pl)
124 {
125 	if (pl < POWER_LIMIT1 || pl > POWER_LIMIT4)
126 		return false;
127 	return rd->rpl[pl].name ? true : false;
128 }
129 
get_pl_lock_prim(struct rapl_domain * rd,int pl)130 static int get_pl_lock_prim(struct rapl_domain *rd, int pl)
131 {
132 	if (rd->rp->priv->type == RAPL_IF_TPMI) {
133 		if (pl == POWER_LIMIT1)
134 			return PL1_LOCK;
135 		if (pl == POWER_LIMIT2)
136 			return PL2_LOCK;
137 		if (pl == POWER_LIMIT4)
138 			return PL4_LOCK;
139 	}
140 
141 	/* MSR/MMIO Interface doesn't have Lock bit for PL4 */
142 	if (pl == POWER_LIMIT4)
143 		return -EINVAL;
144 
145 	/*
146 	 * Power Limit register that supports two power limits has a different
147 	 * bit position for the Lock bit.
148 	 */
149 	if (rd->rp->priv->limits[rd->id] & BIT(POWER_LIMIT2))
150 		return FW_HIGH_LOCK;
151 	return FW_LOCK;
152 }
153 
get_pl_prim(struct rapl_domain * rd,int pl,enum pl_prims prim)154 static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim)
155 {
156 	switch (pl) {
157 	case POWER_LIMIT1:
158 		if (prim == PL_ENABLE)
159 			return PL1_ENABLE;
160 		if (prim == PL_CLAMP && rd->rp->priv->type != RAPL_IF_TPMI)
161 			return PL1_CLAMP;
162 		if (prim == PL_LIMIT)
163 			return POWER_LIMIT1;
164 		if (prim == PL_TIME_WINDOW)
165 			return TIME_WINDOW1;
166 		if (prim == PL_MAX_POWER)
167 			return THERMAL_SPEC_POWER;
168 		if (prim == PL_LOCK)
169 			return get_pl_lock_prim(rd, pl);
170 		return -EINVAL;
171 	case POWER_LIMIT2:
172 		if (prim == PL_ENABLE)
173 			return PL2_ENABLE;
174 		if (prim == PL_CLAMP && rd->rp->priv->type != RAPL_IF_TPMI)
175 			return PL2_CLAMP;
176 		if (prim == PL_LIMIT)
177 			return POWER_LIMIT2;
178 		if (prim == PL_TIME_WINDOW)
179 			return TIME_WINDOW2;
180 		if (prim == PL_MAX_POWER)
181 			return MAX_POWER;
182 		if (prim == PL_LOCK)
183 			return get_pl_lock_prim(rd, pl);
184 		return -EINVAL;
185 	case POWER_LIMIT4:
186 		if (prim == PL_LIMIT)
187 			return POWER_LIMIT4;
188 		if (prim == PL_ENABLE)
189 			return PL4_ENABLE;
190 		/* PL4 would be around two times PL2, use same prim as PL2. */
191 		if (prim == PL_MAX_POWER)
192 			return MAX_POWER;
193 		if (prim == PL_LOCK)
194 			return get_pl_lock_prim(rd, pl);
195 		return -EINVAL;
196 	default:
197 		return -EINVAL;
198 	}
199 }
200 
201 #define power_zone_to_rapl_domain(_zone) \
202 	container_of(_zone, struct rapl_domain, power_zone)
203 
204 struct rapl_defaults {
205 	u8 floor_freq_reg_addr;
206 	int (*check_unit)(struct rapl_domain *rd);
207 	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
208 	u64 (*compute_time_window)(struct rapl_domain *rd, u64 val,
209 				    bool to_raw);
210 	unsigned int dram_domain_energy_unit;
211 	unsigned int psys_domain_energy_unit;
212 	bool spr_psys_bits;
213 };
214 static struct rapl_defaults *defaults_msr;
215 static const struct rapl_defaults defaults_tpmi;
216 
get_defaults(struct rapl_package * rp)217 static struct rapl_defaults *get_defaults(struct rapl_package *rp)
218 {
219 	return rp->priv->defaults;
220 }
221 
222 /* Sideband MBI registers */
223 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
224 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
225 
226 #define PACKAGE_PLN_INT_SAVED   BIT(0)
227 #define MAX_PRIM_NAME (32)
228 
229 /* per domain data. used to describe individual knobs such that access function
230  * can be consolidated into one instead of many inline functions.
231  */
232 struct rapl_primitive_info {
233 	const char *name;
234 	u64 mask;
235 	int shift;
236 	enum rapl_domain_reg_id id;
237 	enum unit_type unit;
238 	u32 flag;
239 };
240 
241 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
242 		.name = #p,			\
243 		.mask = m,			\
244 		.shift = s,			\
245 		.id = i,			\
246 		.unit = u,			\
247 		.flag = f			\
248 	}
249 
250 static void rapl_init_domains(struct rapl_package *rp);
251 static int rapl_read_data_raw(struct rapl_domain *rd,
252 			      enum rapl_primitives prim,
253 			      bool xlate, u64 *data);
254 static int rapl_write_data_raw(struct rapl_domain *rd,
255 			       enum rapl_primitives prim,
256 			       unsigned long long value);
257 static int rapl_read_pl_data(struct rapl_domain *rd, int pl,
258 			      enum pl_prims pl_prim,
259 			      bool xlate, u64 *data);
260 static int rapl_write_pl_data(struct rapl_domain *rd, int pl,
261 			       enum pl_prims pl_prim,
262 			       unsigned long long value);
263 static u64 rapl_unit_xlate(struct rapl_domain *rd,
264 			   enum unit_type type, u64 value, int to_raw);
265 static void package_power_limit_irq_save(struct rapl_package *rp);
266 
267 static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
268 
269 static const char *const rapl_domain_names[] = {
270 	"package",
271 	"core",
272 	"uncore",
273 	"dram",
274 	"psys",
275 };
276 
get_energy_counter(struct powercap_zone * power_zone,u64 * energy_raw)277 static int get_energy_counter(struct powercap_zone *power_zone,
278 			      u64 *energy_raw)
279 {
280 	struct rapl_domain *rd;
281 	u64 energy_now;
282 
283 	/* prevent CPU hotplug, make sure the RAPL domain does not go
284 	 * away while reading the counter.
285 	 */
286 	cpus_read_lock();
287 	rd = power_zone_to_rapl_domain(power_zone);
288 
289 	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
290 		*energy_raw = energy_now;
291 		cpus_read_unlock();
292 
293 		return 0;
294 	}
295 	cpus_read_unlock();
296 
297 	return -EIO;
298 }
299 
get_max_energy_counter(struct powercap_zone * pcd_dev,u64 * energy)300 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
301 {
302 	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
303 
304 	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
305 	return 0;
306 }
307 
release_zone(struct powercap_zone * power_zone)308 static int release_zone(struct powercap_zone *power_zone)
309 {
310 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
311 	struct rapl_package *rp = rd->rp;
312 
313 	/* package zone is the last zone of a package, we can free
314 	 * memory here since all children has been unregistered.
315 	 */
316 	if (rd->id == RAPL_DOMAIN_PACKAGE) {
317 		kfree(rd);
318 		rp->domains = NULL;
319 	}
320 
321 	return 0;
322 
323 }
324 
find_nr_power_limit(struct rapl_domain * rd)325 static int find_nr_power_limit(struct rapl_domain *rd)
326 {
327 	int i, nr_pl = 0;
328 
329 	for (i = 0; i < NR_POWER_LIMITS; i++) {
330 		if (is_pl_valid(rd, i))
331 			nr_pl++;
332 	}
333 
334 	return nr_pl;
335 }
336 
set_domain_enable(struct powercap_zone * power_zone,bool mode)337 static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
338 {
339 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
340 	struct rapl_defaults *defaults = get_defaults(rd->rp);
341 	u64 val;
342 	int ret;
343 
344 	cpus_read_lock();
345 	ret = rapl_write_pl_data(rd, POWER_LIMIT1, PL_ENABLE, mode);
346 	if (ret)
347 		goto end;
348 
349 	ret = rapl_read_pl_data(rd, POWER_LIMIT1, PL_ENABLE, false, &val);
350 	if (ret)
351 		goto end;
352 
353 	if (mode != val) {
354 		pr_debug("%s cannot be %s\n", power_zone->name,
355 			 str_enabled_disabled(mode));
356 		goto end;
357 	}
358 
359 	if (defaults->set_floor_freq)
360 		defaults->set_floor_freq(rd, mode);
361 
362 end:
363 	cpus_read_unlock();
364 
365 	return ret;
366 }
367 
get_domain_enable(struct powercap_zone * power_zone,bool * mode)368 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
369 {
370 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
371 	u64 val;
372 	int ret;
373 
374 	if (rd->rpl[POWER_LIMIT1].locked) {
375 		*mode = false;
376 		return 0;
377 	}
378 	cpus_read_lock();
379 	ret = rapl_read_pl_data(rd, POWER_LIMIT1, PL_ENABLE, true, &val);
380 	if (!ret)
381 		*mode = val;
382 	cpus_read_unlock();
383 
384 	return ret;
385 }
386 
387 /* per RAPL domain ops, in the order of rapl_domain_type */
388 static const struct powercap_zone_ops zone_ops[] = {
389 	/* RAPL_DOMAIN_PACKAGE */
390 	{
391 	 .get_energy_uj = get_energy_counter,
392 	 .get_max_energy_range_uj = get_max_energy_counter,
393 	 .release = release_zone,
394 	 .set_enable = set_domain_enable,
395 	 .get_enable = get_domain_enable,
396 	 },
397 	/* RAPL_DOMAIN_PP0 */
398 	{
399 	 .get_energy_uj = get_energy_counter,
400 	 .get_max_energy_range_uj = get_max_energy_counter,
401 	 .release = release_zone,
402 	 .set_enable = set_domain_enable,
403 	 .get_enable = get_domain_enable,
404 	 },
405 	/* RAPL_DOMAIN_PP1 */
406 	{
407 	 .get_energy_uj = get_energy_counter,
408 	 .get_max_energy_range_uj = get_max_energy_counter,
409 	 .release = release_zone,
410 	 .set_enable = set_domain_enable,
411 	 .get_enable = get_domain_enable,
412 	 },
413 	/* RAPL_DOMAIN_DRAM */
414 	{
415 	 .get_energy_uj = get_energy_counter,
416 	 .get_max_energy_range_uj = get_max_energy_counter,
417 	 .release = release_zone,
418 	 .set_enable = set_domain_enable,
419 	 .get_enable = get_domain_enable,
420 	 },
421 	/* RAPL_DOMAIN_PLATFORM */
422 	{
423 	 .get_energy_uj = get_energy_counter,
424 	 .get_max_energy_range_uj = get_max_energy_counter,
425 	 .release = release_zone,
426 	 .set_enable = set_domain_enable,
427 	 .get_enable = get_domain_enable,
428 	 },
429 };
430 
431 /*
432  * Constraint index used by powercap can be different than power limit (PL)
433  * index in that some  PLs maybe missing due to non-existent MSRs. So we
434  * need to convert here by finding the valid PLs only (name populated).
435  */
contraint_to_pl(struct rapl_domain * rd,int cid)436 static int contraint_to_pl(struct rapl_domain *rd, int cid)
437 {
438 	int i, j;
439 
440 	for (i = POWER_LIMIT1, j = 0; i < NR_POWER_LIMITS; i++) {
441 		if (is_pl_valid(rd, i) && j++ == cid) {
442 			pr_debug("%s: index %d\n", __func__, i);
443 			return i;
444 		}
445 	}
446 	pr_err("Cannot find matching power limit for constraint %d\n", cid);
447 
448 	return -EINVAL;
449 }
450 
set_power_limit(struct powercap_zone * power_zone,int cid,u64 power_limit)451 static int set_power_limit(struct powercap_zone *power_zone, int cid,
452 			   u64 power_limit)
453 {
454 	struct rapl_domain *rd;
455 	struct rapl_package *rp;
456 	int ret = 0;
457 	int id;
458 
459 	cpus_read_lock();
460 	rd = power_zone_to_rapl_domain(power_zone);
461 	id = contraint_to_pl(rd, cid);
462 	rp = rd->rp;
463 
464 	ret = rapl_write_pl_data(rd, id, PL_LIMIT, power_limit);
465 	if (!ret)
466 		package_power_limit_irq_save(rp);
467 	cpus_read_unlock();
468 	return ret;
469 }
470 
get_current_power_limit(struct powercap_zone * power_zone,int cid,u64 * data)471 static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
472 				   u64 *data)
473 {
474 	struct rapl_domain *rd;
475 	u64 val;
476 	int ret = 0;
477 	int id;
478 
479 	cpus_read_lock();
480 	rd = power_zone_to_rapl_domain(power_zone);
481 	id = contraint_to_pl(rd, cid);
482 
483 	ret = rapl_read_pl_data(rd, id, PL_LIMIT, true, &val);
484 	if (!ret)
485 		*data = val;
486 
487 	cpus_read_unlock();
488 
489 	return ret;
490 }
491 
set_time_window(struct powercap_zone * power_zone,int cid,u64 window)492 static int set_time_window(struct powercap_zone *power_zone, int cid,
493 			   u64 window)
494 {
495 	struct rapl_domain *rd;
496 	int ret = 0;
497 	int id;
498 
499 	cpus_read_lock();
500 	rd = power_zone_to_rapl_domain(power_zone);
501 	id = contraint_to_pl(rd, cid);
502 
503 	ret = rapl_write_pl_data(rd, id, PL_TIME_WINDOW, window);
504 
505 	cpus_read_unlock();
506 	return ret;
507 }
508 
get_time_window(struct powercap_zone * power_zone,int cid,u64 * data)509 static int get_time_window(struct powercap_zone *power_zone, int cid,
510 			   u64 *data)
511 {
512 	struct rapl_domain *rd;
513 	u64 val;
514 	int ret = 0;
515 	int id;
516 
517 	cpus_read_lock();
518 	rd = power_zone_to_rapl_domain(power_zone);
519 	id = contraint_to_pl(rd, cid);
520 
521 	ret = rapl_read_pl_data(rd, id, PL_TIME_WINDOW, true, &val);
522 	if (!ret)
523 		*data = val;
524 
525 	cpus_read_unlock();
526 
527 	return ret;
528 }
529 
get_constraint_name(struct powercap_zone * power_zone,int cid)530 static const char *get_constraint_name(struct powercap_zone *power_zone,
531 				       int cid)
532 {
533 	struct rapl_domain *rd;
534 	int id;
535 
536 	rd = power_zone_to_rapl_domain(power_zone);
537 	id = contraint_to_pl(rd, cid);
538 	if (id >= 0)
539 		return rd->rpl[id].name;
540 
541 	return NULL;
542 }
543 
get_max_power(struct powercap_zone * power_zone,int cid,u64 * data)544 static int get_max_power(struct powercap_zone *power_zone, int cid, u64 *data)
545 {
546 	struct rapl_domain *rd;
547 	u64 val;
548 	int ret = 0;
549 	int id;
550 
551 	cpus_read_lock();
552 	rd = power_zone_to_rapl_domain(power_zone);
553 	id = contraint_to_pl(rd, cid);
554 
555 	ret = rapl_read_pl_data(rd, id, PL_MAX_POWER, true, &val);
556 	if (!ret)
557 		*data = val;
558 
559 	/* As a generalization rule, PL4 would be around two times PL2. */
560 	if (id == POWER_LIMIT4)
561 		*data = *data * 2;
562 
563 	cpus_read_unlock();
564 
565 	return ret;
566 }
567 
568 static const struct powercap_zone_constraint_ops constraint_ops = {
569 	.set_power_limit_uw = set_power_limit,
570 	.get_power_limit_uw = get_current_power_limit,
571 	.set_time_window_us = set_time_window,
572 	.get_time_window_us = get_time_window,
573 	.get_max_power_uw = get_max_power,
574 	.get_name = get_constraint_name,
575 };
576 
577 /* Return the id used for read_raw/write_raw callback */
get_rid(struct rapl_package * rp)578 static int get_rid(struct rapl_package *rp)
579 {
580 	return rp->lead_cpu >= 0 ? rp->lead_cpu : rp->id;
581 }
582 
583 /* called after domain detection and package level data are set */
rapl_init_domains(struct rapl_package * rp)584 static void rapl_init_domains(struct rapl_package *rp)
585 {
586 	enum rapl_domain_type i;
587 	enum rapl_domain_reg_id j;
588 	struct rapl_domain *rd = rp->domains;
589 
590 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
591 		unsigned int mask = rp->domain_map & (1 << i);
592 		int t;
593 
594 		if (!mask)
595 			continue;
596 
597 		rd->rp = rp;
598 
599 		if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) {
600 			snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d",
601 				rp->lead_cpu >= 0 ? topology_physical_package_id(rp->lead_cpu) :
602 				rp->id);
603 		} else {
604 			snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s",
605 				rapl_domain_names[i]);
606 		}
607 
608 		rd->id = i;
609 
610 		/* PL1 is supported by default */
611 		rp->priv->limits[i] |= BIT(POWER_LIMIT1);
612 
613 		for (t = POWER_LIMIT1; t < NR_POWER_LIMITS; t++) {
614 			if (rp->priv->limits[i] & BIT(t))
615 				rd->rpl[t].name = pl_names[t];
616 		}
617 
618 		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
619 			rd->regs[j] = rp->priv->regs[i][j];
620 
621 		rd++;
622 	}
623 }
624 
rapl_unit_xlate(struct rapl_domain * rd,enum unit_type type,u64 value,int to_raw)625 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
626 			   u64 value, int to_raw)
627 {
628 	u64 units = 1;
629 	struct rapl_defaults *defaults = get_defaults(rd->rp);
630 	u64 scale = 1;
631 
632 	switch (type) {
633 	case POWER_UNIT:
634 		units = rd->power_unit;
635 		break;
636 	case ENERGY_UNIT:
637 		scale = ENERGY_UNIT_SCALE;
638 		units = rd->energy_unit;
639 		break;
640 	case TIME_UNIT:
641 		return defaults->compute_time_window(rd, value, to_raw);
642 	case ARBITRARY_UNIT:
643 	default:
644 		return value;
645 	}
646 
647 	if (to_raw)
648 		return div64_u64(value, units) * scale;
649 
650 	value *= units;
651 
652 	return div64_u64(value, scale);
653 }
654 
655 /* RAPL primitives for MSR and MMIO I/F */
656 static struct rapl_primitive_info rpi_msr[NR_RAPL_PRIMITIVES] = {
657 	/* name, mask, shift, msr index, unit divisor */
658 	[POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
659 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
660 	[POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
661 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
662 	[POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0,
663 				RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0),
664 	[ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
665 			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
666 	[FW_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
667 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
668 	[FW_HIGH_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_HIGH_LOCK, 63,
669 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
670 	[PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
671 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
672 	[PL1_CLAMP] = PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
673 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
674 	[PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
675 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
676 	[PL2_CLAMP] = PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
677 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
678 	[TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
679 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
680 	[TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
681 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
682 	[THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
683 			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
684 	[MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
685 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
686 	[MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
687 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
688 	[MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
689 			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
690 	[THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
691 			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
692 	[PRIORITY_LEVEL] = PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
693 			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
694 	[PSYS_POWER_LIMIT1] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0,
695 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
696 	[PSYS_POWER_LIMIT2] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32,
697 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
698 	[PSYS_PL1_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17,
699 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
700 	[PSYS_PL2_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49,
701 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
702 	[PSYS_TIME_WINDOW1] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19,
703 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
704 	[PSYS_TIME_WINDOW2] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51,
705 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
706 	/* non-hardware */
707 	[AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
708 			    RAPL_PRIMITIVE_DERIVED),
709 };
710 
711 /* RAPL primitives for TPMI I/F */
712 static struct rapl_primitive_info rpi_tpmi[NR_RAPL_PRIMITIVES] = {
713 	/* name, mask, shift, msr index, unit divisor */
714 	[POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, TPMI_POWER_LIMIT_MASK, 0,
715 		RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
716 	[POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, TPMI_POWER_LIMIT_MASK, 0,
717 		RAPL_DOMAIN_REG_PL2, POWER_UNIT, 0),
718 	[POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, TPMI_POWER_LIMIT_MASK, 0,
719 		RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0),
720 	[ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
721 		RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
722 	[PL1_LOCK] = PRIMITIVE_INFO_INIT(PL1_LOCK, POWER_HIGH_LOCK, 63,
723 		RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
724 	[PL2_LOCK] = PRIMITIVE_INFO_INIT(PL2_LOCK, POWER_HIGH_LOCK, 63,
725 		RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0),
726 	[PL4_LOCK] = PRIMITIVE_INFO_INIT(PL4_LOCK, POWER_HIGH_LOCK, 63,
727 		RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0),
728 	[PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62,
729 		RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
730 	[PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62,
731 		RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0),
732 	[PL4_ENABLE] = PRIMITIVE_INFO_INIT(PL4_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62,
733 		RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0),
734 	[TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TPMI_TIME_WINDOW_MASK, 18,
735 		RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
736 	[TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TPMI_TIME_WINDOW_MASK, 18,
737 		RAPL_DOMAIN_REG_PL2, TIME_UNIT, 0),
738 	[THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, TPMI_INFO_SPEC_MASK, 0,
739 		RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
740 	[MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, TPMI_INFO_MAX_MASK, 36,
741 		RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
742 	[MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, TPMI_INFO_MIN_MASK, 18,
743 		RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
744 	[MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, TPMI_INFO_MAX_TIME_WIN_MASK, 54,
745 		RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
746 	[THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
747 		RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
748 	/* non-hardware */
749 	[AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0,
750 		POWER_UNIT, RAPL_PRIMITIVE_DERIVED),
751 };
752 
get_rpi(struct rapl_package * rp,int prim)753 static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim)
754 {
755 	struct rapl_primitive_info *rpi = rp->priv->rpi;
756 
757 	if (prim < 0 || prim >= NR_RAPL_PRIMITIVES || !rpi)
758 		return NULL;
759 
760 	return &rpi[prim];
761 }
762 
rapl_config(struct rapl_package * rp)763 static int rapl_config(struct rapl_package *rp)
764 {
765 	switch (rp->priv->type) {
766 	/* MMIO I/F shares the same register layout as MSR registers */
767 	case RAPL_IF_MMIO:
768 	case RAPL_IF_MSR:
769 		rp->priv->defaults = (void *)defaults_msr;
770 		rp->priv->rpi = (void *)rpi_msr;
771 		break;
772 	case RAPL_IF_TPMI:
773 		rp->priv->defaults = (void *)&defaults_tpmi;
774 		rp->priv->rpi = (void *)rpi_tpmi;
775 		break;
776 	default:
777 		return -EINVAL;
778 	}
779 
780 	/* defaults_msr can be NULL on unsupported platforms */
781 	if (!rp->priv->defaults || !rp->priv->rpi)
782 		return -ENODEV;
783 
784 	return 0;
785 }
786 
787 static enum rapl_primitives
prim_fixups(struct rapl_domain * rd,enum rapl_primitives prim)788 prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
789 {
790 	struct rapl_defaults *defaults = get_defaults(rd->rp);
791 
792 	if (!defaults->spr_psys_bits)
793 		return prim;
794 
795 	if (rd->id != RAPL_DOMAIN_PLATFORM)
796 		return prim;
797 
798 	switch (prim) {
799 	case POWER_LIMIT1:
800 		return PSYS_POWER_LIMIT1;
801 	case POWER_LIMIT2:
802 		return PSYS_POWER_LIMIT2;
803 	case PL1_ENABLE:
804 		return PSYS_PL1_ENABLE;
805 	case PL2_ENABLE:
806 		return PSYS_PL2_ENABLE;
807 	case TIME_WINDOW1:
808 		return PSYS_TIME_WINDOW1;
809 	case TIME_WINDOW2:
810 		return PSYS_TIME_WINDOW2;
811 	default:
812 		return prim;
813 	}
814 }
815 
816 /* Read primitive data based on its related struct rapl_primitive_info.
817  * if xlate flag is set, return translated data based on data units, i.e.
818  * time, energy, and power.
819  * RAPL MSRs are non-architectual and are laid out not consistently across
820  * domains. Here we use primitive info to allow writing consolidated access
821  * functions.
822  * For a given primitive, it is processed by MSR mask and shift. Unit conversion
823  * is pre-assigned based on RAPL unit MSRs read at init time.
824  * 63-------------------------- 31--------------------------- 0
825  * |                           xxxxx (mask)                   |
826  * |                                |<- shift ----------------|
827  * 63-------------------------- 31--------------------------- 0
828  */
rapl_read_data_raw(struct rapl_domain * rd,enum rapl_primitives prim,bool xlate,u64 * data)829 static int rapl_read_data_raw(struct rapl_domain *rd,
830 			      enum rapl_primitives prim, bool xlate, u64 *data)
831 {
832 	u64 value;
833 	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
834 	struct rapl_primitive_info *rpi = get_rpi(rd->rp, prim_fixed);
835 	struct reg_action ra;
836 
837 	if (!rpi || !rpi->name || rpi->flag & RAPL_PRIMITIVE_DUMMY)
838 		return -EINVAL;
839 
840 	ra.reg = rd->regs[rpi->id];
841 	if (!ra.reg.val)
842 		return -EINVAL;
843 
844 	/* non-hardware data are collected by the polling thread */
845 	if (rpi->flag & RAPL_PRIMITIVE_DERIVED) {
846 		*data = rd->rdd.primitives[prim];
847 		return 0;
848 	}
849 
850 	ra.mask = rpi->mask;
851 
852 	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
853 		pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name);
854 		return -EIO;
855 	}
856 
857 	value = ra.value >> rpi->shift;
858 
859 	if (xlate)
860 		*data = rapl_unit_xlate(rd, rpi->unit, value, 0);
861 	else
862 		*data = value;
863 
864 	return 0;
865 }
866 
867 /* Similar use of primitive info in the read counterpart */
rapl_write_data_raw(struct rapl_domain * rd,enum rapl_primitives prim,unsigned long long value)868 static int rapl_write_data_raw(struct rapl_domain *rd,
869 			       enum rapl_primitives prim,
870 			       unsigned long long value)
871 {
872 	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
873 	struct rapl_primitive_info *rpi = get_rpi(rd->rp, prim_fixed);
874 	u64 bits;
875 	struct reg_action ra;
876 	int ret;
877 
878 	if (!rpi || !rpi->name || rpi->flag & RAPL_PRIMITIVE_DUMMY)
879 		return -EINVAL;
880 
881 	bits = rapl_unit_xlate(rd, rpi->unit, value, 1);
882 	bits <<= rpi->shift;
883 	bits &= rpi->mask;
884 
885 	memset(&ra, 0, sizeof(ra));
886 
887 	ra.reg = rd->regs[rpi->id];
888 	ra.mask = rpi->mask;
889 	ra.value = bits;
890 
891 	ret = rd->rp->priv->write_raw(get_rid(rd->rp), &ra);
892 
893 	return ret;
894 }
895 
rapl_read_pl_data(struct rapl_domain * rd,int pl,enum pl_prims pl_prim,bool xlate,u64 * data)896 static int rapl_read_pl_data(struct rapl_domain *rd, int pl,
897 			      enum pl_prims pl_prim, bool xlate, u64 *data)
898 {
899 	enum rapl_primitives prim = get_pl_prim(rd, pl, pl_prim);
900 
901 	if (!is_pl_valid(rd, pl))
902 		return -EINVAL;
903 
904 	return rapl_read_data_raw(rd, prim, xlate, data);
905 }
906 
rapl_write_pl_data(struct rapl_domain * rd,int pl,enum pl_prims pl_prim,unsigned long long value)907 static int rapl_write_pl_data(struct rapl_domain *rd, int pl,
908 			       enum pl_prims pl_prim,
909 			       unsigned long long value)
910 {
911 	enum rapl_primitives prim = get_pl_prim(rd, pl, pl_prim);
912 
913 	if (!is_pl_valid(rd, pl))
914 		return -EINVAL;
915 
916 	if (rd->rpl[pl].locked) {
917 		pr_debug("%s:%s:%s locked by BIOS\n", rd->rp->name, rd->name, pl_names[pl]);
918 		return -EACCES;
919 	}
920 
921 	return rapl_write_data_raw(rd, prim, value);
922 }
923 /*
924  * Raw RAPL data stored in MSRs are in certain scales. We need to
925  * convert them into standard units based on the units reported in
926  * the RAPL unit MSRs. This is specific to CPUs as the method to
927  * calculate units differ on different CPUs.
928  * We convert the units to below format based on CPUs.
929  * i.e.
930  * energy unit: picoJoules  : Represented in picoJoules by default
931  * power unit : microWatts  : Represented in milliWatts by default
932  * time unit  : microseconds: Represented in seconds by default
933  */
rapl_check_unit_core(struct rapl_domain * rd)934 static int rapl_check_unit_core(struct rapl_domain *rd)
935 {
936 	struct reg_action ra;
937 	u32 value;
938 
939 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
940 	ra.mask = ~0;
941 	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
942 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
943 			ra.reg.val, rd->rp->name, rd->name);
944 		return -ENODEV;
945 	}
946 
947 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
948 	rd->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
949 
950 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
951 	rd->power_unit = 1000000 / (1 << value);
952 
953 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
954 	rd->time_unit = 1000000 / (1 << value);
955 
956 	pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n",
957 		 rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit);
958 
959 	return 0;
960 }
961 
rapl_check_unit_atom(struct rapl_domain * rd)962 static int rapl_check_unit_atom(struct rapl_domain *rd)
963 {
964 	struct reg_action ra;
965 	u32 value;
966 
967 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
968 	ra.mask = ~0;
969 	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
970 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
971 			ra.reg.val, rd->rp->name, rd->name);
972 		return -ENODEV;
973 	}
974 
975 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
976 	rd->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
977 
978 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
979 	rd->power_unit = (1 << value) * 1000;
980 
981 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
982 	rd->time_unit = 1000000 / (1 << value);
983 
984 	pr_debug("Atom %s:%s energy=%dpJ, time=%dus, power=%duW\n",
985 		 rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit);
986 
987 	return 0;
988 }
989 
power_limit_irq_save_cpu(void * info)990 static void power_limit_irq_save_cpu(void *info)
991 {
992 	u32 l, h = 0;
993 	struct rapl_package *rp = (struct rapl_package *)info;
994 
995 	/* save the state of PLN irq mask bit before disabling it */
996 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
997 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
998 		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
999 		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
1000 	}
1001 	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
1002 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
1003 }
1004 
1005 /* REVISIT:
1006  * When package power limit is set artificially low by RAPL, LVT
1007  * thermal interrupt for package power limit should be ignored
1008  * since we are not really exceeding the real limit. The intention
1009  * is to avoid excessive interrupts while we are trying to save power.
1010  * A useful feature might be routing the package_power_limit interrupt
1011  * to userspace via eventfd. once we have a usecase, this is simple
1012  * to do by adding an atomic notifier.
1013  */
1014 
package_power_limit_irq_save(struct rapl_package * rp)1015 static void package_power_limit_irq_save(struct rapl_package *rp)
1016 {
1017 	if (rp->lead_cpu < 0)
1018 		return;
1019 
1020 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
1021 		return;
1022 
1023 	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
1024 }
1025 
1026 /*
1027  * Restore per package power limit interrupt enable state. Called from cpu
1028  * hotplug code on package removal.
1029  */
package_power_limit_irq_restore(struct rapl_package * rp)1030 static void package_power_limit_irq_restore(struct rapl_package *rp)
1031 {
1032 	u32 l, h;
1033 
1034 	if (rp->lead_cpu < 0)
1035 		return;
1036 
1037 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
1038 		return;
1039 
1040 	/* irq enable state not saved, nothing to restore */
1041 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
1042 		return;
1043 
1044 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
1045 
1046 	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
1047 		l |= PACKAGE_THERM_INT_PLN_ENABLE;
1048 	else
1049 		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
1050 
1051 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
1052 }
1053 
set_floor_freq_default(struct rapl_domain * rd,bool mode)1054 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
1055 {
1056 	int i;
1057 
1058 	/* always enable clamp such that p-state can go below OS requested
1059 	 * range. power capping priority over guranteed frequency.
1060 	 */
1061 	rapl_write_pl_data(rd, POWER_LIMIT1, PL_CLAMP, mode);
1062 
1063 	for (i = POWER_LIMIT2; i < NR_POWER_LIMITS; i++) {
1064 		rapl_write_pl_data(rd, i, PL_ENABLE, mode);
1065 		rapl_write_pl_data(rd, i, PL_CLAMP, mode);
1066 	}
1067 }
1068 
set_floor_freq_atom(struct rapl_domain * rd,bool enable)1069 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
1070 {
1071 	static u32 power_ctrl_orig_val;
1072 	struct rapl_defaults *defaults = get_defaults(rd->rp);
1073 	u32 mdata;
1074 
1075 	if (!defaults->floor_freq_reg_addr) {
1076 		pr_err("Invalid floor frequency config register\n");
1077 		return;
1078 	}
1079 
1080 	if (!power_ctrl_orig_val)
1081 		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
1082 			      defaults->floor_freq_reg_addr,
1083 			      &power_ctrl_orig_val);
1084 	mdata = power_ctrl_orig_val;
1085 	if (enable) {
1086 		mdata &= ~(0x7f << 8);
1087 		mdata |= 1 << 8;
1088 	}
1089 	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
1090 		       defaults->floor_freq_reg_addr, mdata);
1091 }
1092 
rapl_compute_time_window_core(struct rapl_domain * rd,u64 value,bool to_raw)1093 static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value,
1094 					 bool to_raw)
1095 {
1096 	u64 f, y;		/* fraction and exp. used for time unit */
1097 
1098 	/*
1099 	 * Special processing based on 2^Y*(1+F/4), refer
1100 	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
1101 	 */
1102 	if (!to_raw) {
1103 		f = (value & 0x60) >> 5;
1104 		y = value & 0x1f;
1105 		value = (1 << y) * (4 + f) * rd->time_unit / 4;
1106 	} else {
1107 		if (value < rd->time_unit)
1108 			return 0;
1109 
1110 		do_div(value, rd->time_unit);
1111 		y = ilog2(value);
1112 
1113 		/*
1114 		 * The target hardware field is 7 bits wide, so return all ones
1115 		 * if the exponent is too large.
1116 		 */
1117 		if (y > 0x1f)
1118 			return 0x7f;
1119 
1120 		f = div64_u64(4 * (value - (1ULL << y)), 1ULL << y);
1121 		value = (y & 0x1f) | ((f & 0x3) << 5);
1122 	}
1123 	return value;
1124 }
1125 
rapl_compute_time_window_atom(struct rapl_domain * rd,u64 value,bool to_raw)1126 static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value,
1127 					 bool to_raw)
1128 {
1129 	/*
1130 	 * Atom time unit encoding is straight forward val * time_unit,
1131 	 * where time_unit is default to 1 sec. Never 0.
1132 	 */
1133 	if (!to_raw)
1134 		return (value) ? value * rd->time_unit : rd->time_unit;
1135 
1136 	value = div64_u64(value, rd->time_unit);
1137 
1138 	return value;
1139 }
1140 
1141 /* TPMI Unit register has different layout */
1142 #define TPMI_POWER_UNIT_OFFSET	POWER_UNIT_OFFSET
1143 #define TPMI_POWER_UNIT_MASK	POWER_UNIT_MASK
1144 #define TPMI_ENERGY_UNIT_OFFSET	0x06
1145 #define TPMI_ENERGY_UNIT_MASK	0x7C0
1146 #define TPMI_TIME_UNIT_OFFSET	0x0C
1147 #define TPMI_TIME_UNIT_MASK	0xF000
1148 
rapl_check_unit_tpmi(struct rapl_domain * rd)1149 static int rapl_check_unit_tpmi(struct rapl_domain *rd)
1150 {
1151 	struct reg_action ra;
1152 	u32 value;
1153 
1154 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
1155 	ra.mask = ~0;
1156 	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
1157 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
1158 			ra.reg.val, rd->rp->name, rd->name);
1159 		return -ENODEV;
1160 	}
1161 
1162 	value = (ra.value & TPMI_ENERGY_UNIT_MASK) >> TPMI_ENERGY_UNIT_OFFSET;
1163 	rd->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
1164 
1165 	value = (ra.value & TPMI_POWER_UNIT_MASK) >> TPMI_POWER_UNIT_OFFSET;
1166 	rd->power_unit = 1000000 / (1 << value);
1167 
1168 	value = (ra.value & TPMI_TIME_UNIT_MASK) >> TPMI_TIME_UNIT_OFFSET;
1169 	rd->time_unit = 1000000 / (1 << value);
1170 
1171 	pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n",
1172 		 rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit);
1173 
1174 	return 0;
1175 }
1176 
1177 static const struct rapl_defaults defaults_tpmi = {
1178 	.check_unit = rapl_check_unit_tpmi,
1179 	/* Reuse existing logic, ignore the PL_CLAMP failures and enable all Power Limits */
1180 	.set_floor_freq = set_floor_freq_default,
1181 	.compute_time_window = rapl_compute_time_window_core,
1182 };
1183 
1184 static const struct rapl_defaults rapl_defaults_core = {
1185 	.floor_freq_reg_addr = 0,
1186 	.check_unit = rapl_check_unit_core,
1187 	.set_floor_freq = set_floor_freq_default,
1188 	.compute_time_window = rapl_compute_time_window_core,
1189 };
1190 
1191 static const struct rapl_defaults rapl_defaults_hsw_server = {
1192 	.check_unit = rapl_check_unit_core,
1193 	.set_floor_freq = set_floor_freq_default,
1194 	.compute_time_window = rapl_compute_time_window_core,
1195 	.dram_domain_energy_unit = 15300,
1196 };
1197 
1198 static const struct rapl_defaults rapl_defaults_spr_server = {
1199 	.check_unit = rapl_check_unit_core,
1200 	.set_floor_freq = set_floor_freq_default,
1201 	.compute_time_window = rapl_compute_time_window_core,
1202 	.psys_domain_energy_unit = 1000000000,
1203 	.spr_psys_bits = true,
1204 };
1205 
1206 static const struct rapl_defaults rapl_defaults_byt = {
1207 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
1208 	.check_unit = rapl_check_unit_atom,
1209 	.set_floor_freq = set_floor_freq_atom,
1210 	.compute_time_window = rapl_compute_time_window_atom,
1211 };
1212 
1213 static const struct rapl_defaults rapl_defaults_tng = {
1214 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
1215 	.check_unit = rapl_check_unit_atom,
1216 	.set_floor_freq = set_floor_freq_atom,
1217 	.compute_time_window = rapl_compute_time_window_atom,
1218 };
1219 
1220 static const struct rapl_defaults rapl_defaults_ann = {
1221 	.floor_freq_reg_addr = 0,
1222 	.check_unit = rapl_check_unit_atom,
1223 	.set_floor_freq = NULL,
1224 	.compute_time_window = rapl_compute_time_window_atom,
1225 };
1226 
1227 static const struct rapl_defaults rapl_defaults_cht = {
1228 	.floor_freq_reg_addr = 0,
1229 	.check_unit = rapl_check_unit_atom,
1230 	.set_floor_freq = NULL,
1231 	.compute_time_window = rapl_compute_time_window_atom,
1232 };
1233 
1234 static const struct rapl_defaults rapl_defaults_amd = {
1235 	.check_unit = rapl_check_unit_core,
1236 };
1237 
1238 static const struct x86_cpu_id rapl_ids[] __initconst = {
1239 	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&rapl_defaults_core),
1240 	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&rapl_defaults_core),
1241 
1242 	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&rapl_defaults_core),
1243 	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&rapl_defaults_core),
1244 
1245 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&rapl_defaults_core),
1246 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&rapl_defaults_core),
1247 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&rapl_defaults_core),
1248 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&rapl_defaults_hsw_server),
1249 
1250 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&rapl_defaults_core),
1251 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&rapl_defaults_core),
1252 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&rapl_defaults_core),
1253 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&rapl_defaults_hsw_server),
1254 
1255 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&rapl_defaults_core),
1256 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&rapl_defaults_core),
1257 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&rapl_defaults_hsw_server),
1258 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&rapl_defaults_core),
1259 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&rapl_defaults_core),
1260 	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&rapl_defaults_core),
1261 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&rapl_defaults_core),
1262 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&rapl_defaults_core),
1263 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI,	&rapl_defaults_core),
1264 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&rapl_defaults_hsw_server),
1265 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&rapl_defaults_hsw_server),
1266 	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&rapl_defaults_core),
1267 	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&rapl_defaults_core),
1268 	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,		&rapl_defaults_core),
1269 	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,		&rapl_defaults_core),
1270 	X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,		&rapl_defaults_core),
1271 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,		&rapl_defaults_core),
1272 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		&rapl_defaults_core),
1273 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,	&rapl_defaults_core),
1274 	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,		&rapl_defaults_core),
1275 	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,        &rapl_defaults_core),
1276 	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S,	&rapl_defaults_core),
1277 	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE,		&rapl_defaults_core),
1278 	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L,	&rapl_defaults_core),
1279 	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&rapl_defaults_spr_server),
1280 	X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X,	&rapl_defaults_spr_server),
1281 	X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD,		&rapl_defaults_core),
1282 
1283 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT,	&rapl_defaults_byt),
1284 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT,	&rapl_defaults_cht),
1285 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID,	&rapl_defaults_tng),
1286 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID,	&rapl_defaults_ann),
1287 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&rapl_defaults_core),
1288 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&rapl_defaults_core),
1289 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&rapl_defaults_core),
1290 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT,	&rapl_defaults_core),
1291 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&rapl_defaults_core),
1292 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,	&rapl_defaults_core),
1293 
1294 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&rapl_defaults_hsw_server),
1295 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&rapl_defaults_hsw_server),
1296 
1297 	X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd),
1298 	X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd),
1299 	X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd),
1300 	X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd),
1301 	{}
1302 };
1303 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
1304 
1305 /* Read once for all raw primitive data for domains */
rapl_update_domain_data(struct rapl_package * rp)1306 static void rapl_update_domain_data(struct rapl_package *rp)
1307 {
1308 	int dmn, prim;
1309 	u64 val;
1310 
1311 	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1312 		pr_debug("update %s domain %s data\n", rp->name,
1313 			 rp->domains[dmn].name);
1314 		/* exclude non-raw primitives */
1315 		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1316 			struct rapl_primitive_info *rpi = get_rpi(rp, prim);
1317 
1318 			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1319 						rpi->unit, &val))
1320 				rp->domains[dmn].rdd.primitives[prim] = val;
1321 		}
1322 	}
1323 
1324 }
1325 
rapl_package_register_powercap(struct rapl_package * rp)1326 static int rapl_package_register_powercap(struct rapl_package *rp)
1327 {
1328 	struct rapl_domain *rd;
1329 	struct powercap_zone *power_zone = NULL;
1330 	int nr_pl, ret;
1331 
1332 	/* Update the domain data of the new package */
1333 	rapl_update_domain_data(rp);
1334 
1335 	/* first we register package domain as the parent zone */
1336 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1337 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1338 			nr_pl = find_nr_power_limit(rd);
1339 			pr_debug("register package domain %s\n", rp->name);
1340 			power_zone = powercap_register_zone(&rd->power_zone,
1341 					    rp->priv->control_type, rp->name,
1342 					    NULL, &zone_ops[rd->id], nr_pl,
1343 					    &constraint_ops);
1344 			if (IS_ERR(power_zone)) {
1345 				pr_debug("failed to register power zone %s\n",
1346 					 rp->name);
1347 				return PTR_ERR(power_zone);
1348 			}
1349 			/* track parent zone in per package/socket data */
1350 			rp->power_zone = power_zone;
1351 			/* done, only one package domain per socket */
1352 			break;
1353 		}
1354 	}
1355 	if (!power_zone) {
1356 		pr_err("no package domain found, unknown topology!\n");
1357 		return -ENODEV;
1358 	}
1359 	/* now register domains as children of the socket/package */
1360 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1361 		struct powercap_zone *parent = rp->power_zone;
1362 
1363 		if (rd->id == RAPL_DOMAIN_PACKAGE)
1364 			continue;
1365 		if (rd->id == RAPL_DOMAIN_PLATFORM)
1366 			parent = NULL;
1367 		/* number of power limits per domain varies */
1368 		nr_pl = find_nr_power_limit(rd);
1369 		power_zone = powercap_register_zone(&rd->power_zone,
1370 						    rp->priv->control_type,
1371 						    rd->name, parent,
1372 						    &zone_ops[rd->id], nr_pl,
1373 						    &constraint_ops);
1374 
1375 		if (IS_ERR(power_zone)) {
1376 			pr_debug("failed to register power_zone, %s:%s\n",
1377 				 rp->name, rd->name);
1378 			ret = PTR_ERR(power_zone);
1379 			goto err_cleanup;
1380 		}
1381 	}
1382 	return 0;
1383 
1384 err_cleanup:
1385 	/*
1386 	 * Clean up previously initialized domains within the package if we
1387 	 * failed after the first domain setup.
1388 	 */
1389 	while (--rd >= rp->domains) {
1390 		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1391 		powercap_unregister_zone(rp->priv->control_type,
1392 					 &rd->power_zone);
1393 	}
1394 
1395 	return ret;
1396 }
1397 
rapl_check_domain(int domain,struct rapl_package * rp)1398 static int rapl_check_domain(int domain, struct rapl_package *rp)
1399 {
1400 	struct reg_action ra;
1401 
1402 	switch (domain) {
1403 	case RAPL_DOMAIN_PACKAGE:
1404 	case RAPL_DOMAIN_PP0:
1405 	case RAPL_DOMAIN_PP1:
1406 	case RAPL_DOMAIN_DRAM:
1407 	case RAPL_DOMAIN_PLATFORM:
1408 		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1409 		break;
1410 	default:
1411 		pr_err("invalid domain id %d\n", domain);
1412 		return -EINVAL;
1413 	}
1414 	/* make sure domain counters are available and contains non-zero
1415 	 * values, otherwise skip it.
1416 	 */
1417 
1418 	ra.mask = ENERGY_STATUS_MASK;
1419 	if (rp->priv->read_raw(get_rid(rp), &ra) || !ra.value)
1420 		return -ENODEV;
1421 
1422 	return 0;
1423 }
1424 
1425 /*
1426  * Get per domain energy/power/time unit.
1427  * RAPL Interfaces without per domain unit register will use the package
1428  * scope unit register to set per domain units.
1429  */
rapl_get_domain_unit(struct rapl_domain * rd)1430 static int rapl_get_domain_unit(struct rapl_domain *rd)
1431 {
1432 	struct rapl_defaults *defaults = get_defaults(rd->rp);
1433 	int ret;
1434 
1435 	if (!rd->regs[RAPL_DOMAIN_REG_UNIT].val) {
1436 		if (!rd->rp->priv->reg_unit.val) {
1437 			pr_err("No valid Unit register found\n");
1438 			return -ENODEV;
1439 		}
1440 		rd->regs[RAPL_DOMAIN_REG_UNIT] = rd->rp->priv->reg_unit;
1441 	}
1442 
1443 	if (!defaults->check_unit) {
1444 		pr_err("missing .check_unit() callback\n");
1445 		return -ENODEV;
1446 	}
1447 
1448 	ret = defaults->check_unit(rd);
1449 	if (ret)
1450 		return ret;
1451 
1452 	if (rd->id == RAPL_DOMAIN_DRAM && defaults->dram_domain_energy_unit)
1453 		rd->energy_unit = defaults->dram_domain_energy_unit;
1454 	if (rd->id == RAPL_DOMAIN_PLATFORM && defaults->psys_domain_energy_unit)
1455 		rd->energy_unit = defaults->psys_domain_energy_unit;
1456 	return 0;
1457 }
1458 
1459 /*
1460  * Check if power limits are available. Two cases when they are not available:
1461  * 1. Locked by BIOS, in this case we still provide read-only access so that
1462  *    users can see what limit is set by the BIOS.
1463  * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1464  *    exist at all. In this case, we do not show the constraints in powercap.
1465  *
1466  * Called after domains are detected and initialized.
1467  */
rapl_detect_powerlimit(struct rapl_domain * rd)1468 static void rapl_detect_powerlimit(struct rapl_domain *rd)
1469 {
1470 	u64 val64;
1471 	int i;
1472 
1473 	for (i = POWER_LIMIT1; i < NR_POWER_LIMITS; i++) {
1474 		if (!rapl_read_pl_data(rd, i, PL_LOCK, false, &val64)) {
1475 			if (val64) {
1476 				rd->rpl[i].locked = true;
1477 				pr_info("%s:%s:%s locked by BIOS\n",
1478 					rd->rp->name, rd->name, pl_names[i]);
1479 			}
1480 		}
1481 
1482 		if (rapl_read_pl_data(rd, i, PL_LIMIT, false, &val64))
1483 			rd->rpl[i].name = NULL;
1484 	}
1485 }
1486 
1487 /* Detect active and valid domains for the given CPU, caller must
1488  * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1489  */
rapl_detect_domains(struct rapl_package * rp)1490 static int rapl_detect_domains(struct rapl_package *rp)
1491 {
1492 	struct rapl_domain *rd;
1493 	int i;
1494 
1495 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1496 		/* use physical package id to read counters */
1497 		if (!rapl_check_domain(i, rp)) {
1498 			rp->domain_map |= 1 << i;
1499 			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1500 		}
1501 	}
1502 	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1503 	if (!rp->nr_domains) {
1504 		pr_debug("no valid rapl domains found in %s\n", rp->name);
1505 		return -ENODEV;
1506 	}
1507 	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1508 
1509 	rp->domains = kcalloc(rp->nr_domains, sizeof(struct rapl_domain),
1510 			      GFP_KERNEL);
1511 	if (!rp->domains)
1512 		return -ENOMEM;
1513 
1514 	rapl_init_domains(rp);
1515 
1516 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1517 		rapl_get_domain_unit(rd);
1518 		rapl_detect_powerlimit(rd);
1519 	}
1520 
1521 	return 0;
1522 }
1523 
1524 /* called from CPU hotplug notifier, hotplug lock held */
rapl_remove_package_cpuslocked(struct rapl_package * rp)1525 void rapl_remove_package_cpuslocked(struct rapl_package *rp)
1526 {
1527 	struct rapl_domain *rd, *rd_package = NULL;
1528 
1529 	package_power_limit_irq_restore(rp);
1530 
1531 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1532 		int i;
1533 
1534 		for (i = POWER_LIMIT1; i < NR_POWER_LIMITS; i++) {
1535 			rapl_write_pl_data(rd, i, PL_ENABLE, 0);
1536 			rapl_write_pl_data(rd, i, PL_CLAMP, 0);
1537 		}
1538 
1539 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1540 			rd_package = rd;
1541 			continue;
1542 		}
1543 		pr_debug("remove package, undo power limit on %s: %s\n",
1544 			 rp->name, rd->name);
1545 		powercap_unregister_zone(rp->priv->control_type,
1546 					 &rd->power_zone);
1547 	}
1548 	/* do parent zone last */
1549 	powercap_unregister_zone(rp->priv->control_type,
1550 				 &rd_package->power_zone);
1551 	list_del(&rp->plist);
1552 	kfree(rp);
1553 }
1554 EXPORT_SYMBOL_GPL(rapl_remove_package_cpuslocked);
1555 
rapl_remove_package(struct rapl_package * rp)1556 void rapl_remove_package(struct rapl_package *rp)
1557 {
1558 	guard(cpus_read_lock)();
1559 	rapl_remove_package_cpuslocked(rp);
1560 }
1561 EXPORT_SYMBOL_GPL(rapl_remove_package);
1562 
1563 /* caller to ensure CPU hotplug lock is held */
rapl_find_package_domain_cpuslocked(int id,struct rapl_if_priv * priv,bool id_is_cpu)1564 struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv,
1565 							 bool id_is_cpu)
1566 {
1567 	struct rapl_package *rp;
1568 	int uid;
1569 
1570 	if (id_is_cpu)
1571 		uid = topology_logical_die_id(id);
1572 	else
1573 		uid = id;
1574 
1575 	list_for_each_entry(rp, &rapl_packages, plist) {
1576 		if (rp->id == uid
1577 		    && rp->priv->control_type == priv->control_type)
1578 			return rp;
1579 	}
1580 
1581 	return NULL;
1582 }
1583 EXPORT_SYMBOL_GPL(rapl_find_package_domain_cpuslocked);
1584 
rapl_find_package_domain(int id,struct rapl_if_priv * priv,bool id_is_cpu)1585 struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu)
1586 {
1587 	guard(cpus_read_lock)();
1588 	return rapl_find_package_domain_cpuslocked(id, priv, id_is_cpu);
1589 }
1590 EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1591 
1592 /* called from CPU hotplug notifier, hotplug lock held */
rapl_add_package_cpuslocked(int id,struct rapl_if_priv * priv,bool id_is_cpu)1593 struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, bool id_is_cpu)
1594 {
1595 	struct rapl_package *rp;
1596 	int ret;
1597 
1598 	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1599 	if (!rp)
1600 		return ERR_PTR(-ENOMEM);
1601 
1602 	if (id_is_cpu) {
1603 		rp->id = topology_logical_die_id(id);
1604 		rp->lead_cpu = id;
1605 		if (topology_max_die_per_package() > 1)
1606 			snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d-die-%d",
1607 				 topology_physical_package_id(id), topology_die_id(id));
1608 		else
1609 			snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1610 				 topology_physical_package_id(id));
1611 	} else {
1612 		rp->id = id;
1613 		rp->lead_cpu = -1;
1614 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d", id);
1615 	}
1616 
1617 	rp->priv = priv;
1618 	ret = rapl_config(rp);
1619 	if (ret)
1620 		goto err_free_package;
1621 
1622 	/* check if the package contains valid domains */
1623 	if (rapl_detect_domains(rp)) {
1624 		ret = -ENODEV;
1625 		goto err_free_package;
1626 	}
1627 	ret = rapl_package_register_powercap(rp);
1628 	if (!ret) {
1629 		INIT_LIST_HEAD(&rp->plist);
1630 		list_add(&rp->plist, &rapl_packages);
1631 		return rp;
1632 	}
1633 
1634 err_free_package:
1635 	kfree(rp->domains);
1636 	kfree(rp);
1637 	return ERR_PTR(ret);
1638 }
1639 EXPORT_SYMBOL_GPL(rapl_add_package_cpuslocked);
1640 
rapl_add_package(int id,struct rapl_if_priv * priv,bool id_is_cpu)1641 struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu)
1642 {
1643 	guard(cpus_read_lock)();
1644 	return rapl_add_package_cpuslocked(id, priv, id_is_cpu);
1645 }
1646 EXPORT_SYMBOL_GPL(rapl_add_package);
1647 
power_limit_state_save(void)1648 static void power_limit_state_save(void)
1649 {
1650 	struct rapl_package *rp;
1651 	struct rapl_domain *rd;
1652 	int ret, i;
1653 
1654 	cpus_read_lock();
1655 	list_for_each_entry(rp, &rapl_packages, plist) {
1656 		if (!rp->power_zone)
1657 			continue;
1658 		rd = power_zone_to_rapl_domain(rp->power_zone);
1659 		for (i = POWER_LIMIT1; i < NR_POWER_LIMITS; i++) {
1660 			ret = rapl_read_pl_data(rd, i, PL_LIMIT, true,
1661 						 &rd->rpl[i].last_power_limit);
1662 			if (ret)
1663 				rd->rpl[i].last_power_limit = 0;
1664 		}
1665 	}
1666 	cpus_read_unlock();
1667 }
1668 
power_limit_state_restore(void)1669 static void power_limit_state_restore(void)
1670 {
1671 	struct rapl_package *rp;
1672 	struct rapl_domain *rd;
1673 	int i;
1674 
1675 	cpus_read_lock();
1676 	list_for_each_entry(rp, &rapl_packages, plist) {
1677 		if (!rp->power_zone)
1678 			continue;
1679 		rd = power_zone_to_rapl_domain(rp->power_zone);
1680 		for (i = POWER_LIMIT1; i < NR_POWER_LIMITS; i++)
1681 			if (rd->rpl[i].last_power_limit)
1682 				rapl_write_pl_data(rd, i, PL_LIMIT,
1683 					       rd->rpl[i].last_power_limit);
1684 	}
1685 	cpus_read_unlock();
1686 }
1687 
rapl_pm_callback(struct notifier_block * nb,unsigned long mode,void * _unused)1688 static int rapl_pm_callback(struct notifier_block *nb,
1689 			    unsigned long mode, void *_unused)
1690 {
1691 	switch (mode) {
1692 	case PM_SUSPEND_PREPARE:
1693 		power_limit_state_save();
1694 		break;
1695 	case PM_POST_SUSPEND:
1696 		power_limit_state_restore();
1697 		break;
1698 	}
1699 	return NOTIFY_OK;
1700 }
1701 
1702 static struct notifier_block rapl_pm_notifier = {
1703 	.notifier_call = rapl_pm_callback,
1704 };
1705 
1706 static struct platform_device *rapl_msr_platdev;
1707 
rapl_init(void)1708 static int __init rapl_init(void)
1709 {
1710 	const struct x86_cpu_id *id;
1711 	int ret;
1712 
1713 	id = x86_match_cpu(rapl_ids);
1714 	if (id) {
1715 		defaults_msr = (struct rapl_defaults *)id->driver_data;
1716 
1717 		rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1718 		if (!rapl_msr_platdev)
1719 			return -ENOMEM;
1720 
1721 		ret = platform_device_add(rapl_msr_platdev);
1722 		if (ret) {
1723 			platform_device_put(rapl_msr_platdev);
1724 			return ret;
1725 		}
1726 	}
1727 
1728 	ret = register_pm_notifier(&rapl_pm_notifier);
1729 	if (ret && rapl_msr_platdev) {
1730 		platform_device_del(rapl_msr_platdev);
1731 		platform_device_put(rapl_msr_platdev);
1732 	}
1733 
1734 	return ret;
1735 }
1736 
rapl_exit(void)1737 static void __exit rapl_exit(void)
1738 {
1739 	platform_device_unregister(rapl_msr_platdev);
1740 	unregister_pm_notifier(&rapl_pm_notifier);
1741 }
1742 
1743 fs_initcall(rapl_init);
1744 module_exit(rapl_exit);
1745 
1746 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1747 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1748 MODULE_LICENSE("GPL v2");
1749