xref: /openbmc/linux/drivers/powercap/intel_rapl_common.c (revision 3381df0954199458fa3993db72fb427f0ed1e43b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Common code for Intel Running Average Power Limit (RAPL) support.
4  * Copyright (c) 2019, Intel Corporation.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/list.h>
11 #include <linux/types.h>
12 #include <linux/device.h>
13 #include <linux/slab.h>
14 #include <linux/log2.h>
15 #include <linux/bitmap.h>
16 #include <linux/delay.h>
17 #include <linux/sysfs.h>
18 #include <linux/cpu.h>
19 #include <linux/powercap.h>
20 #include <linux/suspend.h>
21 #include <linux/intel_rapl.h>
22 #include <linux/processor.h>
23 #include <linux/platform_device.h>
24 
25 #include <asm/iosf_mbi.h>
26 #include <asm/cpu_device_id.h>
27 #include <asm/intel-family.h>
28 
29 /* Local defines */
30 #define MSR_PLATFORM_POWER_LIMIT	0x0000065C
31 
32 /* bitmasks for RAPL MSRs, used by primitive access functions */
33 #define ENERGY_STATUS_MASK      0xffffffff
34 
35 #define POWER_LIMIT1_MASK       0x7FFF
36 #define POWER_LIMIT1_ENABLE     BIT(15)
37 #define POWER_LIMIT1_CLAMP      BIT(16)
38 
39 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
40 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
41 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
42 #define POWER_HIGH_LOCK         BIT_ULL(63)
43 #define POWER_LOW_LOCK          BIT(31)
44 
45 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
46 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
47 
48 #define POWER_UNIT_OFFSET	0
49 #define POWER_UNIT_MASK		0x0F
50 
51 #define ENERGY_UNIT_OFFSET	0x08
52 #define ENERGY_UNIT_MASK	0x1F00
53 
54 #define TIME_UNIT_OFFSET	0x10
55 #define TIME_UNIT_MASK		0xF0000
56 
57 #define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
58 #define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
59 #define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
60 #define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
61 
62 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
63 #define PP_POLICY_MASK         0x1F
64 
65 /* Non HW constants */
66 #define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
67 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
68 
69 #define TIME_WINDOW_MAX_MSEC 40000
70 #define TIME_WINDOW_MIN_MSEC 250
71 #define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
72 enum unit_type {
73 	ARBITRARY_UNIT,		/* no translation */
74 	POWER_UNIT,
75 	ENERGY_UNIT,
76 	TIME_UNIT,
77 };
78 
79 /* per domain data, some are optional */
80 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
81 
82 #define	DOMAIN_STATE_INACTIVE           BIT(0)
83 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
84 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
85 
86 static const char pl1_name[] = "long_term";
87 static const char pl2_name[] = "short_term";
88 
89 #define power_zone_to_rapl_domain(_zone) \
90 	container_of(_zone, struct rapl_domain, power_zone)
91 
92 struct rapl_defaults {
93 	u8 floor_freq_reg_addr;
94 	int (*check_unit)(struct rapl_package *rp, int cpu);
95 	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
96 	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
97 				    bool to_raw);
98 	unsigned int dram_domain_energy_unit;
99 };
100 static struct rapl_defaults *rapl_defaults;
101 
102 /* Sideband MBI registers */
103 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
104 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
105 
106 #define PACKAGE_PLN_INT_SAVED   BIT(0)
107 #define MAX_PRIM_NAME (32)
108 
109 /* per domain data. used to describe individual knobs such that access function
110  * can be consolidated into one instead of many inline functions.
111  */
112 struct rapl_primitive_info {
113 	const char *name;
114 	u64 mask;
115 	int shift;
116 	enum rapl_domain_reg_id id;
117 	enum unit_type unit;
118 	u32 flag;
119 };
120 
121 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
122 		.name = #p,			\
123 		.mask = m,			\
124 		.shift = s,			\
125 		.id = i,			\
126 		.unit = u,			\
127 		.flag = f			\
128 	}
129 
130 static void rapl_init_domains(struct rapl_package *rp);
131 static int rapl_read_data_raw(struct rapl_domain *rd,
132 			      enum rapl_primitives prim,
133 			      bool xlate, u64 *data);
134 static int rapl_write_data_raw(struct rapl_domain *rd,
135 			       enum rapl_primitives prim,
136 			       unsigned long long value);
137 static u64 rapl_unit_xlate(struct rapl_domain *rd,
138 			   enum unit_type type, u64 value, int to_raw);
139 static void package_power_limit_irq_save(struct rapl_package *rp);
140 
141 static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
142 
143 static const char *const rapl_domain_names[] = {
144 	"package",
145 	"core",
146 	"uncore",
147 	"dram",
148 	"psys",
149 };
150 
151 static int get_energy_counter(struct powercap_zone *power_zone,
152 			      u64 *energy_raw)
153 {
154 	struct rapl_domain *rd;
155 	u64 energy_now;
156 
157 	/* prevent CPU hotplug, make sure the RAPL domain does not go
158 	 * away while reading the counter.
159 	 */
160 	get_online_cpus();
161 	rd = power_zone_to_rapl_domain(power_zone);
162 
163 	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
164 		*energy_raw = energy_now;
165 		put_online_cpus();
166 
167 		return 0;
168 	}
169 	put_online_cpus();
170 
171 	return -EIO;
172 }
173 
174 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
175 {
176 	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
177 
178 	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
179 	return 0;
180 }
181 
182 static int release_zone(struct powercap_zone *power_zone)
183 {
184 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
185 	struct rapl_package *rp = rd->rp;
186 
187 	/* package zone is the last zone of a package, we can free
188 	 * memory here since all children has been unregistered.
189 	 */
190 	if (rd->id == RAPL_DOMAIN_PACKAGE) {
191 		kfree(rd);
192 		rp->domains = NULL;
193 	}
194 
195 	return 0;
196 
197 }
198 
199 static int find_nr_power_limit(struct rapl_domain *rd)
200 {
201 	int i, nr_pl = 0;
202 
203 	for (i = 0; i < NR_POWER_LIMITS; i++) {
204 		if (rd->rpl[i].name)
205 			nr_pl++;
206 	}
207 
208 	return nr_pl;
209 }
210 
211 static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
212 {
213 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
214 
215 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
216 		return -EACCES;
217 
218 	get_online_cpus();
219 	rapl_write_data_raw(rd, PL1_ENABLE, mode);
220 	if (rapl_defaults->set_floor_freq)
221 		rapl_defaults->set_floor_freq(rd, mode);
222 	put_online_cpus();
223 
224 	return 0;
225 }
226 
227 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
228 {
229 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
230 	u64 val;
231 
232 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
233 		*mode = false;
234 		return 0;
235 	}
236 	get_online_cpus();
237 	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
238 		put_online_cpus();
239 		return -EIO;
240 	}
241 	*mode = val;
242 	put_online_cpus();
243 
244 	return 0;
245 }
246 
247 /* per RAPL domain ops, in the order of rapl_domain_type */
248 static const struct powercap_zone_ops zone_ops[] = {
249 	/* RAPL_DOMAIN_PACKAGE */
250 	{
251 	 .get_energy_uj = get_energy_counter,
252 	 .get_max_energy_range_uj = get_max_energy_counter,
253 	 .release = release_zone,
254 	 .set_enable = set_domain_enable,
255 	 .get_enable = get_domain_enable,
256 	 },
257 	/* RAPL_DOMAIN_PP0 */
258 	{
259 	 .get_energy_uj = get_energy_counter,
260 	 .get_max_energy_range_uj = get_max_energy_counter,
261 	 .release = release_zone,
262 	 .set_enable = set_domain_enable,
263 	 .get_enable = get_domain_enable,
264 	 },
265 	/* RAPL_DOMAIN_PP1 */
266 	{
267 	 .get_energy_uj = get_energy_counter,
268 	 .get_max_energy_range_uj = get_max_energy_counter,
269 	 .release = release_zone,
270 	 .set_enable = set_domain_enable,
271 	 .get_enable = get_domain_enable,
272 	 },
273 	/* RAPL_DOMAIN_DRAM */
274 	{
275 	 .get_energy_uj = get_energy_counter,
276 	 .get_max_energy_range_uj = get_max_energy_counter,
277 	 .release = release_zone,
278 	 .set_enable = set_domain_enable,
279 	 .get_enable = get_domain_enable,
280 	 },
281 	/* RAPL_DOMAIN_PLATFORM */
282 	{
283 	 .get_energy_uj = get_energy_counter,
284 	 .get_max_energy_range_uj = get_max_energy_counter,
285 	 .release = release_zone,
286 	 .set_enable = set_domain_enable,
287 	 .get_enable = get_domain_enable,
288 	 },
289 };
290 
291 /*
292  * Constraint index used by powercap can be different than power limit (PL)
293  * index in that some  PLs maybe missing due to non-existent MSRs. So we
294  * need to convert here by finding the valid PLs only (name populated).
295  */
296 static int contraint_to_pl(struct rapl_domain *rd, int cid)
297 {
298 	int i, j;
299 
300 	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
301 		if ((rd->rpl[i].name) && j++ == cid) {
302 			pr_debug("%s: index %d\n", __func__, i);
303 			return i;
304 		}
305 	}
306 	pr_err("Cannot find matching power limit for constraint %d\n", cid);
307 
308 	return -EINVAL;
309 }
310 
311 static int set_power_limit(struct powercap_zone *power_zone, int cid,
312 			   u64 power_limit)
313 {
314 	struct rapl_domain *rd;
315 	struct rapl_package *rp;
316 	int ret = 0;
317 	int id;
318 
319 	get_online_cpus();
320 	rd = power_zone_to_rapl_domain(power_zone);
321 	id = contraint_to_pl(rd, cid);
322 	if (id < 0) {
323 		ret = id;
324 		goto set_exit;
325 	}
326 
327 	rp = rd->rp;
328 
329 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
330 		dev_warn(&power_zone->dev,
331 			 "%s locked by BIOS, monitoring only\n", rd->name);
332 		ret = -EACCES;
333 		goto set_exit;
334 	}
335 
336 	switch (rd->rpl[id].prim_id) {
337 	case PL1_ENABLE:
338 		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
339 		break;
340 	case PL2_ENABLE:
341 		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
342 		break;
343 	default:
344 		ret = -EINVAL;
345 	}
346 	if (!ret)
347 		package_power_limit_irq_save(rp);
348 set_exit:
349 	put_online_cpus();
350 	return ret;
351 }
352 
353 static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
354 				   u64 *data)
355 {
356 	struct rapl_domain *rd;
357 	u64 val;
358 	int prim;
359 	int ret = 0;
360 	int id;
361 
362 	get_online_cpus();
363 	rd = power_zone_to_rapl_domain(power_zone);
364 	id = contraint_to_pl(rd, cid);
365 	if (id < 0) {
366 		ret = id;
367 		goto get_exit;
368 	}
369 
370 	switch (rd->rpl[id].prim_id) {
371 	case PL1_ENABLE:
372 		prim = POWER_LIMIT1;
373 		break;
374 	case PL2_ENABLE:
375 		prim = POWER_LIMIT2;
376 		break;
377 	default:
378 		put_online_cpus();
379 		return -EINVAL;
380 	}
381 	if (rapl_read_data_raw(rd, prim, true, &val))
382 		ret = -EIO;
383 	else
384 		*data = val;
385 
386 get_exit:
387 	put_online_cpus();
388 
389 	return ret;
390 }
391 
392 static int set_time_window(struct powercap_zone *power_zone, int cid,
393 			   u64 window)
394 {
395 	struct rapl_domain *rd;
396 	int ret = 0;
397 	int id;
398 
399 	get_online_cpus();
400 	rd = power_zone_to_rapl_domain(power_zone);
401 	id = contraint_to_pl(rd, cid);
402 	if (id < 0) {
403 		ret = id;
404 		goto set_time_exit;
405 	}
406 
407 	switch (rd->rpl[id].prim_id) {
408 	case PL1_ENABLE:
409 		rapl_write_data_raw(rd, TIME_WINDOW1, window);
410 		break;
411 	case PL2_ENABLE:
412 		rapl_write_data_raw(rd, TIME_WINDOW2, window);
413 		break;
414 	default:
415 		ret = -EINVAL;
416 	}
417 
418 set_time_exit:
419 	put_online_cpus();
420 	return ret;
421 }
422 
423 static int get_time_window(struct powercap_zone *power_zone, int cid,
424 			   u64 *data)
425 {
426 	struct rapl_domain *rd;
427 	u64 val;
428 	int ret = 0;
429 	int id;
430 
431 	get_online_cpus();
432 	rd = power_zone_to_rapl_domain(power_zone);
433 	id = contraint_to_pl(rd, cid);
434 	if (id < 0) {
435 		ret = id;
436 		goto get_time_exit;
437 	}
438 
439 	switch (rd->rpl[id].prim_id) {
440 	case PL1_ENABLE:
441 		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
442 		break;
443 	case PL2_ENABLE:
444 		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
445 		break;
446 	default:
447 		put_online_cpus();
448 		return -EINVAL;
449 	}
450 	if (!ret)
451 		*data = val;
452 
453 get_time_exit:
454 	put_online_cpus();
455 
456 	return ret;
457 }
458 
459 static const char *get_constraint_name(struct powercap_zone *power_zone,
460 				       int cid)
461 {
462 	struct rapl_domain *rd;
463 	int id;
464 
465 	rd = power_zone_to_rapl_domain(power_zone);
466 	id = contraint_to_pl(rd, cid);
467 	if (id >= 0)
468 		return rd->rpl[id].name;
469 
470 	return NULL;
471 }
472 
473 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
474 {
475 	struct rapl_domain *rd;
476 	u64 val;
477 	int prim;
478 	int ret = 0;
479 
480 	get_online_cpus();
481 	rd = power_zone_to_rapl_domain(power_zone);
482 	switch (rd->rpl[id].prim_id) {
483 	case PL1_ENABLE:
484 		prim = THERMAL_SPEC_POWER;
485 		break;
486 	case PL2_ENABLE:
487 		prim = MAX_POWER;
488 		break;
489 	default:
490 		put_online_cpus();
491 		return -EINVAL;
492 	}
493 	if (rapl_read_data_raw(rd, prim, true, &val))
494 		ret = -EIO;
495 	else
496 		*data = val;
497 
498 	put_online_cpus();
499 
500 	return ret;
501 }
502 
503 static const struct powercap_zone_constraint_ops constraint_ops = {
504 	.set_power_limit_uw = set_power_limit,
505 	.get_power_limit_uw = get_current_power_limit,
506 	.set_time_window_us = set_time_window,
507 	.get_time_window_us = get_time_window,
508 	.get_max_power_uw = get_max_power,
509 	.get_name = get_constraint_name,
510 };
511 
512 /* called after domain detection and package level data are set */
513 static void rapl_init_domains(struct rapl_package *rp)
514 {
515 	enum rapl_domain_type i;
516 	enum rapl_domain_reg_id j;
517 	struct rapl_domain *rd = rp->domains;
518 
519 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
520 		unsigned int mask = rp->domain_map & (1 << i);
521 
522 		if (!mask)
523 			continue;
524 
525 		rd->rp = rp;
526 		rd->name = rapl_domain_names[i];
527 		rd->id = i;
528 		rd->rpl[0].prim_id = PL1_ENABLE;
529 		rd->rpl[0].name = pl1_name;
530 		/* some domain may support two power limits */
531 		if (rp->priv->limits[i] == 2) {
532 			rd->rpl[1].prim_id = PL2_ENABLE;
533 			rd->rpl[1].name = pl2_name;
534 		}
535 
536 		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
537 			rd->regs[j] = rp->priv->regs[i][j];
538 
539 		if (i == RAPL_DOMAIN_DRAM) {
540 			rd->domain_energy_unit =
541 			    rapl_defaults->dram_domain_energy_unit;
542 			if (rd->domain_energy_unit)
543 				pr_info("DRAM domain energy unit %dpj\n",
544 					rd->domain_energy_unit);
545 		}
546 		rd++;
547 	}
548 }
549 
550 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
551 			   u64 value, int to_raw)
552 {
553 	u64 units = 1;
554 	struct rapl_package *rp = rd->rp;
555 	u64 scale = 1;
556 
557 	switch (type) {
558 	case POWER_UNIT:
559 		units = rp->power_unit;
560 		break;
561 	case ENERGY_UNIT:
562 		scale = ENERGY_UNIT_SCALE;
563 		/* per domain unit takes precedence */
564 		if (rd->domain_energy_unit)
565 			units = rd->domain_energy_unit;
566 		else
567 			units = rp->energy_unit;
568 		break;
569 	case TIME_UNIT:
570 		return rapl_defaults->compute_time_window(rp, value, to_raw);
571 	case ARBITRARY_UNIT:
572 	default:
573 		return value;
574 	};
575 
576 	if (to_raw)
577 		return div64_u64(value, units) * scale;
578 
579 	value *= units;
580 
581 	return div64_u64(value, scale);
582 }
583 
584 /* in the order of enum rapl_primitives */
585 static struct rapl_primitive_info rpi[] = {
586 	/* name, mask, shift, msr index, unit divisor */
587 	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
588 			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
589 	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
590 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
591 	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
592 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
593 	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
594 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
595 	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
596 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
597 	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
598 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
599 	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
600 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
601 	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
602 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
603 	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
604 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
605 	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
606 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
607 	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
608 			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
609 	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
610 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
611 	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
612 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
613 	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
614 			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
615 	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
616 			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
617 	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
618 			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
619 	/* non-hardware */
620 	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
621 			    RAPL_PRIMITIVE_DERIVED),
622 	{NULL, 0, 0, 0},
623 };
624 
625 /* Read primitive data based on its related struct rapl_primitive_info.
626  * if xlate flag is set, return translated data based on data units, i.e.
627  * time, energy, and power.
628  * RAPL MSRs are non-architectual and are laid out not consistently across
629  * domains. Here we use primitive info to allow writing consolidated access
630  * functions.
631  * For a given primitive, it is processed by MSR mask and shift. Unit conversion
632  * is pre-assigned based on RAPL unit MSRs read at init time.
633  * 63-------------------------- 31--------------------------- 0
634  * |                           xxxxx (mask)                   |
635  * |                                |<- shift ----------------|
636  * 63-------------------------- 31--------------------------- 0
637  */
638 static int rapl_read_data_raw(struct rapl_domain *rd,
639 			      enum rapl_primitives prim, bool xlate, u64 *data)
640 {
641 	u64 value;
642 	struct rapl_primitive_info *rp = &rpi[prim];
643 	struct reg_action ra;
644 	int cpu;
645 
646 	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
647 		return -EINVAL;
648 
649 	ra.reg = rd->regs[rp->id];
650 	if (!ra.reg)
651 		return -EINVAL;
652 
653 	cpu = rd->rp->lead_cpu;
654 
655 	/* domain with 2 limits has different bit */
656 	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
657 		rp->mask = POWER_HIGH_LOCK;
658 		rp->shift = 63;
659 	}
660 	/* non-hardware data are collected by the polling thread */
661 	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
662 		*data = rd->rdd.primitives[prim];
663 		return 0;
664 	}
665 
666 	ra.mask = rp->mask;
667 
668 	if (rd->rp->priv->read_raw(cpu, &ra)) {
669 		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
670 		return -EIO;
671 	}
672 
673 	value = ra.value >> rp->shift;
674 
675 	if (xlate)
676 		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
677 	else
678 		*data = value;
679 
680 	return 0;
681 }
682 
683 /* Similar use of primitive info in the read counterpart */
684 static int rapl_write_data_raw(struct rapl_domain *rd,
685 			       enum rapl_primitives prim,
686 			       unsigned long long value)
687 {
688 	struct rapl_primitive_info *rp = &rpi[prim];
689 	int cpu;
690 	u64 bits;
691 	struct reg_action ra;
692 	int ret;
693 
694 	cpu = rd->rp->lead_cpu;
695 	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
696 	bits <<= rp->shift;
697 	bits &= rp->mask;
698 
699 	memset(&ra, 0, sizeof(ra));
700 
701 	ra.reg = rd->regs[rp->id];
702 	ra.mask = rp->mask;
703 	ra.value = bits;
704 
705 	ret = rd->rp->priv->write_raw(cpu, &ra);
706 
707 	return ret;
708 }
709 
710 /*
711  * Raw RAPL data stored in MSRs are in certain scales. We need to
712  * convert them into standard units based on the units reported in
713  * the RAPL unit MSRs. This is specific to CPUs as the method to
714  * calculate units differ on different CPUs.
715  * We convert the units to below format based on CPUs.
716  * i.e.
717  * energy unit: picoJoules  : Represented in picoJoules by default
718  * power unit : microWatts  : Represented in milliWatts by default
719  * time unit  : microseconds: Represented in seconds by default
720  */
721 static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
722 {
723 	struct reg_action ra;
724 	u32 value;
725 
726 	ra.reg = rp->priv->reg_unit;
727 	ra.mask = ~0;
728 	if (rp->priv->read_raw(cpu, &ra)) {
729 		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
730 		       rp->priv->reg_unit, cpu);
731 		return -ENODEV;
732 	}
733 
734 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
735 	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
736 
737 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
738 	rp->power_unit = 1000000 / (1 << value);
739 
740 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
741 	rp->time_unit = 1000000 / (1 << value);
742 
743 	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
744 		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
745 
746 	return 0;
747 }
748 
749 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
750 {
751 	struct reg_action ra;
752 	u32 value;
753 
754 	ra.reg = rp->priv->reg_unit;
755 	ra.mask = ~0;
756 	if (rp->priv->read_raw(cpu, &ra)) {
757 		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
758 		       rp->priv->reg_unit, cpu);
759 		return -ENODEV;
760 	}
761 
762 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
763 	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
764 
765 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
766 	rp->power_unit = (1 << value) * 1000;
767 
768 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
769 	rp->time_unit = 1000000 / (1 << value);
770 
771 	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
772 		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
773 
774 	return 0;
775 }
776 
777 static void power_limit_irq_save_cpu(void *info)
778 {
779 	u32 l, h = 0;
780 	struct rapl_package *rp = (struct rapl_package *)info;
781 
782 	/* save the state of PLN irq mask bit before disabling it */
783 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
784 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
785 		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
786 		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
787 	}
788 	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
789 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
790 }
791 
792 /* REVISIT:
793  * When package power limit is set artificially low by RAPL, LVT
794  * thermal interrupt for package power limit should be ignored
795  * since we are not really exceeding the real limit. The intention
796  * is to avoid excessive interrupts while we are trying to save power.
797  * A useful feature might be routing the package_power_limit interrupt
798  * to userspace via eventfd. once we have a usecase, this is simple
799  * to do by adding an atomic notifier.
800  */
801 
802 static void package_power_limit_irq_save(struct rapl_package *rp)
803 {
804 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
805 		return;
806 
807 	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
808 }
809 
810 /*
811  * Restore per package power limit interrupt enable state. Called from cpu
812  * hotplug code on package removal.
813  */
814 static void package_power_limit_irq_restore(struct rapl_package *rp)
815 {
816 	u32 l, h;
817 
818 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
819 		return;
820 
821 	/* irq enable state not saved, nothing to restore */
822 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
823 		return;
824 
825 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
826 
827 	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
828 		l |= PACKAGE_THERM_INT_PLN_ENABLE;
829 	else
830 		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
831 
832 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
833 }
834 
835 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
836 {
837 	int nr_powerlimit = find_nr_power_limit(rd);
838 
839 	/* always enable clamp such that p-state can go below OS requested
840 	 * range. power capping priority over guranteed frequency.
841 	 */
842 	rapl_write_data_raw(rd, PL1_CLAMP, mode);
843 
844 	/* some domains have pl2 */
845 	if (nr_powerlimit > 1) {
846 		rapl_write_data_raw(rd, PL2_ENABLE, mode);
847 		rapl_write_data_raw(rd, PL2_CLAMP, mode);
848 	}
849 }
850 
851 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
852 {
853 	static u32 power_ctrl_orig_val;
854 	u32 mdata;
855 
856 	if (!rapl_defaults->floor_freq_reg_addr) {
857 		pr_err("Invalid floor frequency config register\n");
858 		return;
859 	}
860 
861 	if (!power_ctrl_orig_val)
862 		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
863 			      rapl_defaults->floor_freq_reg_addr,
864 			      &power_ctrl_orig_val);
865 	mdata = power_ctrl_orig_val;
866 	if (enable) {
867 		mdata &= ~(0x7f << 8);
868 		mdata |= 1 << 8;
869 	}
870 	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
871 		       rapl_defaults->floor_freq_reg_addr, mdata);
872 }
873 
874 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
875 					 bool to_raw)
876 {
877 	u64 f, y;		/* fraction and exp. used for time unit */
878 
879 	/*
880 	 * Special processing based on 2^Y*(1+F/4), refer
881 	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
882 	 */
883 	if (!to_raw) {
884 		f = (value & 0x60) >> 5;
885 		y = value & 0x1f;
886 		value = (1 << y) * (4 + f) * rp->time_unit / 4;
887 	} else {
888 		do_div(value, rp->time_unit);
889 		y = ilog2(value);
890 		f = div64_u64(4 * (value - (1 << y)), 1 << y);
891 		value = (y & 0x1f) | ((f & 0x3) << 5);
892 	}
893 	return value;
894 }
895 
896 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
897 					 bool to_raw)
898 {
899 	/*
900 	 * Atom time unit encoding is straight forward val * time_unit,
901 	 * where time_unit is default to 1 sec. Never 0.
902 	 */
903 	if (!to_raw)
904 		return (value) ? value *= rp->time_unit : rp->time_unit;
905 
906 	value = div64_u64(value, rp->time_unit);
907 
908 	return value;
909 }
910 
911 static const struct rapl_defaults rapl_defaults_core = {
912 	.floor_freq_reg_addr = 0,
913 	.check_unit = rapl_check_unit_core,
914 	.set_floor_freq = set_floor_freq_default,
915 	.compute_time_window = rapl_compute_time_window_core,
916 };
917 
918 static const struct rapl_defaults rapl_defaults_hsw_server = {
919 	.check_unit = rapl_check_unit_core,
920 	.set_floor_freq = set_floor_freq_default,
921 	.compute_time_window = rapl_compute_time_window_core,
922 	.dram_domain_energy_unit = 15300,
923 };
924 
925 static const struct rapl_defaults rapl_defaults_byt = {
926 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
927 	.check_unit = rapl_check_unit_atom,
928 	.set_floor_freq = set_floor_freq_atom,
929 	.compute_time_window = rapl_compute_time_window_atom,
930 };
931 
932 static const struct rapl_defaults rapl_defaults_tng = {
933 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
934 	.check_unit = rapl_check_unit_atom,
935 	.set_floor_freq = set_floor_freq_atom,
936 	.compute_time_window = rapl_compute_time_window_atom,
937 };
938 
939 static const struct rapl_defaults rapl_defaults_ann = {
940 	.floor_freq_reg_addr = 0,
941 	.check_unit = rapl_check_unit_atom,
942 	.set_floor_freq = NULL,
943 	.compute_time_window = rapl_compute_time_window_atom,
944 };
945 
946 static const struct rapl_defaults rapl_defaults_cht = {
947 	.floor_freq_reg_addr = 0,
948 	.check_unit = rapl_check_unit_atom,
949 	.set_floor_freq = NULL,
950 	.compute_time_window = rapl_compute_time_window_atom,
951 };
952 
953 static const struct x86_cpu_id rapl_ids[] __initconst = {
954 	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&rapl_defaults_core),
955 	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&rapl_defaults_core),
956 
957 	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&rapl_defaults_core),
958 	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&rapl_defaults_core),
959 
960 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&rapl_defaults_core),
961 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&rapl_defaults_core),
962 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&rapl_defaults_core),
963 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&rapl_defaults_hsw_server),
964 
965 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&rapl_defaults_core),
966 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&rapl_defaults_core),
967 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&rapl_defaults_core),
968 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&rapl_defaults_hsw_server),
969 
970 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&rapl_defaults_core),
971 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&rapl_defaults_core),
972 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&rapl_defaults_hsw_server),
973 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&rapl_defaults_core),
974 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&rapl_defaults_core),
975 	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&rapl_defaults_core),
976 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&rapl_defaults_core),
977 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&rapl_defaults_core),
978 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI,	&rapl_defaults_core),
979 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&rapl_defaults_hsw_server),
980 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&rapl_defaults_hsw_server),
981 	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&rapl_defaults_core),
982 	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&rapl_defaults_core),
983 	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,		&rapl_defaults_core),
984 
985 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT,	&rapl_defaults_byt),
986 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT,	&rapl_defaults_cht),
987 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID,	&rapl_defaults_tng),
988 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID,	&rapl_defaults_ann),
989 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&rapl_defaults_core),
990 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&rapl_defaults_core),
991 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&rapl_defaults_core),
992 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&rapl_defaults_core),
993 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,	&rapl_defaults_core),
994 
995 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&rapl_defaults_hsw_server),
996 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&rapl_defaults_hsw_server),
997 	{}
998 };
999 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
1000 
1001 /* Read once for all raw primitive data for domains */
1002 static void rapl_update_domain_data(struct rapl_package *rp)
1003 {
1004 	int dmn, prim;
1005 	u64 val;
1006 
1007 	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1008 		pr_debug("update %s domain %s data\n", rp->name,
1009 			 rp->domains[dmn].name);
1010 		/* exclude non-raw primitives */
1011 		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1012 			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1013 						rpi[prim].unit, &val))
1014 				rp->domains[dmn].rdd.primitives[prim] = val;
1015 		}
1016 	}
1017 
1018 }
1019 
1020 static int rapl_package_register_powercap(struct rapl_package *rp)
1021 {
1022 	struct rapl_domain *rd;
1023 	struct powercap_zone *power_zone = NULL;
1024 	int nr_pl, ret;
1025 
1026 	/* Update the domain data of the new package */
1027 	rapl_update_domain_data(rp);
1028 
1029 	/* first we register package domain as the parent zone */
1030 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1031 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1032 			nr_pl = find_nr_power_limit(rd);
1033 			pr_debug("register package domain %s\n", rp->name);
1034 			power_zone = powercap_register_zone(&rd->power_zone,
1035 					    rp->priv->control_type, rp->name,
1036 					    NULL, &zone_ops[rd->id], nr_pl,
1037 					    &constraint_ops);
1038 			if (IS_ERR(power_zone)) {
1039 				pr_debug("failed to register power zone %s\n",
1040 					 rp->name);
1041 				return PTR_ERR(power_zone);
1042 			}
1043 			/* track parent zone in per package/socket data */
1044 			rp->power_zone = power_zone;
1045 			/* done, only one package domain per socket */
1046 			break;
1047 		}
1048 	}
1049 	if (!power_zone) {
1050 		pr_err("no package domain found, unknown topology!\n");
1051 		return -ENODEV;
1052 	}
1053 	/* now register domains as children of the socket/package */
1054 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1055 		if (rd->id == RAPL_DOMAIN_PACKAGE)
1056 			continue;
1057 		/* number of power limits per domain varies */
1058 		nr_pl = find_nr_power_limit(rd);
1059 		power_zone = powercap_register_zone(&rd->power_zone,
1060 						    rp->priv->control_type,
1061 						    rd->name, rp->power_zone,
1062 						    &zone_ops[rd->id], nr_pl,
1063 						    &constraint_ops);
1064 
1065 		if (IS_ERR(power_zone)) {
1066 			pr_debug("failed to register power_zone, %s:%s\n",
1067 				 rp->name, rd->name);
1068 			ret = PTR_ERR(power_zone);
1069 			goto err_cleanup;
1070 		}
1071 	}
1072 	return 0;
1073 
1074 err_cleanup:
1075 	/*
1076 	 * Clean up previously initialized domains within the package if we
1077 	 * failed after the first domain setup.
1078 	 */
1079 	while (--rd >= rp->domains) {
1080 		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1081 		powercap_unregister_zone(rp->priv->control_type,
1082 					 &rd->power_zone);
1083 	}
1084 
1085 	return ret;
1086 }
1087 
1088 int rapl_add_platform_domain(struct rapl_if_priv *priv)
1089 {
1090 	struct rapl_domain *rd;
1091 	struct powercap_zone *power_zone;
1092 	struct reg_action ra;
1093 	int ret;
1094 
1095 	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1096 	ra.mask = ~0;
1097 	ret = priv->read_raw(0, &ra);
1098 	if (ret || !ra.value)
1099 		return -ENODEV;
1100 
1101 	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1102 	ra.mask = ~0;
1103 	ret = priv->read_raw(0, &ra);
1104 	if (ret || !ra.value)
1105 		return -ENODEV;
1106 
1107 	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
1108 	if (!rd)
1109 		return -ENOMEM;
1110 
1111 	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
1112 	rd->id = RAPL_DOMAIN_PLATFORM;
1113 	rd->regs[RAPL_DOMAIN_REG_LIMIT] =
1114 	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1115 	rd->regs[RAPL_DOMAIN_REG_STATUS] =
1116 	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1117 	rd->rpl[0].prim_id = PL1_ENABLE;
1118 	rd->rpl[0].name = pl1_name;
1119 	rd->rpl[1].prim_id = PL2_ENABLE;
1120 	rd->rpl[1].name = pl2_name;
1121 	rd->rp = rapl_find_package_domain(0, priv);
1122 
1123 	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
1124 					    "psys", NULL,
1125 					    &zone_ops[RAPL_DOMAIN_PLATFORM],
1126 					    2, &constraint_ops);
1127 
1128 	if (IS_ERR(power_zone)) {
1129 		kfree(rd);
1130 		return PTR_ERR(power_zone);
1131 	}
1132 
1133 	priv->platform_rapl_domain = rd;
1134 
1135 	return 0;
1136 }
1137 EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
1138 
1139 void rapl_remove_platform_domain(struct rapl_if_priv *priv)
1140 {
1141 	if (priv->platform_rapl_domain) {
1142 		powercap_unregister_zone(priv->control_type,
1143 				 &priv->platform_rapl_domain->power_zone);
1144 		kfree(priv->platform_rapl_domain);
1145 	}
1146 }
1147 EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
1148 
1149 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
1150 {
1151 	struct reg_action ra;
1152 
1153 	switch (domain) {
1154 	case RAPL_DOMAIN_PACKAGE:
1155 	case RAPL_DOMAIN_PP0:
1156 	case RAPL_DOMAIN_PP1:
1157 	case RAPL_DOMAIN_DRAM:
1158 		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1159 		break;
1160 	case RAPL_DOMAIN_PLATFORM:
1161 		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
1162 		return -EINVAL;
1163 	default:
1164 		pr_err("invalid domain id %d\n", domain);
1165 		return -EINVAL;
1166 	}
1167 	/* make sure domain counters are available and contains non-zero
1168 	 * values, otherwise skip it.
1169 	 */
1170 
1171 	ra.mask = ~0;
1172 	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
1173 		return -ENODEV;
1174 
1175 	return 0;
1176 }
1177 
1178 /*
1179  * Check if power limits are available. Two cases when they are not available:
1180  * 1. Locked by BIOS, in this case we still provide read-only access so that
1181  *    users can see what limit is set by the BIOS.
1182  * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1183  *    exist at all. In this case, we do not show the constraints in powercap.
1184  *
1185  * Called after domains are detected and initialized.
1186  */
1187 static void rapl_detect_powerlimit(struct rapl_domain *rd)
1188 {
1189 	u64 val64;
1190 	int i;
1191 
1192 	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1193 	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
1194 		if (val64) {
1195 			pr_info("RAPL %s domain %s locked by BIOS\n",
1196 				rd->rp->name, rd->name);
1197 			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1198 		}
1199 	}
1200 	/* check if power limit MSR exists, otherwise domain is monitoring only */
1201 	for (i = 0; i < NR_POWER_LIMITS; i++) {
1202 		int prim = rd->rpl[i].prim_id;
1203 
1204 		if (rapl_read_data_raw(rd, prim, false, &val64))
1205 			rd->rpl[i].name = NULL;
1206 	}
1207 }
1208 
1209 /* Detect active and valid domains for the given CPU, caller must
1210  * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1211  */
1212 static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1213 {
1214 	struct rapl_domain *rd;
1215 	int i;
1216 
1217 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1218 		/* use physical package id to read counters */
1219 		if (!rapl_check_domain(cpu, i, rp)) {
1220 			rp->domain_map |= 1 << i;
1221 			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1222 		}
1223 	}
1224 	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1225 	if (!rp->nr_domains) {
1226 		pr_debug("no valid rapl domains found in %s\n", rp->name);
1227 		return -ENODEV;
1228 	}
1229 	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1230 
1231 	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1232 			      GFP_KERNEL);
1233 	if (!rp->domains)
1234 		return -ENOMEM;
1235 
1236 	rapl_init_domains(rp);
1237 
1238 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1239 		rapl_detect_powerlimit(rd);
1240 
1241 	return 0;
1242 }
1243 
1244 /* called from CPU hotplug notifier, hotplug lock held */
1245 void rapl_remove_package(struct rapl_package *rp)
1246 {
1247 	struct rapl_domain *rd, *rd_package = NULL;
1248 
1249 	package_power_limit_irq_restore(rp);
1250 
1251 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1252 		rapl_write_data_raw(rd, PL1_ENABLE, 0);
1253 		rapl_write_data_raw(rd, PL1_CLAMP, 0);
1254 		if (find_nr_power_limit(rd) > 1) {
1255 			rapl_write_data_raw(rd, PL2_ENABLE, 0);
1256 			rapl_write_data_raw(rd, PL2_CLAMP, 0);
1257 		}
1258 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1259 			rd_package = rd;
1260 			continue;
1261 		}
1262 		pr_debug("remove package, undo power limit on %s: %s\n",
1263 			 rp->name, rd->name);
1264 		powercap_unregister_zone(rp->priv->control_type,
1265 					 &rd->power_zone);
1266 	}
1267 	/* do parent zone last */
1268 	powercap_unregister_zone(rp->priv->control_type,
1269 				 &rd_package->power_zone);
1270 	list_del(&rp->plist);
1271 	kfree(rp);
1272 }
1273 EXPORT_SYMBOL_GPL(rapl_remove_package);
1274 
1275 /* caller to ensure CPU hotplug lock is held */
1276 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
1277 {
1278 	int id = topology_logical_die_id(cpu);
1279 	struct rapl_package *rp;
1280 
1281 	list_for_each_entry(rp, &rapl_packages, plist) {
1282 		if (rp->id == id
1283 		    && rp->priv->control_type == priv->control_type)
1284 			return rp;
1285 	}
1286 
1287 	return NULL;
1288 }
1289 EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1290 
1291 /* called from CPU hotplug notifier, hotplug lock held */
1292 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
1293 {
1294 	int id = topology_logical_die_id(cpu);
1295 	struct rapl_package *rp;
1296 	struct cpuinfo_x86 *c = &cpu_data(cpu);
1297 	int ret;
1298 
1299 	if (!rapl_defaults)
1300 		return ERR_PTR(-ENODEV);
1301 
1302 	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1303 	if (!rp)
1304 		return ERR_PTR(-ENOMEM);
1305 
1306 	/* add the new package to the list */
1307 	rp->id = id;
1308 	rp->lead_cpu = cpu;
1309 	rp->priv = priv;
1310 
1311 	if (topology_max_die_per_package() > 1)
1312 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
1313 			 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
1314 	else
1315 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1316 			 c->phys_proc_id);
1317 
1318 	/* check if the package contains valid domains */
1319 	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
1320 		ret = -ENODEV;
1321 		goto err_free_package;
1322 	}
1323 	ret = rapl_package_register_powercap(rp);
1324 	if (!ret) {
1325 		INIT_LIST_HEAD(&rp->plist);
1326 		list_add(&rp->plist, &rapl_packages);
1327 		return rp;
1328 	}
1329 
1330 err_free_package:
1331 	kfree(rp->domains);
1332 	kfree(rp);
1333 	return ERR_PTR(ret);
1334 }
1335 EXPORT_SYMBOL_GPL(rapl_add_package);
1336 
1337 static void power_limit_state_save(void)
1338 {
1339 	struct rapl_package *rp;
1340 	struct rapl_domain *rd;
1341 	int nr_pl, ret, i;
1342 
1343 	get_online_cpus();
1344 	list_for_each_entry(rp, &rapl_packages, plist) {
1345 		if (!rp->power_zone)
1346 			continue;
1347 		rd = power_zone_to_rapl_domain(rp->power_zone);
1348 		nr_pl = find_nr_power_limit(rd);
1349 		for (i = 0; i < nr_pl; i++) {
1350 			switch (rd->rpl[i].prim_id) {
1351 			case PL1_ENABLE:
1352 				ret = rapl_read_data_raw(rd,
1353 						 POWER_LIMIT1, true,
1354 						 &rd->rpl[i].last_power_limit);
1355 				if (ret)
1356 					rd->rpl[i].last_power_limit = 0;
1357 				break;
1358 			case PL2_ENABLE:
1359 				ret = rapl_read_data_raw(rd,
1360 						 POWER_LIMIT2, true,
1361 						 &rd->rpl[i].last_power_limit);
1362 				if (ret)
1363 					rd->rpl[i].last_power_limit = 0;
1364 				break;
1365 			}
1366 		}
1367 	}
1368 	put_online_cpus();
1369 }
1370 
1371 static void power_limit_state_restore(void)
1372 {
1373 	struct rapl_package *rp;
1374 	struct rapl_domain *rd;
1375 	int nr_pl, i;
1376 
1377 	get_online_cpus();
1378 	list_for_each_entry(rp, &rapl_packages, plist) {
1379 		if (!rp->power_zone)
1380 			continue;
1381 		rd = power_zone_to_rapl_domain(rp->power_zone);
1382 		nr_pl = find_nr_power_limit(rd);
1383 		for (i = 0; i < nr_pl; i++) {
1384 			switch (rd->rpl[i].prim_id) {
1385 			case PL1_ENABLE:
1386 				if (rd->rpl[i].last_power_limit)
1387 					rapl_write_data_raw(rd, POWER_LIMIT1,
1388 					    rd->rpl[i].last_power_limit);
1389 				break;
1390 			case PL2_ENABLE:
1391 				if (rd->rpl[i].last_power_limit)
1392 					rapl_write_data_raw(rd, POWER_LIMIT2,
1393 					    rd->rpl[i].last_power_limit);
1394 				break;
1395 			}
1396 		}
1397 	}
1398 	put_online_cpus();
1399 }
1400 
1401 static int rapl_pm_callback(struct notifier_block *nb,
1402 			    unsigned long mode, void *_unused)
1403 {
1404 	switch (mode) {
1405 	case PM_SUSPEND_PREPARE:
1406 		power_limit_state_save();
1407 		break;
1408 	case PM_POST_SUSPEND:
1409 		power_limit_state_restore();
1410 		break;
1411 	}
1412 	return NOTIFY_OK;
1413 }
1414 
1415 static struct notifier_block rapl_pm_notifier = {
1416 	.notifier_call = rapl_pm_callback,
1417 };
1418 
1419 static struct platform_device *rapl_msr_platdev;
1420 
1421 static int __init rapl_init(void)
1422 {
1423 	const struct x86_cpu_id *id;
1424 	int ret;
1425 
1426 	id = x86_match_cpu(rapl_ids);
1427 	if (!id) {
1428 		pr_err("driver does not support CPU family %d model %d\n",
1429 		       boot_cpu_data.x86, boot_cpu_data.x86_model);
1430 
1431 		return -ENODEV;
1432 	}
1433 
1434 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
1435 
1436 	ret = register_pm_notifier(&rapl_pm_notifier);
1437 	if (ret)
1438 		return ret;
1439 
1440 	rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1441 	if (!rapl_msr_platdev) {
1442 		ret = -ENOMEM;
1443 		goto end;
1444 	}
1445 
1446 	ret = platform_device_add(rapl_msr_platdev);
1447 	if (ret)
1448 		platform_device_put(rapl_msr_platdev);
1449 
1450 end:
1451 	if (ret)
1452 		unregister_pm_notifier(&rapl_pm_notifier);
1453 
1454 	return ret;
1455 }
1456 
1457 static void __exit rapl_exit(void)
1458 {
1459 	platform_device_unregister(rapl_msr_platdev);
1460 	unregister_pm_notifier(&rapl_pm_notifier);
1461 }
1462 
1463 fs_initcall(rapl_init);
1464 module_exit(rapl_exit);
1465 
1466 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1467 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1468 MODULE_LICENSE("GPL v2");
1469