1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Common code for Intel Running Average Power Limit (RAPL) support.
4  * Copyright (c) 2019, Intel Corporation.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/list.h>
11 #include <linux/types.h>
12 #include <linux/device.h>
13 #include <linux/slab.h>
14 #include <linux/log2.h>
15 #include <linux/bitmap.h>
16 #include <linux/delay.h>
17 #include <linux/sysfs.h>
18 #include <linux/cpu.h>
19 #include <linux/powercap.h>
20 #include <linux/suspend.h>
21 #include <linux/intel_rapl.h>
22 #include <linux/processor.h>
23 #include <linux/platform_device.h>
24 
25 #include <asm/iosf_mbi.h>
26 #include <asm/cpu_device_id.h>
27 #include <asm/intel-family.h>
28 
29 /* bitmasks for RAPL MSRs, used by primitive access functions */
30 #define ENERGY_STATUS_MASK      0xffffffff
31 
32 #define POWER_LIMIT1_MASK       0x7FFF
33 #define POWER_LIMIT1_ENABLE     BIT(15)
34 #define POWER_LIMIT1_CLAMP      BIT(16)
35 
36 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
37 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
38 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
39 #define POWER_HIGH_LOCK         BIT_ULL(63)
40 #define POWER_LOW_LOCK          BIT(31)
41 
42 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
43 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
44 
45 #define POWER_UNIT_OFFSET	0
46 #define POWER_UNIT_MASK		0x0F
47 
48 #define ENERGY_UNIT_OFFSET	0x08
49 #define ENERGY_UNIT_MASK	0x1F00
50 
51 #define TIME_UNIT_OFFSET	0x10
52 #define TIME_UNIT_MASK		0xF0000
53 
54 #define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
55 #define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
56 #define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
57 #define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
58 
59 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
60 #define PP_POLICY_MASK         0x1F
61 
62 /* Non HW constants */
63 #define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
64 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
65 
66 #define TIME_WINDOW_MAX_MSEC 40000
67 #define TIME_WINDOW_MIN_MSEC 250
68 #define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
69 enum unit_type {
70 	ARBITRARY_UNIT,		/* no translation */
71 	POWER_UNIT,
72 	ENERGY_UNIT,
73 	TIME_UNIT,
74 };
75 
76 /* per domain data, some are optional */
77 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
78 
79 #define	DOMAIN_STATE_INACTIVE           BIT(0)
80 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
81 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
82 
83 static const char pl1_name[] = "long_term";
84 static const char pl2_name[] = "short_term";
85 
86 #define power_zone_to_rapl_domain(_zone) \
87 	container_of(_zone, struct rapl_domain, power_zone)
88 
89 struct rapl_defaults {
90 	u8 floor_freq_reg_addr;
91 	int (*check_unit)(struct rapl_package *rp, int cpu);
92 	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
93 	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
94 				    bool to_raw);
95 	unsigned int dram_domain_energy_unit;
96 };
97 static struct rapl_defaults *rapl_defaults;
98 
99 /* Sideband MBI registers */
100 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
101 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
102 
103 #define PACKAGE_PLN_INT_SAVED   BIT(0)
104 #define MAX_PRIM_NAME (32)
105 
106 /* per domain data. used to describe individual knobs such that access function
107  * can be consolidated into one instead of many inline functions.
108  */
109 struct rapl_primitive_info {
110 	const char *name;
111 	u64 mask;
112 	int shift;
113 	enum rapl_domain_reg_id id;
114 	enum unit_type unit;
115 	u32 flag;
116 };
117 
118 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
119 		.name = #p,			\
120 		.mask = m,			\
121 		.shift = s,			\
122 		.id = i,			\
123 		.unit = u,			\
124 		.flag = f			\
125 	}
126 
127 static void rapl_init_domains(struct rapl_package *rp);
128 static int rapl_read_data_raw(struct rapl_domain *rd,
129 			      enum rapl_primitives prim,
130 			      bool xlate, u64 *data);
131 static int rapl_write_data_raw(struct rapl_domain *rd,
132 			       enum rapl_primitives prim,
133 			       unsigned long long value);
134 static u64 rapl_unit_xlate(struct rapl_domain *rd,
135 			   enum unit_type type, u64 value, int to_raw);
136 static void package_power_limit_irq_save(struct rapl_package *rp);
137 
138 static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
139 
140 static const char *const rapl_domain_names[] = {
141 	"package",
142 	"core",
143 	"uncore",
144 	"dram",
145 	"psys",
146 };
147 
148 static int get_energy_counter(struct powercap_zone *power_zone,
149 			      u64 *energy_raw)
150 {
151 	struct rapl_domain *rd;
152 	u64 energy_now;
153 
154 	/* prevent CPU hotplug, make sure the RAPL domain does not go
155 	 * away while reading the counter.
156 	 */
157 	get_online_cpus();
158 	rd = power_zone_to_rapl_domain(power_zone);
159 
160 	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
161 		*energy_raw = energy_now;
162 		put_online_cpus();
163 
164 		return 0;
165 	}
166 	put_online_cpus();
167 
168 	return -EIO;
169 }
170 
171 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
172 {
173 	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
174 
175 	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
176 	return 0;
177 }
178 
179 static int release_zone(struct powercap_zone *power_zone)
180 {
181 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
182 	struct rapl_package *rp = rd->rp;
183 
184 	/* package zone is the last zone of a package, we can free
185 	 * memory here since all children has been unregistered.
186 	 */
187 	if (rd->id == RAPL_DOMAIN_PACKAGE) {
188 		kfree(rd);
189 		rp->domains = NULL;
190 	}
191 
192 	return 0;
193 
194 }
195 
196 static int find_nr_power_limit(struct rapl_domain *rd)
197 {
198 	int i, nr_pl = 0;
199 
200 	for (i = 0; i < NR_POWER_LIMITS; i++) {
201 		if (rd->rpl[i].name)
202 			nr_pl++;
203 	}
204 
205 	return nr_pl;
206 }
207 
208 static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
209 {
210 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
211 
212 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
213 		return -EACCES;
214 
215 	get_online_cpus();
216 	rapl_write_data_raw(rd, PL1_ENABLE, mode);
217 	if (rapl_defaults->set_floor_freq)
218 		rapl_defaults->set_floor_freq(rd, mode);
219 	put_online_cpus();
220 
221 	return 0;
222 }
223 
224 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
225 {
226 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
227 	u64 val;
228 
229 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
230 		*mode = false;
231 		return 0;
232 	}
233 	get_online_cpus();
234 	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
235 		put_online_cpus();
236 		return -EIO;
237 	}
238 	*mode = val;
239 	put_online_cpus();
240 
241 	return 0;
242 }
243 
244 /* per RAPL domain ops, in the order of rapl_domain_type */
245 static const struct powercap_zone_ops zone_ops[] = {
246 	/* RAPL_DOMAIN_PACKAGE */
247 	{
248 	 .get_energy_uj = get_energy_counter,
249 	 .get_max_energy_range_uj = get_max_energy_counter,
250 	 .release = release_zone,
251 	 .set_enable = set_domain_enable,
252 	 .get_enable = get_domain_enable,
253 	 },
254 	/* RAPL_DOMAIN_PP0 */
255 	{
256 	 .get_energy_uj = get_energy_counter,
257 	 .get_max_energy_range_uj = get_max_energy_counter,
258 	 .release = release_zone,
259 	 .set_enable = set_domain_enable,
260 	 .get_enable = get_domain_enable,
261 	 },
262 	/* RAPL_DOMAIN_PP1 */
263 	{
264 	 .get_energy_uj = get_energy_counter,
265 	 .get_max_energy_range_uj = get_max_energy_counter,
266 	 .release = release_zone,
267 	 .set_enable = set_domain_enable,
268 	 .get_enable = get_domain_enable,
269 	 },
270 	/* RAPL_DOMAIN_DRAM */
271 	{
272 	 .get_energy_uj = get_energy_counter,
273 	 .get_max_energy_range_uj = get_max_energy_counter,
274 	 .release = release_zone,
275 	 .set_enable = set_domain_enable,
276 	 .get_enable = get_domain_enable,
277 	 },
278 	/* RAPL_DOMAIN_PLATFORM */
279 	{
280 	 .get_energy_uj = get_energy_counter,
281 	 .get_max_energy_range_uj = get_max_energy_counter,
282 	 .release = release_zone,
283 	 .set_enable = set_domain_enable,
284 	 .get_enable = get_domain_enable,
285 	 },
286 };
287 
288 /*
289  * Constraint index used by powercap can be different than power limit (PL)
290  * index in that some  PLs maybe missing due to non-existent MSRs. So we
291  * need to convert here by finding the valid PLs only (name populated).
292  */
293 static int contraint_to_pl(struct rapl_domain *rd, int cid)
294 {
295 	int i, j;
296 
297 	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
298 		if ((rd->rpl[i].name) && j++ == cid) {
299 			pr_debug("%s: index %d\n", __func__, i);
300 			return i;
301 		}
302 	}
303 	pr_err("Cannot find matching power limit for constraint %d\n", cid);
304 
305 	return -EINVAL;
306 }
307 
308 static int set_power_limit(struct powercap_zone *power_zone, int cid,
309 			   u64 power_limit)
310 {
311 	struct rapl_domain *rd;
312 	struct rapl_package *rp;
313 	int ret = 0;
314 	int id;
315 
316 	get_online_cpus();
317 	rd = power_zone_to_rapl_domain(power_zone);
318 	id = contraint_to_pl(rd, cid);
319 	if (id < 0) {
320 		ret = id;
321 		goto set_exit;
322 	}
323 
324 	rp = rd->rp;
325 
326 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
327 		dev_warn(&power_zone->dev,
328 			 "%s locked by BIOS, monitoring only\n", rd->name);
329 		ret = -EACCES;
330 		goto set_exit;
331 	}
332 
333 	switch (rd->rpl[id].prim_id) {
334 	case PL1_ENABLE:
335 		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
336 		break;
337 	case PL2_ENABLE:
338 		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
339 		break;
340 	default:
341 		ret = -EINVAL;
342 	}
343 	if (!ret)
344 		package_power_limit_irq_save(rp);
345 set_exit:
346 	put_online_cpus();
347 	return ret;
348 }
349 
350 static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
351 				   u64 *data)
352 {
353 	struct rapl_domain *rd;
354 	u64 val;
355 	int prim;
356 	int ret = 0;
357 	int id;
358 
359 	get_online_cpus();
360 	rd = power_zone_to_rapl_domain(power_zone);
361 	id = contraint_to_pl(rd, cid);
362 	if (id < 0) {
363 		ret = id;
364 		goto get_exit;
365 	}
366 
367 	switch (rd->rpl[id].prim_id) {
368 	case PL1_ENABLE:
369 		prim = POWER_LIMIT1;
370 		break;
371 	case PL2_ENABLE:
372 		prim = POWER_LIMIT2;
373 		break;
374 	default:
375 		put_online_cpus();
376 		return -EINVAL;
377 	}
378 	if (rapl_read_data_raw(rd, prim, true, &val))
379 		ret = -EIO;
380 	else
381 		*data = val;
382 
383 get_exit:
384 	put_online_cpus();
385 
386 	return ret;
387 }
388 
389 static int set_time_window(struct powercap_zone *power_zone, int cid,
390 			   u64 window)
391 {
392 	struct rapl_domain *rd;
393 	int ret = 0;
394 	int id;
395 
396 	get_online_cpus();
397 	rd = power_zone_to_rapl_domain(power_zone);
398 	id = contraint_to_pl(rd, cid);
399 	if (id < 0) {
400 		ret = id;
401 		goto set_time_exit;
402 	}
403 
404 	switch (rd->rpl[id].prim_id) {
405 	case PL1_ENABLE:
406 		rapl_write_data_raw(rd, TIME_WINDOW1, window);
407 		break;
408 	case PL2_ENABLE:
409 		rapl_write_data_raw(rd, TIME_WINDOW2, window);
410 		break;
411 	default:
412 		ret = -EINVAL;
413 	}
414 
415 set_time_exit:
416 	put_online_cpus();
417 	return ret;
418 }
419 
420 static int get_time_window(struct powercap_zone *power_zone, int cid,
421 			   u64 *data)
422 {
423 	struct rapl_domain *rd;
424 	u64 val;
425 	int ret = 0;
426 	int id;
427 
428 	get_online_cpus();
429 	rd = power_zone_to_rapl_domain(power_zone);
430 	id = contraint_to_pl(rd, cid);
431 	if (id < 0) {
432 		ret = id;
433 		goto get_time_exit;
434 	}
435 
436 	switch (rd->rpl[id].prim_id) {
437 	case PL1_ENABLE:
438 		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
439 		break;
440 	case PL2_ENABLE:
441 		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
442 		break;
443 	default:
444 		put_online_cpus();
445 		return -EINVAL;
446 	}
447 	if (!ret)
448 		*data = val;
449 
450 get_time_exit:
451 	put_online_cpus();
452 
453 	return ret;
454 }
455 
456 static const char *get_constraint_name(struct powercap_zone *power_zone,
457 				       int cid)
458 {
459 	struct rapl_domain *rd;
460 	int id;
461 
462 	rd = power_zone_to_rapl_domain(power_zone);
463 	id = contraint_to_pl(rd, cid);
464 	if (id >= 0)
465 		return rd->rpl[id].name;
466 
467 	return NULL;
468 }
469 
470 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
471 {
472 	struct rapl_domain *rd;
473 	u64 val;
474 	int prim;
475 	int ret = 0;
476 
477 	get_online_cpus();
478 	rd = power_zone_to_rapl_domain(power_zone);
479 	switch (rd->rpl[id].prim_id) {
480 	case PL1_ENABLE:
481 		prim = THERMAL_SPEC_POWER;
482 		break;
483 	case PL2_ENABLE:
484 		prim = MAX_POWER;
485 		break;
486 	default:
487 		put_online_cpus();
488 		return -EINVAL;
489 	}
490 	if (rapl_read_data_raw(rd, prim, true, &val))
491 		ret = -EIO;
492 	else
493 		*data = val;
494 
495 	put_online_cpus();
496 
497 	return ret;
498 }
499 
500 static const struct powercap_zone_constraint_ops constraint_ops = {
501 	.set_power_limit_uw = set_power_limit,
502 	.get_power_limit_uw = get_current_power_limit,
503 	.set_time_window_us = set_time_window,
504 	.get_time_window_us = get_time_window,
505 	.get_max_power_uw = get_max_power,
506 	.get_name = get_constraint_name,
507 };
508 
509 /* called after domain detection and package level data are set */
510 static void rapl_init_domains(struct rapl_package *rp)
511 {
512 	enum rapl_domain_type i;
513 	enum rapl_domain_reg_id j;
514 	struct rapl_domain *rd = rp->domains;
515 
516 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
517 		unsigned int mask = rp->domain_map & (1 << i);
518 
519 		if (!mask)
520 			continue;
521 
522 		rd->rp = rp;
523 		rd->name = rapl_domain_names[i];
524 		rd->id = i;
525 		rd->rpl[0].prim_id = PL1_ENABLE;
526 		rd->rpl[0].name = pl1_name;
527 		/* some domain may support two power limits */
528 		if (rp->priv->limits[i] == 2) {
529 			rd->rpl[1].prim_id = PL2_ENABLE;
530 			rd->rpl[1].name = pl2_name;
531 		}
532 
533 		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
534 			rd->regs[j] = rp->priv->regs[i][j];
535 
536 		if (i == RAPL_DOMAIN_DRAM) {
537 			rd->domain_energy_unit =
538 			    rapl_defaults->dram_domain_energy_unit;
539 			if (rd->domain_energy_unit)
540 				pr_info("DRAM domain energy unit %dpj\n",
541 					rd->domain_energy_unit);
542 		}
543 		rd++;
544 	}
545 }
546 
547 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
548 			   u64 value, int to_raw)
549 {
550 	u64 units = 1;
551 	struct rapl_package *rp = rd->rp;
552 	u64 scale = 1;
553 
554 	switch (type) {
555 	case POWER_UNIT:
556 		units = rp->power_unit;
557 		break;
558 	case ENERGY_UNIT:
559 		scale = ENERGY_UNIT_SCALE;
560 		/* per domain unit takes precedence */
561 		if (rd->domain_energy_unit)
562 			units = rd->domain_energy_unit;
563 		else
564 			units = rp->energy_unit;
565 		break;
566 	case TIME_UNIT:
567 		return rapl_defaults->compute_time_window(rp, value, to_raw);
568 	case ARBITRARY_UNIT:
569 	default:
570 		return value;
571 	};
572 
573 	if (to_raw)
574 		return div64_u64(value, units) * scale;
575 
576 	value *= units;
577 
578 	return div64_u64(value, scale);
579 }
580 
581 /* in the order of enum rapl_primitives */
582 static struct rapl_primitive_info rpi[] = {
583 	/* name, mask, shift, msr index, unit divisor */
584 	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
585 			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
586 	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
587 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
588 	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
589 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
590 	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
591 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
592 	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
593 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
594 	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
595 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
596 	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
597 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
598 	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
599 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
600 	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
601 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
602 	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
603 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
604 	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
605 			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
606 	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
607 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
608 	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
609 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
610 	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
611 			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
612 	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
613 			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
614 	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
615 			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
616 	/* non-hardware */
617 	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
618 			    RAPL_PRIMITIVE_DERIVED),
619 	{NULL, 0, 0, 0},
620 };
621 
622 /* Read primitive data based on its related struct rapl_primitive_info.
623  * if xlate flag is set, return translated data based on data units, i.e.
624  * time, energy, and power.
625  * RAPL MSRs are non-architectual and are laid out not consistently across
626  * domains. Here we use primitive info to allow writing consolidated access
627  * functions.
628  * For a given primitive, it is processed by MSR mask and shift. Unit conversion
629  * is pre-assigned based on RAPL unit MSRs read at init time.
630  * 63-------------------------- 31--------------------------- 0
631  * |                           xxxxx (mask)                   |
632  * |                                |<- shift ----------------|
633  * 63-------------------------- 31--------------------------- 0
634  */
635 static int rapl_read_data_raw(struct rapl_domain *rd,
636 			      enum rapl_primitives prim, bool xlate, u64 *data)
637 {
638 	u64 value;
639 	struct rapl_primitive_info *rp = &rpi[prim];
640 	struct reg_action ra;
641 	int cpu;
642 
643 	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
644 		return -EINVAL;
645 
646 	ra.reg = rd->regs[rp->id];
647 	if (!ra.reg)
648 		return -EINVAL;
649 
650 	cpu = rd->rp->lead_cpu;
651 
652 	/* domain with 2 limits has different bit */
653 	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
654 		rp->mask = POWER_HIGH_LOCK;
655 		rp->shift = 63;
656 	}
657 	/* non-hardware data are collected by the polling thread */
658 	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
659 		*data = rd->rdd.primitives[prim];
660 		return 0;
661 	}
662 
663 	ra.mask = rp->mask;
664 
665 	if (rd->rp->priv->read_raw(cpu, &ra)) {
666 		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
667 		return -EIO;
668 	}
669 
670 	value = ra.value >> rp->shift;
671 
672 	if (xlate)
673 		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
674 	else
675 		*data = value;
676 
677 	return 0;
678 }
679 
680 /* Similar use of primitive info in the read counterpart */
681 static int rapl_write_data_raw(struct rapl_domain *rd,
682 			       enum rapl_primitives prim,
683 			       unsigned long long value)
684 {
685 	struct rapl_primitive_info *rp = &rpi[prim];
686 	int cpu;
687 	u64 bits;
688 	struct reg_action ra;
689 	int ret;
690 
691 	cpu = rd->rp->lead_cpu;
692 	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
693 	bits <<= rp->shift;
694 	bits &= rp->mask;
695 
696 	memset(&ra, 0, sizeof(ra));
697 
698 	ra.reg = rd->regs[rp->id];
699 	ra.mask = rp->mask;
700 	ra.value = bits;
701 
702 	ret = rd->rp->priv->write_raw(cpu, &ra);
703 
704 	return ret;
705 }
706 
707 /*
708  * Raw RAPL data stored in MSRs are in certain scales. We need to
709  * convert them into standard units based on the units reported in
710  * the RAPL unit MSRs. This is specific to CPUs as the method to
711  * calculate units differ on different CPUs.
712  * We convert the units to below format based on CPUs.
713  * i.e.
714  * energy unit: picoJoules  : Represented in picoJoules by default
715  * power unit : microWatts  : Represented in milliWatts by default
716  * time unit  : microseconds: Represented in seconds by default
717  */
718 static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
719 {
720 	struct reg_action ra;
721 	u32 value;
722 
723 	ra.reg = rp->priv->reg_unit;
724 	ra.mask = ~0;
725 	if (rp->priv->read_raw(cpu, &ra)) {
726 		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
727 		       rp->priv->reg_unit, cpu);
728 		return -ENODEV;
729 	}
730 
731 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
732 	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
733 
734 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
735 	rp->power_unit = 1000000 / (1 << value);
736 
737 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
738 	rp->time_unit = 1000000 / (1 << value);
739 
740 	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
741 		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
742 
743 	return 0;
744 }
745 
746 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
747 {
748 	struct reg_action ra;
749 	u32 value;
750 
751 	ra.reg = rp->priv->reg_unit;
752 	ra.mask = ~0;
753 	if (rp->priv->read_raw(cpu, &ra)) {
754 		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
755 		       rp->priv->reg_unit, cpu);
756 		return -ENODEV;
757 	}
758 
759 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
760 	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
761 
762 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
763 	rp->power_unit = (1 << value) * 1000;
764 
765 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
766 	rp->time_unit = 1000000 / (1 << value);
767 
768 	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
769 		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
770 
771 	return 0;
772 }
773 
774 static void power_limit_irq_save_cpu(void *info)
775 {
776 	u32 l, h = 0;
777 	struct rapl_package *rp = (struct rapl_package *)info;
778 
779 	/* save the state of PLN irq mask bit before disabling it */
780 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
781 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
782 		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
783 		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
784 	}
785 	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
786 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
787 }
788 
789 /* REVISIT:
790  * When package power limit is set artificially low by RAPL, LVT
791  * thermal interrupt for package power limit should be ignored
792  * since we are not really exceeding the real limit. The intention
793  * is to avoid excessive interrupts while we are trying to save power.
794  * A useful feature might be routing the package_power_limit interrupt
795  * to userspace via eventfd. once we have a usecase, this is simple
796  * to do by adding an atomic notifier.
797  */
798 
799 static void package_power_limit_irq_save(struct rapl_package *rp)
800 {
801 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
802 		return;
803 
804 	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
805 }
806 
807 /*
808  * Restore per package power limit interrupt enable state. Called from cpu
809  * hotplug code on package removal.
810  */
811 static void package_power_limit_irq_restore(struct rapl_package *rp)
812 {
813 	u32 l, h;
814 
815 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
816 		return;
817 
818 	/* irq enable state not saved, nothing to restore */
819 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
820 		return;
821 
822 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
823 
824 	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
825 		l |= PACKAGE_THERM_INT_PLN_ENABLE;
826 	else
827 		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
828 
829 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
830 }
831 
832 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
833 {
834 	int nr_powerlimit = find_nr_power_limit(rd);
835 
836 	/* always enable clamp such that p-state can go below OS requested
837 	 * range. power capping priority over guranteed frequency.
838 	 */
839 	rapl_write_data_raw(rd, PL1_CLAMP, mode);
840 
841 	/* some domains have pl2 */
842 	if (nr_powerlimit > 1) {
843 		rapl_write_data_raw(rd, PL2_ENABLE, mode);
844 		rapl_write_data_raw(rd, PL2_CLAMP, mode);
845 	}
846 }
847 
848 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
849 {
850 	static u32 power_ctrl_orig_val;
851 	u32 mdata;
852 
853 	if (!rapl_defaults->floor_freq_reg_addr) {
854 		pr_err("Invalid floor frequency config register\n");
855 		return;
856 	}
857 
858 	if (!power_ctrl_orig_val)
859 		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
860 			      rapl_defaults->floor_freq_reg_addr,
861 			      &power_ctrl_orig_val);
862 	mdata = power_ctrl_orig_val;
863 	if (enable) {
864 		mdata &= ~(0x7f << 8);
865 		mdata |= 1 << 8;
866 	}
867 	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
868 		       rapl_defaults->floor_freq_reg_addr, mdata);
869 }
870 
871 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
872 					 bool to_raw)
873 {
874 	u64 f, y;		/* fraction and exp. used for time unit */
875 
876 	/*
877 	 * Special processing based on 2^Y*(1+F/4), refer
878 	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
879 	 */
880 	if (!to_raw) {
881 		f = (value & 0x60) >> 5;
882 		y = value & 0x1f;
883 		value = (1 << y) * (4 + f) * rp->time_unit / 4;
884 	} else {
885 		do_div(value, rp->time_unit);
886 		y = ilog2(value);
887 		f = div64_u64(4 * (value - (1 << y)), 1 << y);
888 		value = (y & 0x1f) | ((f & 0x3) << 5);
889 	}
890 	return value;
891 }
892 
893 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
894 					 bool to_raw)
895 {
896 	/*
897 	 * Atom time unit encoding is straight forward val * time_unit,
898 	 * where time_unit is default to 1 sec. Never 0.
899 	 */
900 	if (!to_raw)
901 		return (value) ? value *= rp->time_unit : rp->time_unit;
902 
903 	value = div64_u64(value, rp->time_unit);
904 
905 	return value;
906 }
907 
908 static const struct rapl_defaults rapl_defaults_core = {
909 	.floor_freq_reg_addr = 0,
910 	.check_unit = rapl_check_unit_core,
911 	.set_floor_freq = set_floor_freq_default,
912 	.compute_time_window = rapl_compute_time_window_core,
913 };
914 
915 static const struct rapl_defaults rapl_defaults_hsw_server = {
916 	.check_unit = rapl_check_unit_core,
917 	.set_floor_freq = set_floor_freq_default,
918 	.compute_time_window = rapl_compute_time_window_core,
919 	.dram_domain_energy_unit = 15300,
920 };
921 
922 static const struct rapl_defaults rapl_defaults_byt = {
923 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
924 	.check_unit = rapl_check_unit_atom,
925 	.set_floor_freq = set_floor_freq_atom,
926 	.compute_time_window = rapl_compute_time_window_atom,
927 };
928 
929 static const struct rapl_defaults rapl_defaults_tng = {
930 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
931 	.check_unit = rapl_check_unit_atom,
932 	.set_floor_freq = set_floor_freq_atom,
933 	.compute_time_window = rapl_compute_time_window_atom,
934 };
935 
936 static const struct rapl_defaults rapl_defaults_ann = {
937 	.floor_freq_reg_addr = 0,
938 	.check_unit = rapl_check_unit_atom,
939 	.set_floor_freq = NULL,
940 	.compute_time_window = rapl_compute_time_window_atom,
941 };
942 
943 static const struct rapl_defaults rapl_defaults_cht = {
944 	.floor_freq_reg_addr = 0,
945 	.check_unit = rapl_check_unit_atom,
946 	.set_floor_freq = NULL,
947 	.compute_time_window = rapl_compute_time_window_atom,
948 };
949 
950 static const struct x86_cpu_id rapl_ids[] __initconst = {
951 	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&rapl_defaults_core),
952 	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&rapl_defaults_core),
953 
954 	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&rapl_defaults_core),
955 	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&rapl_defaults_core),
956 
957 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&rapl_defaults_core),
958 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&rapl_defaults_core),
959 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&rapl_defaults_core),
960 	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&rapl_defaults_hsw_server),
961 
962 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&rapl_defaults_core),
963 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&rapl_defaults_core),
964 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&rapl_defaults_core),
965 	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&rapl_defaults_hsw_server),
966 
967 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&rapl_defaults_core),
968 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&rapl_defaults_core),
969 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&rapl_defaults_hsw_server),
970 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&rapl_defaults_core),
971 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&rapl_defaults_core),
972 	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&rapl_defaults_core),
973 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&rapl_defaults_core),
974 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&rapl_defaults_core),
975 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI,	&rapl_defaults_core),
976 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&rapl_defaults_hsw_server),
977 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&rapl_defaults_hsw_server),
978 	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&rapl_defaults_core),
979 	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&rapl_defaults_core),
980 	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,		&rapl_defaults_core),
981 
982 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT,	&rapl_defaults_byt),
983 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT,	&rapl_defaults_cht),
984 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID,	&rapl_defaults_tng),
985 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID,	&rapl_defaults_ann),
986 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&rapl_defaults_core),
987 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&rapl_defaults_core),
988 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&rapl_defaults_core),
989 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT,	&rapl_defaults_core),
990 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&rapl_defaults_core),
991 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,	&rapl_defaults_core),
992 
993 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&rapl_defaults_hsw_server),
994 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&rapl_defaults_hsw_server),
995 	{}
996 };
997 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
998 
999 /* Read once for all raw primitive data for domains */
1000 static void rapl_update_domain_data(struct rapl_package *rp)
1001 {
1002 	int dmn, prim;
1003 	u64 val;
1004 
1005 	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1006 		pr_debug("update %s domain %s data\n", rp->name,
1007 			 rp->domains[dmn].name);
1008 		/* exclude non-raw primitives */
1009 		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1010 			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1011 						rpi[prim].unit, &val))
1012 				rp->domains[dmn].rdd.primitives[prim] = val;
1013 		}
1014 	}
1015 
1016 }
1017 
1018 static int rapl_package_register_powercap(struct rapl_package *rp)
1019 {
1020 	struct rapl_domain *rd;
1021 	struct powercap_zone *power_zone = NULL;
1022 	int nr_pl, ret;
1023 
1024 	/* Update the domain data of the new package */
1025 	rapl_update_domain_data(rp);
1026 
1027 	/* first we register package domain as the parent zone */
1028 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1029 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1030 			nr_pl = find_nr_power_limit(rd);
1031 			pr_debug("register package domain %s\n", rp->name);
1032 			power_zone = powercap_register_zone(&rd->power_zone,
1033 					    rp->priv->control_type, rp->name,
1034 					    NULL, &zone_ops[rd->id], nr_pl,
1035 					    &constraint_ops);
1036 			if (IS_ERR(power_zone)) {
1037 				pr_debug("failed to register power zone %s\n",
1038 					 rp->name);
1039 				return PTR_ERR(power_zone);
1040 			}
1041 			/* track parent zone in per package/socket data */
1042 			rp->power_zone = power_zone;
1043 			/* done, only one package domain per socket */
1044 			break;
1045 		}
1046 	}
1047 	if (!power_zone) {
1048 		pr_err("no package domain found, unknown topology!\n");
1049 		return -ENODEV;
1050 	}
1051 	/* now register domains as children of the socket/package */
1052 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1053 		if (rd->id == RAPL_DOMAIN_PACKAGE)
1054 			continue;
1055 		/* number of power limits per domain varies */
1056 		nr_pl = find_nr_power_limit(rd);
1057 		power_zone = powercap_register_zone(&rd->power_zone,
1058 						    rp->priv->control_type,
1059 						    rd->name, rp->power_zone,
1060 						    &zone_ops[rd->id], nr_pl,
1061 						    &constraint_ops);
1062 
1063 		if (IS_ERR(power_zone)) {
1064 			pr_debug("failed to register power_zone, %s:%s\n",
1065 				 rp->name, rd->name);
1066 			ret = PTR_ERR(power_zone);
1067 			goto err_cleanup;
1068 		}
1069 	}
1070 	return 0;
1071 
1072 err_cleanup:
1073 	/*
1074 	 * Clean up previously initialized domains within the package if we
1075 	 * failed after the first domain setup.
1076 	 */
1077 	while (--rd >= rp->domains) {
1078 		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1079 		powercap_unregister_zone(rp->priv->control_type,
1080 					 &rd->power_zone);
1081 	}
1082 
1083 	return ret;
1084 }
1085 
1086 int rapl_add_platform_domain(struct rapl_if_priv *priv)
1087 {
1088 	struct rapl_domain *rd;
1089 	struct powercap_zone *power_zone;
1090 	struct reg_action ra;
1091 	int ret;
1092 
1093 	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1094 	ra.mask = ~0;
1095 	ret = priv->read_raw(0, &ra);
1096 	if (ret || !ra.value)
1097 		return -ENODEV;
1098 
1099 	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1100 	ra.mask = ~0;
1101 	ret = priv->read_raw(0, &ra);
1102 	if (ret || !ra.value)
1103 		return -ENODEV;
1104 
1105 	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
1106 	if (!rd)
1107 		return -ENOMEM;
1108 
1109 	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
1110 	rd->id = RAPL_DOMAIN_PLATFORM;
1111 	rd->regs[RAPL_DOMAIN_REG_LIMIT] =
1112 	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1113 	rd->regs[RAPL_DOMAIN_REG_STATUS] =
1114 	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1115 	rd->rpl[0].prim_id = PL1_ENABLE;
1116 	rd->rpl[0].name = pl1_name;
1117 	rd->rpl[1].prim_id = PL2_ENABLE;
1118 	rd->rpl[1].name = pl2_name;
1119 	rd->rp = rapl_find_package_domain(0, priv);
1120 
1121 	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
1122 					    "psys", NULL,
1123 					    &zone_ops[RAPL_DOMAIN_PLATFORM],
1124 					    2, &constraint_ops);
1125 
1126 	if (IS_ERR(power_zone)) {
1127 		kfree(rd);
1128 		return PTR_ERR(power_zone);
1129 	}
1130 
1131 	priv->platform_rapl_domain = rd;
1132 
1133 	return 0;
1134 }
1135 EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
1136 
1137 void rapl_remove_platform_domain(struct rapl_if_priv *priv)
1138 {
1139 	if (priv->platform_rapl_domain) {
1140 		powercap_unregister_zone(priv->control_type,
1141 				 &priv->platform_rapl_domain->power_zone);
1142 		kfree(priv->platform_rapl_domain);
1143 	}
1144 }
1145 EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
1146 
1147 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
1148 {
1149 	struct reg_action ra;
1150 
1151 	switch (domain) {
1152 	case RAPL_DOMAIN_PACKAGE:
1153 	case RAPL_DOMAIN_PP0:
1154 	case RAPL_DOMAIN_PP1:
1155 	case RAPL_DOMAIN_DRAM:
1156 		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1157 		break;
1158 	case RAPL_DOMAIN_PLATFORM:
1159 		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
1160 		return -EINVAL;
1161 	default:
1162 		pr_err("invalid domain id %d\n", domain);
1163 		return -EINVAL;
1164 	}
1165 	/* make sure domain counters are available and contains non-zero
1166 	 * values, otherwise skip it.
1167 	 */
1168 
1169 	ra.mask = ~0;
1170 	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
1171 		return -ENODEV;
1172 
1173 	return 0;
1174 }
1175 
1176 /*
1177  * Check if power limits are available. Two cases when they are not available:
1178  * 1. Locked by BIOS, in this case we still provide read-only access so that
1179  *    users can see what limit is set by the BIOS.
1180  * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1181  *    exist at all. In this case, we do not show the constraints in powercap.
1182  *
1183  * Called after domains are detected and initialized.
1184  */
1185 static void rapl_detect_powerlimit(struct rapl_domain *rd)
1186 {
1187 	u64 val64;
1188 	int i;
1189 
1190 	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1191 	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
1192 		if (val64) {
1193 			pr_info("RAPL %s domain %s locked by BIOS\n",
1194 				rd->rp->name, rd->name);
1195 			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1196 		}
1197 	}
1198 	/* check if power limit MSR exists, otherwise domain is monitoring only */
1199 	for (i = 0; i < NR_POWER_LIMITS; i++) {
1200 		int prim = rd->rpl[i].prim_id;
1201 
1202 		if (rapl_read_data_raw(rd, prim, false, &val64))
1203 			rd->rpl[i].name = NULL;
1204 	}
1205 }
1206 
1207 /* Detect active and valid domains for the given CPU, caller must
1208  * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1209  */
1210 static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1211 {
1212 	struct rapl_domain *rd;
1213 	int i;
1214 
1215 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1216 		/* use physical package id to read counters */
1217 		if (!rapl_check_domain(cpu, i, rp)) {
1218 			rp->domain_map |= 1 << i;
1219 			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1220 		}
1221 	}
1222 	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1223 	if (!rp->nr_domains) {
1224 		pr_debug("no valid rapl domains found in %s\n", rp->name);
1225 		return -ENODEV;
1226 	}
1227 	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1228 
1229 	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1230 			      GFP_KERNEL);
1231 	if (!rp->domains)
1232 		return -ENOMEM;
1233 
1234 	rapl_init_domains(rp);
1235 
1236 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1237 		rapl_detect_powerlimit(rd);
1238 
1239 	return 0;
1240 }
1241 
1242 /* called from CPU hotplug notifier, hotplug lock held */
1243 void rapl_remove_package(struct rapl_package *rp)
1244 {
1245 	struct rapl_domain *rd, *rd_package = NULL;
1246 
1247 	package_power_limit_irq_restore(rp);
1248 
1249 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1250 		rapl_write_data_raw(rd, PL1_ENABLE, 0);
1251 		rapl_write_data_raw(rd, PL1_CLAMP, 0);
1252 		if (find_nr_power_limit(rd) > 1) {
1253 			rapl_write_data_raw(rd, PL2_ENABLE, 0);
1254 			rapl_write_data_raw(rd, PL2_CLAMP, 0);
1255 		}
1256 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1257 			rd_package = rd;
1258 			continue;
1259 		}
1260 		pr_debug("remove package, undo power limit on %s: %s\n",
1261 			 rp->name, rd->name);
1262 		powercap_unregister_zone(rp->priv->control_type,
1263 					 &rd->power_zone);
1264 	}
1265 	/* do parent zone last */
1266 	powercap_unregister_zone(rp->priv->control_type,
1267 				 &rd_package->power_zone);
1268 	list_del(&rp->plist);
1269 	kfree(rp);
1270 }
1271 EXPORT_SYMBOL_GPL(rapl_remove_package);
1272 
1273 /* caller to ensure CPU hotplug lock is held */
1274 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
1275 {
1276 	int id = topology_logical_die_id(cpu);
1277 	struct rapl_package *rp;
1278 
1279 	list_for_each_entry(rp, &rapl_packages, plist) {
1280 		if (rp->id == id
1281 		    && rp->priv->control_type == priv->control_type)
1282 			return rp;
1283 	}
1284 
1285 	return NULL;
1286 }
1287 EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1288 
1289 /* called from CPU hotplug notifier, hotplug lock held */
1290 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
1291 {
1292 	int id = topology_logical_die_id(cpu);
1293 	struct rapl_package *rp;
1294 	struct cpuinfo_x86 *c = &cpu_data(cpu);
1295 	int ret;
1296 
1297 	if (!rapl_defaults)
1298 		return ERR_PTR(-ENODEV);
1299 
1300 	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1301 	if (!rp)
1302 		return ERR_PTR(-ENOMEM);
1303 
1304 	/* add the new package to the list */
1305 	rp->id = id;
1306 	rp->lead_cpu = cpu;
1307 	rp->priv = priv;
1308 
1309 	if (topology_max_die_per_package() > 1)
1310 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
1311 			 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
1312 	else
1313 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1314 			 c->phys_proc_id);
1315 
1316 	/* check if the package contains valid domains */
1317 	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
1318 		ret = -ENODEV;
1319 		goto err_free_package;
1320 	}
1321 	ret = rapl_package_register_powercap(rp);
1322 	if (!ret) {
1323 		INIT_LIST_HEAD(&rp->plist);
1324 		list_add(&rp->plist, &rapl_packages);
1325 		return rp;
1326 	}
1327 
1328 err_free_package:
1329 	kfree(rp->domains);
1330 	kfree(rp);
1331 	return ERR_PTR(ret);
1332 }
1333 EXPORT_SYMBOL_GPL(rapl_add_package);
1334 
1335 static void power_limit_state_save(void)
1336 {
1337 	struct rapl_package *rp;
1338 	struct rapl_domain *rd;
1339 	int nr_pl, ret, i;
1340 
1341 	get_online_cpus();
1342 	list_for_each_entry(rp, &rapl_packages, plist) {
1343 		if (!rp->power_zone)
1344 			continue;
1345 		rd = power_zone_to_rapl_domain(rp->power_zone);
1346 		nr_pl = find_nr_power_limit(rd);
1347 		for (i = 0; i < nr_pl; i++) {
1348 			switch (rd->rpl[i].prim_id) {
1349 			case PL1_ENABLE:
1350 				ret = rapl_read_data_raw(rd,
1351 						 POWER_LIMIT1, true,
1352 						 &rd->rpl[i].last_power_limit);
1353 				if (ret)
1354 					rd->rpl[i].last_power_limit = 0;
1355 				break;
1356 			case PL2_ENABLE:
1357 				ret = rapl_read_data_raw(rd,
1358 						 POWER_LIMIT2, true,
1359 						 &rd->rpl[i].last_power_limit);
1360 				if (ret)
1361 					rd->rpl[i].last_power_limit = 0;
1362 				break;
1363 			}
1364 		}
1365 	}
1366 	put_online_cpus();
1367 }
1368 
1369 static void power_limit_state_restore(void)
1370 {
1371 	struct rapl_package *rp;
1372 	struct rapl_domain *rd;
1373 	int nr_pl, i;
1374 
1375 	get_online_cpus();
1376 	list_for_each_entry(rp, &rapl_packages, plist) {
1377 		if (!rp->power_zone)
1378 			continue;
1379 		rd = power_zone_to_rapl_domain(rp->power_zone);
1380 		nr_pl = find_nr_power_limit(rd);
1381 		for (i = 0; i < nr_pl; i++) {
1382 			switch (rd->rpl[i].prim_id) {
1383 			case PL1_ENABLE:
1384 				if (rd->rpl[i].last_power_limit)
1385 					rapl_write_data_raw(rd, POWER_LIMIT1,
1386 					    rd->rpl[i].last_power_limit);
1387 				break;
1388 			case PL2_ENABLE:
1389 				if (rd->rpl[i].last_power_limit)
1390 					rapl_write_data_raw(rd, POWER_LIMIT2,
1391 					    rd->rpl[i].last_power_limit);
1392 				break;
1393 			}
1394 		}
1395 	}
1396 	put_online_cpus();
1397 }
1398 
1399 static int rapl_pm_callback(struct notifier_block *nb,
1400 			    unsigned long mode, void *_unused)
1401 {
1402 	switch (mode) {
1403 	case PM_SUSPEND_PREPARE:
1404 		power_limit_state_save();
1405 		break;
1406 	case PM_POST_SUSPEND:
1407 		power_limit_state_restore();
1408 		break;
1409 	}
1410 	return NOTIFY_OK;
1411 }
1412 
1413 static struct notifier_block rapl_pm_notifier = {
1414 	.notifier_call = rapl_pm_callback,
1415 };
1416 
1417 static struct platform_device *rapl_msr_platdev;
1418 
1419 static int __init rapl_init(void)
1420 {
1421 	const struct x86_cpu_id *id;
1422 	int ret;
1423 
1424 	id = x86_match_cpu(rapl_ids);
1425 	if (!id) {
1426 		pr_err("driver does not support CPU family %d model %d\n",
1427 		       boot_cpu_data.x86, boot_cpu_data.x86_model);
1428 
1429 		return -ENODEV;
1430 	}
1431 
1432 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
1433 
1434 	ret = register_pm_notifier(&rapl_pm_notifier);
1435 	if (ret)
1436 		return ret;
1437 
1438 	rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1439 	if (!rapl_msr_platdev) {
1440 		ret = -ENOMEM;
1441 		goto end;
1442 	}
1443 
1444 	ret = platform_device_add(rapl_msr_platdev);
1445 	if (ret)
1446 		platform_device_put(rapl_msr_platdev);
1447 
1448 end:
1449 	if (ret)
1450 		unregister_pm_notifier(&rapl_pm_notifier);
1451 
1452 	return ret;
1453 }
1454 
1455 static void __exit rapl_exit(void)
1456 {
1457 	platform_device_unregister(rapl_msr_platdev);
1458 	unregister_pm_notifier(&rapl_pm_notifier);
1459 }
1460 
1461 fs_initcall(rapl_init);
1462 module_exit(rapl_exit);
1463 
1464 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1465 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1466 MODULE_LICENSE("GPL v2");
1467