1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Common code for Intel Running Average Power Limit (RAPL) support.
4  * Copyright (c) 2019, Intel Corporation.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/list.h>
11 #include <linux/types.h>
12 #include <linux/device.h>
13 #include <linux/slab.h>
14 #include <linux/log2.h>
15 #include <linux/bitmap.h>
16 #include <linux/delay.h>
17 #include <linux/sysfs.h>
18 #include <linux/cpu.h>
19 #include <linux/powercap.h>
20 #include <linux/suspend.h>
21 #include <linux/intel_rapl.h>
22 #include <linux/processor.h>
23 #include <linux/platform_device.h>
24 
25 #include <asm/iosf_mbi.h>
26 #include <asm/cpu_device_id.h>
27 #include <asm/intel-family.h>
28 
29 /* Local defines */
30 #define MSR_PLATFORM_POWER_LIMIT	0x0000065C
31 
32 /* bitmasks for RAPL MSRs, used by primitive access functions */
33 #define ENERGY_STATUS_MASK      0xffffffff
34 
35 #define POWER_LIMIT1_MASK       0x7FFF
36 #define POWER_LIMIT1_ENABLE     BIT(15)
37 #define POWER_LIMIT1_CLAMP      BIT(16)
38 
39 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
40 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
41 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
42 #define POWER_HIGH_LOCK         BIT_ULL(63)
43 #define POWER_LOW_LOCK          BIT(31)
44 
45 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
46 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
47 
48 #define POWER_UNIT_OFFSET	0
49 #define POWER_UNIT_MASK		0x0F
50 
51 #define ENERGY_UNIT_OFFSET	0x08
52 #define ENERGY_UNIT_MASK	0x1F00
53 
54 #define TIME_UNIT_OFFSET	0x10
55 #define TIME_UNIT_MASK		0xF0000
56 
57 #define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
58 #define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
59 #define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
60 #define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
61 
62 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
63 #define PP_POLICY_MASK         0x1F
64 
65 /* Non HW constants */
66 #define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
67 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
68 
69 #define TIME_WINDOW_MAX_MSEC 40000
70 #define TIME_WINDOW_MIN_MSEC 250
71 #define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
72 enum unit_type {
73 	ARBITRARY_UNIT,		/* no translation */
74 	POWER_UNIT,
75 	ENERGY_UNIT,
76 	TIME_UNIT,
77 };
78 
79 /* per domain data, some are optional */
80 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
81 
82 #define	DOMAIN_STATE_INACTIVE           BIT(0)
83 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
84 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
85 
86 static const char pl1_name[] = "long_term";
87 static const char pl2_name[] = "short_term";
88 
89 #define power_zone_to_rapl_domain(_zone) \
90 	container_of(_zone, struct rapl_domain, power_zone)
91 
92 struct rapl_defaults {
93 	u8 floor_freq_reg_addr;
94 	int (*check_unit)(struct rapl_package *rp, int cpu);
95 	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
96 	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
97 				    bool to_raw);
98 	unsigned int dram_domain_energy_unit;
99 };
100 static struct rapl_defaults *rapl_defaults;
101 
102 /* Sideband MBI registers */
103 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
104 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
105 
106 #define PACKAGE_PLN_INT_SAVED   BIT(0)
107 #define MAX_PRIM_NAME (32)
108 
109 /* per domain data. used to describe individual knobs such that access function
110  * can be consolidated into one instead of many inline functions.
111  */
112 struct rapl_primitive_info {
113 	const char *name;
114 	u64 mask;
115 	int shift;
116 	enum rapl_domain_reg_id id;
117 	enum unit_type unit;
118 	u32 flag;
119 };
120 
121 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
122 		.name = #p,			\
123 		.mask = m,			\
124 		.shift = s,			\
125 		.id = i,			\
126 		.unit = u,			\
127 		.flag = f			\
128 	}
129 
130 static void rapl_init_domains(struct rapl_package *rp);
131 static int rapl_read_data_raw(struct rapl_domain *rd,
132 			      enum rapl_primitives prim,
133 			      bool xlate, u64 *data);
134 static int rapl_write_data_raw(struct rapl_domain *rd,
135 			       enum rapl_primitives prim,
136 			       unsigned long long value);
137 static u64 rapl_unit_xlate(struct rapl_domain *rd,
138 			   enum unit_type type, u64 value, int to_raw);
139 static void package_power_limit_irq_save(struct rapl_package *rp);
140 
141 static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
142 
143 static const char *const rapl_domain_names[] = {
144 	"package",
145 	"core",
146 	"uncore",
147 	"dram",
148 	"psys",
149 };
150 
151 static int get_energy_counter(struct powercap_zone *power_zone,
152 			      u64 *energy_raw)
153 {
154 	struct rapl_domain *rd;
155 	u64 energy_now;
156 
157 	/* prevent CPU hotplug, make sure the RAPL domain does not go
158 	 * away while reading the counter.
159 	 */
160 	get_online_cpus();
161 	rd = power_zone_to_rapl_domain(power_zone);
162 
163 	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
164 		*energy_raw = energy_now;
165 		put_online_cpus();
166 
167 		return 0;
168 	}
169 	put_online_cpus();
170 
171 	return -EIO;
172 }
173 
174 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
175 {
176 	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
177 
178 	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
179 	return 0;
180 }
181 
182 static int release_zone(struct powercap_zone *power_zone)
183 {
184 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
185 	struct rapl_package *rp = rd->rp;
186 
187 	/* package zone is the last zone of a package, we can free
188 	 * memory here since all children has been unregistered.
189 	 */
190 	if (rd->id == RAPL_DOMAIN_PACKAGE) {
191 		kfree(rd);
192 		rp->domains = NULL;
193 	}
194 
195 	return 0;
196 
197 }
198 
199 static int find_nr_power_limit(struct rapl_domain *rd)
200 {
201 	int i, nr_pl = 0;
202 
203 	for (i = 0; i < NR_POWER_LIMITS; i++) {
204 		if (rd->rpl[i].name)
205 			nr_pl++;
206 	}
207 
208 	return nr_pl;
209 }
210 
211 static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
212 {
213 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
214 
215 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
216 		return -EACCES;
217 
218 	get_online_cpus();
219 	rapl_write_data_raw(rd, PL1_ENABLE, mode);
220 	if (rapl_defaults->set_floor_freq)
221 		rapl_defaults->set_floor_freq(rd, mode);
222 	put_online_cpus();
223 
224 	return 0;
225 }
226 
227 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
228 {
229 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
230 	u64 val;
231 
232 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
233 		*mode = false;
234 		return 0;
235 	}
236 	get_online_cpus();
237 	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
238 		put_online_cpus();
239 		return -EIO;
240 	}
241 	*mode = val;
242 	put_online_cpus();
243 
244 	return 0;
245 }
246 
247 /* per RAPL domain ops, in the order of rapl_domain_type */
248 static const struct powercap_zone_ops zone_ops[] = {
249 	/* RAPL_DOMAIN_PACKAGE */
250 	{
251 	 .get_energy_uj = get_energy_counter,
252 	 .get_max_energy_range_uj = get_max_energy_counter,
253 	 .release = release_zone,
254 	 .set_enable = set_domain_enable,
255 	 .get_enable = get_domain_enable,
256 	 },
257 	/* RAPL_DOMAIN_PP0 */
258 	{
259 	 .get_energy_uj = get_energy_counter,
260 	 .get_max_energy_range_uj = get_max_energy_counter,
261 	 .release = release_zone,
262 	 .set_enable = set_domain_enable,
263 	 .get_enable = get_domain_enable,
264 	 },
265 	/* RAPL_DOMAIN_PP1 */
266 	{
267 	 .get_energy_uj = get_energy_counter,
268 	 .get_max_energy_range_uj = get_max_energy_counter,
269 	 .release = release_zone,
270 	 .set_enable = set_domain_enable,
271 	 .get_enable = get_domain_enable,
272 	 },
273 	/* RAPL_DOMAIN_DRAM */
274 	{
275 	 .get_energy_uj = get_energy_counter,
276 	 .get_max_energy_range_uj = get_max_energy_counter,
277 	 .release = release_zone,
278 	 .set_enable = set_domain_enable,
279 	 .get_enable = get_domain_enable,
280 	 },
281 	/* RAPL_DOMAIN_PLATFORM */
282 	{
283 	 .get_energy_uj = get_energy_counter,
284 	 .get_max_energy_range_uj = get_max_energy_counter,
285 	 .release = release_zone,
286 	 .set_enable = set_domain_enable,
287 	 .get_enable = get_domain_enable,
288 	 },
289 };
290 
291 /*
292  * Constraint index used by powercap can be different than power limit (PL)
293  * index in that some  PLs maybe missing due to non-existent MSRs. So we
294  * need to convert here by finding the valid PLs only (name populated).
295  */
296 static int contraint_to_pl(struct rapl_domain *rd, int cid)
297 {
298 	int i, j;
299 
300 	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
301 		if ((rd->rpl[i].name) && j++ == cid) {
302 			pr_debug("%s: index %d\n", __func__, i);
303 			return i;
304 		}
305 	}
306 	pr_err("Cannot find matching power limit for constraint %d\n", cid);
307 
308 	return -EINVAL;
309 }
310 
311 static int set_power_limit(struct powercap_zone *power_zone, int cid,
312 			   u64 power_limit)
313 {
314 	struct rapl_domain *rd;
315 	struct rapl_package *rp;
316 	int ret = 0;
317 	int id;
318 
319 	get_online_cpus();
320 	rd = power_zone_to_rapl_domain(power_zone);
321 	id = contraint_to_pl(rd, cid);
322 	if (id < 0) {
323 		ret = id;
324 		goto set_exit;
325 	}
326 
327 	rp = rd->rp;
328 
329 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
330 		dev_warn(&power_zone->dev,
331 			 "%s locked by BIOS, monitoring only\n", rd->name);
332 		ret = -EACCES;
333 		goto set_exit;
334 	}
335 
336 	switch (rd->rpl[id].prim_id) {
337 	case PL1_ENABLE:
338 		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
339 		break;
340 	case PL2_ENABLE:
341 		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
342 		break;
343 	default:
344 		ret = -EINVAL;
345 	}
346 	if (!ret)
347 		package_power_limit_irq_save(rp);
348 set_exit:
349 	put_online_cpus();
350 	return ret;
351 }
352 
353 static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
354 				   u64 *data)
355 {
356 	struct rapl_domain *rd;
357 	u64 val;
358 	int prim;
359 	int ret = 0;
360 	int id;
361 
362 	get_online_cpus();
363 	rd = power_zone_to_rapl_domain(power_zone);
364 	id = contraint_to_pl(rd, cid);
365 	if (id < 0) {
366 		ret = id;
367 		goto get_exit;
368 	}
369 
370 	switch (rd->rpl[id].prim_id) {
371 	case PL1_ENABLE:
372 		prim = POWER_LIMIT1;
373 		break;
374 	case PL2_ENABLE:
375 		prim = POWER_LIMIT2;
376 		break;
377 	default:
378 		put_online_cpus();
379 		return -EINVAL;
380 	}
381 	if (rapl_read_data_raw(rd, prim, true, &val))
382 		ret = -EIO;
383 	else
384 		*data = val;
385 
386 get_exit:
387 	put_online_cpus();
388 
389 	return ret;
390 }
391 
392 static int set_time_window(struct powercap_zone *power_zone, int cid,
393 			   u64 window)
394 {
395 	struct rapl_domain *rd;
396 	int ret = 0;
397 	int id;
398 
399 	get_online_cpus();
400 	rd = power_zone_to_rapl_domain(power_zone);
401 	id = contraint_to_pl(rd, cid);
402 	if (id < 0) {
403 		ret = id;
404 		goto set_time_exit;
405 	}
406 
407 	switch (rd->rpl[id].prim_id) {
408 	case PL1_ENABLE:
409 		rapl_write_data_raw(rd, TIME_WINDOW1, window);
410 		break;
411 	case PL2_ENABLE:
412 		rapl_write_data_raw(rd, TIME_WINDOW2, window);
413 		break;
414 	default:
415 		ret = -EINVAL;
416 	}
417 
418 set_time_exit:
419 	put_online_cpus();
420 	return ret;
421 }
422 
423 static int get_time_window(struct powercap_zone *power_zone, int cid,
424 			   u64 *data)
425 {
426 	struct rapl_domain *rd;
427 	u64 val;
428 	int ret = 0;
429 	int id;
430 
431 	get_online_cpus();
432 	rd = power_zone_to_rapl_domain(power_zone);
433 	id = contraint_to_pl(rd, cid);
434 	if (id < 0) {
435 		ret = id;
436 		goto get_time_exit;
437 	}
438 
439 	switch (rd->rpl[id].prim_id) {
440 	case PL1_ENABLE:
441 		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
442 		break;
443 	case PL2_ENABLE:
444 		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
445 		break;
446 	default:
447 		put_online_cpus();
448 		return -EINVAL;
449 	}
450 	if (!ret)
451 		*data = val;
452 
453 get_time_exit:
454 	put_online_cpus();
455 
456 	return ret;
457 }
458 
459 static const char *get_constraint_name(struct powercap_zone *power_zone,
460 				       int cid)
461 {
462 	struct rapl_domain *rd;
463 	int id;
464 
465 	rd = power_zone_to_rapl_domain(power_zone);
466 	id = contraint_to_pl(rd, cid);
467 	if (id >= 0)
468 		return rd->rpl[id].name;
469 
470 	return NULL;
471 }
472 
473 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
474 {
475 	struct rapl_domain *rd;
476 	u64 val;
477 	int prim;
478 	int ret = 0;
479 
480 	get_online_cpus();
481 	rd = power_zone_to_rapl_domain(power_zone);
482 	switch (rd->rpl[id].prim_id) {
483 	case PL1_ENABLE:
484 		prim = THERMAL_SPEC_POWER;
485 		break;
486 	case PL2_ENABLE:
487 		prim = MAX_POWER;
488 		break;
489 	default:
490 		put_online_cpus();
491 		return -EINVAL;
492 	}
493 	if (rapl_read_data_raw(rd, prim, true, &val))
494 		ret = -EIO;
495 	else
496 		*data = val;
497 
498 	put_online_cpus();
499 
500 	return ret;
501 }
502 
503 static const struct powercap_zone_constraint_ops constraint_ops = {
504 	.set_power_limit_uw = set_power_limit,
505 	.get_power_limit_uw = get_current_power_limit,
506 	.set_time_window_us = set_time_window,
507 	.get_time_window_us = get_time_window,
508 	.get_max_power_uw = get_max_power,
509 	.get_name = get_constraint_name,
510 };
511 
512 /* called after domain detection and package level data are set */
513 static void rapl_init_domains(struct rapl_package *rp)
514 {
515 	enum rapl_domain_type i;
516 	enum rapl_domain_reg_id j;
517 	struct rapl_domain *rd = rp->domains;
518 
519 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
520 		unsigned int mask = rp->domain_map & (1 << i);
521 
522 		if (!mask)
523 			continue;
524 
525 		rd->rp = rp;
526 		rd->name = rapl_domain_names[i];
527 		rd->id = i;
528 		rd->rpl[0].prim_id = PL1_ENABLE;
529 		rd->rpl[0].name = pl1_name;
530 		/* some domain may support two power limits */
531 		if (rp->priv->limits[i] == 2) {
532 			rd->rpl[1].prim_id = PL2_ENABLE;
533 			rd->rpl[1].name = pl2_name;
534 		}
535 
536 		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
537 			rd->regs[j] = rp->priv->regs[i][j];
538 
539 		if (i == RAPL_DOMAIN_DRAM) {
540 			rd->domain_energy_unit =
541 			    rapl_defaults->dram_domain_energy_unit;
542 			if (rd->domain_energy_unit)
543 				pr_info("DRAM domain energy unit %dpj\n",
544 					rd->domain_energy_unit);
545 		}
546 		rd++;
547 	}
548 }
549 
550 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
551 			   u64 value, int to_raw)
552 {
553 	u64 units = 1;
554 	struct rapl_package *rp = rd->rp;
555 	u64 scale = 1;
556 
557 	switch (type) {
558 	case POWER_UNIT:
559 		units = rp->power_unit;
560 		break;
561 	case ENERGY_UNIT:
562 		scale = ENERGY_UNIT_SCALE;
563 		/* per domain unit takes precedence */
564 		if (rd->domain_energy_unit)
565 			units = rd->domain_energy_unit;
566 		else
567 			units = rp->energy_unit;
568 		break;
569 	case TIME_UNIT:
570 		return rapl_defaults->compute_time_window(rp, value, to_raw);
571 	case ARBITRARY_UNIT:
572 	default:
573 		return value;
574 	};
575 
576 	if (to_raw)
577 		return div64_u64(value, units) * scale;
578 
579 	value *= units;
580 
581 	return div64_u64(value, scale);
582 }
583 
584 /* in the order of enum rapl_primitives */
585 static struct rapl_primitive_info rpi[] = {
586 	/* name, mask, shift, msr index, unit divisor */
587 	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
588 			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
589 	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
590 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
591 	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
592 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
593 	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
594 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
595 	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
596 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
597 	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
598 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
599 	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
600 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
601 	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
602 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
603 	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
604 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
605 	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
606 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
607 	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
608 			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
609 	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
610 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
611 	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
612 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
613 	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
614 			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
615 	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
616 			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
617 	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
618 			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
619 	/* non-hardware */
620 	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
621 			    RAPL_PRIMITIVE_DERIVED),
622 	{NULL, 0, 0, 0},
623 };
624 
625 /* Read primitive data based on its related struct rapl_primitive_info.
626  * if xlate flag is set, return translated data based on data units, i.e.
627  * time, energy, and power.
628  * RAPL MSRs are non-architectual and are laid out not consistently across
629  * domains. Here we use primitive info to allow writing consolidated access
630  * functions.
631  * For a given primitive, it is processed by MSR mask and shift. Unit conversion
632  * is pre-assigned based on RAPL unit MSRs read at init time.
633  * 63-------------------------- 31--------------------------- 0
634  * |                           xxxxx (mask)                   |
635  * |                                |<- shift ----------------|
636  * 63-------------------------- 31--------------------------- 0
637  */
638 static int rapl_read_data_raw(struct rapl_domain *rd,
639 			      enum rapl_primitives prim, bool xlate, u64 *data)
640 {
641 	u64 value;
642 	struct rapl_primitive_info *rp = &rpi[prim];
643 	struct reg_action ra;
644 	int cpu;
645 
646 	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
647 		return -EINVAL;
648 
649 	ra.reg = rd->regs[rp->id];
650 	if (!ra.reg)
651 		return -EINVAL;
652 
653 	cpu = rd->rp->lead_cpu;
654 
655 	/* domain with 2 limits has different bit */
656 	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
657 		rp->mask = POWER_HIGH_LOCK;
658 		rp->shift = 63;
659 	}
660 	/* non-hardware data are collected by the polling thread */
661 	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
662 		*data = rd->rdd.primitives[prim];
663 		return 0;
664 	}
665 
666 	ra.mask = rp->mask;
667 
668 	if (rd->rp->priv->read_raw(cpu, &ra)) {
669 		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
670 		return -EIO;
671 	}
672 
673 	value = ra.value >> rp->shift;
674 
675 	if (xlate)
676 		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
677 	else
678 		*data = value;
679 
680 	return 0;
681 }
682 
683 /* Similar use of primitive info in the read counterpart */
684 static int rapl_write_data_raw(struct rapl_domain *rd,
685 			       enum rapl_primitives prim,
686 			       unsigned long long value)
687 {
688 	struct rapl_primitive_info *rp = &rpi[prim];
689 	int cpu;
690 	u64 bits;
691 	struct reg_action ra;
692 	int ret;
693 
694 	cpu = rd->rp->lead_cpu;
695 	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
696 	bits <<= rp->shift;
697 	bits &= rp->mask;
698 
699 	memset(&ra, 0, sizeof(ra));
700 
701 	ra.reg = rd->regs[rp->id];
702 	ra.mask = rp->mask;
703 	ra.value = bits;
704 
705 	ret = rd->rp->priv->write_raw(cpu, &ra);
706 
707 	return ret;
708 }
709 
710 /*
711  * Raw RAPL data stored in MSRs are in certain scales. We need to
712  * convert them into standard units based on the units reported in
713  * the RAPL unit MSRs. This is specific to CPUs as the method to
714  * calculate units differ on different CPUs.
715  * We convert the units to below format based on CPUs.
716  * i.e.
717  * energy unit: picoJoules  : Represented in picoJoules by default
718  * power unit : microWatts  : Represented in milliWatts by default
719  * time unit  : microseconds: Represented in seconds by default
720  */
721 static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
722 {
723 	struct reg_action ra;
724 	u32 value;
725 
726 	ra.reg = rp->priv->reg_unit;
727 	ra.mask = ~0;
728 	if (rp->priv->read_raw(cpu, &ra)) {
729 		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
730 		       rp->priv->reg_unit, cpu);
731 		return -ENODEV;
732 	}
733 
734 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
735 	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
736 
737 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
738 	rp->power_unit = 1000000 / (1 << value);
739 
740 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
741 	rp->time_unit = 1000000 / (1 << value);
742 
743 	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
744 		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
745 
746 	return 0;
747 }
748 
749 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
750 {
751 	struct reg_action ra;
752 	u32 value;
753 
754 	ra.reg = rp->priv->reg_unit;
755 	ra.mask = ~0;
756 	if (rp->priv->read_raw(cpu, &ra)) {
757 		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
758 		       rp->priv->reg_unit, cpu);
759 		return -ENODEV;
760 	}
761 
762 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
763 	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
764 
765 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
766 	rp->power_unit = (1 << value) * 1000;
767 
768 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
769 	rp->time_unit = 1000000 / (1 << value);
770 
771 	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
772 		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
773 
774 	return 0;
775 }
776 
777 static void power_limit_irq_save_cpu(void *info)
778 {
779 	u32 l, h = 0;
780 	struct rapl_package *rp = (struct rapl_package *)info;
781 
782 	/* save the state of PLN irq mask bit before disabling it */
783 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
784 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
785 		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
786 		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
787 	}
788 	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
789 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
790 }
791 
792 /* REVISIT:
793  * When package power limit is set artificially low by RAPL, LVT
794  * thermal interrupt for package power limit should be ignored
795  * since we are not really exceeding the real limit. The intention
796  * is to avoid excessive interrupts while we are trying to save power.
797  * A useful feature might be routing the package_power_limit interrupt
798  * to userspace via eventfd. once we have a usecase, this is simple
799  * to do by adding an atomic notifier.
800  */
801 
802 static void package_power_limit_irq_save(struct rapl_package *rp)
803 {
804 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
805 		return;
806 
807 	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
808 }
809 
810 /*
811  * Restore per package power limit interrupt enable state. Called from cpu
812  * hotplug code on package removal.
813  */
814 static void package_power_limit_irq_restore(struct rapl_package *rp)
815 {
816 	u32 l, h;
817 
818 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
819 		return;
820 
821 	/* irq enable state not saved, nothing to restore */
822 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
823 		return;
824 
825 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
826 
827 	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
828 		l |= PACKAGE_THERM_INT_PLN_ENABLE;
829 	else
830 		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
831 
832 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
833 }
834 
835 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
836 {
837 	int nr_powerlimit = find_nr_power_limit(rd);
838 
839 	/* always enable clamp such that p-state can go below OS requested
840 	 * range. power capping priority over guranteed frequency.
841 	 */
842 	rapl_write_data_raw(rd, PL1_CLAMP, mode);
843 
844 	/* some domains have pl2 */
845 	if (nr_powerlimit > 1) {
846 		rapl_write_data_raw(rd, PL2_ENABLE, mode);
847 		rapl_write_data_raw(rd, PL2_CLAMP, mode);
848 	}
849 }
850 
851 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
852 {
853 	static u32 power_ctrl_orig_val;
854 	u32 mdata;
855 
856 	if (!rapl_defaults->floor_freq_reg_addr) {
857 		pr_err("Invalid floor frequency config register\n");
858 		return;
859 	}
860 
861 	if (!power_ctrl_orig_val)
862 		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
863 			      rapl_defaults->floor_freq_reg_addr,
864 			      &power_ctrl_orig_val);
865 	mdata = power_ctrl_orig_val;
866 	if (enable) {
867 		mdata &= ~(0x7f << 8);
868 		mdata |= 1 << 8;
869 	}
870 	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
871 		       rapl_defaults->floor_freq_reg_addr, mdata);
872 }
873 
874 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
875 					 bool to_raw)
876 {
877 	u64 f, y;		/* fraction and exp. used for time unit */
878 
879 	/*
880 	 * Special processing based on 2^Y*(1+F/4), refer
881 	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
882 	 */
883 	if (!to_raw) {
884 		f = (value & 0x60) >> 5;
885 		y = value & 0x1f;
886 		value = (1 << y) * (4 + f) * rp->time_unit / 4;
887 	} else {
888 		do_div(value, rp->time_unit);
889 		y = ilog2(value);
890 		f = div64_u64(4 * (value - (1 << y)), 1 << y);
891 		value = (y & 0x1f) | ((f & 0x3) << 5);
892 	}
893 	return value;
894 }
895 
896 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
897 					 bool to_raw)
898 {
899 	/*
900 	 * Atom time unit encoding is straight forward val * time_unit,
901 	 * where time_unit is default to 1 sec. Never 0.
902 	 */
903 	if (!to_raw)
904 		return (value) ? value *= rp->time_unit : rp->time_unit;
905 
906 	value = div64_u64(value, rp->time_unit);
907 
908 	return value;
909 }
910 
911 static const struct rapl_defaults rapl_defaults_core = {
912 	.floor_freq_reg_addr = 0,
913 	.check_unit = rapl_check_unit_core,
914 	.set_floor_freq = set_floor_freq_default,
915 	.compute_time_window = rapl_compute_time_window_core,
916 };
917 
918 static const struct rapl_defaults rapl_defaults_hsw_server = {
919 	.check_unit = rapl_check_unit_core,
920 	.set_floor_freq = set_floor_freq_default,
921 	.compute_time_window = rapl_compute_time_window_core,
922 	.dram_domain_energy_unit = 15300,
923 };
924 
925 static const struct rapl_defaults rapl_defaults_byt = {
926 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
927 	.check_unit = rapl_check_unit_atom,
928 	.set_floor_freq = set_floor_freq_atom,
929 	.compute_time_window = rapl_compute_time_window_atom,
930 };
931 
932 static const struct rapl_defaults rapl_defaults_tng = {
933 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
934 	.check_unit = rapl_check_unit_atom,
935 	.set_floor_freq = set_floor_freq_atom,
936 	.compute_time_window = rapl_compute_time_window_atom,
937 };
938 
939 static const struct rapl_defaults rapl_defaults_ann = {
940 	.floor_freq_reg_addr = 0,
941 	.check_unit = rapl_check_unit_atom,
942 	.set_floor_freq = NULL,
943 	.compute_time_window = rapl_compute_time_window_atom,
944 };
945 
946 static const struct rapl_defaults rapl_defaults_cht = {
947 	.floor_freq_reg_addr = 0,
948 	.check_unit = rapl_check_unit_atom,
949 	.set_floor_freq = NULL,
950 	.compute_time_window = rapl_compute_time_window_atom,
951 };
952 
953 static const struct x86_cpu_id rapl_ids[] __initconst = {
954 	INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
955 	INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
956 
957 	INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
958 	INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
959 
960 	INTEL_CPU_FAM6(HASWELL, rapl_defaults_core),
961 	INTEL_CPU_FAM6(HASWELL_L, rapl_defaults_core),
962 	INTEL_CPU_FAM6(HASWELL_G, rapl_defaults_core),
963 	INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
964 
965 	INTEL_CPU_FAM6(BROADWELL, rapl_defaults_core),
966 	INTEL_CPU_FAM6(BROADWELL_G, rapl_defaults_core),
967 	INTEL_CPU_FAM6(BROADWELL_D, rapl_defaults_core),
968 	INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
969 
970 	INTEL_CPU_FAM6(SKYLAKE, rapl_defaults_core),
971 	INTEL_CPU_FAM6(SKYLAKE_L, rapl_defaults_core),
972 	INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
973 	INTEL_CPU_FAM6(KABYLAKE_L, rapl_defaults_core),
974 	INTEL_CPU_FAM6(KABYLAKE, rapl_defaults_core),
975 	INTEL_CPU_FAM6(CANNONLAKE_L, rapl_defaults_core),
976 	INTEL_CPU_FAM6(ICELAKE_L, rapl_defaults_core),
977 	INTEL_CPU_FAM6(ICELAKE, rapl_defaults_core),
978 	INTEL_CPU_FAM6(ICELAKE_NNPI, rapl_defaults_core),
979 	INTEL_CPU_FAM6(ICELAKE_X, rapl_defaults_hsw_server),
980 	INTEL_CPU_FAM6(ICELAKE_D, rapl_defaults_hsw_server),
981 	INTEL_CPU_FAM6(COMETLAKE_L, rapl_defaults_core),
982 	INTEL_CPU_FAM6(COMETLAKE, rapl_defaults_core),
983 	INTEL_CPU_FAM6(TIGERLAKE_L, rapl_defaults_core),
984 
985 	INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
986 	INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
987 	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
988 	INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
989 	INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
990 	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
991 	INTEL_CPU_FAM6(ATOM_GOLDMONT_D, rapl_defaults_core),
992 	INTEL_CPU_FAM6(ATOM_TREMONT_D, rapl_defaults_core),
993 	INTEL_CPU_FAM6(ATOM_TREMONT_L, rapl_defaults_core),
994 
995 	INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
996 	INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
997 	{}
998 };
999 
1000 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
1001 
1002 /* Read once for all raw primitive data for domains */
1003 static void rapl_update_domain_data(struct rapl_package *rp)
1004 {
1005 	int dmn, prim;
1006 	u64 val;
1007 
1008 	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1009 		pr_debug("update %s domain %s data\n", rp->name,
1010 			 rp->domains[dmn].name);
1011 		/* exclude non-raw primitives */
1012 		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1013 			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1014 						rpi[prim].unit, &val))
1015 				rp->domains[dmn].rdd.primitives[prim] = val;
1016 		}
1017 	}
1018 
1019 }
1020 
1021 static int rapl_package_register_powercap(struct rapl_package *rp)
1022 {
1023 	struct rapl_domain *rd;
1024 	struct powercap_zone *power_zone = NULL;
1025 	int nr_pl, ret;
1026 
1027 	/* Update the domain data of the new package */
1028 	rapl_update_domain_data(rp);
1029 
1030 	/* first we register package domain as the parent zone */
1031 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1032 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1033 			nr_pl = find_nr_power_limit(rd);
1034 			pr_debug("register package domain %s\n", rp->name);
1035 			power_zone = powercap_register_zone(&rd->power_zone,
1036 					    rp->priv->control_type, rp->name,
1037 					    NULL, &zone_ops[rd->id], nr_pl,
1038 					    &constraint_ops);
1039 			if (IS_ERR(power_zone)) {
1040 				pr_debug("failed to register power zone %s\n",
1041 					 rp->name);
1042 				return PTR_ERR(power_zone);
1043 			}
1044 			/* track parent zone in per package/socket data */
1045 			rp->power_zone = power_zone;
1046 			/* done, only one package domain per socket */
1047 			break;
1048 		}
1049 	}
1050 	if (!power_zone) {
1051 		pr_err("no package domain found, unknown topology!\n");
1052 		return -ENODEV;
1053 	}
1054 	/* now register domains as children of the socket/package */
1055 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1056 		if (rd->id == RAPL_DOMAIN_PACKAGE)
1057 			continue;
1058 		/* number of power limits per domain varies */
1059 		nr_pl = find_nr_power_limit(rd);
1060 		power_zone = powercap_register_zone(&rd->power_zone,
1061 						    rp->priv->control_type,
1062 						    rd->name, rp->power_zone,
1063 						    &zone_ops[rd->id], nr_pl,
1064 						    &constraint_ops);
1065 
1066 		if (IS_ERR(power_zone)) {
1067 			pr_debug("failed to register power_zone, %s:%s\n",
1068 				 rp->name, rd->name);
1069 			ret = PTR_ERR(power_zone);
1070 			goto err_cleanup;
1071 		}
1072 	}
1073 	return 0;
1074 
1075 err_cleanup:
1076 	/*
1077 	 * Clean up previously initialized domains within the package if we
1078 	 * failed after the first domain setup.
1079 	 */
1080 	while (--rd >= rp->domains) {
1081 		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1082 		powercap_unregister_zone(rp->priv->control_type,
1083 					 &rd->power_zone);
1084 	}
1085 
1086 	return ret;
1087 }
1088 
1089 int rapl_add_platform_domain(struct rapl_if_priv *priv)
1090 {
1091 	struct rapl_domain *rd;
1092 	struct powercap_zone *power_zone;
1093 	struct reg_action ra;
1094 	int ret;
1095 
1096 	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1097 	ra.mask = ~0;
1098 	ret = priv->read_raw(0, &ra);
1099 	if (ret || !ra.value)
1100 		return -ENODEV;
1101 
1102 	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1103 	ra.mask = ~0;
1104 	ret = priv->read_raw(0, &ra);
1105 	if (ret || !ra.value)
1106 		return -ENODEV;
1107 
1108 	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
1109 	if (!rd)
1110 		return -ENOMEM;
1111 
1112 	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
1113 	rd->id = RAPL_DOMAIN_PLATFORM;
1114 	rd->regs[RAPL_DOMAIN_REG_LIMIT] =
1115 	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1116 	rd->regs[RAPL_DOMAIN_REG_STATUS] =
1117 	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1118 	rd->rpl[0].prim_id = PL1_ENABLE;
1119 	rd->rpl[0].name = pl1_name;
1120 	rd->rpl[1].prim_id = PL2_ENABLE;
1121 	rd->rpl[1].name = pl2_name;
1122 	rd->rp = rapl_find_package_domain(0, priv);
1123 
1124 	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
1125 					    "psys", NULL,
1126 					    &zone_ops[RAPL_DOMAIN_PLATFORM],
1127 					    2, &constraint_ops);
1128 
1129 	if (IS_ERR(power_zone)) {
1130 		kfree(rd);
1131 		return PTR_ERR(power_zone);
1132 	}
1133 
1134 	priv->platform_rapl_domain = rd;
1135 
1136 	return 0;
1137 }
1138 EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
1139 
1140 void rapl_remove_platform_domain(struct rapl_if_priv *priv)
1141 {
1142 	if (priv->platform_rapl_domain) {
1143 		powercap_unregister_zone(priv->control_type,
1144 				 &priv->platform_rapl_domain->power_zone);
1145 		kfree(priv->platform_rapl_domain);
1146 	}
1147 }
1148 EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
1149 
1150 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
1151 {
1152 	struct reg_action ra;
1153 
1154 	switch (domain) {
1155 	case RAPL_DOMAIN_PACKAGE:
1156 	case RAPL_DOMAIN_PP0:
1157 	case RAPL_DOMAIN_PP1:
1158 	case RAPL_DOMAIN_DRAM:
1159 		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1160 		break;
1161 	case RAPL_DOMAIN_PLATFORM:
1162 		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
1163 		return -EINVAL;
1164 	default:
1165 		pr_err("invalid domain id %d\n", domain);
1166 		return -EINVAL;
1167 	}
1168 	/* make sure domain counters are available and contains non-zero
1169 	 * values, otherwise skip it.
1170 	 */
1171 
1172 	ra.mask = ~0;
1173 	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
1174 		return -ENODEV;
1175 
1176 	return 0;
1177 }
1178 
1179 /*
1180  * Check if power limits are available. Two cases when they are not available:
1181  * 1. Locked by BIOS, in this case we still provide read-only access so that
1182  *    users can see what limit is set by the BIOS.
1183  * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1184  *    exist at all. In this case, we do not show the constraints in powercap.
1185  *
1186  * Called after domains are detected and initialized.
1187  */
1188 static void rapl_detect_powerlimit(struct rapl_domain *rd)
1189 {
1190 	u64 val64;
1191 	int i;
1192 
1193 	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1194 	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
1195 		if (val64) {
1196 			pr_info("RAPL %s domain %s locked by BIOS\n",
1197 				rd->rp->name, rd->name);
1198 			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1199 		}
1200 	}
1201 	/* check if power limit MSR exists, otherwise domain is monitoring only */
1202 	for (i = 0; i < NR_POWER_LIMITS; i++) {
1203 		int prim = rd->rpl[i].prim_id;
1204 
1205 		if (rapl_read_data_raw(rd, prim, false, &val64))
1206 			rd->rpl[i].name = NULL;
1207 	}
1208 }
1209 
1210 /* Detect active and valid domains for the given CPU, caller must
1211  * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1212  */
1213 static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1214 {
1215 	struct rapl_domain *rd;
1216 	int i;
1217 
1218 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1219 		/* use physical package id to read counters */
1220 		if (!rapl_check_domain(cpu, i, rp)) {
1221 			rp->domain_map |= 1 << i;
1222 			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1223 		}
1224 	}
1225 	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1226 	if (!rp->nr_domains) {
1227 		pr_debug("no valid rapl domains found in %s\n", rp->name);
1228 		return -ENODEV;
1229 	}
1230 	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1231 
1232 	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1233 			      GFP_KERNEL);
1234 	if (!rp->domains)
1235 		return -ENOMEM;
1236 
1237 	rapl_init_domains(rp);
1238 
1239 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1240 		rapl_detect_powerlimit(rd);
1241 
1242 	return 0;
1243 }
1244 
1245 /* called from CPU hotplug notifier, hotplug lock held */
1246 void rapl_remove_package(struct rapl_package *rp)
1247 {
1248 	struct rapl_domain *rd, *rd_package = NULL;
1249 
1250 	package_power_limit_irq_restore(rp);
1251 
1252 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1253 		rapl_write_data_raw(rd, PL1_ENABLE, 0);
1254 		rapl_write_data_raw(rd, PL1_CLAMP, 0);
1255 		if (find_nr_power_limit(rd) > 1) {
1256 			rapl_write_data_raw(rd, PL2_ENABLE, 0);
1257 			rapl_write_data_raw(rd, PL2_CLAMP, 0);
1258 		}
1259 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1260 			rd_package = rd;
1261 			continue;
1262 		}
1263 		pr_debug("remove package, undo power limit on %s: %s\n",
1264 			 rp->name, rd->name);
1265 		powercap_unregister_zone(rp->priv->control_type,
1266 					 &rd->power_zone);
1267 	}
1268 	/* do parent zone last */
1269 	powercap_unregister_zone(rp->priv->control_type,
1270 				 &rd_package->power_zone);
1271 	list_del(&rp->plist);
1272 	kfree(rp);
1273 }
1274 EXPORT_SYMBOL_GPL(rapl_remove_package);
1275 
1276 /* caller to ensure CPU hotplug lock is held */
1277 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
1278 {
1279 	int id = topology_logical_die_id(cpu);
1280 	struct rapl_package *rp;
1281 
1282 	list_for_each_entry(rp, &rapl_packages, plist) {
1283 		if (rp->id == id
1284 		    && rp->priv->control_type == priv->control_type)
1285 			return rp;
1286 	}
1287 
1288 	return NULL;
1289 }
1290 EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1291 
1292 /* called from CPU hotplug notifier, hotplug lock held */
1293 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
1294 {
1295 	int id = topology_logical_die_id(cpu);
1296 	struct rapl_package *rp;
1297 	struct cpuinfo_x86 *c = &cpu_data(cpu);
1298 	int ret;
1299 
1300 	if (!rapl_defaults)
1301 		return ERR_PTR(-ENODEV);
1302 
1303 	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1304 	if (!rp)
1305 		return ERR_PTR(-ENOMEM);
1306 
1307 	/* add the new package to the list */
1308 	rp->id = id;
1309 	rp->lead_cpu = cpu;
1310 	rp->priv = priv;
1311 
1312 	if (topology_max_die_per_package() > 1)
1313 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
1314 			 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
1315 	else
1316 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1317 			 c->phys_proc_id);
1318 
1319 	/* check if the package contains valid domains */
1320 	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
1321 		ret = -ENODEV;
1322 		goto err_free_package;
1323 	}
1324 	ret = rapl_package_register_powercap(rp);
1325 	if (!ret) {
1326 		INIT_LIST_HEAD(&rp->plist);
1327 		list_add(&rp->plist, &rapl_packages);
1328 		return rp;
1329 	}
1330 
1331 err_free_package:
1332 	kfree(rp->domains);
1333 	kfree(rp);
1334 	return ERR_PTR(ret);
1335 }
1336 EXPORT_SYMBOL_GPL(rapl_add_package);
1337 
1338 static void power_limit_state_save(void)
1339 {
1340 	struct rapl_package *rp;
1341 	struct rapl_domain *rd;
1342 	int nr_pl, ret, i;
1343 
1344 	get_online_cpus();
1345 	list_for_each_entry(rp, &rapl_packages, plist) {
1346 		if (!rp->power_zone)
1347 			continue;
1348 		rd = power_zone_to_rapl_domain(rp->power_zone);
1349 		nr_pl = find_nr_power_limit(rd);
1350 		for (i = 0; i < nr_pl; i++) {
1351 			switch (rd->rpl[i].prim_id) {
1352 			case PL1_ENABLE:
1353 				ret = rapl_read_data_raw(rd,
1354 						 POWER_LIMIT1, true,
1355 						 &rd->rpl[i].last_power_limit);
1356 				if (ret)
1357 					rd->rpl[i].last_power_limit = 0;
1358 				break;
1359 			case PL2_ENABLE:
1360 				ret = rapl_read_data_raw(rd,
1361 						 POWER_LIMIT2, true,
1362 						 &rd->rpl[i].last_power_limit);
1363 				if (ret)
1364 					rd->rpl[i].last_power_limit = 0;
1365 				break;
1366 			}
1367 		}
1368 	}
1369 	put_online_cpus();
1370 }
1371 
1372 static void power_limit_state_restore(void)
1373 {
1374 	struct rapl_package *rp;
1375 	struct rapl_domain *rd;
1376 	int nr_pl, i;
1377 
1378 	get_online_cpus();
1379 	list_for_each_entry(rp, &rapl_packages, plist) {
1380 		if (!rp->power_zone)
1381 			continue;
1382 		rd = power_zone_to_rapl_domain(rp->power_zone);
1383 		nr_pl = find_nr_power_limit(rd);
1384 		for (i = 0; i < nr_pl; i++) {
1385 			switch (rd->rpl[i].prim_id) {
1386 			case PL1_ENABLE:
1387 				if (rd->rpl[i].last_power_limit)
1388 					rapl_write_data_raw(rd, POWER_LIMIT1,
1389 					    rd->rpl[i].last_power_limit);
1390 				break;
1391 			case PL2_ENABLE:
1392 				if (rd->rpl[i].last_power_limit)
1393 					rapl_write_data_raw(rd, POWER_LIMIT2,
1394 					    rd->rpl[i].last_power_limit);
1395 				break;
1396 			}
1397 		}
1398 	}
1399 	put_online_cpus();
1400 }
1401 
1402 static int rapl_pm_callback(struct notifier_block *nb,
1403 			    unsigned long mode, void *_unused)
1404 {
1405 	switch (mode) {
1406 	case PM_SUSPEND_PREPARE:
1407 		power_limit_state_save();
1408 		break;
1409 	case PM_POST_SUSPEND:
1410 		power_limit_state_restore();
1411 		break;
1412 	}
1413 	return NOTIFY_OK;
1414 }
1415 
1416 static struct notifier_block rapl_pm_notifier = {
1417 	.notifier_call = rapl_pm_callback,
1418 };
1419 
1420 static struct platform_device *rapl_msr_platdev;
1421 
1422 static int __init rapl_init(void)
1423 {
1424 	const struct x86_cpu_id *id;
1425 	int ret;
1426 
1427 	id = x86_match_cpu(rapl_ids);
1428 	if (!id) {
1429 		pr_err("driver does not support CPU family %d model %d\n",
1430 		       boot_cpu_data.x86, boot_cpu_data.x86_model);
1431 
1432 		return -ENODEV;
1433 	}
1434 
1435 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
1436 
1437 	ret = register_pm_notifier(&rapl_pm_notifier);
1438 	if (ret)
1439 		return ret;
1440 
1441 	rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1442 	if (!rapl_msr_platdev) {
1443 		ret = -ENOMEM;
1444 		goto end;
1445 	}
1446 
1447 	ret = platform_device_add(rapl_msr_platdev);
1448 	if (ret)
1449 		platform_device_put(rapl_msr_platdev);
1450 
1451 end:
1452 	if (ret)
1453 		unregister_pm_notifier(&rapl_pm_notifier);
1454 
1455 	return ret;
1456 }
1457 
1458 static void __exit rapl_exit(void)
1459 {
1460 	platform_device_unregister(rapl_msr_platdev);
1461 	unregister_pm_notifier(&rapl_pm_notifier);
1462 }
1463 
1464 fs_initcall(rapl_init);
1465 module_exit(rapl_exit);
1466 
1467 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1468 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1469 MODULE_LICENSE("GPL v2");
1470