xref: /openbmc/linux/drivers/thermal/intel/intel_powerclamp.c (revision b97d6790d03b763eca08847a9a5869a4291b9f9a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012-2023, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/cpu.h>
31 #include <linux/thermal.h>
32 #include <linux/debugfs.h>
33 #include <linux/seq_file.h>
34 #include <linux/idle_inject.h>
35 
36 #include <asm/msr.h>
37 #include <asm/mwait.h>
38 #include <asm/cpu_device_id.h>
39 
40 #define MAX_TARGET_RATIO (100U)
41 /* For each undisturbed clamping period (no extra wake ups during idle time),
42  * we increment the confidence counter for the given target ratio.
43  * CONFIDENCE_OK defines the level where runtime calibration results are
44  * valid.
45  */
46 #define CONFIDENCE_OK (3)
47 /* Default idle injection duration, driver adjust sleep time to meet target
48  * idle ratio. Similar to frequency modulation.
49  */
50 #define DEFAULT_DURATION_JIFFIES (6)
51 
52 static unsigned int target_mwait;
53 static struct dentry *debug_dir;
54 static bool poll_pkg_cstate_enable;
55 
56 /* Idle ratio observed using package C-state counters */
57 static unsigned int current_ratio;
58 
59 /* Skip the idle injection till set to true */
60 static bool should_skip;
61 
62 struct powerclamp_data {
63 	unsigned int cpu;
64 	unsigned int count;
65 	unsigned int guard;
66 	unsigned int window_size_now;
67 	unsigned int target_ratio;
68 	bool clamping;
69 };
70 
71 static struct powerclamp_data powerclamp_data;
72 
73 static struct thermal_cooling_device *cooling_dev;
74 
75 static DEFINE_MUTEX(powerclamp_lock);
76 
77 /* This duration is in microseconds */
78 static unsigned int duration;
79 static unsigned int pkg_cstate_ratio_cur;
80 static unsigned int window_size;
81 
duration_set(const char * arg,const struct kernel_param * kp)82 static int duration_set(const char *arg, const struct kernel_param *kp)
83 {
84 	int ret = 0;
85 	unsigned long new_duration;
86 
87 	ret = kstrtoul(arg, 10, &new_duration);
88 	if (ret)
89 		goto exit;
90 	if (new_duration > 25 || new_duration < 6) {
91 		pr_err("Out of recommended range %lu, between 6-25ms\n",
92 			new_duration);
93 		ret = -EINVAL;
94 		goto exit;
95 	}
96 
97 	mutex_lock(&powerclamp_lock);
98 	duration = clamp(new_duration, 6ul, 25ul) * 1000;
99 	mutex_unlock(&powerclamp_lock);
100 exit:
101 
102 	return ret;
103 }
104 
duration_get(char * buf,const struct kernel_param * kp)105 static int duration_get(char *buf, const struct kernel_param *kp)
106 {
107 	int ret;
108 
109 	mutex_lock(&powerclamp_lock);
110 	ret = sysfs_emit(buf, "%d\n", duration / 1000);
111 	mutex_unlock(&powerclamp_lock);
112 
113 	return ret;
114 }
115 
116 static const struct kernel_param_ops duration_ops = {
117 	.set = duration_set,
118 	.get = duration_get,
119 };
120 
121 module_param_cb(duration, &duration_ops, NULL, 0644);
122 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
123 
124 #define DEFAULT_MAX_IDLE	50
125 #define MAX_ALL_CPU_IDLE	75
126 
127 static u8 max_idle = DEFAULT_MAX_IDLE;
128 
129 static cpumask_var_t idle_injection_cpu_mask;
130 
allocate_copy_idle_injection_mask(const struct cpumask * copy_mask)131 static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask)
132 {
133 	if (cpumask_available(idle_injection_cpu_mask))
134 		goto copy_mask;
135 
136 	/* This mask is allocated only one time and freed during module exit */
137 	if (!alloc_cpumask_var(&idle_injection_cpu_mask, GFP_KERNEL))
138 		return -ENOMEM;
139 
140 copy_mask:
141 	cpumask_copy(idle_injection_cpu_mask, copy_mask);
142 
143 	return 0;
144 }
145 
146 /* Return true if the cpumask and idle percent combination is invalid */
check_invalid(cpumask_var_t mask,u8 idle)147 static bool check_invalid(cpumask_var_t mask, u8 idle)
148 {
149 	if (cpumask_equal(cpu_present_mask, mask) && idle > MAX_ALL_CPU_IDLE)
150 		return true;
151 
152 	return false;
153 }
154 
cpumask_set(const char * arg,const struct kernel_param * kp)155 static int cpumask_set(const char *arg, const struct kernel_param *kp)
156 {
157 	cpumask_var_t new_mask;
158 	int ret;
159 
160 	mutex_lock(&powerclamp_lock);
161 
162 	/* Can't set mask when cooling device is in use */
163 	if (powerclamp_data.clamping) {
164 		ret = -EAGAIN;
165 		goto skip_cpumask_set;
166 	}
167 
168 	ret = alloc_cpumask_var(&new_mask, GFP_KERNEL);
169 	if (!ret)
170 		goto skip_cpumask_set;
171 
172 	ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
173 			   nr_cpumask_bits);
174 	if (ret)
175 		goto free_cpumask_set;
176 
177 	if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
178 		ret = -EINVAL;
179 		goto free_cpumask_set;
180 	}
181 
182 	/*
183 	 * When module parameters are passed from kernel command line
184 	 * during insmod, the module parameter callback is called
185 	 * before powerclamp_init(), so we can't assume that some
186 	 * cpumask can be allocated and copied before here. Also
187 	 * in this case this cpumask is used as the default mask.
188 	 */
189 	ret = allocate_copy_idle_injection_mask(new_mask);
190 
191 free_cpumask_set:
192 	free_cpumask_var(new_mask);
193 skip_cpumask_set:
194 	mutex_unlock(&powerclamp_lock);
195 
196 	return ret;
197 }
198 
cpumask_get(char * buf,const struct kernel_param * kp)199 static int cpumask_get(char *buf, const struct kernel_param *kp)
200 {
201 	if (!cpumask_available(idle_injection_cpu_mask))
202 		return -ENODEV;
203 
204 	return bitmap_print_to_pagebuf(false, buf, cpumask_bits(idle_injection_cpu_mask),
205 				       nr_cpumask_bits);
206 }
207 
208 static const struct kernel_param_ops cpumask_ops = {
209 	.set = cpumask_set,
210 	.get = cpumask_get,
211 };
212 
213 module_param_cb(cpumask, &cpumask_ops, NULL, 0644);
214 MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection.");
215 
max_idle_set(const char * arg,const struct kernel_param * kp)216 static int max_idle_set(const char *arg, const struct kernel_param *kp)
217 {
218 	u8 new_max_idle;
219 	int ret = 0;
220 
221 	mutex_lock(&powerclamp_lock);
222 
223 	/* Can't set mask when cooling device is in use */
224 	if (powerclamp_data.clamping) {
225 		ret = -EAGAIN;
226 		goto skip_limit_set;
227 	}
228 
229 	ret = kstrtou8(arg, 10, &new_max_idle);
230 	if (ret)
231 		goto skip_limit_set;
232 
233 	if (new_max_idle > MAX_TARGET_RATIO) {
234 		ret = -EINVAL;
235 		goto skip_limit_set;
236 	}
237 
238 	if (!cpumask_available(idle_injection_cpu_mask)) {
239 		ret = allocate_copy_idle_injection_mask(cpu_present_mask);
240 		if (ret)
241 			goto skip_limit_set;
242 	}
243 
244 	if (check_invalid(idle_injection_cpu_mask, new_max_idle)) {
245 		ret = -EINVAL;
246 		goto skip_limit_set;
247 	}
248 
249 	max_idle = new_max_idle;
250 
251 skip_limit_set:
252 	mutex_unlock(&powerclamp_lock);
253 
254 	return ret;
255 }
256 
257 static const struct kernel_param_ops max_idle_ops = {
258 	.set = max_idle_set,
259 	.get = param_get_byte,
260 };
261 
262 module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
263 MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
264 
265 struct powerclamp_calibration_data {
266 	unsigned long confidence;  /* used for calibration, basically a counter
267 				    * gets incremented each time a clamping
268 				    * period is completed without extra wakeups
269 				    * once that counter is reached given level,
270 				    * compensation is deemed usable.
271 				    */
272 	unsigned long steady_comp; /* steady state compensation used when
273 				    * no extra wakeups occurred.
274 				    */
275 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
276 				     * mostly from external interrupts.
277 				     */
278 };
279 
280 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
281 
window_size_set(const char * arg,const struct kernel_param * kp)282 static int window_size_set(const char *arg, const struct kernel_param *kp)
283 {
284 	int ret = 0;
285 	unsigned long new_window_size;
286 
287 	ret = kstrtoul(arg, 10, &new_window_size);
288 	if (ret)
289 		goto exit_win;
290 	if (new_window_size > 10 || new_window_size < 2) {
291 		pr_err("Out of recommended window size %lu, between 2-10\n",
292 			new_window_size);
293 		ret = -EINVAL;
294 	}
295 
296 	window_size = clamp(new_window_size, 2ul, 10ul);
297 	smp_mb();
298 
299 exit_win:
300 
301 	return ret;
302 }
303 
304 static const struct kernel_param_ops window_size_ops = {
305 	.set = window_size_set,
306 	.get = param_get_int,
307 };
308 
309 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
310 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
311 	"\tpowerclamp controls idle ratio within this window. larger\n"
312 	"\twindow size results in slower response time but more smooth\n"
313 	"\tclamping results. default to 2.");
314 
find_target_mwait(void)315 static void find_target_mwait(void)
316 {
317 	unsigned int eax, ebx, ecx, edx;
318 	unsigned int highest_cstate = 0;
319 	unsigned int highest_subcstate = 0;
320 	int i;
321 
322 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
323 		return;
324 
325 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
326 
327 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
328 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
329 		return;
330 
331 	edx >>= MWAIT_SUBSTATE_SIZE;
332 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
333 		if (edx & MWAIT_SUBSTATE_MASK) {
334 			highest_cstate = i;
335 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
336 		}
337 	}
338 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
339 		(highest_subcstate - 1);
340 
341 }
342 
343 struct pkg_cstate_info {
344 	bool skip;
345 	int msr_index;
346 	int cstate_id;
347 };
348 
349 #define PKG_CSTATE_INIT(id) {				\
350 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
351 		.cstate_id = id				\
352 			}
353 
354 static struct pkg_cstate_info pkg_cstates[] = {
355 	PKG_CSTATE_INIT(2),
356 	PKG_CSTATE_INIT(3),
357 	PKG_CSTATE_INIT(6),
358 	PKG_CSTATE_INIT(7),
359 	PKG_CSTATE_INIT(8),
360 	PKG_CSTATE_INIT(9),
361 	PKG_CSTATE_INIT(10),
362 	{NULL},
363 };
364 
has_pkg_state_counter(void)365 static bool has_pkg_state_counter(void)
366 {
367 	u64 val;
368 	struct pkg_cstate_info *info = pkg_cstates;
369 
370 	/* check if any one of the counter msrs exists */
371 	while (info->msr_index) {
372 		if (!rdmsrl_safe(info->msr_index, &val))
373 			return true;
374 		info++;
375 	}
376 
377 	return false;
378 }
379 
pkg_state_counter(void)380 static u64 pkg_state_counter(void)
381 {
382 	u64 val;
383 	u64 count = 0;
384 	struct pkg_cstate_info *info = pkg_cstates;
385 
386 	while (info->msr_index) {
387 		if (!info->skip) {
388 			if (!rdmsrl_safe(info->msr_index, &val))
389 				count += val;
390 			else
391 				info->skip = true;
392 		}
393 		info++;
394 	}
395 
396 	return count;
397 }
398 
get_compensation(int ratio)399 static unsigned int get_compensation(int ratio)
400 {
401 	unsigned int comp = 0;
402 
403 	if (!poll_pkg_cstate_enable)
404 		return 0;
405 
406 	/* we only use compensation if all adjacent ones are good */
407 	if (ratio == 1 &&
408 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
409 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
410 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
411 		comp = (cal_data[ratio].steady_comp +
412 			cal_data[ratio + 1].steady_comp +
413 			cal_data[ratio + 2].steady_comp) / 3;
414 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
415 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
416 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
417 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
418 		comp = (cal_data[ratio].steady_comp +
419 			cal_data[ratio - 1].steady_comp +
420 			cal_data[ratio - 2].steady_comp) / 3;
421 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
422 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
423 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
424 		comp = (cal_data[ratio].steady_comp +
425 			cal_data[ratio - 1].steady_comp +
426 			cal_data[ratio + 1].steady_comp) / 3;
427 	}
428 
429 	/* do not exceed limit */
430 	if (comp + ratio >= MAX_TARGET_RATIO)
431 		comp = MAX_TARGET_RATIO - ratio - 1;
432 
433 	return comp;
434 }
435 
adjust_compensation(int target_ratio,unsigned int win)436 static void adjust_compensation(int target_ratio, unsigned int win)
437 {
438 	int delta;
439 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
440 
441 	/*
442 	 * adjust compensations if confidence level has not been reached.
443 	 */
444 	if (d->confidence >= CONFIDENCE_OK)
445 		return;
446 
447 	delta = powerclamp_data.target_ratio - current_ratio;
448 	/* filter out bad data */
449 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
450 		if (d->steady_comp)
451 			d->steady_comp =
452 				roundup(delta+d->steady_comp, 2)/2;
453 		else
454 			d->steady_comp = delta;
455 		d->confidence++;
456 	}
457 }
458 
powerclamp_adjust_controls(unsigned int target_ratio,unsigned int guard,unsigned int win)459 static bool powerclamp_adjust_controls(unsigned int target_ratio,
460 				unsigned int guard, unsigned int win)
461 {
462 	static u64 msr_last, tsc_last;
463 	u64 msr_now, tsc_now;
464 	u64 val64;
465 
466 	/* check result for the last window */
467 	msr_now = pkg_state_counter();
468 	tsc_now = rdtsc();
469 
470 	/* calculate pkg cstate vs tsc ratio */
471 	if (!msr_last || !tsc_last)
472 		current_ratio = 1;
473 	else if (tsc_now-tsc_last) {
474 		val64 = 100*(msr_now-msr_last);
475 		do_div(val64, (tsc_now-tsc_last));
476 		current_ratio = val64;
477 	}
478 
479 	/* update record */
480 	msr_last = msr_now;
481 	tsc_last = tsc_now;
482 
483 	adjust_compensation(target_ratio, win);
484 
485 	/* if we are above target+guard, skip */
486 	return powerclamp_data.target_ratio + guard <= current_ratio;
487 }
488 
489 /*
490  * This function calculates runtime from the current target ratio.
491  * This function gets called under powerclamp_lock.
492  */
get_run_time(void)493 static unsigned int get_run_time(void)
494 {
495 	unsigned int compensated_ratio;
496 	unsigned int runtime;
497 
498 	/*
499 	 * make sure user selected ratio does not take effect until
500 	 * the next round. adjust target_ratio if user has changed
501 	 * target such that we can converge quickly.
502 	 */
503 	powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
504 	powerclamp_data.window_size_now = window_size;
505 
506 	/*
507 	 * systems may have different ability to enter package level
508 	 * c-states, thus we need to compensate the injected idle ratio
509 	 * to achieve the actual target reported by the HW.
510 	 */
511 	compensated_ratio = powerclamp_data.target_ratio +
512 		get_compensation(powerclamp_data.target_ratio);
513 	if (compensated_ratio <= 0)
514 		compensated_ratio = 1;
515 
516 	runtime = duration * 100 / compensated_ratio - duration;
517 
518 	return runtime;
519 }
520 
521 /*
522  * 1 HZ polling while clamping is active, useful for userspace
523  * to monitor actual idle ratio.
524  */
525 static void poll_pkg_cstate(struct work_struct *dummy);
526 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
poll_pkg_cstate(struct work_struct * dummy)527 static void poll_pkg_cstate(struct work_struct *dummy)
528 {
529 	static u64 msr_last;
530 	static u64 tsc_last;
531 
532 	u64 msr_now;
533 	u64 tsc_now;
534 	u64 val64;
535 
536 	msr_now = pkg_state_counter();
537 	tsc_now = rdtsc();
538 
539 	/* calculate pkg cstate vs tsc ratio */
540 	if (!msr_last || !tsc_last)
541 		pkg_cstate_ratio_cur = 1;
542 	else {
543 		if (tsc_now - tsc_last) {
544 			val64 = 100 * (msr_now - msr_last);
545 			do_div(val64, (tsc_now - tsc_last));
546 			pkg_cstate_ratio_cur = val64;
547 		}
548 	}
549 
550 	/* update record */
551 	msr_last = msr_now;
552 	tsc_last = tsc_now;
553 
554 	mutex_lock(&powerclamp_lock);
555 	if (powerclamp_data.clamping)
556 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
557 	mutex_unlock(&powerclamp_lock);
558 }
559 
560 static struct idle_inject_device *ii_dev;
561 
562 /*
563  * This function is called from idle injection core on timer expiry
564  * for the run duration. This allows powerclamp to readjust or skip
565  * injecting idle for this cycle.
566  */
idle_inject_update(void)567 static bool idle_inject_update(void)
568 {
569 	bool update = false;
570 
571 	/* We can't sleep in this callback */
572 	if (!mutex_trylock(&powerclamp_lock))
573 		return true;
574 
575 	if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
576 
577 		should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio,
578 							 powerclamp_data.guard,
579 							 powerclamp_data.window_size_now);
580 		update = true;
581 	}
582 
583 	if (update) {
584 		unsigned int runtime = get_run_time();
585 
586 		idle_inject_set_duration(ii_dev, runtime, duration);
587 	}
588 
589 	powerclamp_data.count++;
590 
591 	mutex_unlock(&powerclamp_lock);
592 
593 	if (should_skip)
594 		return false;
595 
596 	return true;
597 }
598 
599 /* This function starts idle injection by calling idle_inject_start() */
trigger_idle_injection(void)600 static void trigger_idle_injection(void)
601 {
602 	unsigned int runtime = get_run_time();
603 
604 	idle_inject_set_duration(ii_dev, runtime, duration);
605 	idle_inject_start(ii_dev);
606 	powerclamp_data.clamping = true;
607 }
608 
609 /*
610  * This function is called from start_power_clamp() to register
611  * CPUS with powercap idle injection register and set default
612  * idle duration and latency.
613  */
powerclamp_idle_injection_register(void)614 static int powerclamp_idle_injection_register(void)
615 {
616 	poll_pkg_cstate_enable = false;
617 	if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
618 		ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update);
619 		if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
620 			poll_pkg_cstate_enable = true;
621 	} else {
622 		ii_dev = idle_inject_register(idle_injection_cpu_mask);
623 	}
624 
625 	if (!ii_dev) {
626 		pr_err("powerclamp: idle_inject_register failed\n");
627 		return -EAGAIN;
628 	}
629 
630 	idle_inject_set_duration(ii_dev, TICK_USEC, duration);
631 	idle_inject_set_latency(ii_dev, UINT_MAX);
632 
633 	return 0;
634 }
635 
636 /*
637  * This function is called from end_power_clamp() to stop idle injection
638  * and unregister CPUS from powercap idle injection core.
639  */
remove_idle_injection(void)640 static void remove_idle_injection(void)
641 {
642 	if (!powerclamp_data.clamping)
643 		return;
644 
645 	powerclamp_data.clamping = false;
646 	idle_inject_stop(ii_dev);
647 }
648 
649 /*
650  * This function is called when user change the cooling device
651  * state from zero to some other value.
652  */
start_power_clamp(void)653 static int start_power_clamp(void)
654 {
655 	int ret;
656 
657 	ret = powerclamp_idle_injection_register();
658 	if (!ret) {
659 		trigger_idle_injection();
660 		if (poll_pkg_cstate_enable)
661 			schedule_delayed_work(&poll_pkg_cstate_work, 0);
662 	}
663 
664 	return ret;
665 }
666 
667 /*
668  * This function is called when user change the cooling device
669  * state from non zero value zero.
670  */
end_power_clamp(void)671 static void end_power_clamp(void)
672 {
673 	if (powerclamp_data.clamping) {
674 		remove_idle_injection();
675 		idle_inject_unregister(ii_dev);
676 	}
677 }
678 
powerclamp_get_max_state(struct thermal_cooling_device * cdev,unsigned long * state)679 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
680 				 unsigned long *state)
681 {
682 	*state = MAX_TARGET_RATIO;
683 
684 	return 0;
685 }
686 
powerclamp_get_cur_state(struct thermal_cooling_device * cdev,unsigned long * state)687 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
688 				 unsigned long *state)
689 {
690 	mutex_lock(&powerclamp_lock);
691 	*state = powerclamp_data.target_ratio;
692 	mutex_unlock(&powerclamp_lock);
693 
694 	return 0;
695 }
696 
powerclamp_set_cur_state(struct thermal_cooling_device * cdev,unsigned long new_target_ratio)697 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
698 				 unsigned long new_target_ratio)
699 {
700 	int ret = 0;
701 
702 	mutex_lock(&powerclamp_lock);
703 
704 	new_target_ratio = clamp(new_target_ratio, 0UL,
705 				(unsigned long) (max_idle - 1));
706 
707 	if (powerclamp_data.target_ratio == new_target_ratio)
708 		goto exit_set;
709 
710 	if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
711 		pr_info("Start idle injection to reduce power\n");
712 		powerclamp_data.target_ratio = new_target_ratio;
713 		ret = start_power_clamp();
714 		if (ret)
715 			powerclamp_data.target_ratio = 0;
716 		goto exit_set;
717 	} else	if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) {
718 		pr_info("Stop forced idle injection\n");
719 		end_power_clamp();
720 		powerclamp_data.target_ratio = 0;
721 	} else	/* adjust currently running */ {
722 		unsigned int runtime;
723 
724 		powerclamp_data.target_ratio = new_target_ratio;
725 		runtime = get_run_time();
726 		idle_inject_set_duration(ii_dev, runtime, duration);
727 	}
728 
729 exit_set:
730 	mutex_unlock(&powerclamp_lock);
731 
732 	return ret;
733 }
734 
735 /* bind to generic thermal layer as cooling device*/
736 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
737 	.get_max_state = powerclamp_get_max_state,
738 	.get_cur_state = powerclamp_get_cur_state,
739 	.set_cur_state = powerclamp_set_cur_state,
740 };
741 
742 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
743 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
744 	{}
745 };
746 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
747 
powerclamp_probe(void)748 static int __init powerclamp_probe(void)
749 {
750 
751 	if (!x86_match_cpu(intel_powerclamp_ids)) {
752 		pr_err("CPU does not support MWAIT\n");
753 		return -ENODEV;
754 	}
755 
756 	/* The goal for idle time alignment is to achieve package cstate. */
757 	if (!has_pkg_state_counter()) {
758 		pr_info("No package C-state available\n");
759 		return -ENODEV;
760 	}
761 
762 	/* find the deepest mwait value */
763 	find_target_mwait();
764 
765 	return 0;
766 }
767 
powerclamp_debug_show(struct seq_file * m,void * unused)768 static int powerclamp_debug_show(struct seq_file *m, void *unused)
769 {
770 	int i = 0;
771 
772 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
773 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
774 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
775 			i,
776 			cal_data[i].confidence,
777 			cal_data[i].steady_comp,
778 			cal_data[i].dynamic_comp);
779 	}
780 
781 	return 0;
782 }
783 
784 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
785 
powerclamp_create_debug_files(void)786 static inline void powerclamp_create_debug_files(void)
787 {
788 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
789 
790 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
791 			    &powerclamp_debug_fops);
792 }
793 
powerclamp_init(void)794 static int __init powerclamp_init(void)
795 {
796 	int retval;
797 
798 	/* probe cpu features and ids here */
799 	retval = powerclamp_probe();
800 	if (retval)
801 		return retval;
802 
803 	mutex_lock(&powerclamp_lock);
804 	if (!cpumask_available(idle_injection_cpu_mask))
805 		retval = allocate_copy_idle_injection_mask(cpu_present_mask);
806 	mutex_unlock(&powerclamp_lock);
807 
808 	if (retval)
809 		return retval;
810 
811 	/* set default limit, maybe adjusted during runtime based on feedback */
812 	window_size = 2;
813 
814 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
815 						      &powerclamp_cooling_ops);
816 	if (IS_ERR(cooling_dev))
817 		return -ENODEV;
818 
819 	if (!duration)
820 		duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES);
821 
822 	powerclamp_create_debug_files();
823 
824 	return 0;
825 }
826 module_init(powerclamp_init);
827 
powerclamp_exit(void)828 static void __exit powerclamp_exit(void)
829 {
830 	mutex_lock(&powerclamp_lock);
831 	end_power_clamp();
832 	mutex_unlock(&powerclamp_lock);
833 
834 	thermal_cooling_device_unregister(cooling_dev);
835 
836 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
837 	debugfs_remove_recursive(debug_dir);
838 
839 	if (cpumask_available(idle_injection_cpu_mask))
840 		free_cpumask_var(idle_injection_cpu_mask);
841 }
842 module_exit(powerclamp_exit);
843 
844 MODULE_IMPORT_NS(IDLE_INJECT);
845 
846 MODULE_LICENSE("GPL");
847 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
848 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
849 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
850