xref: /openbmc/linux/drivers/thermal/intel/intel_powerclamp.c (revision 498a1cf902c31c3af398082d65cf150b33b367e6)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012-2023, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/cpu.h>
31 #include <linux/thermal.h>
32 #include <linux/debugfs.h>
33 #include <linux/seq_file.h>
34 #include <linux/idle_inject.h>
35 
36 #include <asm/msr.h>
37 #include <asm/mwait.h>
38 #include <asm/cpu_device_id.h>
39 
40 #define MAX_TARGET_RATIO (100U)
41 /* For each undisturbed clamping period (no extra wake ups during idle time),
42  * we increment the confidence counter for the given target ratio.
43  * CONFIDENCE_OK defines the level where runtime calibration results are
44  * valid.
45  */
46 #define CONFIDENCE_OK (3)
47 /* Default idle injection duration, driver adjust sleep time to meet target
48  * idle ratio. Similar to frequency modulation.
49  */
50 #define DEFAULT_DURATION_JIFFIES (6)
51 
52 static unsigned int target_mwait;
53 static struct dentry *debug_dir;
54 static bool poll_pkg_cstate_enable;
55 
56 /* Idle ratio observed using package C-state counters */
57 static unsigned int current_ratio;
58 
59 /* Skip the idle injection till set to true */
60 static bool should_skip;
61 
62 struct powerclamp_data {
63 	unsigned int cpu;
64 	unsigned int count;
65 	unsigned int guard;
66 	unsigned int window_size_now;
67 	unsigned int target_ratio;
68 	bool clamping;
69 };
70 
71 static struct powerclamp_data powerclamp_data;
72 
73 static struct thermal_cooling_device *cooling_dev;
74 
75 static DEFINE_MUTEX(powerclamp_lock);
76 
77 /* This duration is in microseconds */
78 static unsigned int duration;
79 static unsigned int pkg_cstate_ratio_cur;
80 static unsigned int window_size;
81 
82 static int duration_set(const char *arg, const struct kernel_param *kp)
83 {
84 	int ret = 0;
85 	unsigned long new_duration;
86 
87 	ret = kstrtoul(arg, 10, &new_duration);
88 	if (ret)
89 		goto exit;
90 	if (new_duration > 25 || new_duration < 6) {
91 		pr_err("Out of recommended range %lu, between 6-25ms\n",
92 			new_duration);
93 		ret = -EINVAL;
94 		goto exit;
95 	}
96 
97 	mutex_lock(&powerclamp_lock);
98 	duration = clamp(new_duration, 6ul, 25ul) * 1000;
99 	mutex_unlock(&powerclamp_lock);
100 exit:
101 
102 	return ret;
103 }
104 
105 static int duration_get(char *buf, const struct kernel_param *kp)
106 {
107 	int ret;
108 
109 	mutex_lock(&powerclamp_lock);
110 	ret = sysfs_emit(buf, "%d\n", duration / 1000);
111 	mutex_unlock(&powerclamp_lock);
112 
113 	return ret;
114 }
115 
116 static const struct kernel_param_ops duration_ops = {
117 	.set = duration_set,
118 	.get = duration_get,
119 };
120 
121 module_param_cb(duration, &duration_ops, NULL, 0644);
122 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
123 
124 #define DEFAULT_MAX_IDLE	50
125 #define MAX_ALL_CPU_IDLE	75
126 
127 static u8 max_idle = DEFAULT_MAX_IDLE;
128 
129 static cpumask_var_t idle_injection_cpu_mask;
130 
131 static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask)
132 {
133 	if (cpumask_available(idle_injection_cpu_mask))
134 		goto copy_mask;
135 
136 	/* This mask is allocated only one time and freed during module exit */
137 	if (!alloc_cpumask_var(&idle_injection_cpu_mask, GFP_KERNEL))
138 		return -ENOMEM;
139 
140 copy_mask:
141 	cpumask_copy(idle_injection_cpu_mask, copy_mask);
142 
143 	return 0;
144 }
145 
146 /* Return true if the cpumask and idle percent combination is invalid */
147 static bool check_invalid(cpumask_var_t mask, u8 idle)
148 {
149 	if (cpumask_equal(cpu_present_mask, mask) && idle > MAX_ALL_CPU_IDLE)
150 		return true;
151 
152 	return false;
153 }
154 
155 static int cpumask_set(const char *arg, const struct kernel_param *kp)
156 {
157 	cpumask_var_t new_mask;
158 	int ret;
159 
160 	mutex_lock(&powerclamp_lock);
161 
162 	/* Can't set mask when cooling device is in use */
163 	if (powerclamp_data.clamping) {
164 		ret = -EAGAIN;
165 		goto skip_cpumask_set;
166 	}
167 
168 	ret = alloc_cpumask_var(&new_mask, GFP_KERNEL);
169 	if (!ret)
170 		goto skip_cpumask_set;
171 
172 	ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
173 			   nr_cpumask_bits);
174 	if (ret)
175 		goto free_cpumask_set;
176 
177 	if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
178 		ret = -EINVAL;
179 		goto free_cpumask_set;
180 	}
181 
182 	/*
183 	 * When module parameters are passed from kernel command line
184 	 * during insmod, the module parameter callback is called
185 	 * before powerclamp_init(), so we can't assume that some
186 	 * cpumask can be allocated and copied before here. Also
187 	 * in this case this cpumask is used as the default mask.
188 	 */
189 	ret = allocate_copy_idle_injection_mask(new_mask);
190 
191 free_cpumask_set:
192 	free_cpumask_var(new_mask);
193 skip_cpumask_set:
194 	mutex_unlock(&powerclamp_lock);
195 
196 	return ret;
197 }
198 
199 static int cpumask_get(char *buf, const struct kernel_param *kp)
200 {
201 	if (!cpumask_available(idle_injection_cpu_mask))
202 		return -ENODEV;
203 
204 	return bitmap_print_to_pagebuf(false, buf, cpumask_bits(idle_injection_cpu_mask),
205 				       nr_cpumask_bits);
206 }
207 
208 static const struct kernel_param_ops cpumask_ops = {
209 	.set = cpumask_set,
210 	.get = cpumask_get,
211 };
212 
213 module_param_cb(cpumask, &cpumask_ops, NULL, 0644);
214 MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection.");
215 
216 static int max_idle_set(const char *arg, const struct kernel_param *kp)
217 {
218 	u8 new_max_idle;
219 	int ret = 0;
220 
221 	mutex_lock(&powerclamp_lock);
222 
223 	/* Can't set mask when cooling device is in use */
224 	if (powerclamp_data.clamping) {
225 		ret = -EAGAIN;
226 		goto skip_limit_set;
227 	}
228 
229 	ret = kstrtou8(arg, 10, &new_max_idle);
230 	if (ret)
231 		goto skip_limit_set;
232 
233 	if (new_max_idle > MAX_TARGET_RATIO) {
234 		ret = -EINVAL;
235 		goto skip_limit_set;
236 	}
237 
238 	if (check_invalid(idle_injection_cpu_mask, new_max_idle)) {
239 		ret = -EINVAL;
240 		goto skip_limit_set;
241 	}
242 
243 	max_idle = new_max_idle;
244 
245 skip_limit_set:
246 	mutex_unlock(&powerclamp_lock);
247 
248 	return ret;
249 }
250 
251 static const struct kernel_param_ops max_idle_ops = {
252 	.set = max_idle_set,
253 	.get = param_get_int,
254 };
255 
256 module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
257 MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
258 
259 struct powerclamp_calibration_data {
260 	unsigned long confidence;  /* used for calibration, basically a counter
261 				    * gets incremented each time a clamping
262 				    * period is completed without extra wakeups
263 				    * once that counter is reached given level,
264 				    * compensation is deemed usable.
265 				    */
266 	unsigned long steady_comp; /* steady state compensation used when
267 				    * no extra wakeups occurred.
268 				    */
269 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
270 				     * mostly from external interrupts.
271 				     */
272 };
273 
274 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
275 
276 static int window_size_set(const char *arg, const struct kernel_param *kp)
277 {
278 	int ret = 0;
279 	unsigned long new_window_size;
280 
281 	ret = kstrtoul(arg, 10, &new_window_size);
282 	if (ret)
283 		goto exit_win;
284 	if (new_window_size > 10 || new_window_size < 2) {
285 		pr_err("Out of recommended window size %lu, between 2-10\n",
286 			new_window_size);
287 		ret = -EINVAL;
288 	}
289 
290 	window_size = clamp(new_window_size, 2ul, 10ul);
291 	smp_mb();
292 
293 exit_win:
294 
295 	return ret;
296 }
297 
298 static const struct kernel_param_ops window_size_ops = {
299 	.set = window_size_set,
300 	.get = param_get_int,
301 };
302 
303 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
304 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
305 	"\tpowerclamp controls idle ratio within this window. larger\n"
306 	"\twindow size results in slower response time but more smooth\n"
307 	"\tclamping results. default to 2.");
308 
309 static void find_target_mwait(void)
310 {
311 	unsigned int eax, ebx, ecx, edx;
312 	unsigned int highest_cstate = 0;
313 	unsigned int highest_subcstate = 0;
314 	int i;
315 
316 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
317 		return;
318 
319 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
320 
321 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
322 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
323 		return;
324 
325 	edx >>= MWAIT_SUBSTATE_SIZE;
326 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
327 		if (edx & MWAIT_SUBSTATE_MASK) {
328 			highest_cstate = i;
329 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
330 		}
331 	}
332 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
333 		(highest_subcstate - 1);
334 
335 }
336 
337 struct pkg_cstate_info {
338 	bool skip;
339 	int msr_index;
340 	int cstate_id;
341 };
342 
343 #define PKG_CSTATE_INIT(id) {				\
344 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
345 		.cstate_id = id				\
346 			}
347 
348 static struct pkg_cstate_info pkg_cstates[] = {
349 	PKG_CSTATE_INIT(2),
350 	PKG_CSTATE_INIT(3),
351 	PKG_CSTATE_INIT(6),
352 	PKG_CSTATE_INIT(7),
353 	PKG_CSTATE_INIT(8),
354 	PKG_CSTATE_INIT(9),
355 	PKG_CSTATE_INIT(10),
356 	{NULL},
357 };
358 
359 static bool has_pkg_state_counter(void)
360 {
361 	u64 val;
362 	struct pkg_cstate_info *info = pkg_cstates;
363 
364 	/* check if any one of the counter msrs exists */
365 	while (info->msr_index) {
366 		if (!rdmsrl_safe(info->msr_index, &val))
367 			return true;
368 		info++;
369 	}
370 
371 	return false;
372 }
373 
374 static u64 pkg_state_counter(void)
375 {
376 	u64 val;
377 	u64 count = 0;
378 	struct pkg_cstate_info *info = pkg_cstates;
379 
380 	while (info->msr_index) {
381 		if (!info->skip) {
382 			if (!rdmsrl_safe(info->msr_index, &val))
383 				count += val;
384 			else
385 				info->skip = true;
386 		}
387 		info++;
388 	}
389 
390 	return count;
391 }
392 
393 static unsigned int get_compensation(int ratio)
394 {
395 	unsigned int comp = 0;
396 
397 	if (!poll_pkg_cstate_enable)
398 		return 0;
399 
400 	/* we only use compensation if all adjacent ones are good */
401 	if (ratio == 1 &&
402 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
403 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
404 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
405 		comp = (cal_data[ratio].steady_comp +
406 			cal_data[ratio + 1].steady_comp +
407 			cal_data[ratio + 2].steady_comp) / 3;
408 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
409 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
410 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
411 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
412 		comp = (cal_data[ratio].steady_comp +
413 			cal_data[ratio - 1].steady_comp +
414 			cal_data[ratio - 2].steady_comp) / 3;
415 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
416 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
417 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
418 		comp = (cal_data[ratio].steady_comp +
419 			cal_data[ratio - 1].steady_comp +
420 			cal_data[ratio + 1].steady_comp) / 3;
421 	}
422 
423 	/* do not exceed limit */
424 	if (comp + ratio >= MAX_TARGET_RATIO)
425 		comp = MAX_TARGET_RATIO - ratio - 1;
426 
427 	return comp;
428 }
429 
430 static void adjust_compensation(int target_ratio, unsigned int win)
431 {
432 	int delta;
433 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
434 
435 	/*
436 	 * adjust compensations if confidence level has not been reached.
437 	 */
438 	if (d->confidence >= CONFIDENCE_OK)
439 		return;
440 
441 	delta = powerclamp_data.target_ratio - current_ratio;
442 	/* filter out bad data */
443 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
444 		if (d->steady_comp)
445 			d->steady_comp =
446 				roundup(delta+d->steady_comp, 2)/2;
447 		else
448 			d->steady_comp = delta;
449 		d->confidence++;
450 	}
451 }
452 
453 static bool powerclamp_adjust_controls(unsigned int target_ratio,
454 				unsigned int guard, unsigned int win)
455 {
456 	static u64 msr_last, tsc_last;
457 	u64 msr_now, tsc_now;
458 	u64 val64;
459 
460 	/* check result for the last window */
461 	msr_now = pkg_state_counter();
462 	tsc_now = rdtsc();
463 
464 	/* calculate pkg cstate vs tsc ratio */
465 	if (!msr_last || !tsc_last)
466 		current_ratio = 1;
467 	else if (tsc_now-tsc_last) {
468 		val64 = 100*(msr_now-msr_last);
469 		do_div(val64, (tsc_now-tsc_last));
470 		current_ratio = val64;
471 	}
472 
473 	/* update record */
474 	msr_last = msr_now;
475 	tsc_last = tsc_now;
476 
477 	adjust_compensation(target_ratio, win);
478 
479 	/* if we are above target+guard, skip */
480 	return powerclamp_data.target_ratio + guard <= current_ratio;
481 }
482 
483 /*
484  * This function calculates runtime from the current target ratio.
485  * This function gets called under powerclamp_lock.
486  */
487 static unsigned int get_run_time(void)
488 {
489 	unsigned int compensated_ratio;
490 	unsigned int runtime;
491 
492 	/*
493 	 * make sure user selected ratio does not take effect until
494 	 * the next round. adjust target_ratio if user has changed
495 	 * target such that we can converge quickly.
496 	 */
497 	powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
498 	powerclamp_data.window_size_now = window_size;
499 
500 	/*
501 	 * systems may have different ability to enter package level
502 	 * c-states, thus we need to compensate the injected idle ratio
503 	 * to achieve the actual target reported by the HW.
504 	 */
505 	compensated_ratio = powerclamp_data.target_ratio +
506 		get_compensation(powerclamp_data.target_ratio);
507 	if (compensated_ratio <= 0)
508 		compensated_ratio = 1;
509 
510 	runtime = duration * 100 / compensated_ratio - duration;
511 
512 	return runtime;
513 }
514 
515 /*
516  * 1 HZ polling while clamping is active, useful for userspace
517  * to monitor actual idle ratio.
518  */
519 static void poll_pkg_cstate(struct work_struct *dummy);
520 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
521 static void poll_pkg_cstate(struct work_struct *dummy)
522 {
523 	static u64 msr_last;
524 	static u64 tsc_last;
525 
526 	u64 msr_now;
527 	u64 tsc_now;
528 	u64 val64;
529 
530 	msr_now = pkg_state_counter();
531 	tsc_now = rdtsc();
532 
533 	/* calculate pkg cstate vs tsc ratio */
534 	if (!msr_last || !tsc_last)
535 		pkg_cstate_ratio_cur = 1;
536 	else {
537 		if (tsc_now - tsc_last) {
538 			val64 = 100 * (msr_now - msr_last);
539 			do_div(val64, (tsc_now - tsc_last));
540 			pkg_cstate_ratio_cur = val64;
541 		}
542 	}
543 
544 	/* update record */
545 	msr_last = msr_now;
546 	tsc_last = tsc_now;
547 
548 	mutex_lock(&powerclamp_lock);
549 	if (powerclamp_data.clamping)
550 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
551 	mutex_unlock(&powerclamp_lock);
552 }
553 
554 static struct idle_inject_device *ii_dev;
555 
556 /*
557  * This function is called from idle injection core on timer expiry
558  * for the run duration. This allows powerclamp to readjust or skip
559  * injecting idle for this cycle.
560  */
561 static bool idle_inject_update(void)
562 {
563 	bool update = false;
564 
565 	/* We can't sleep in this callback */
566 	if (!mutex_trylock(&powerclamp_lock))
567 		return true;
568 
569 	if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
570 
571 		should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio,
572 							 powerclamp_data.guard,
573 							 powerclamp_data.window_size_now);
574 		update = true;
575 	}
576 
577 	if (update) {
578 		unsigned int runtime = get_run_time();
579 
580 		idle_inject_set_duration(ii_dev, runtime, duration);
581 	}
582 
583 	powerclamp_data.count++;
584 
585 	mutex_unlock(&powerclamp_lock);
586 
587 	if (should_skip)
588 		return false;
589 
590 	return true;
591 }
592 
593 /* This function starts idle injection by calling idle_inject_start() */
594 static void trigger_idle_injection(void)
595 {
596 	unsigned int runtime = get_run_time();
597 
598 	idle_inject_set_duration(ii_dev, runtime, duration);
599 	idle_inject_start(ii_dev);
600 	powerclamp_data.clamping = true;
601 }
602 
603 /*
604  * This function is called from start_power_clamp() to register
605  * CPUS with powercap idle injection register and set default
606  * idle duration and latency.
607  */
608 static int powerclamp_idle_injection_register(void)
609 {
610 	poll_pkg_cstate_enable = false;
611 	if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
612 		ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update);
613 		if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
614 			poll_pkg_cstate_enable = true;
615 	} else {
616 		ii_dev = idle_inject_register(idle_injection_cpu_mask);
617 	}
618 
619 	if (!ii_dev) {
620 		pr_err("powerclamp: idle_inject_register failed\n");
621 		return -EAGAIN;
622 	}
623 
624 	idle_inject_set_duration(ii_dev, TICK_USEC, duration);
625 	idle_inject_set_latency(ii_dev, UINT_MAX);
626 
627 	return 0;
628 }
629 
630 /*
631  * This function is called from end_power_clamp() to stop idle injection
632  * and unregister CPUS from powercap idle injection core.
633  */
634 static void remove_idle_injection(void)
635 {
636 	if (!powerclamp_data.clamping)
637 		return;
638 
639 	powerclamp_data.clamping = false;
640 	idle_inject_stop(ii_dev);
641 }
642 
643 /*
644  * This function is called when user change the cooling device
645  * state from zero to some other value.
646  */
647 static int start_power_clamp(void)
648 {
649 	int ret;
650 
651 	ret = powerclamp_idle_injection_register();
652 	if (!ret) {
653 		trigger_idle_injection();
654 		if (poll_pkg_cstate_enable)
655 			schedule_delayed_work(&poll_pkg_cstate_work, 0);
656 	}
657 
658 	return ret;
659 }
660 
661 /*
662  * This function is called when user change the cooling device
663  * state from non zero value zero.
664  */
665 static void end_power_clamp(void)
666 {
667 	if (powerclamp_data.clamping) {
668 		remove_idle_injection();
669 		idle_inject_unregister(ii_dev);
670 	}
671 }
672 
673 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
674 				 unsigned long *state)
675 {
676 	*state = MAX_TARGET_RATIO;
677 
678 	return 0;
679 }
680 
681 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
682 				 unsigned long *state)
683 {
684 	mutex_lock(&powerclamp_lock);
685 	*state = powerclamp_data.target_ratio;
686 	mutex_unlock(&powerclamp_lock);
687 
688 	return 0;
689 }
690 
691 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
692 				 unsigned long new_target_ratio)
693 {
694 	int ret = 0;
695 
696 	mutex_lock(&powerclamp_lock);
697 
698 	new_target_ratio = clamp(new_target_ratio, 0UL,
699 				(unsigned long) (max_idle - 1));
700 	if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
701 		pr_info("Start idle injection to reduce power\n");
702 		powerclamp_data.target_ratio = new_target_ratio;
703 		ret = start_power_clamp();
704 		if (ret)
705 			powerclamp_data.target_ratio = 0;
706 		goto exit_set;
707 	} else	if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) {
708 		pr_info("Stop forced idle injection\n");
709 		end_power_clamp();
710 		powerclamp_data.target_ratio = 0;
711 	} else	/* adjust currently running */ {
712 		unsigned int runtime;
713 
714 		powerclamp_data.target_ratio = new_target_ratio;
715 		runtime = get_run_time();
716 		idle_inject_set_duration(ii_dev, runtime, duration);
717 	}
718 
719 exit_set:
720 	mutex_unlock(&powerclamp_lock);
721 
722 	return ret;
723 }
724 
725 /* bind to generic thermal layer as cooling device*/
726 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
727 	.get_max_state = powerclamp_get_max_state,
728 	.get_cur_state = powerclamp_get_cur_state,
729 	.set_cur_state = powerclamp_set_cur_state,
730 };
731 
732 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
733 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
734 	{}
735 };
736 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
737 
738 static int __init powerclamp_probe(void)
739 {
740 
741 	if (!x86_match_cpu(intel_powerclamp_ids)) {
742 		pr_err("CPU does not support MWAIT\n");
743 		return -ENODEV;
744 	}
745 
746 	/* The goal for idle time alignment is to achieve package cstate. */
747 	if (!has_pkg_state_counter()) {
748 		pr_info("No package C-state available\n");
749 		return -ENODEV;
750 	}
751 
752 	/* find the deepest mwait value */
753 	find_target_mwait();
754 
755 	return 0;
756 }
757 
758 static int powerclamp_debug_show(struct seq_file *m, void *unused)
759 {
760 	int i = 0;
761 
762 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
763 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
764 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
765 			i,
766 			cal_data[i].confidence,
767 			cal_data[i].steady_comp,
768 			cal_data[i].dynamic_comp);
769 	}
770 
771 	return 0;
772 }
773 
774 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
775 
776 static inline void powerclamp_create_debug_files(void)
777 {
778 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
779 
780 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
781 			    &powerclamp_debug_fops);
782 }
783 
784 static int __init powerclamp_init(void)
785 {
786 	int retval;
787 
788 	/* probe cpu features and ids here */
789 	retval = powerclamp_probe();
790 	if (retval)
791 		return retval;
792 
793 	mutex_lock(&powerclamp_lock);
794 	retval = allocate_copy_idle_injection_mask(cpu_present_mask);
795 	mutex_unlock(&powerclamp_lock);
796 
797 	if (retval)
798 		return retval;
799 
800 	/* set default limit, maybe adjusted during runtime based on feedback */
801 	window_size = 2;
802 
803 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
804 						      &powerclamp_cooling_ops);
805 	if (IS_ERR(cooling_dev))
806 		return -ENODEV;
807 
808 	if (!duration)
809 		duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES);
810 
811 	powerclamp_create_debug_files();
812 
813 	return 0;
814 }
815 module_init(powerclamp_init);
816 
817 static void __exit powerclamp_exit(void)
818 {
819 	mutex_lock(&powerclamp_lock);
820 	end_power_clamp();
821 	mutex_unlock(&powerclamp_lock);
822 
823 	thermal_cooling_device_unregister(cooling_dev);
824 
825 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
826 	debugfs_remove_recursive(debug_dir);
827 
828 	if (cpumask_available(idle_injection_cpu_mask))
829 		free_cpumask_var(idle_injection_cpu_mask);
830 }
831 module_exit(powerclamp_exit);
832 
833 MODULE_IMPORT_NS(IDLE_INJECT);
834 
835 MODULE_LICENSE("GPL");
836 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
837 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
838 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
839