xref: /openbmc/linux/drivers/thermal/intel/intel_powerclamp.c (revision 8f8d5745bb520c76b81abef4a2cb3023d0313bfd)
1 /*
2  * intel_powerclamp.c - package c-state idle injection
3  *
4  * Copyright (c) 2012, Intel Corporation.
5  *
6  * Authors:
7  *     Arjan van de Ven <arjan@linux.intel.com>
8  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms and conditions of the GNU General Public License,
12  * version 2, as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program; if not, write to the Free Software Foundation, Inc.,
21  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22  *
23  *
24  *	TODO:
25  *           1. better handle wakeup from external interrupts, currently a fixed
26  *              compensation is added to clamping duration when excessive amount
27  *              of wakeups are observed during idle time. the reason is that in
28  *              case of external interrupts without need for ack, clamping down
29  *              cpu in non-irq context does not reduce irq. for majority of the
30  *              cases, clamping down cpu does help reduce irq as well, we should
31  *              be able to differentiate the two cases and give a quantitative
32  *              solution for the irqs that we can control. perhaps based on
33  *              get_cpu_iowait_time_us()
34  *
35  *	     2. synchronization with other hw blocks
36  *
37  *
38  */
39 
40 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
41 
42 #include <linux/module.h>
43 #include <linux/kernel.h>
44 #include <linux/delay.h>
45 #include <linux/kthread.h>
46 #include <linux/cpu.h>
47 #include <linux/thermal.h>
48 #include <linux/slab.h>
49 #include <linux/tick.h>
50 #include <linux/debugfs.h>
51 #include <linux/seq_file.h>
52 #include <linux/sched/rt.h>
53 #include <uapi/linux/sched/types.h>
54 
55 #include <asm/nmi.h>
56 #include <asm/msr.h>
57 #include <asm/mwait.h>
58 #include <asm/cpu_device_id.h>
59 #include <asm/hardirq.h>
60 
61 #define MAX_TARGET_RATIO (50U)
62 /* For each undisturbed clamping period (no extra wake ups during idle time),
63  * we increment the confidence counter for the given target ratio.
64  * CONFIDENCE_OK defines the level where runtime calibration results are
65  * valid.
66  */
67 #define CONFIDENCE_OK (3)
68 /* Default idle injection duration, driver adjust sleep time to meet target
69  * idle ratio. Similar to frequency modulation.
70  */
71 #define DEFAULT_DURATION_JIFFIES (6)
72 
73 static unsigned int target_mwait;
74 static struct dentry *debug_dir;
75 
76 /* user selected target */
77 static unsigned int set_target_ratio;
78 static unsigned int current_ratio;
79 static bool should_skip;
80 static bool reduce_irq;
81 static atomic_t idle_wakeup_counter;
82 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
83 				  * control parameters. default to BSP but BSP
84 				  * can be offlined.
85 				  */
86 static bool clamping;
87 
88 static const struct sched_param sparam = {
89 	.sched_priority = MAX_USER_RT_PRIO / 2,
90 };
91 struct powerclamp_worker_data {
92 	struct kthread_worker *worker;
93 	struct kthread_work balancing_work;
94 	struct kthread_delayed_work idle_injection_work;
95 	unsigned int cpu;
96 	unsigned int count;
97 	unsigned int guard;
98 	unsigned int window_size_now;
99 	unsigned int target_ratio;
100 	unsigned int duration_jiffies;
101 	bool clamping;
102 };
103 
104 static struct powerclamp_worker_data __percpu *worker_data;
105 static struct thermal_cooling_device *cooling_dev;
106 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
107 					   * clamping kthread worker
108 					   */
109 
110 static unsigned int duration;
111 static unsigned int pkg_cstate_ratio_cur;
112 static unsigned int window_size;
113 
114 static int duration_set(const char *arg, const struct kernel_param *kp)
115 {
116 	int ret = 0;
117 	unsigned long new_duration;
118 
119 	ret = kstrtoul(arg, 10, &new_duration);
120 	if (ret)
121 		goto exit;
122 	if (new_duration > 25 || new_duration < 6) {
123 		pr_err("Out of recommended range %lu, between 6-25ms\n",
124 			new_duration);
125 		ret = -EINVAL;
126 	}
127 
128 	duration = clamp(new_duration, 6ul, 25ul);
129 	smp_mb();
130 
131 exit:
132 
133 	return ret;
134 }
135 
136 static const struct kernel_param_ops duration_ops = {
137 	.set = duration_set,
138 	.get = param_get_int,
139 };
140 
141 
142 module_param_cb(duration, &duration_ops, &duration, 0644);
143 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
144 
145 struct powerclamp_calibration_data {
146 	unsigned long confidence;  /* used for calibration, basically a counter
147 				    * gets incremented each time a clamping
148 				    * period is completed without extra wakeups
149 				    * once that counter is reached given level,
150 				    * compensation is deemed usable.
151 				    */
152 	unsigned long steady_comp; /* steady state compensation used when
153 				    * no extra wakeups occurred.
154 				    */
155 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
156 				     * mostly from external interrupts.
157 				     */
158 };
159 
160 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
161 
162 static int window_size_set(const char *arg, const struct kernel_param *kp)
163 {
164 	int ret = 0;
165 	unsigned long new_window_size;
166 
167 	ret = kstrtoul(arg, 10, &new_window_size);
168 	if (ret)
169 		goto exit_win;
170 	if (new_window_size > 10 || new_window_size < 2) {
171 		pr_err("Out of recommended window size %lu, between 2-10\n",
172 			new_window_size);
173 		ret = -EINVAL;
174 	}
175 
176 	window_size = clamp(new_window_size, 2ul, 10ul);
177 	smp_mb();
178 
179 exit_win:
180 
181 	return ret;
182 }
183 
184 static const struct kernel_param_ops window_size_ops = {
185 	.set = window_size_set,
186 	.get = param_get_int,
187 };
188 
189 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
190 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
191 	"\tpowerclamp controls idle ratio within this window. larger\n"
192 	"\twindow size results in slower response time but more smooth\n"
193 	"\tclamping results. default to 2.");
194 
195 static void find_target_mwait(void)
196 {
197 	unsigned int eax, ebx, ecx, edx;
198 	unsigned int highest_cstate = 0;
199 	unsigned int highest_subcstate = 0;
200 	int i;
201 
202 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
203 		return;
204 
205 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
206 
207 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
208 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
209 		return;
210 
211 	edx >>= MWAIT_SUBSTATE_SIZE;
212 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
213 		if (edx & MWAIT_SUBSTATE_MASK) {
214 			highest_cstate = i;
215 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
216 		}
217 	}
218 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
219 		(highest_subcstate - 1);
220 
221 }
222 
223 struct pkg_cstate_info {
224 	bool skip;
225 	int msr_index;
226 	int cstate_id;
227 };
228 
229 #define PKG_CSTATE_INIT(id) {				\
230 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
231 		.cstate_id = id				\
232 			}
233 
234 static struct pkg_cstate_info pkg_cstates[] = {
235 	PKG_CSTATE_INIT(2),
236 	PKG_CSTATE_INIT(3),
237 	PKG_CSTATE_INIT(6),
238 	PKG_CSTATE_INIT(7),
239 	PKG_CSTATE_INIT(8),
240 	PKG_CSTATE_INIT(9),
241 	PKG_CSTATE_INIT(10),
242 	{NULL},
243 };
244 
245 static bool has_pkg_state_counter(void)
246 {
247 	u64 val;
248 	struct pkg_cstate_info *info = pkg_cstates;
249 
250 	/* check if any one of the counter msrs exists */
251 	while (info->msr_index) {
252 		if (!rdmsrl_safe(info->msr_index, &val))
253 			return true;
254 		info++;
255 	}
256 
257 	return false;
258 }
259 
260 static u64 pkg_state_counter(void)
261 {
262 	u64 val;
263 	u64 count = 0;
264 	struct pkg_cstate_info *info = pkg_cstates;
265 
266 	while (info->msr_index) {
267 		if (!info->skip) {
268 			if (!rdmsrl_safe(info->msr_index, &val))
269 				count += val;
270 			else
271 				info->skip = true;
272 		}
273 		info++;
274 	}
275 
276 	return count;
277 }
278 
279 static unsigned int get_compensation(int ratio)
280 {
281 	unsigned int comp = 0;
282 
283 	/* we only use compensation if all adjacent ones are good */
284 	if (ratio == 1 &&
285 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
286 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
287 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
288 		comp = (cal_data[ratio].steady_comp +
289 			cal_data[ratio + 1].steady_comp +
290 			cal_data[ratio + 2].steady_comp) / 3;
291 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
292 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
293 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
294 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
295 		comp = (cal_data[ratio].steady_comp +
296 			cal_data[ratio - 1].steady_comp +
297 			cal_data[ratio - 2].steady_comp) / 3;
298 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
299 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
300 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
301 		comp = (cal_data[ratio].steady_comp +
302 			cal_data[ratio - 1].steady_comp +
303 			cal_data[ratio + 1].steady_comp) / 3;
304 	}
305 
306 	/* REVISIT: simple penalty of double idle injection */
307 	if (reduce_irq)
308 		comp = ratio;
309 	/* do not exceed limit */
310 	if (comp + ratio >= MAX_TARGET_RATIO)
311 		comp = MAX_TARGET_RATIO - ratio - 1;
312 
313 	return comp;
314 }
315 
316 static void adjust_compensation(int target_ratio, unsigned int win)
317 {
318 	int delta;
319 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
320 
321 	/*
322 	 * adjust compensations if confidence level has not been reached or
323 	 * there are too many wakeups during the last idle injection period, we
324 	 * cannot trust the data for compensation.
325 	 */
326 	if (d->confidence >= CONFIDENCE_OK ||
327 		atomic_read(&idle_wakeup_counter) >
328 		win * num_online_cpus())
329 		return;
330 
331 	delta = set_target_ratio - current_ratio;
332 	/* filter out bad data */
333 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
334 		if (d->steady_comp)
335 			d->steady_comp =
336 				roundup(delta+d->steady_comp, 2)/2;
337 		else
338 			d->steady_comp = delta;
339 		d->confidence++;
340 	}
341 }
342 
343 static bool powerclamp_adjust_controls(unsigned int target_ratio,
344 				unsigned int guard, unsigned int win)
345 {
346 	static u64 msr_last, tsc_last;
347 	u64 msr_now, tsc_now;
348 	u64 val64;
349 
350 	/* check result for the last window */
351 	msr_now = pkg_state_counter();
352 	tsc_now = rdtsc();
353 
354 	/* calculate pkg cstate vs tsc ratio */
355 	if (!msr_last || !tsc_last)
356 		current_ratio = 1;
357 	else if (tsc_now-tsc_last) {
358 		val64 = 100*(msr_now-msr_last);
359 		do_div(val64, (tsc_now-tsc_last));
360 		current_ratio = val64;
361 	}
362 
363 	/* update record */
364 	msr_last = msr_now;
365 	tsc_last = tsc_now;
366 
367 	adjust_compensation(target_ratio, win);
368 	/*
369 	 * too many external interrupts, set flag such
370 	 * that we can take measure later.
371 	 */
372 	reduce_irq = atomic_read(&idle_wakeup_counter) >=
373 		2 * win * num_online_cpus();
374 
375 	atomic_set(&idle_wakeup_counter, 0);
376 	/* if we are above target+guard, skip */
377 	return set_target_ratio + guard <= current_ratio;
378 }
379 
380 static void clamp_balancing_func(struct kthread_work *work)
381 {
382 	struct powerclamp_worker_data *w_data;
383 	int sleeptime;
384 	unsigned long target_jiffies;
385 	unsigned int compensated_ratio;
386 	int interval; /* jiffies to sleep for each attempt */
387 
388 	w_data = container_of(work, struct powerclamp_worker_data,
389 			      balancing_work);
390 
391 	/*
392 	 * make sure user selected ratio does not take effect until
393 	 * the next round. adjust target_ratio if user has changed
394 	 * target such that we can converge quickly.
395 	 */
396 	w_data->target_ratio = READ_ONCE(set_target_ratio);
397 	w_data->guard = 1 + w_data->target_ratio / 20;
398 	w_data->window_size_now = window_size;
399 	w_data->duration_jiffies = msecs_to_jiffies(duration);
400 	w_data->count++;
401 
402 	/*
403 	 * systems may have different ability to enter package level
404 	 * c-states, thus we need to compensate the injected idle ratio
405 	 * to achieve the actual target reported by the HW.
406 	 */
407 	compensated_ratio = w_data->target_ratio +
408 		get_compensation(w_data->target_ratio);
409 	if (compensated_ratio <= 0)
410 		compensated_ratio = 1;
411 	interval = w_data->duration_jiffies * 100 / compensated_ratio;
412 
413 	/* align idle time */
414 	target_jiffies = roundup(jiffies, interval);
415 	sleeptime = target_jiffies - jiffies;
416 	if (sleeptime <= 0)
417 		sleeptime = 1;
418 
419 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
420 		kthread_queue_delayed_work(w_data->worker,
421 					   &w_data->idle_injection_work,
422 					   sleeptime);
423 }
424 
425 static void clamp_idle_injection_func(struct kthread_work *work)
426 {
427 	struct powerclamp_worker_data *w_data;
428 
429 	w_data = container_of(work, struct powerclamp_worker_data,
430 			      idle_injection_work.work);
431 
432 	/*
433 	 * only elected controlling cpu can collect stats and update
434 	 * control parameters.
435 	 */
436 	if (w_data->cpu == control_cpu &&
437 	    !(w_data->count % w_data->window_size_now)) {
438 		should_skip =
439 			powerclamp_adjust_controls(w_data->target_ratio,
440 						   w_data->guard,
441 						   w_data->window_size_now);
442 		smp_mb();
443 	}
444 
445 	if (should_skip)
446 		goto balance;
447 
448 	play_idle(jiffies_to_msecs(w_data->duration_jiffies));
449 
450 balance:
451 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
452 		kthread_queue_work(w_data->worker, &w_data->balancing_work);
453 }
454 
455 /*
456  * 1 HZ polling while clamping is active, useful for userspace
457  * to monitor actual idle ratio.
458  */
459 static void poll_pkg_cstate(struct work_struct *dummy);
460 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
461 static void poll_pkg_cstate(struct work_struct *dummy)
462 {
463 	static u64 msr_last;
464 	static u64 tsc_last;
465 
466 	u64 msr_now;
467 	u64 tsc_now;
468 	u64 val64;
469 
470 	msr_now = pkg_state_counter();
471 	tsc_now = rdtsc();
472 
473 	/* calculate pkg cstate vs tsc ratio */
474 	if (!msr_last || !tsc_last)
475 		pkg_cstate_ratio_cur = 1;
476 	else {
477 		if (tsc_now - tsc_last) {
478 			val64 = 100 * (msr_now - msr_last);
479 			do_div(val64, (tsc_now - tsc_last));
480 			pkg_cstate_ratio_cur = val64;
481 		}
482 	}
483 
484 	/* update record */
485 	msr_last = msr_now;
486 	tsc_last = tsc_now;
487 
488 	if (true == clamping)
489 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
490 }
491 
492 static void start_power_clamp_worker(unsigned long cpu)
493 {
494 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
495 	struct kthread_worker *worker;
496 
497 	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
498 	if (IS_ERR(worker))
499 		return;
500 
501 	w_data->worker = worker;
502 	w_data->count = 0;
503 	w_data->cpu = cpu;
504 	w_data->clamping = true;
505 	set_bit(cpu, cpu_clamping_mask);
506 	sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
507 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
508 	kthread_init_delayed_work(&w_data->idle_injection_work,
509 				  clamp_idle_injection_func);
510 	kthread_queue_work(w_data->worker, &w_data->balancing_work);
511 }
512 
513 static void stop_power_clamp_worker(unsigned long cpu)
514 {
515 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
516 
517 	if (!w_data->worker)
518 		return;
519 
520 	w_data->clamping = false;
521 	/*
522 	 * Make sure that all works that get queued after this point see
523 	 * the clamping disabled. The counter part is not needed because
524 	 * there is an implicit memory barrier when the queued work
525 	 * is proceed.
526 	 */
527 	smp_wmb();
528 	kthread_cancel_work_sync(&w_data->balancing_work);
529 	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
530 	/*
531 	 * The balancing work still might be queued here because
532 	 * the handling of the "clapming" variable, cancel, and queue
533 	 * operations are not synchronized via a lock. But it is not
534 	 * a big deal. The balancing work is fast and destroy kthread
535 	 * will wait for it.
536 	 */
537 	clear_bit(w_data->cpu, cpu_clamping_mask);
538 	kthread_destroy_worker(w_data->worker);
539 
540 	w_data->worker = NULL;
541 }
542 
543 static int start_power_clamp(void)
544 {
545 	unsigned long cpu;
546 
547 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
548 	/* prevent cpu hotplug */
549 	get_online_cpus();
550 
551 	/* prefer BSP */
552 	control_cpu = 0;
553 	if (!cpu_online(control_cpu))
554 		control_cpu = smp_processor_id();
555 
556 	clamping = true;
557 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
558 
559 	/* start one kthread worker per online cpu */
560 	for_each_online_cpu(cpu) {
561 		start_power_clamp_worker(cpu);
562 	}
563 	put_online_cpus();
564 
565 	return 0;
566 }
567 
568 static void end_power_clamp(void)
569 {
570 	int i;
571 
572 	/*
573 	 * Block requeuing in all the kthread workers. They will flush and
574 	 * stop faster.
575 	 */
576 	clamping = false;
577 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
578 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
579 			pr_debug("clamping worker for cpu %d alive, destroy\n",
580 				 i);
581 			stop_power_clamp_worker(i);
582 		}
583 	}
584 }
585 
586 static int powerclamp_cpu_online(unsigned int cpu)
587 {
588 	if (clamping == false)
589 		return 0;
590 	start_power_clamp_worker(cpu);
591 	/* prefer BSP as controlling CPU */
592 	if (cpu == 0) {
593 		control_cpu = 0;
594 		smp_mb();
595 	}
596 	return 0;
597 }
598 
599 static int powerclamp_cpu_predown(unsigned int cpu)
600 {
601 	if (clamping == false)
602 		return 0;
603 
604 	stop_power_clamp_worker(cpu);
605 	if (cpu != control_cpu)
606 		return 0;
607 
608 	control_cpu = cpumask_first(cpu_online_mask);
609 	if (control_cpu == cpu)
610 		control_cpu = cpumask_next(cpu, cpu_online_mask);
611 	smp_mb();
612 	return 0;
613 }
614 
615 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
616 				 unsigned long *state)
617 {
618 	*state = MAX_TARGET_RATIO;
619 
620 	return 0;
621 }
622 
623 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
624 				 unsigned long *state)
625 {
626 	if (true == clamping)
627 		*state = pkg_cstate_ratio_cur;
628 	else
629 		/* to save power, do not poll idle ratio while not clamping */
630 		*state = -1; /* indicates invalid state */
631 
632 	return 0;
633 }
634 
635 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
636 				 unsigned long new_target_ratio)
637 {
638 	int ret = 0;
639 
640 	new_target_ratio = clamp(new_target_ratio, 0UL,
641 				(unsigned long) (MAX_TARGET_RATIO-1));
642 	if (set_target_ratio == 0 && new_target_ratio > 0) {
643 		pr_info("Start idle injection to reduce power\n");
644 		set_target_ratio = new_target_ratio;
645 		ret = start_power_clamp();
646 		goto exit_set;
647 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
648 		pr_info("Stop forced idle injection\n");
649 		end_power_clamp();
650 		set_target_ratio = 0;
651 	} else	/* adjust currently running */ {
652 		set_target_ratio = new_target_ratio;
653 		/* make new set_target_ratio visible to other cpus */
654 		smp_mb();
655 	}
656 
657 exit_set:
658 	return ret;
659 }
660 
661 /* bind to generic thermal layer as cooling device*/
662 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
663 	.get_max_state = powerclamp_get_max_state,
664 	.get_cur_state = powerclamp_get_cur_state,
665 	.set_cur_state = powerclamp_set_cur_state,
666 };
667 
668 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
669 	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
670 	{}
671 };
672 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
673 
674 static int __init powerclamp_probe(void)
675 {
676 
677 	if (!x86_match_cpu(intel_powerclamp_ids)) {
678 		pr_err("CPU does not support MWAIT\n");
679 		return -ENODEV;
680 	}
681 
682 	/* The goal for idle time alignment is to achieve package cstate. */
683 	if (!has_pkg_state_counter()) {
684 		pr_info("No package C-state available\n");
685 		return -ENODEV;
686 	}
687 
688 	/* find the deepest mwait value */
689 	find_target_mwait();
690 
691 	return 0;
692 }
693 
694 static int powerclamp_debug_show(struct seq_file *m, void *unused)
695 {
696 	int i = 0;
697 
698 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
699 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
700 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
701 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
702 			i,
703 			cal_data[i].confidence,
704 			cal_data[i].steady_comp,
705 			cal_data[i].dynamic_comp);
706 	}
707 
708 	return 0;
709 }
710 
711 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
712 
713 static inline void powerclamp_create_debug_files(void)
714 {
715 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
716 	if (!debug_dir)
717 		return;
718 
719 	if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
720 					cal_data, &powerclamp_debug_fops))
721 		goto file_error;
722 
723 	return;
724 
725 file_error:
726 	debugfs_remove_recursive(debug_dir);
727 }
728 
729 static enum cpuhp_state hp_state;
730 
731 static int __init powerclamp_init(void)
732 {
733 	int retval;
734 	int bitmap_size;
735 
736 	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
737 	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
738 	if (!cpu_clamping_mask)
739 		return -ENOMEM;
740 
741 	/* probe cpu features and ids here */
742 	retval = powerclamp_probe();
743 	if (retval)
744 		goto exit_free;
745 
746 	/* set default limit, maybe adjusted during runtime based on feedback */
747 	window_size = 2;
748 	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
749 					   "thermal/intel_powerclamp:online",
750 					   powerclamp_cpu_online,
751 					   powerclamp_cpu_predown);
752 	if (retval < 0)
753 		goto exit_free;
754 
755 	hp_state = retval;
756 
757 	worker_data = alloc_percpu(struct powerclamp_worker_data);
758 	if (!worker_data) {
759 		retval = -ENOMEM;
760 		goto exit_unregister;
761 	}
762 
763 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
764 						&powerclamp_cooling_ops);
765 	if (IS_ERR(cooling_dev)) {
766 		retval = -ENODEV;
767 		goto exit_free_thread;
768 	}
769 
770 	if (!duration)
771 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
772 
773 	powerclamp_create_debug_files();
774 
775 	return 0;
776 
777 exit_free_thread:
778 	free_percpu(worker_data);
779 exit_unregister:
780 	cpuhp_remove_state_nocalls(hp_state);
781 exit_free:
782 	kfree(cpu_clamping_mask);
783 	return retval;
784 }
785 module_init(powerclamp_init);
786 
787 static void __exit powerclamp_exit(void)
788 {
789 	end_power_clamp();
790 	cpuhp_remove_state_nocalls(hp_state);
791 	free_percpu(worker_data);
792 	thermal_cooling_device_unregister(cooling_dev);
793 	kfree(cpu_clamping_mask);
794 
795 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
796 	debugfs_remove_recursive(debug_dir);
797 }
798 module_exit(powerclamp_exit);
799 
800 MODULE_LICENSE("GPL");
801 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
802 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
803 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
804