1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39 
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45 
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57 
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60 
61 /* user selected target */
62 static unsigned int set_target_ratio;
63 static unsigned int current_ratio;
64 static bool should_skip;
65 static bool reduce_irq;
66 static atomic_t idle_wakeup_counter;
67 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
68 				  * control parameters. default to BSP but BSP
69 				  * can be offlined.
70 				  */
71 static bool clamping;
72 
73 struct powerclamp_worker_data {
74 	struct kthread_worker *worker;
75 	struct kthread_work balancing_work;
76 	struct kthread_delayed_work idle_injection_work;
77 	unsigned int cpu;
78 	unsigned int count;
79 	unsigned int guard;
80 	unsigned int window_size_now;
81 	unsigned int target_ratio;
82 	unsigned int duration_jiffies;
83 	bool clamping;
84 };
85 
86 static struct powerclamp_worker_data __percpu *worker_data;
87 static struct thermal_cooling_device *cooling_dev;
88 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
89 					   * clamping kthread worker
90 					   */
91 
92 static unsigned int duration;
93 static unsigned int pkg_cstate_ratio_cur;
94 static unsigned int window_size;
95 
96 static int duration_set(const char *arg, const struct kernel_param *kp)
97 {
98 	int ret = 0;
99 	unsigned long new_duration;
100 
101 	ret = kstrtoul(arg, 10, &new_duration);
102 	if (ret)
103 		goto exit;
104 	if (new_duration > 25 || new_duration < 6) {
105 		pr_err("Out of recommended range %lu, between 6-25ms\n",
106 			new_duration);
107 		ret = -EINVAL;
108 	}
109 
110 	duration = clamp(new_duration, 6ul, 25ul);
111 	smp_mb();
112 
113 exit:
114 
115 	return ret;
116 }
117 
118 static const struct kernel_param_ops duration_ops = {
119 	.set = duration_set,
120 	.get = param_get_int,
121 };
122 
123 
124 module_param_cb(duration, &duration_ops, &duration, 0644);
125 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
126 
127 struct powerclamp_calibration_data {
128 	unsigned long confidence;  /* used for calibration, basically a counter
129 				    * gets incremented each time a clamping
130 				    * period is completed without extra wakeups
131 				    * once that counter is reached given level,
132 				    * compensation is deemed usable.
133 				    */
134 	unsigned long steady_comp; /* steady state compensation used when
135 				    * no extra wakeups occurred.
136 				    */
137 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
138 				     * mostly from external interrupts.
139 				     */
140 };
141 
142 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
143 
144 static int window_size_set(const char *arg, const struct kernel_param *kp)
145 {
146 	int ret = 0;
147 	unsigned long new_window_size;
148 
149 	ret = kstrtoul(arg, 10, &new_window_size);
150 	if (ret)
151 		goto exit_win;
152 	if (new_window_size > 10 || new_window_size < 2) {
153 		pr_err("Out of recommended window size %lu, between 2-10\n",
154 			new_window_size);
155 		ret = -EINVAL;
156 	}
157 
158 	window_size = clamp(new_window_size, 2ul, 10ul);
159 	smp_mb();
160 
161 exit_win:
162 
163 	return ret;
164 }
165 
166 static const struct kernel_param_ops window_size_ops = {
167 	.set = window_size_set,
168 	.get = param_get_int,
169 };
170 
171 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
172 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
173 	"\tpowerclamp controls idle ratio within this window. larger\n"
174 	"\twindow size results in slower response time but more smooth\n"
175 	"\tclamping results. default to 2.");
176 
177 static void find_target_mwait(void)
178 {
179 	unsigned int eax, ebx, ecx, edx;
180 	unsigned int highest_cstate = 0;
181 	unsigned int highest_subcstate = 0;
182 	int i;
183 
184 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
185 		return;
186 
187 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
188 
189 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
190 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
191 		return;
192 
193 	edx >>= MWAIT_SUBSTATE_SIZE;
194 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
195 		if (edx & MWAIT_SUBSTATE_MASK) {
196 			highest_cstate = i;
197 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
198 		}
199 	}
200 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
201 		(highest_subcstate - 1);
202 
203 }
204 
205 struct pkg_cstate_info {
206 	bool skip;
207 	int msr_index;
208 	int cstate_id;
209 };
210 
211 #define PKG_CSTATE_INIT(id) {				\
212 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
213 		.cstate_id = id				\
214 			}
215 
216 static struct pkg_cstate_info pkg_cstates[] = {
217 	PKG_CSTATE_INIT(2),
218 	PKG_CSTATE_INIT(3),
219 	PKG_CSTATE_INIT(6),
220 	PKG_CSTATE_INIT(7),
221 	PKG_CSTATE_INIT(8),
222 	PKG_CSTATE_INIT(9),
223 	PKG_CSTATE_INIT(10),
224 	{NULL},
225 };
226 
227 static bool has_pkg_state_counter(void)
228 {
229 	u64 val;
230 	struct pkg_cstate_info *info = pkg_cstates;
231 
232 	/* check if any one of the counter msrs exists */
233 	while (info->msr_index) {
234 		if (!rdmsrl_safe(info->msr_index, &val))
235 			return true;
236 		info++;
237 	}
238 
239 	return false;
240 }
241 
242 static u64 pkg_state_counter(void)
243 {
244 	u64 val;
245 	u64 count = 0;
246 	struct pkg_cstate_info *info = pkg_cstates;
247 
248 	while (info->msr_index) {
249 		if (!info->skip) {
250 			if (!rdmsrl_safe(info->msr_index, &val))
251 				count += val;
252 			else
253 				info->skip = true;
254 		}
255 		info++;
256 	}
257 
258 	return count;
259 }
260 
261 static unsigned int get_compensation(int ratio)
262 {
263 	unsigned int comp = 0;
264 
265 	/* we only use compensation if all adjacent ones are good */
266 	if (ratio == 1 &&
267 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
268 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
269 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
270 		comp = (cal_data[ratio].steady_comp +
271 			cal_data[ratio + 1].steady_comp +
272 			cal_data[ratio + 2].steady_comp) / 3;
273 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
274 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
275 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
276 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
277 		comp = (cal_data[ratio].steady_comp +
278 			cal_data[ratio - 1].steady_comp +
279 			cal_data[ratio - 2].steady_comp) / 3;
280 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
281 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
282 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
283 		comp = (cal_data[ratio].steady_comp +
284 			cal_data[ratio - 1].steady_comp +
285 			cal_data[ratio + 1].steady_comp) / 3;
286 	}
287 
288 	/* REVISIT: simple penalty of double idle injection */
289 	if (reduce_irq)
290 		comp = ratio;
291 	/* do not exceed limit */
292 	if (comp + ratio >= MAX_TARGET_RATIO)
293 		comp = MAX_TARGET_RATIO - ratio - 1;
294 
295 	return comp;
296 }
297 
298 static void adjust_compensation(int target_ratio, unsigned int win)
299 {
300 	int delta;
301 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
302 
303 	/*
304 	 * adjust compensations if confidence level has not been reached or
305 	 * there are too many wakeups during the last idle injection period, we
306 	 * cannot trust the data for compensation.
307 	 */
308 	if (d->confidence >= CONFIDENCE_OK ||
309 		atomic_read(&idle_wakeup_counter) >
310 		win * num_online_cpus())
311 		return;
312 
313 	delta = set_target_ratio - current_ratio;
314 	/* filter out bad data */
315 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
316 		if (d->steady_comp)
317 			d->steady_comp =
318 				roundup(delta+d->steady_comp, 2)/2;
319 		else
320 			d->steady_comp = delta;
321 		d->confidence++;
322 	}
323 }
324 
325 static bool powerclamp_adjust_controls(unsigned int target_ratio,
326 				unsigned int guard, unsigned int win)
327 {
328 	static u64 msr_last, tsc_last;
329 	u64 msr_now, tsc_now;
330 	u64 val64;
331 
332 	/* check result for the last window */
333 	msr_now = pkg_state_counter();
334 	tsc_now = rdtsc();
335 
336 	/* calculate pkg cstate vs tsc ratio */
337 	if (!msr_last || !tsc_last)
338 		current_ratio = 1;
339 	else if (tsc_now-tsc_last) {
340 		val64 = 100*(msr_now-msr_last);
341 		do_div(val64, (tsc_now-tsc_last));
342 		current_ratio = val64;
343 	}
344 
345 	/* update record */
346 	msr_last = msr_now;
347 	tsc_last = tsc_now;
348 
349 	adjust_compensation(target_ratio, win);
350 	/*
351 	 * too many external interrupts, set flag such
352 	 * that we can take measure later.
353 	 */
354 	reduce_irq = atomic_read(&idle_wakeup_counter) >=
355 		2 * win * num_online_cpus();
356 
357 	atomic_set(&idle_wakeup_counter, 0);
358 	/* if we are above target+guard, skip */
359 	return set_target_ratio + guard <= current_ratio;
360 }
361 
362 static void clamp_balancing_func(struct kthread_work *work)
363 {
364 	struct powerclamp_worker_data *w_data;
365 	int sleeptime;
366 	unsigned long target_jiffies;
367 	unsigned int compensated_ratio;
368 	int interval; /* jiffies to sleep for each attempt */
369 
370 	w_data = container_of(work, struct powerclamp_worker_data,
371 			      balancing_work);
372 
373 	/*
374 	 * make sure user selected ratio does not take effect until
375 	 * the next round. adjust target_ratio if user has changed
376 	 * target such that we can converge quickly.
377 	 */
378 	w_data->target_ratio = READ_ONCE(set_target_ratio);
379 	w_data->guard = 1 + w_data->target_ratio / 20;
380 	w_data->window_size_now = window_size;
381 	w_data->duration_jiffies = msecs_to_jiffies(duration);
382 	w_data->count++;
383 
384 	/*
385 	 * systems may have different ability to enter package level
386 	 * c-states, thus we need to compensate the injected idle ratio
387 	 * to achieve the actual target reported by the HW.
388 	 */
389 	compensated_ratio = w_data->target_ratio +
390 		get_compensation(w_data->target_ratio);
391 	if (compensated_ratio <= 0)
392 		compensated_ratio = 1;
393 	interval = w_data->duration_jiffies * 100 / compensated_ratio;
394 
395 	/* align idle time */
396 	target_jiffies = roundup(jiffies, interval);
397 	sleeptime = target_jiffies - jiffies;
398 	if (sleeptime <= 0)
399 		sleeptime = 1;
400 
401 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
402 		kthread_queue_delayed_work(w_data->worker,
403 					   &w_data->idle_injection_work,
404 					   sleeptime);
405 }
406 
407 static void clamp_idle_injection_func(struct kthread_work *work)
408 {
409 	struct powerclamp_worker_data *w_data;
410 
411 	w_data = container_of(work, struct powerclamp_worker_data,
412 			      idle_injection_work.work);
413 
414 	/*
415 	 * only elected controlling cpu can collect stats and update
416 	 * control parameters.
417 	 */
418 	if (w_data->cpu == control_cpu &&
419 	    !(w_data->count % w_data->window_size_now)) {
420 		should_skip =
421 			powerclamp_adjust_controls(w_data->target_ratio,
422 						   w_data->guard,
423 						   w_data->window_size_now);
424 		smp_mb();
425 	}
426 
427 	if (should_skip)
428 		goto balance;
429 
430 	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
431 
432 balance:
433 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
434 		kthread_queue_work(w_data->worker, &w_data->balancing_work);
435 }
436 
437 /*
438  * 1 HZ polling while clamping is active, useful for userspace
439  * to monitor actual idle ratio.
440  */
441 static void poll_pkg_cstate(struct work_struct *dummy);
442 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
443 static void poll_pkg_cstate(struct work_struct *dummy)
444 {
445 	static u64 msr_last;
446 	static u64 tsc_last;
447 
448 	u64 msr_now;
449 	u64 tsc_now;
450 	u64 val64;
451 
452 	msr_now = pkg_state_counter();
453 	tsc_now = rdtsc();
454 
455 	/* calculate pkg cstate vs tsc ratio */
456 	if (!msr_last || !tsc_last)
457 		pkg_cstate_ratio_cur = 1;
458 	else {
459 		if (tsc_now - tsc_last) {
460 			val64 = 100 * (msr_now - msr_last);
461 			do_div(val64, (tsc_now - tsc_last));
462 			pkg_cstate_ratio_cur = val64;
463 		}
464 	}
465 
466 	/* update record */
467 	msr_last = msr_now;
468 	tsc_last = tsc_now;
469 
470 	if (true == clamping)
471 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
472 }
473 
474 static void start_power_clamp_worker(unsigned long cpu)
475 {
476 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
477 	struct kthread_worker *worker;
478 
479 	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
480 	if (IS_ERR(worker))
481 		return;
482 
483 	w_data->worker = worker;
484 	w_data->count = 0;
485 	w_data->cpu = cpu;
486 	w_data->clamping = true;
487 	set_bit(cpu, cpu_clamping_mask);
488 	sched_set_fifo(worker->task);
489 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
490 	kthread_init_delayed_work(&w_data->idle_injection_work,
491 				  clamp_idle_injection_func);
492 	kthread_queue_work(w_data->worker, &w_data->balancing_work);
493 }
494 
495 static void stop_power_clamp_worker(unsigned long cpu)
496 {
497 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
498 
499 	if (!w_data->worker)
500 		return;
501 
502 	w_data->clamping = false;
503 	/*
504 	 * Make sure that all works that get queued after this point see
505 	 * the clamping disabled. The counter part is not needed because
506 	 * there is an implicit memory barrier when the queued work
507 	 * is proceed.
508 	 */
509 	smp_wmb();
510 	kthread_cancel_work_sync(&w_data->balancing_work);
511 	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
512 	/*
513 	 * The balancing work still might be queued here because
514 	 * the handling of the "clapming" variable, cancel, and queue
515 	 * operations are not synchronized via a lock. But it is not
516 	 * a big deal. The balancing work is fast and destroy kthread
517 	 * will wait for it.
518 	 */
519 	clear_bit(w_data->cpu, cpu_clamping_mask);
520 	kthread_destroy_worker(w_data->worker);
521 
522 	w_data->worker = NULL;
523 }
524 
525 static int start_power_clamp(void)
526 {
527 	unsigned long cpu;
528 
529 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
530 	/* prevent cpu hotplug */
531 	cpus_read_lock();
532 
533 	/* prefer BSP */
534 	control_cpu = 0;
535 	if (!cpu_online(control_cpu))
536 		control_cpu = smp_processor_id();
537 
538 	clamping = true;
539 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
540 
541 	/* start one kthread worker per online cpu */
542 	for_each_online_cpu(cpu) {
543 		start_power_clamp_worker(cpu);
544 	}
545 	cpus_read_unlock();
546 
547 	return 0;
548 }
549 
550 static void end_power_clamp(void)
551 {
552 	int i;
553 
554 	/*
555 	 * Block requeuing in all the kthread workers. They will flush and
556 	 * stop faster.
557 	 */
558 	clamping = false;
559 	for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
560 		pr_debug("clamping worker for cpu %d alive, destroy\n", i);
561 		stop_power_clamp_worker(i);
562 	}
563 }
564 
565 static int powerclamp_cpu_online(unsigned int cpu)
566 {
567 	if (clamping == false)
568 		return 0;
569 	start_power_clamp_worker(cpu);
570 	/* prefer BSP as controlling CPU */
571 	if (cpu == 0) {
572 		control_cpu = 0;
573 		smp_mb();
574 	}
575 	return 0;
576 }
577 
578 static int powerclamp_cpu_predown(unsigned int cpu)
579 {
580 	if (clamping == false)
581 		return 0;
582 
583 	stop_power_clamp_worker(cpu);
584 	if (cpu != control_cpu)
585 		return 0;
586 
587 	control_cpu = cpumask_first(cpu_online_mask);
588 	if (control_cpu == cpu)
589 		control_cpu = cpumask_next(cpu, cpu_online_mask);
590 	smp_mb();
591 	return 0;
592 }
593 
594 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
595 				 unsigned long *state)
596 {
597 	*state = MAX_TARGET_RATIO;
598 
599 	return 0;
600 }
601 
602 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
603 				 unsigned long *state)
604 {
605 	if (true == clamping)
606 		*state = pkg_cstate_ratio_cur;
607 	else
608 		/* to save power, do not poll idle ratio while not clamping */
609 		*state = -1; /* indicates invalid state */
610 
611 	return 0;
612 }
613 
614 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
615 				 unsigned long new_target_ratio)
616 {
617 	int ret = 0;
618 
619 	new_target_ratio = clamp(new_target_ratio, 0UL,
620 				(unsigned long) (MAX_TARGET_RATIO-1));
621 	if (set_target_ratio == 0 && new_target_ratio > 0) {
622 		pr_info("Start idle injection to reduce power\n");
623 		set_target_ratio = new_target_ratio;
624 		ret = start_power_clamp();
625 		goto exit_set;
626 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
627 		pr_info("Stop forced idle injection\n");
628 		end_power_clamp();
629 		set_target_ratio = 0;
630 	} else	/* adjust currently running */ {
631 		set_target_ratio = new_target_ratio;
632 		/* make new set_target_ratio visible to other cpus */
633 		smp_mb();
634 	}
635 
636 exit_set:
637 	return ret;
638 }
639 
640 /* bind to generic thermal layer as cooling device*/
641 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
642 	.get_max_state = powerclamp_get_max_state,
643 	.get_cur_state = powerclamp_get_cur_state,
644 	.set_cur_state = powerclamp_set_cur_state,
645 };
646 
647 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
648 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
649 	{}
650 };
651 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
652 
653 static int __init powerclamp_probe(void)
654 {
655 
656 	if (!x86_match_cpu(intel_powerclamp_ids)) {
657 		pr_err("CPU does not support MWAIT\n");
658 		return -ENODEV;
659 	}
660 
661 	/* The goal for idle time alignment is to achieve package cstate. */
662 	if (!has_pkg_state_counter()) {
663 		pr_info("No package C-state available\n");
664 		return -ENODEV;
665 	}
666 
667 	/* find the deepest mwait value */
668 	find_target_mwait();
669 
670 	return 0;
671 }
672 
673 static int powerclamp_debug_show(struct seq_file *m, void *unused)
674 {
675 	int i = 0;
676 
677 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
678 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
679 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
680 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
681 			i,
682 			cal_data[i].confidence,
683 			cal_data[i].steady_comp,
684 			cal_data[i].dynamic_comp);
685 	}
686 
687 	return 0;
688 }
689 
690 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
691 
692 static inline void powerclamp_create_debug_files(void)
693 {
694 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
695 
696 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
697 			    &powerclamp_debug_fops);
698 }
699 
700 static enum cpuhp_state hp_state;
701 
702 static int __init powerclamp_init(void)
703 {
704 	int retval;
705 
706 	cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
707 	if (!cpu_clamping_mask)
708 		return -ENOMEM;
709 
710 	/* probe cpu features and ids here */
711 	retval = powerclamp_probe();
712 	if (retval)
713 		goto exit_free;
714 
715 	/* set default limit, maybe adjusted during runtime based on feedback */
716 	window_size = 2;
717 	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
718 					   "thermal/intel_powerclamp:online",
719 					   powerclamp_cpu_online,
720 					   powerclamp_cpu_predown);
721 	if (retval < 0)
722 		goto exit_free;
723 
724 	hp_state = retval;
725 
726 	worker_data = alloc_percpu(struct powerclamp_worker_data);
727 	if (!worker_data) {
728 		retval = -ENOMEM;
729 		goto exit_unregister;
730 	}
731 
732 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
733 						&powerclamp_cooling_ops);
734 	if (IS_ERR(cooling_dev)) {
735 		retval = -ENODEV;
736 		goto exit_free_thread;
737 	}
738 
739 	if (!duration)
740 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
741 
742 	powerclamp_create_debug_files();
743 
744 	return 0;
745 
746 exit_free_thread:
747 	free_percpu(worker_data);
748 exit_unregister:
749 	cpuhp_remove_state_nocalls(hp_state);
750 exit_free:
751 	bitmap_free(cpu_clamping_mask);
752 	return retval;
753 }
754 module_init(powerclamp_init);
755 
756 static void __exit powerclamp_exit(void)
757 {
758 	end_power_clamp();
759 	cpuhp_remove_state_nocalls(hp_state);
760 	free_percpu(worker_data);
761 	thermal_cooling_device_unregister(cooling_dev);
762 	bitmap_free(cpu_clamping_mask);
763 
764 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
765 	debugfs_remove_recursive(debug_dir);
766 }
767 module_exit(powerclamp_exit);
768 
769 MODULE_LICENSE("GPL");
770 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
771 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
772 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
773