1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39 
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45 
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57 
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60 
61 /* user selected target */
62 static unsigned int set_target_ratio;
63 static unsigned int current_ratio;
64 static bool should_skip;
65 static bool reduce_irq;
66 static atomic_t idle_wakeup_counter;
67 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
68 				  * control parameters. default to BSP but BSP
69 				  * can be offlined.
70 				  */
71 static bool clamping;
72 
73 struct powerclamp_worker_data {
74 	struct kthread_worker *worker;
75 	struct kthread_work balancing_work;
76 	struct kthread_delayed_work idle_injection_work;
77 	unsigned int cpu;
78 	unsigned int count;
79 	unsigned int guard;
80 	unsigned int window_size_now;
81 	unsigned int target_ratio;
82 	unsigned int duration_jiffies;
83 	bool clamping;
84 };
85 
86 static struct powerclamp_worker_data __percpu *worker_data;
87 static struct thermal_cooling_device *cooling_dev;
88 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
89 					   * clamping kthread worker
90 					   */
91 
92 static unsigned int duration;
93 static unsigned int pkg_cstate_ratio_cur;
94 static unsigned int window_size;
95 
96 static int duration_set(const char *arg, const struct kernel_param *kp)
97 {
98 	int ret = 0;
99 	unsigned long new_duration;
100 
101 	ret = kstrtoul(arg, 10, &new_duration);
102 	if (ret)
103 		goto exit;
104 	if (new_duration > 25 || new_duration < 6) {
105 		pr_err("Out of recommended range %lu, between 6-25ms\n",
106 			new_duration);
107 		ret = -EINVAL;
108 	}
109 
110 	duration = clamp(new_duration, 6ul, 25ul);
111 	smp_mb();
112 
113 exit:
114 
115 	return ret;
116 }
117 
118 static const struct kernel_param_ops duration_ops = {
119 	.set = duration_set,
120 	.get = param_get_int,
121 };
122 
123 
124 module_param_cb(duration, &duration_ops, &duration, 0644);
125 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
126 
127 struct powerclamp_calibration_data {
128 	unsigned long confidence;  /* used for calibration, basically a counter
129 				    * gets incremented each time a clamping
130 				    * period is completed without extra wakeups
131 				    * once that counter is reached given level,
132 				    * compensation is deemed usable.
133 				    */
134 	unsigned long steady_comp; /* steady state compensation used when
135 				    * no extra wakeups occurred.
136 				    */
137 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
138 				     * mostly from external interrupts.
139 				     */
140 };
141 
142 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
143 
144 static int window_size_set(const char *arg, const struct kernel_param *kp)
145 {
146 	int ret = 0;
147 	unsigned long new_window_size;
148 
149 	ret = kstrtoul(arg, 10, &new_window_size);
150 	if (ret)
151 		goto exit_win;
152 	if (new_window_size > 10 || new_window_size < 2) {
153 		pr_err("Out of recommended window size %lu, between 2-10\n",
154 			new_window_size);
155 		ret = -EINVAL;
156 	}
157 
158 	window_size = clamp(new_window_size, 2ul, 10ul);
159 	smp_mb();
160 
161 exit_win:
162 
163 	return ret;
164 }
165 
166 static const struct kernel_param_ops window_size_ops = {
167 	.set = window_size_set,
168 	.get = param_get_int,
169 };
170 
171 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
172 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
173 	"\tpowerclamp controls idle ratio within this window. larger\n"
174 	"\twindow size results in slower response time but more smooth\n"
175 	"\tclamping results. default to 2.");
176 
177 static void find_target_mwait(void)
178 {
179 	unsigned int eax, ebx, ecx, edx;
180 	unsigned int highest_cstate = 0;
181 	unsigned int highest_subcstate = 0;
182 	int i;
183 
184 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
185 		return;
186 
187 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
188 
189 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
190 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
191 		return;
192 
193 	edx >>= MWAIT_SUBSTATE_SIZE;
194 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
195 		if (edx & MWAIT_SUBSTATE_MASK) {
196 			highest_cstate = i;
197 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
198 		}
199 	}
200 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
201 		(highest_subcstate - 1);
202 
203 }
204 
205 struct pkg_cstate_info {
206 	bool skip;
207 	int msr_index;
208 	int cstate_id;
209 };
210 
211 #define PKG_CSTATE_INIT(id) {				\
212 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
213 		.cstate_id = id				\
214 			}
215 
216 static struct pkg_cstate_info pkg_cstates[] = {
217 	PKG_CSTATE_INIT(2),
218 	PKG_CSTATE_INIT(3),
219 	PKG_CSTATE_INIT(6),
220 	PKG_CSTATE_INIT(7),
221 	PKG_CSTATE_INIT(8),
222 	PKG_CSTATE_INIT(9),
223 	PKG_CSTATE_INIT(10),
224 	{NULL},
225 };
226 
227 static bool has_pkg_state_counter(void)
228 {
229 	u64 val;
230 	struct pkg_cstate_info *info = pkg_cstates;
231 
232 	/* check if any one of the counter msrs exists */
233 	while (info->msr_index) {
234 		if (!rdmsrl_safe(info->msr_index, &val))
235 			return true;
236 		info++;
237 	}
238 
239 	return false;
240 }
241 
242 static u64 pkg_state_counter(void)
243 {
244 	u64 val;
245 	u64 count = 0;
246 	struct pkg_cstate_info *info = pkg_cstates;
247 
248 	while (info->msr_index) {
249 		if (!info->skip) {
250 			if (!rdmsrl_safe(info->msr_index, &val))
251 				count += val;
252 			else
253 				info->skip = true;
254 		}
255 		info++;
256 	}
257 
258 	return count;
259 }
260 
261 static unsigned int get_compensation(int ratio)
262 {
263 	unsigned int comp = 0;
264 
265 	/* we only use compensation if all adjacent ones are good */
266 	if (ratio == 1 &&
267 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
268 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
269 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
270 		comp = (cal_data[ratio].steady_comp +
271 			cal_data[ratio + 1].steady_comp +
272 			cal_data[ratio + 2].steady_comp) / 3;
273 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
274 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
275 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
276 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
277 		comp = (cal_data[ratio].steady_comp +
278 			cal_data[ratio - 1].steady_comp +
279 			cal_data[ratio - 2].steady_comp) / 3;
280 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
281 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
282 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
283 		comp = (cal_data[ratio].steady_comp +
284 			cal_data[ratio - 1].steady_comp +
285 			cal_data[ratio + 1].steady_comp) / 3;
286 	}
287 
288 	/* REVISIT: simple penalty of double idle injection */
289 	if (reduce_irq)
290 		comp = ratio;
291 	/* do not exceed limit */
292 	if (comp + ratio >= MAX_TARGET_RATIO)
293 		comp = MAX_TARGET_RATIO - ratio - 1;
294 
295 	return comp;
296 }
297 
298 static void adjust_compensation(int target_ratio, unsigned int win)
299 {
300 	int delta;
301 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
302 
303 	/*
304 	 * adjust compensations if confidence level has not been reached or
305 	 * there are too many wakeups during the last idle injection period, we
306 	 * cannot trust the data for compensation.
307 	 */
308 	if (d->confidence >= CONFIDENCE_OK ||
309 		atomic_read(&idle_wakeup_counter) >
310 		win * num_online_cpus())
311 		return;
312 
313 	delta = set_target_ratio - current_ratio;
314 	/* filter out bad data */
315 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
316 		if (d->steady_comp)
317 			d->steady_comp =
318 				roundup(delta+d->steady_comp, 2)/2;
319 		else
320 			d->steady_comp = delta;
321 		d->confidence++;
322 	}
323 }
324 
325 static bool powerclamp_adjust_controls(unsigned int target_ratio,
326 				unsigned int guard, unsigned int win)
327 {
328 	static u64 msr_last, tsc_last;
329 	u64 msr_now, tsc_now;
330 	u64 val64;
331 
332 	/* check result for the last window */
333 	msr_now = pkg_state_counter();
334 	tsc_now = rdtsc();
335 
336 	/* calculate pkg cstate vs tsc ratio */
337 	if (!msr_last || !tsc_last)
338 		current_ratio = 1;
339 	else if (tsc_now-tsc_last) {
340 		val64 = 100*(msr_now-msr_last);
341 		do_div(val64, (tsc_now-tsc_last));
342 		current_ratio = val64;
343 	}
344 
345 	/* update record */
346 	msr_last = msr_now;
347 	tsc_last = tsc_now;
348 
349 	adjust_compensation(target_ratio, win);
350 	/*
351 	 * too many external interrupts, set flag such
352 	 * that we can take measure later.
353 	 */
354 	reduce_irq = atomic_read(&idle_wakeup_counter) >=
355 		2 * win * num_online_cpus();
356 
357 	atomic_set(&idle_wakeup_counter, 0);
358 	/* if we are above target+guard, skip */
359 	return set_target_ratio + guard <= current_ratio;
360 }
361 
362 static void clamp_balancing_func(struct kthread_work *work)
363 {
364 	struct powerclamp_worker_data *w_data;
365 	int sleeptime;
366 	unsigned long target_jiffies;
367 	unsigned int compensated_ratio;
368 	int interval; /* jiffies to sleep for each attempt */
369 
370 	w_data = container_of(work, struct powerclamp_worker_data,
371 			      balancing_work);
372 
373 	/*
374 	 * make sure user selected ratio does not take effect until
375 	 * the next round. adjust target_ratio if user has changed
376 	 * target such that we can converge quickly.
377 	 */
378 	w_data->target_ratio = READ_ONCE(set_target_ratio);
379 	w_data->guard = 1 + w_data->target_ratio / 20;
380 	w_data->window_size_now = window_size;
381 	w_data->duration_jiffies = msecs_to_jiffies(duration);
382 	w_data->count++;
383 
384 	/*
385 	 * systems may have different ability to enter package level
386 	 * c-states, thus we need to compensate the injected idle ratio
387 	 * to achieve the actual target reported by the HW.
388 	 */
389 	compensated_ratio = w_data->target_ratio +
390 		get_compensation(w_data->target_ratio);
391 	if (compensated_ratio <= 0)
392 		compensated_ratio = 1;
393 	interval = w_data->duration_jiffies * 100 / compensated_ratio;
394 
395 	/* align idle time */
396 	target_jiffies = roundup(jiffies, interval);
397 	sleeptime = target_jiffies - jiffies;
398 	if (sleeptime <= 0)
399 		sleeptime = 1;
400 
401 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
402 		kthread_queue_delayed_work(w_data->worker,
403 					   &w_data->idle_injection_work,
404 					   sleeptime);
405 }
406 
407 static void clamp_idle_injection_func(struct kthread_work *work)
408 {
409 	struct powerclamp_worker_data *w_data;
410 
411 	w_data = container_of(work, struct powerclamp_worker_data,
412 			      idle_injection_work.work);
413 
414 	/*
415 	 * only elected controlling cpu can collect stats and update
416 	 * control parameters.
417 	 */
418 	if (w_data->cpu == control_cpu &&
419 	    !(w_data->count % w_data->window_size_now)) {
420 		should_skip =
421 			powerclamp_adjust_controls(w_data->target_ratio,
422 						   w_data->guard,
423 						   w_data->window_size_now);
424 		smp_mb();
425 	}
426 
427 	if (should_skip)
428 		goto balance;
429 
430 	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
431 
432 balance:
433 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
434 		kthread_queue_work(w_data->worker, &w_data->balancing_work);
435 }
436 
437 /*
438  * 1 HZ polling while clamping is active, useful for userspace
439  * to monitor actual idle ratio.
440  */
441 static void poll_pkg_cstate(struct work_struct *dummy);
442 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
443 static void poll_pkg_cstate(struct work_struct *dummy)
444 {
445 	static u64 msr_last;
446 	static u64 tsc_last;
447 
448 	u64 msr_now;
449 	u64 tsc_now;
450 	u64 val64;
451 
452 	msr_now = pkg_state_counter();
453 	tsc_now = rdtsc();
454 
455 	/* calculate pkg cstate vs tsc ratio */
456 	if (!msr_last || !tsc_last)
457 		pkg_cstate_ratio_cur = 1;
458 	else {
459 		if (tsc_now - tsc_last) {
460 			val64 = 100 * (msr_now - msr_last);
461 			do_div(val64, (tsc_now - tsc_last));
462 			pkg_cstate_ratio_cur = val64;
463 		}
464 	}
465 
466 	/* update record */
467 	msr_last = msr_now;
468 	tsc_last = tsc_now;
469 
470 	if (true == clamping)
471 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
472 }
473 
474 static void start_power_clamp_worker(unsigned long cpu)
475 {
476 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
477 	struct kthread_worker *worker;
478 
479 	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
480 	if (IS_ERR(worker))
481 		return;
482 
483 	w_data->worker = worker;
484 	w_data->count = 0;
485 	w_data->cpu = cpu;
486 	w_data->clamping = true;
487 	set_bit(cpu, cpu_clamping_mask);
488 	sched_set_fifo(worker->task);
489 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
490 	kthread_init_delayed_work(&w_data->idle_injection_work,
491 				  clamp_idle_injection_func);
492 	kthread_queue_work(w_data->worker, &w_data->balancing_work);
493 }
494 
495 static void stop_power_clamp_worker(unsigned long cpu)
496 {
497 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
498 
499 	if (!w_data->worker)
500 		return;
501 
502 	w_data->clamping = false;
503 	/*
504 	 * Make sure that all works that get queued after this point see
505 	 * the clamping disabled. The counter part is not needed because
506 	 * there is an implicit memory barrier when the queued work
507 	 * is proceed.
508 	 */
509 	smp_wmb();
510 	kthread_cancel_work_sync(&w_data->balancing_work);
511 	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
512 	/*
513 	 * The balancing work still might be queued here because
514 	 * the handling of the "clapming" variable, cancel, and queue
515 	 * operations are not synchronized via a lock. But it is not
516 	 * a big deal. The balancing work is fast and destroy kthread
517 	 * will wait for it.
518 	 */
519 	clear_bit(w_data->cpu, cpu_clamping_mask);
520 	kthread_destroy_worker(w_data->worker);
521 
522 	w_data->worker = NULL;
523 }
524 
525 static int start_power_clamp(void)
526 {
527 	unsigned long cpu;
528 
529 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
530 	/* prevent cpu hotplug */
531 	cpus_read_lock();
532 
533 	/* prefer BSP */
534 	control_cpu = 0;
535 	if (!cpu_online(control_cpu))
536 		control_cpu = smp_processor_id();
537 
538 	clamping = true;
539 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
540 
541 	/* start one kthread worker per online cpu */
542 	for_each_online_cpu(cpu) {
543 		start_power_clamp_worker(cpu);
544 	}
545 	cpus_read_unlock();
546 
547 	return 0;
548 }
549 
550 static void end_power_clamp(void)
551 {
552 	int i;
553 
554 	/*
555 	 * Block requeuing in all the kthread workers. They will flush and
556 	 * stop faster.
557 	 */
558 	clamping = false;
559 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
560 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
561 			pr_debug("clamping worker for cpu %d alive, destroy\n",
562 				 i);
563 			stop_power_clamp_worker(i);
564 		}
565 	}
566 }
567 
568 static int powerclamp_cpu_online(unsigned int cpu)
569 {
570 	if (clamping == false)
571 		return 0;
572 	start_power_clamp_worker(cpu);
573 	/* prefer BSP as controlling CPU */
574 	if (cpu == 0) {
575 		control_cpu = 0;
576 		smp_mb();
577 	}
578 	return 0;
579 }
580 
581 static int powerclamp_cpu_predown(unsigned int cpu)
582 {
583 	if (clamping == false)
584 		return 0;
585 
586 	stop_power_clamp_worker(cpu);
587 	if (cpu != control_cpu)
588 		return 0;
589 
590 	control_cpu = cpumask_first(cpu_online_mask);
591 	if (control_cpu == cpu)
592 		control_cpu = cpumask_next(cpu, cpu_online_mask);
593 	smp_mb();
594 	return 0;
595 }
596 
597 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
598 				 unsigned long *state)
599 {
600 	*state = MAX_TARGET_RATIO;
601 
602 	return 0;
603 }
604 
605 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
606 				 unsigned long *state)
607 {
608 	if (true == clamping)
609 		*state = pkg_cstate_ratio_cur;
610 	else
611 		/* to save power, do not poll idle ratio while not clamping */
612 		*state = -1; /* indicates invalid state */
613 
614 	return 0;
615 }
616 
617 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
618 				 unsigned long new_target_ratio)
619 {
620 	int ret = 0;
621 
622 	new_target_ratio = clamp(new_target_ratio, 0UL,
623 				(unsigned long) (MAX_TARGET_RATIO-1));
624 	if (set_target_ratio == 0 && new_target_ratio > 0) {
625 		pr_info("Start idle injection to reduce power\n");
626 		set_target_ratio = new_target_ratio;
627 		ret = start_power_clamp();
628 		goto exit_set;
629 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
630 		pr_info("Stop forced idle injection\n");
631 		end_power_clamp();
632 		set_target_ratio = 0;
633 	} else	/* adjust currently running */ {
634 		set_target_ratio = new_target_ratio;
635 		/* make new set_target_ratio visible to other cpus */
636 		smp_mb();
637 	}
638 
639 exit_set:
640 	return ret;
641 }
642 
643 /* bind to generic thermal layer as cooling device*/
644 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
645 	.get_max_state = powerclamp_get_max_state,
646 	.get_cur_state = powerclamp_get_cur_state,
647 	.set_cur_state = powerclamp_set_cur_state,
648 };
649 
650 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
651 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
652 	{}
653 };
654 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
655 
656 static int __init powerclamp_probe(void)
657 {
658 
659 	if (!x86_match_cpu(intel_powerclamp_ids)) {
660 		pr_err("CPU does not support MWAIT\n");
661 		return -ENODEV;
662 	}
663 
664 	/* The goal for idle time alignment is to achieve package cstate. */
665 	if (!has_pkg_state_counter()) {
666 		pr_info("No package C-state available\n");
667 		return -ENODEV;
668 	}
669 
670 	/* find the deepest mwait value */
671 	find_target_mwait();
672 
673 	return 0;
674 }
675 
676 static int powerclamp_debug_show(struct seq_file *m, void *unused)
677 {
678 	int i = 0;
679 
680 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
681 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
682 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
683 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
684 			i,
685 			cal_data[i].confidence,
686 			cal_data[i].steady_comp,
687 			cal_data[i].dynamic_comp);
688 	}
689 
690 	return 0;
691 }
692 
693 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
694 
695 static inline void powerclamp_create_debug_files(void)
696 {
697 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
698 
699 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
700 			    &powerclamp_debug_fops);
701 }
702 
703 static enum cpuhp_state hp_state;
704 
705 static int __init powerclamp_init(void)
706 {
707 	int retval;
708 
709 	cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
710 	if (!cpu_clamping_mask)
711 		return -ENOMEM;
712 
713 	/* probe cpu features and ids here */
714 	retval = powerclamp_probe();
715 	if (retval)
716 		goto exit_free;
717 
718 	/* set default limit, maybe adjusted during runtime based on feedback */
719 	window_size = 2;
720 	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
721 					   "thermal/intel_powerclamp:online",
722 					   powerclamp_cpu_online,
723 					   powerclamp_cpu_predown);
724 	if (retval < 0)
725 		goto exit_free;
726 
727 	hp_state = retval;
728 
729 	worker_data = alloc_percpu(struct powerclamp_worker_data);
730 	if (!worker_data) {
731 		retval = -ENOMEM;
732 		goto exit_unregister;
733 	}
734 
735 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
736 						&powerclamp_cooling_ops);
737 	if (IS_ERR(cooling_dev)) {
738 		retval = -ENODEV;
739 		goto exit_free_thread;
740 	}
741 
742 	if (!duration)
743 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
744 
745 	powerclamp_create_debug_files();
746 
747 	return 0;
748 
749 exit_free_thread:
750 	free_percpu(worker_data);
751 exit_unregister:
752 	cpuhp_remove_state_nocalls(hp_state);
753 exit_free:
754 	bitmap_free(cpu_clamping_mask);
755 	return retval;
756 }
757 module_init(powerclamp_init);
758 
759 static void __exit powerclamp_exit(void)
760 {
761 	end_power_clamp();
762 	cpuhp_remove_state_nocalls(hp_state);
763 	free_percpu(worker_data);
764 	thermal_cooling_device_unregister(cooling_dev);
765 	bitmap_free(cpu_clamping_mask);
766 
767 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
768 	debugfs_remove_recursive(debug_dir);
769 }
770 module_exit(powerclamp_exit);
771 
772 MODULE_LICENSE("GPL");
773 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
774 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
775 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
776