xref: /openbmc/linux/drivers/thermal/intel/x86_pkg_temp_thermal.c (revision 4f727ecefefbd180de10e25b3e74c03dce3f1e75)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * x86_pkg_temp_thermal driver
4  * Copyright (c) 2013, Intel Corporation.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/module.h>
9 #include <linux/init.h>
10 #include <linux/err.h>
11 #include <linux/param.h>
12 #include <linux/device.h>
13 #include <linux/platform_device.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/slab.h>
17 #include <linux/pm.h>
18 #include <linux/thermal.h>
19 #include <linux/debugfs.h>
20 #include <asm/cpu_device_id.h>
21 #include <asm/mce.h>
22 
23 /*
24 * Rate control delay: Idea is to introduce denounce effect
25 * This should be long enough to avoid reduce events, when
26 * threshold is set to a temperature, which is constantly
27 * violated, but at the short enough to take any action.
28 * The action can be remove threshold or change it to next
29 * interesting setting. Based on experiments, in around
30 * every 5 seconds under load will give us a significant
31 * temperature change.
32 */
33 #define PKG_TEMP_THERMAL_NOTIFY_DELAY	5000
34 static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
35 module_param(notify_delay_ms, int, 0644);
36 MODULE_PARM_DESC(notify_delay_ms,
37 	"User space notification delay in milli seconds.");
38 
39 /* Number of trip points in thermal zone. Currently it can't
40 * be more than 2. MSR can allow setting and getting notifications
41 * for only 2 thresholds. This define enforces this, if there
42 * is some wrong values returned by cpuid for number of thresholds.
43 */
44 #define MAX_NUMBER_OF_TRIPS	2
45 
46 struct pkg_device {
47 	int				cpu;
48 	bool				work_scheduled;
49 	u32				tj_max;
50 	u32				msr_pkg_therm_low;
51 	u32				msr_pkg_therm_high;
52 	struct delayed_work		work;
53 	struct thermal_zone_device	*tzone;
54 	struct cpumask			cpumask;
55 };
56 
57 static struct thermal_zone_params pkg_temp_tz_params = {
58 	.no_hwmon	= true,
59 };
60 
61 /* Keep track of how many package pointers we allocated in init() */
62 static int max_packages __read_mostly;
63 /* Array of package pointers */
64 static struct pkg_device **packages;
65 /* Serializes interrupt notification, work and hotplug */
66 static DEFINE_SPINLOCK(pkg_temp_lock);
67 /* Protects zone operation in the work function against hotplug removal */
68 static DEFINE_MUTEX(thermal_zone_mutex);
69 
70 /* The dynamically assigned cpu hotplug state for module_exit() */
71 static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
72 
73 /* Debug counters to show using debugfs */
74 static struct dentry *debugfs;
75 static unsigned int pkg_interrupt_cnt;
76 static unsigned int pkg_work_cnt;
77 
78 static int pkg_temp_debugfs_init(void)
79 {
80 	struct dentry *d;
81 
82 	debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
83 	if (!debugfs)
84 		return -ENOENT;
85 
86 	d = debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
87 			       &pkg_interrupt_cnt);
88 	if (!d)
89 		goto err_out;
90 
91 	d = debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
92 			       &pkg_work_cnt);
93 	if (!d)
94 		goto err_out;
95 
96 	return 0;
97 
98 err_out:
99 	debugfs_remove_recursive(debugfs);
100 	return -ENOENT;
101 }
102 
103 /*
104  * Protection:
105  *
106  * - cpu hotplug: Read serialized by cpu hotplug lock
107  *		  Write must hold pkg_temp_lock
108  *
109  * - Other callsites: Must hold pkg_temp_lock
110  */
111 static struct pkg_device *pkg_temp_thermal_get_dev(unsigned int cpu)
112 {
113 	int pkgid = topology_logical_package_id(cpu);
114 
115 	if (pkgid >= 0 && pkgid < max_packages)
116 		return packages[pkgid];
117 	return NULL;
118 }
119 
120 /*
121 * tj-max is is interesting because threshold is set relative to this
122 * temperature.
123 */
124 static int get_tj_max(int cpu, u32 *tj_max)
125 {
126 	u32 eax, edx, val;
127 	int err;
128 
129 	err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
130 	if (err)
131 		return err;
132 
133 	val = (eax >> 16) & 0xff;
134 	*tj_max = val * 1000;
135 
136 	return val ? 0 : -EINVAL;
137 }
138 
139 static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
140 {
141 	struct pkg_device *pkgdev = tzd->devdata;
142 	u32 eax, edx;
143 
144 	rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_STATUS, &eax, &edx);
145 	if (eax & 0x80000000) {
146 		*temp = pkgdev->tj_max - ((eax >> 16) & 0x7f) * 1000;
147 		pr_debug("sys_get_curr_temp %d\n", *temp);
148 		return 0;
149 	}
150 	return -EINVAL;
151 }
152 
153 static int sys_get_trip_temp(struct thermal_zone_device *tzd,
154 			     int trip, int *temp)
155 {
156 	struct pkg_device *pkgdev = tzd->devdata;
157 	unsigned long thres_reg_value;
158 	u32 mask, shift, eax, edx;
159 	int ret;
160 
161 	if (trip >= MAX_NUMBER_OF_TRIPS)
162 		return -EINVAL;
163 
164 	if (trip) {
165 		mask = THERM_MASK_THRESHOLD1;
166 		shift = THERM_SHIFT_THRESHOLD1;
167 	} else {
168 		mask = THERM_MASK_THRESHOLD0;
169 		shift = THERM_SHIFT_THRESHOLD0;
170 	}
171 
172 	ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
173 			   &eax, &edx);
174 	if (ret < 0)
175 		return ret;
176 
177 	thres_reg_value = (eax & mask) >> shift;
178 	if (thres_reg_value)
179 		*temp = pkgdev->tj_max - thres_reg_value * 1000;
180 	else
181 		*temp = 0;
182 	pr_debug("sys_get_trip_temp %d\n", *temp);
183 
184 	return 0;
185 }
186 
187 static int
188 sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
189 {
190 	struct pkg_device *pkgdev = tzd->devdata;
191 	u32 l, h, mask, shift, intr;
192 	int ret;
193 
194 	if (trip >= MAX_NUMBER_OF_TRIPS || temp >= pkgdev->tj_max)
195 		return -EINVAL;
196 
197 	ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
198 			   &l, &h);
199 	if (ret < 0)
200 		return ret;
201 
202 	if (trip) {
203 		mask = THERM_MASK_THRESHOLD1;
204 		shift = THERM_SHIFT_THRESHOLD1;
205 		intr = THERM_INT_THRESHOLD1_ENABLE;
206 	} else {
207 		mask = THERM_MASK_THRESHOLD0;
208 		shift = THERM_SHIFT_THRESHOLD0;
209 		intr = THERM_INT_THRESHOLD0_ENABLE;
210 	}
211 	l &= ~mask;
212 	/*
213 	* When users space sets a trip temperature == 0, which is indication
214 	* that, it is no longer interested in receiving notifications.
215 	*/
216 	if (!temp) {
217 		l &= ~intr;
218 	} else {
219 		l |= (pkgdev->tj_max - temp)/1000 << shift;
220 		l |= intr;
221 	}
222 
223 	return wrmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
224 }
225 
226 static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
227 			     enum thermal_trip_type *type)
228 {
229 	*type = THERMAL_TRIP_PASSIVE;
230 	return 0;
231 }
232 
233 /* Thermal zone callback registry */
234 static struct thermal_zone_device_ops tzone_ops = {
235 	.get_temp = sys_get_curr_temp,
236 	.get_trip_temp = sys_get_trip_temp,
237 	.get_trip_type = sys_get_trip_type,
238 	.set_trip_temp = sys_set_trip_temp,
239 };
240 
241 static bool pkg_thermal_rate_control(void)
242 {
243 	return true;
244 }
245 
246 /* Enable threshold interrupt on local package/cpu */
247 static inline void enable_pkg_thres_interrupt(void)
248 {
249 	u8 thres_0, thres_1;
250 	u32 l, h;
251 
252 	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
253 	/* only enable/disable if it had valid threshold value */
254 	thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
255 	thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
256 	if (thres_0)
257 		l |= THERM_INT_THRESHOLD0_ENABLE;
258 	if (thres_1)
259 		l |= THERM_INT_THRESHOLD1_ENABLE;
260 	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
261 }
262 
263 /* Disable threshold interrupt on local package/cpu */
264 static inline void disable_pkg_thres_interrupt(void)
265 {
266 	u32 l, h;
267 
268 	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
269 
270 	l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
271 	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
272 }
273 
274 static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
275 {
276 	struct thermal_zone_device *tzone = NULL;
277 	int cpu = smp_processor_id();
278 	struct pkg_device *pkgdev;
279 	u64 msr_val, wr_val;
280 
281 	mutex_lock(&thermal_zone_mutex);
282 	spin_lock_irq(&pkg_temp_lock);
283 	++pkg_work_cnt;
284 
285 	pkgdev = pkg_temp_thermal_get_dev(cpu);
286 	if (!pkgdev) {
287 		spin_unlock_irq(&pkg_temp_lock);
288 		mutex_unlock(&thermal_zone_mutex);
289 		return;
290 	}
291 	pkgdev->work_scheduled = false;
292 
293 	rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
294 	wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
295 	if (wr_val != msr_val) {
296 		wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val);
297 		tzone = pkgdev->tzone;
298 	}
299 
300 	enable_pkg_thres_interrupt();
301 	spin_unlock_irq(&pkg_temp_lock);
302 
303 	/*
304 	 * If tzone is not NULL, then thermal_zone_mutex will prevent the
305 	 * concurrent removal in the cpu offline callback.
306 	 */
307 	if (tzone)
308 		thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
309 
310 	mutex_unlock(&thermal_zone_mutex);
311 }
312 
313 static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
314 {
315 	unsigned long ms = msecs_to_jiffies(notify_delay_ms);
316 
317 	schedule_delayed_work_on(cpu, work, ms);
318 }
319 
320 static int pkg_thermal_notify(u64 msr_val)
321 {
322 	int cpu = smp_processor_id();
323 	struct pkg_device *pkgdev;
324 	unsigned long flags;
325 
326 	spin_lock_irqsave(&pkg_temp_lock, flags);
327 	++pkg_interrupt_cnt;
328 
329 	disable_pkg_thres_interrupt();
330 
331 	/* Work is per package, so scheduling it once is enough. */
332 	pkgdev = pkg_temp_thermal_get_dev(cpu);
333 	if (pkgdev && !pkgdev->work_scheduled) {
334 		pkgdev->work_scheduled = true;
335 		pkg_thermal_schedule_work(pkgdev->cpu, &pkgdev->work);
336 	}
337 
338 	spin_unlock_irqrestore(&pkg_temp_lock, flags);
339 	return 0;
340 }
341 
342 static int pkg_temp_thermal_device_add(unsigned int cpu)
343 {
344 	int pkgid = topology_logical_package_id(cpu);
345 	u32 tj_max, eax, ebx, ecx, edx;
346 	struct pkg_device *pkgdev;
347 	int thres_count, err;
348 
349 	if (pkgid >= max_packages)
350 		return -ENOMEM;
351 
352 	cpuid(6, &eax, &ebx, &ecx, &edx);
353 	thres_count = ebx & 0x07;
354 	if (!thres_count)
355 		return -ENODEV;
356 
357 	thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
358 
359 	err = get_tj_max(cpu, &tj_max);
360 	if (err)
361 		return err;
362 
363 	pkgdev = kzalloc(sizeof(*pkgdev), GFP_KERNEL);
364 	if (!pkgdev)
365 		return -ENOMEM;
366 
367 	INIT_DELAYED_WORK(&pkgdev->work, pkg_temp_thermal_threshold_work_fn);
368 	pkgdev->cpu = cpu;
369 	pkgdev->tj_max = tj_max;
370 	pkgdev->tzone = thermal_zone_device_register("x86_pkg_temp",
371 			thres_count,
372 			(thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
373 			pkgdev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
374 	if (IS_ERR(pkgdev->tzone)) {
375 		err = PTR_ERR(pkgdev->tzone);
376 		kfree(pkgdev);
377 		return err;
378 	}
379 	/* Store MSR value for package thermal interrupt, to restore at exit */
380 	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, pkgdev->msr_pkg_therm_low,
381 	      pkgdev->msr_pkg_therm_high);
382 
383 	cpumask_set_cpu(cpu, &pkgdev->cpumask);
384 	spin_lock_irq(&pkg_temp_lock);
385 	packages[pkgid] = pkgdev;
386 	spin_unlock_irq(&pkg_temp_lock);
387 	return 0;
388 }
389 
390 static int pkg_thermal_cpu_offline(unsigned int cpu)
391 {
392 	struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu);
393 	bool lastcpu, was_target;
394 	int target;
395 
396 	if (!pkgdev)
397 		return 0;
398 
399 	target = cpumask_any_but(&pkgdev->cpumask, cpu);
400 	cpumask_clear_cpu(cpu, &pkgdev->cpumask);
401 	lastcpu = target >= nr_cpu_ids;
402 	/*
403 	 * Remove the sysfs files, if this is the last cpu in the package
404 	 * before doing further cleanups.
405 	 */
406 	if (lastcpu) {
407 		struct thermal_zone_device *tzone = pkgdev->tzone;
408 
409 		/*
410 		 * We must protect against a work function calling
411 		 * thermal_zone_update, after/while unregister. We null out
412 		 * the pointer under the zone mutex, so the worker function
413 		 * won't try to call.
414 		 */
415 		mutex_lock(&thermal_zone_mutex);
416 		pkgdev->tzone = NULL;
417 		mutex_unlock(&thermal_zone_mutex);
418 
419 		thermal_zone_device_unregister(tzone);
420 	}
421 
422 	/* Protect against work and interrupts */
423 	spin_lock_irq(&pkg_temp_lock);
424 
425 	/*
426 	 * Check whether this cpu was the current target and store the new
427 	 * one. When we drop the lock, then the interrupt notify function
428 	 * will see the new target.
429 	 */
430 	was_target = pkgdev->cpu == cpu;
431 	pkgdev->cpu = target;
432 
433 	/*
434 	 * If this is the last CPU in the package remove the package
435 	 * reference from the array and restore the interrupt MSR. When we
436 	 * drop the lock neither the interrupt notify function nor the
437 	 * worker will see the package anymore.
438 	 */
439 	if (lastcpu) {
440 		packages[topology_logical_package_id(cpu)] = NULL;
441 		/* After this point nothing touches the MSR anymore. */
442 		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
443 		      pkgdev->msr_pkg_therm_low, pkgdev->msr_pkg_therm_high);
444 	}
445 
446 	/*
447 	 * Check whether there is work scheduled and whether the work is
448 	 * targeted at the outgoing CPU.
449 	 */
450 	if (pkgdev->work_scheduled && was_target) {
451 		/*
452 		 * To cancel the work we need to drop the lock, otherwise
453 		 * we might deadlock if the work needs to be flushed.
454 		 */
455 		spin_unlock_irq(&pkg_temp_lock);
456 		cancel_delayed_work_sync(&pkgdev->work);
457 		spin_lock_irq(&pkg_temp_lock);
458 		/*
459 		 * If this is not the last cpu in the package and the work
460 		 * did not run after we dropped the lock above, then we
461 		 * need to reschedule the work, otherwise the interrupt
462 		 * stays disabled forever.
463 		 */
464 		if (!lastcpu && pkgdev->work_scheduled)
465 			pkg_thermal_schedule_work(target, &pkgdev->work);
466 	}
467 
468 	spin_unlock_irq(&pkg_temp_lock);
469 
470 	/* Final cleanup if this is the last cpu */
471 	if (lastcpu)
472 		kfree(pkgdev);
473 	return 0;
474 }
475 
476 static int pkg_thermal_cpu_online(unsigned int cpu)
477 {
478 	struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu);
479 	struct cpuinfo_x86 *c = &cpu_data(cpu);
480 
481 	/* Paranoia check */
482 	if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
483 		return -ENODEV;
484 
485 	/* If the package exists, nothing to do */
486 	if (pkgdev) {
487 		cpumask_set_cpu(cpu, &pkgdev->cpumask);
488 		return 0;
489 	}
490 	return pkg_temp_thermal_device_add(cpu);
491 }
492 
493 static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
494 	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS },
495 	{}
496 };
497 MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
498 
499 static int __init pkg_temp_thermal_init(void)
500 {
501 	int ret;
502 
503 	if (!x86_match_cpu(pkg_temp_thermal_ids))
504 		return -ENODEV;
505 
506 	max_packages = topology_max_packages();
507 	packages = kcalloc(max_packages, sizeof(struct pkg_device *),
508 			   GFP_KERNEL);
509 	if (!packages)
510 		return -ENOMEM;
511 
512 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
513 				pkg_thermal_cpu_online,	pkg_thermal_cpu_offline);
514 	if (ret < 0)
515 		goto err;
516 
517 	/* Store the state for module exit */
518 	pkg_thermal_hp_state = ret;
519 
520 	platform_thermal_package_notify = pkg_thermal_notify;
521 	platform_thermal_package_rate_control = pkg_thermal_rate_control;
522 
523 	 /* Don't care if it fails */
524 	pkg_temp_debugfs_init();
525 	return 0;
526 
527 err:
528 	kfree(packages);
529 	return ret;
530 }
531 module_init(pkg_temp_thermal_init)
532 
533 static void __exit pkg_temp_thermal_exit(void)
534 {
535 	platform_thermal_package_notify = NULL;
536 	platform_thermal_package_rate_control = NULL;
537 
538 	cpuhp_remove_state(pkg_thermal_hp_state);
539 	debugfs_remove_recursive(debugfs);
540 	kfree(packages);
541 }
542 module_exit(pkg_temp_thermal_exit)
543 
544 MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
545 MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
546 MODULE_LICENSE("GPL v2");
547