1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * x86_pkg_temp_thermal driver
4  * Copyright (c) 2013, Intel Corporation.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/module.h>
9 #include <linux/init.h>
10 #include <linux/err.h>
11 #include <linux/param.h>
12 #include <linux/device.h>
13 #include <linux/platform_device.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/slab.h>
17 #include <linux/pm.h>
18 #include <linux/thermal.h>
19 #include <linux/debugfs.h>
20 
21 #include <asm/cpu_device_id.h>
22 
23 #include "thermal_interrupt.h"
24 
25 /*
26 * Rate control delay: Idea is to introduce denounce effect
27 * This should be long enough to avoid reduce events, when
28 * threshold is set to a temperature, which is constantly
29 * violated, but at the short enough to take any action.
30 * The action can be remove threshold or change it to next
31 * interesting setting. Based on experiments, in around
32 * every 5 seconds under load will give us a significant
33 * temperature change.
34 */
35 #define PKG_TEMP_THERMAL_NOTIFY_DELAY	5000
36 static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
37 module_param(notify_delay_ms, int, 0644);
38 MODULE_PARM_DESC(notify_delay_ms,
39 	"User space notification delay in milli seconds.");
40 
41 /* Number of trip points in thermal zone. Currently it can't
42 * be more than 2. MSR can allow setting and getting notifications
43 * for only 2 thresholds. This define enforces this, if there
44 * is some wrong values returned by cpuid for number of thresholds.
45 */
46 #define MAX_NUMBER_OF_TRIPS	2
47 
48 struct zone_device {
49 	int				cpu;
50 	bool				work_scheduled;
51 	u32				tj_max;
52 	u32				msr_pkg_therm_low;
53 	u32				msr_pkg_therm_high;
54 	struct delayed_work		work;
55 	struct thermal_zone_device	*tzone;
56 	struct thermal_trip		*trips;
57 	struct cpumask			cpumask;
58 };
59 
60 static struct thermal_zone_params pkg_temp_tz_params = {
61 	.no_hwmon	= true,
62 };
63 
64 /* Keep track of how many zone pointers we allocated in init() */
65 static int max_id __read_mostly;
66 /* Array of zone pointers */
67 static struct zone_device **zones;
68 /* Serializes interrupt notification, work and hotplug */
69 static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
70 /* Protects zone operation in the work function against hotplug removal */
71 static DEFINE_MUTEX(thermal_zone_mutex);
72 
73 /* The dynamically assigned cpu hotplug state for module_exit() */
74 static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
75 
76 /* Debug counters to show using debugfs */
77 static struct dentry *debugfs;
78 static unsigned int pkg_interrupt_cnt;
79 static unsigned int pkg_work_cnt;
80 
81 static void pkg_temp_debugfs_init(void)
82 {
83 	debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
84 
85 	debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
86 			   &pkg_interrupt_cnt);
87 	debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
88 			   &pkg_work_cnt);
89 }
90 
91 /*
92  * Protection:
93  *
94  * - cpu hotplug: Read serialized by cpu hotplug lock
95  *		  Write must hold pkg_temp_lock
96  *
97  * - Other callsites: Must hold pkg_temp_lock
98  */
99 static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
100 {
101 	int id = topology_logical_die_id(cpu);
102 
103 	if (id >= 0 && id < max_id)
104 		return zones[id];
105 	return NULL;
106 }
107 
108 /*
109 * tj-max is interesting because threshold is set relative to this
110 * temperature.
111 */
112 static int get_tj_max(int cpu, u32 *tj_max)
113 {
114 	u32 eax, edx, val;
115 	int err;
116 
117 	err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
118 	if (err)
119 		return err;
120 
121 	val = (eax >> 16) & 0xff;
122 	*tj_max = val * 1000;
123 
124 	return val ? 0 : -EINVAL;
125 }
126 
127 static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
128 {
129 	struct zone_device *zonedev = tzd->devdata;
130 	u32 eax, edx;
131 
132 	rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_STATUS,
133 			&eax, &edx);
134 	if (eax & 0x80000000) {
135 		*temp = zonedev->tj_max - ((eax >> 16) & 0x7f) * 1000;
136 		pr_debug("sys_get_curr_temp %d\n", *temp);
137 		return 0;
138 	}
139 	return -EINVAL;
140 }
141 
142 static int
143 sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
144 {
145 	struct zone_device *zonedev = tzd->devdata;
146 	u32 l, h, mask, shift, intr;
147 	int ret;
148 
149 	if (trip >= MAX_NUMBER_OF_TRIPS || temp >= zonedev->tj_max)
150 		return -EINVAL;
151 
152 	ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
153 			   &l, &h);
154 	if (ret < 0)
155 		return ret;
156 
157 	if (trip) {
158 		mask = THERM_MASK_THRESHOLD1;
159 		shift = THERM_SHIFT_THRESHOLD1;
160 		intr = THERM_INT_THRESHOLD1_ENABLE;
161 	} else {
162 		mask = THERM_MASK_THRESHOLD0;
163 		shift = THERM_SHIFT_THRESHOLD0;
164 		intr = THERM_INT_THRESHOLD0_ENABLE;
165 	}
166 	l &= ~mask;
167 	/*
168 	* When users space sets a trip temperature == 0, which is indication
169 	* that, it is no longer interested in receiving notifications.
170 	*/
171 	if (!temp) {
172 		l &= ~intr;
173 	} else {
174 		l |= (zonedev->tj_max - temp)/1000 << shift;
175 		l |= intr;
176 	}
177 
178 	return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
179 			l, h);
180 }
181 
182 /* Thermal zone callback registry */
183 static struct thermal_zone_device_ops tzone_ops = {
184 	.get_temp = sys_get_curr_temp,
185 	.set_trip_temp = sys_set_trip_temp,
186 };
187 
188 static bool pkg_thermal_rate_control(void)
189 {
190 	return true;
191 }
192 
193 /* Enable threshold interrupt on local package/cpu */
194 static inline void enable_pkg_thres_interrupt(void)
195 {
196 	u8 thres_0, thres_1;
197 	u32 l, h;
198 
199 	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
200 	/* only enable/disable if it had valid threshold value */
201 	thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
202 	thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
203 	if (thres_0)
204 		l |= THERM_INT_THRESHOLD0_ENABLE;
205 	if (thres_1)
206 		l |= THERM_INT_THRESHOLD1_ENABLE;
207 	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
208 }
209 
210 /* Disable threshold interrupt on local package/cpu */
211 static inline void disable_pkg_thres_interrupt(void)
212 {
213 	u32 l, h;
214 
215 	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
216 
217 	l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
218 	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
219 }
220 
221 static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
222 {
223 	struct thermal_zone_device *tzone = NULL;
224 	int cpu = smp_processor_id();
225 	struct zone_device *zonedev;
226 
227 	mutex_lock(&thermal_zone_mutex);
228 	raw_spin_lock_irq(&pkg_temp_lock);
229 	++pkg_work_cnt;
230 
231 	zonedev = pkg_temp_thermal_get_dev(cpu);
232 	if (!zonedev) {
233 		raw_spin_unlock_irq(&pkg_temp_lock);
234 		mutex_unlock(&thermal_zone_mutex);
235 		return;
236 	}
237 	zonedev->work_scheduled = false;
238 
239 	thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
240 	tzone = zonedev->tzone;
241 
242 	enable_pkg_thres_interrupt();
243 	raw_spin_unlock_irq(&pkg_temp_lock);
244 
245 	/*
246 	 * If tzone is not NULL, then thermal_zone_mutex will prevent the
247 	 * concurrent removal in the cpu offline callback.
248 	 */
249 	if (tzone)
250 		thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
251 
252 	mutex_unlock(&thermal_zone_mutex);
253 }
254 
255 static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
256 {
257 	unsigned long ms = msecs_to_jiffies(notify_delay_ms);
258 
259 	schedule_delayed_work_on(cpu, work, ms);
260 }
261 
262 static int pkg_thermal_notify(u64 msr_val)
263 {
264 	int cpu = smp_processor_id();
265 	struct zone_device *zonedev;
266 	unsigned long flags;
267 
268 	raw_spin_lock_irqsave(&pkg_temp_lock, flags);
269 	++pkg_interrupt_cnt;
270 
271 	disable_pkg_thres_interrupt();
272 
273 	/* Work is per package, so scheduling it once is enough. */
274 	zonedev = pkg_temp_thermal_get_dev(cpu);
275 	if (zonedev && !zonedev->work_scheduled) {
276 		zonedev->work_scheduled = true;
277 		pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
278 	}
279 
280 	raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
281 	return 0;
282 }
283 
284 static struct thermal_trip *pkg_temp_thermal_trips_init(int cpu, int tj_max, int num_trips)
285 {
286 	struct thermal_trip *trips;
287 	unsigned long thres_reg_value;
288 	u32 mask, shift, eax, edx;
289 	int ret, i;
290 
291 	trips = kzalloc(sizeof(*trips) * num_trips, GFP_KERNEL);
292 	if (!trips)
293 		return ERR_PTR(-ENOMEM);
294 
295 	for (i = 0; i < num_trips; i++) {
296 
297 		if (i) {
298 			mask = THERM_MASK_THRESHOLD1;
299 			shift = THERM_SHIFT_THRESHOLD1;
300 		} else {
301 			mask = THERM_MASK_THRESHOLD0;
302 			shift = THERM_SHIFT_THRESHOLD0;
303 		}
304 
305 		ret = rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
306 				   &eax, &edx);
307 		if (ret < 0) {
308 			kfree(trips);
309 			return ERR_PTR(ret);
310 		}
311 
312 		thres_reg_value = (eax & mask) >> shift;
313 
314 		trips[i].temperature = thres_reg_value ?
315 			tj_max - thres_reg_value * 1000 : THERMAL_TEMP_INVALID;
316 
317 		trips[i].type = THERMAL_TRIP_PASSIVE;
318 
319 		pr_debug("%s: cpu=%d, trip=%d, temp=%d\n",
320 			 __func__, cpu, i, trips[i].temperature);
321 	}
322 
323 	return trips;
324 }
325 
326 static int pkg_temp_thermal_device_add(unsigned int cpu)
327 {
328 	int id = topology_logical_die_id(cpu);
329 	u32 tj_max, eax, ebx, ecx, edx;
330 	struct zone_device *zonedev;
331 	int thres_count, err;
332 
333 	if (id >= max_id)
334 		return -ENOMEM;
335 
336 	cpuid(6, &eax, &ebx, &ecx, &edx);
337 	thres_count = ebx & 0x07;
338 	if (!thres_count)
339 		return -ENODEV;
340 
341 	thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
342 
343 	err = get_tj_max(cpu, &tj_max);
344 	if (err)
345 		return err;
346 
347 	zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
348 	if (!zonedev)
349 		return -ENOMEM;
350 
351 	zonedev->trips = pkg_temp_thermal_trips_init(cpu, tj_max, thres_count);
352 	if (IS_ERR(zonedev->trips)) {
353 		err = PTR_ERR(zonedev->trips);
354 		goto out_kfree_zonedev;
355 	}
356 
357 	INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
358 	zonedev->cpu = cpu;
359 	zonedev->tj_max = tj_max;
360 	zonedev->tzone = thermal_zone_device_register_with_trips("x86_pkg_temp",
361 			zonedev->trips, thres_count,
362 			(thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
363 			zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
364 	if (IS_ERR(zonedev->tzone)) {
365 		err = PTR_ERR(zonedev->tzone);
366 		goto out_kfree_trips;
367 	}
368 	err = thermal_zone_device_enable(zonedev->tzone);
369 	if (err)
370 		goto out_unregister_tz;
371 
372 	/* Store MSR value for package thermal interrupt, to restore at exit */
373 	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
374 	      zonedev->msr_pkg_therm_high);
375 
376 	cpumask_set_cpu(cpu, &zonedev->cpumask);
377 	raw_spin_lock_irq(&pkg_temp_lock);
378 	zones[id] = zonedev;
379 	raw_spin_unlock_irq(&pkg_temp_lock);
380 
381 	return 0;
382 
383 out_unregister_tz:
384 	thermal_zone_device_unregister(zonedev->tzone);
385 out_kfree_trips:
386 	kfree(zonedev->trips);
387 out_kfree_zonedev:
388 	kfree(zonedev);
389 	return err;
390 }
391 
392 static int pkg_thermal_cpu_offline(unsigned int cpu)
393 {
394 	struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
395 	bool lastcpu, was_target;
396 	int target;
397 
398 	if (!zonedev)
399 		return 0;
400 
401 	target = cpumask_any_but(&zonedev->cpumask, cpu);
402 	cpumask_clear_cpu(cpu, &zonedev->cpumask);
403 	lastcpu = target >= nr_cpu_ids;
404 	/*
405 	 * Remove the sysfs files, if this is the last cpu in the package
406 	 * before doing further cleanups.
407 	 */
408 	if (lastcpu) {
409 		struct thermal_zone_device *tzone = zonedev->tzone;
410 
411 		/*
412 		 * We must protect against a work function calling
413 		 * thermal_zone_update, after/while unregister. We null out
414 		 * the pointer under the zone mutex, so the worker function
415 		 * won't try to call.
416 		 */
417 		mutex_lock(&thermal_zone_mutex);
418 		zonedev->tzone = NULL;
419 		mutex_unlock(&thermal_zone_mutex);
420 
421 		thermal_zone_device_unregister(tzone);
422 	}
423 
424 	/* Protect against work and interrupts */
425 	raw_spin_lock_irq(&pkg_temp_lock);
426 
427 	/*
428 	 * Check whether this cpu was the current target and store the new
429 	 * one. When we drop the lock, then the interrupt notify function
430 	 * will see the new target.
431 	 */
432 	was_target = zonedev->cpu == cpu;
433 	zonedev->cpu = target;
434 
435 	/*
436 	 * If this is the last CPU in the package remove the package
437 	 * reference from the array and restore the interrupt MSR. When we
438 	 * drop the lock neither the interrupt notify function nor the
439 	 * worker will see the package anymore.
440 	 */
441 	if (lastcpu) {
442 		zones[topology_logical_die_id(cpu)] = NULL;
443 		/* After this point nothing touches the MSR anymore. */
444 		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
445 		      zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
446 	}
447 
448 	/*
449 	 * Check whether there is work scheduled and whether the work is
450 	 * targeted at the outgoing CPU.
451 	 */
452 	if (zonedev->work_scheduled && was_target) {
453 		/*
454 		 * To cancel the work we need to drop the lock, otherwise
455 		 * we might deadlock if the work needs to be flushed.
456 		 */
457 		raw_spin_unlock_irq(&pkg_temp_lock);
458 		cancel_delayed_work_sync(&zonedev->work);
459 		raw_spin_lock_irq(&pkg_temp_lock);
460 		/*
461 		 * If this is not the last cpu in the package and the work
462 		 * did not run after we dropped the lock above, then we
463 		 * need to reschedule the work, otherwise the interrupt
464 		 * stays disabled forever.
465 		 */
466 		if (!lastcpu && zonedev->work_scheduled)
467 			pkg_thermal_schedule_work(target, &zonedev->work);
468 	}
469 
470 	raw_spin_unlock_irq(&pkg_temp_lock);
471 
472 	/* Final cleanup if this is the last cpu */
473 	if (lastcpu) {
474 		kfree(zonedev->trips);
475 		kfree(zonedev);
476 	}
477 	return 0;
478 }
479 
480 static int pkg_thermal_cpu_online(unsigned int cpu)
481 {
482 	struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
483 	struct cpuinfo_x86 *c = &cpu_data(cpu);
484 
485 	/* Paranoia check */
486 	if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
487 		return -ENODEV;
488 
489 	/* If the package exists, nothing to do */
490 	if (zonedev) {
491 		cpumask_set_cpu(cpu, &zonedev->cpumask);
492 		return 0;
493 	}
494 	return pkg_temp_thermal_device_add(cpu);
495 }
496 
497 static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
498 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL),
499 	{}
500 };
501 MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
502 
503 static int __init pkg_temp_thermal_init(void)
504 {
505 	int ret;
506 
507 	if (!x86_match_cpu(pkg_temp_thermal_ids))
508 		return -ENODEV;
509 
510 	max_id = topology_max_packages() * topology_max_die_per_package();
511 	zones = kcalloc(max_id, sizeof(struct zone_device *),
512 			   GFP_KERNEL);
513 	if (!zones)
514 		return -ENOMEM;
515 
516 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
517 				pkg_thermal_cpu_online,	pkg_thermal_cpu_offline);
518 	if (ret < 0)
519 		goto err;
520 
521 	/* Store the state for module exit */
522 	pkg_thermal_hp_state = ret;
523 
524 	platform_thermal_package_notify = pkg_thermal_notify;
525 	platform_thermal_package_rate_control = pkg_thermal_rate_control;
526 
527 	 /* Don't care if it fails */
528 	pkg_temp_debugfs_init();
529 	return 0;
530 
531 err:
532 	kfree(zones);
533 	return ret;
534 }
535 module_init(pkg_temp_thermal_init)
536 
537 static void __exit pkg_temp_thermal_exit(void)
538 {
539 	platform_thermal_package_notify = NULL;
540 	platform_thermal_package_rate_control = NULL;
541 
542 	cpuhp_remove_state(pkg_thermal_hp_state);
543 	debugfs_remove_recursive(debugfs);
544 	kfree(zones);
545 }
546 module_exit(pkg_temp_thermal_exit)
547 
548 MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
549 MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
550 MODULE_LICENSE("GPL v2");
551