1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * x86_pkg_temp_thermal driver 4 * Copyright (c) 2013, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/module.h> 9 #include <linux/init.h> 10 #include <linux/err.h> 11 #include <linux/param.h> 12 #include <linux/device.h> 13 #include <linux/platform_device.h> 14 #include <linux/cpu.h> 15 #include <linux/smp.h> 16 #include <linux/slab.h> 17 #include <linux/pm.h> 18 #include <linux/thermal.h> 19 #include <linux/debugfs.h> 20 21 #include <asm/cpu_device_id.h> 22 23 #include "thermal_interrupt.h" 24 25 /* 26 * Rate control delay: Idea is to introduce denounce effect 27 * This should be long enough to avoid reduce events, when 28 * threshold is set to a temperature, which is constantly 29 * violated, but at the short enough to take any action. 30 * The action can be remove threshold or change it to next 31 * interesting setting. Based on experiments, in around 32 * every 5 seconds under load will give us a significant 33 * temperature change. 34 */ 35 #define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000 36 static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY; 37 module_param(notify_delay_ms, int, 0644); 38 MODULE_PARM_DESC(notify_delay_ms, 39 "User space notification delay in milli seconds."); 40 41 /* Number of trip points in thermal zone. Currently it can't 42 * be more than 2. MSR can allow setting and getting notifications 43 * for only 2 thresholds. This define enforces this, if there 44 * is some wrong values returned by cpuid for number of thresholds. 45 */ 46 #define MAX_NUMBER_OF_TRIPS 2 47 48 struct zone_device { 49 int cpu; 50 bool work_scheduled; 51 u32 tj_max; 52 u32 msr_pkg_therm_low; 53 u32 msr_pkg_therm_high; 54 struct delayed_work work; 55 struct thermal_zone_device *tzone; 56 struct thermal_trip *trips; 57 struct cpumask cpumask; 58 }; 59 60 static struct thermal_zone_params pkg_temp_tz_params = { 61 .no_hwmon = true, 62 }; 63 64 /* Keep track of how many zone pointers we allocated in init() */ 65 static int max_id __read_mostly; 66 /* Array of zone pointers */ 67 static struct zone_device **zones; 68 /* Serializes interrupt notification, work and hotplug */ 69 static DEFINE_RAW_SPINLOCK(pkg_temp_lock); 70 /* Protects zone operation in the work function against hotplug removal */ 71 static DEFINE_MUTEX(thermal_zone_mutex); 72 73 /* The dynamically assigned cpu hotplug state for module_exit() */ 74 static enum cpuhp_state pkg_thermal_hp_state __read_mostly; 75 76 /* Debug counters to show using debugfs */ 77 static struct dentry *debugfs; 78 static unsigned int pkg_interrupt_cnt; 79 static unsigned int pkg_work_cnt; 80 81 static void pkg_temp_debugfs_init(void) 82 { 83 debugfs = debugfs_create_dir("pkg_temp_thermal", NULL); 84 85 debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs, 86 &pkg_interrupt_cnt); 87 debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs, 88 &pkg_work_cnt); 89 } 90 91 /* 92 * Protection: 93 * 94 * - cpu hotplug: Read serialized by cpu hotplug lock 95 * Write must hold pkg_temp_lock 96 * 97 * - Other callsites: Must hold pkg_temp_lock 98 */ 99 static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu) 100 { 101 int id = topology_logical_die_id(cpu); 102 103 if (id >= 0 && id < max_id) 104 return zones[id]; 105 return NULL; 106 } 107 108 /* 109 * tj-max is interesting because threshold is set relative to this 110 * temperature. 111 */ 112 static int get_tj_max(int cpu, u32 *tj_max) 113 { 114 u32 eax, edx, val; 115 int err; 116 117 err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx); 118 if (err) 119 return err; 120 121 val = (eax >> 16) & 0xff; 122 *tj_max = val * 1000; 123 124 return val ? 0 : -EINVAL; 125 } 126 127 static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp) 128 { 129 struct zone_device *zonedev = tzd->devdata; 130 u32 eax, edx; 131 132 rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_STATUS, 133 &eax, &edx); 134 if (eax & 0x80000000) { 135 *temp = zonedev->tj_max - ((eax >> 16) & 0x7f) * 1000; 136 pr_debug("sys_get_curr_temp %d\n", *temp); 137 return 0; 138 } 139 return -EINVAL; 140 } 141 142 static int 143 sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp) 144 { 145 struct zone_device *zonedev = tzd->devdata; 146 u32 l, h, mask, shift, intr; 147 int ret; 148 149 if (trip >= MAX_NUMBER_OF_TRIPS || temp >= zonedev->tj_max) 150 return -EINVAL; 151 152 ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, 153 &l, &h); 154 if (ret < 0) 155 return ret; 156 157 if (trip) { 158 mask = THERM_MASK_THRESHOLD1; 159 shift = THERM_SHIFT_THRESHOLD1; 160 intr = THERM_INT_THRESHOLD1_ENABLE; 161 } else { 162 mask = THERM_MASK_THRESHOLD0; 163 shift = THERM_SHIFT_THRESHOLD0; 164 intr = THERM_INT_THRESHOLD0_ENABLE; 165 } 166 l &= ~mask; 167 /* 168 * When users space sets a trip temperature == 0, which is indication 169 * that, it is no longer interested in receiving notifications. 170 */ 171 if (!temp) { 172 l &= ~intr; 173 } else { 174 l |= (zonedev->tj_max - temp)/1000 << shift; 175 l |= intr; 176 } 177 178 return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, 179 l, h); 180 } 181 182 /* Thermal zone callback registry */ 183 static struct thermal_zone_device_ops tzone_ops = { 184 .get_temp = sys_get_curr_temp, 185 .set_trip_temp = sys_set_trip_temp, 186 }; 187 188 static bool pkg_thermal_rate_control(void) 189 { 190 return true; 191 } 192 193 /* Enable threshold interrupt on local package/cpu */ 194 static inline void enable_pkg_thres_interrupt(void) 195 { 196 u8 thres_0, thres_1; 197 u32 l, h; 198 199 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 200 /* only enable/disable if it had valid threshold value */ 201 thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0; 202 thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1; 203 if (thres_0) 204 l |= THERM_INT_THRESHOLD0_ENABLE; 205 if (thres_1) 206 l |= THERM_INT_THRESHOLD1_ENABLE; 207 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 208 } 209 210 /* Disable threshold interrupt on local package/cpu */ 211 static inline void disable_pkg_thres_interrupt(void) 212 { 213 u32 l, h; 214 215 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 216 217 l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE); 218 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 219 } 220 221 static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work) 222 { 223 struct thermal_zone_device *tzone = NULL; 224 int cpu = smp_processor_id(); 225 struct zone_device *zonedev; 226 227 mutex_lock(&thermal_zone_mutex); 228 raw_spin_lock_irq(&pkg_temp_lock); 229 ++pkg_work_cnt; 230 231 zonedev = pkg_temp_thermal_get_dev(cpu); 232 if (!zonedev) { 233 raw_spin_unlock_irq(&pkg_temp_lock); 234 mutex_unlock(&thermal_zone_mutex); 235 return; 236 } 237 zonedev->work_scheduled = false; 238 239 thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1); 240 tzone = zonedev->tzone; 241 242 enable_pkg_thres_interrupt(); 243 raw_spin_unlock_irq(&pkg_temp_lock); 244 245 /* 246 * If tzone is not NULL, then thermal_zone_mutex will prevent the 247 * concurrent removal in the cpu offline callback. 248 */ 249 if (tzone) 250 thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED); 251 252 mutex_unlock(&thermal_zone_mutex); 253 } 254 255 static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work) 256 { 257 unsigned long ms = msecs_to_jiffies(notify_delay_ms); 258 259 schedule_delayed_work_on(cpu, work, ms); 260 } 261 262 static int pkg_thermal_notify(u64 msr_val) 263 { 264 int cpu = smp_processor_id(); 265 struct zone_device *zonedev; 266 unsigned long flags; 267 268 raw_spin_lock_irqsave(&pkg_temp_lock, flags); 269 ++pkg_interrupt_cnt; 270 271 disable_pkg_thres_interrupt(); 272 273 /* Work is per package, so scheduling it once is enough. */ 274 zonedev = pkg_temp_thermal_get_dev(cpu); 275 if (zonedev && !zonedev->work_scheduled) { 276 zonedev->work_scheduled = true; 277 pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work); 278 } 279 280 raw_spin_unlock_irqrestore(&pkg_temp_lock, flags); 281 return 0; 282 } 283 284 static struct thermal_trip *pkg_temp_thermal_trips_init(int cpu, int tj_max, int num_trips) 285 { 286 struct thermal_trip *trips; 287 unsigned long thres_reg_value; 288 u32 mask, shift, eax, edx; 289 int ret, i; 290 291 trips = kzalloc(sizeof(*trips) * num_trips, GFP_KERNEL); 292 if (!trips) 293 return ERR_PTR(-ENOMEM); 294 295 for (i = 0; i < num_trips; i++) { 296 297 if (i) { 298 mask = THERM_MASK_THRESHOLD1; 299 shift = THERM_SHIFT_THRESHOLD1; 300 } else { 301 mask = THERM_MASK_THRESHOLD0; 302 shift = THERM_SHIFT_THRESHOLD0; 303 } 304 305 ret = rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, 306 &eax, &edx); 307 if (ret < 0) { 308 kfree(trips); 309 return ERR_PTR(ret); 310 } 311 312 thres_reg_value = (eax & mask) >> shift; 313 314 trips[i].temperature = thres_reg_value ? 315 tj_max - thres_reg_value * 1000 : THERMAL_TEMP_INVALID; 316 317 trips[i].type = THERMAL_TRIP_PASSIVE; 318 319 pr_debug("%s: cpu=%d, trip=%d, temp=%d\n", 320 __func__, cpu, i, trips[i].temperature); 321 } 322 323 return trips; 324 } 325 326 static int pkg_temp_thermal_device_add(unsigned int cpu) 327 { 328 int id = topology_logical_die_id(cpu); 329 u32 tj_max, eax, ebx, ecx, edx; 330 struct zone_device *zonedev; 331 int thres_count, err; 332 333 if (id >= max_id) 334 return -ENOMEM; 335 336 cpuid(6, &eax, &ebx, &ecx, &edx); 337 thres_count = ebx & 0x07; 338 if (!thres_count) 339 return -ENODEV; 340 341 thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS); 342 343 err = get_tj_max(cpu, &tj_max); 344 if (err) 345 return err; 346 347 zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL); 348 if (!zonedev) 349 return -ENOMEM; 350 351 zonedev->trips = pkg_temp_thermal_trips_init(cpu, tj_max, thres_count); 352 if (IS_ERR(zonedev->trips)) { 353 err = PTR_ERR(zonedev->trips); 354 goto out_kfree_zonedev; 355 } 356 357 INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn); 358 zonedev->cpu = cpu; 359 zonedev->tj_max = tj_max; 360 zonedev->tzone = thermal_zone_device_register_with_trips("x86_pkg_temp", 361 zonedev->trips, thres_count, 362 (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01, 363 zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0); 364 if (IS_ERR(zonedev->tzone)) { 365 err = PTR_ERR(zonedev->tzone); 366 goto out_kfree_trips; 367 } 368 err = thermal_zone_device_enable(zonedev->tzone); 369 if (err) 370 goto out_unregister_tz; 371 372 /* Store MSR value for package thermal interrupt, to restore at exit */ 373 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low, 374 zonedev->msr_pkg_therm_high); 375 376 cpumask_set_cpu(cpu, &zonedev->cpumask); 377 raw_spin_lock_irq(&pkg_temp_lock); 378 zones[id] = zonedev; 379 raw_spin_unlock_irq(&pkg_temp_lock); 380 381 return 0; 382 383 out_unregister_tz: 384 thermal_zone_device_unregister(zonedev->tzone); 385 out_kfree_trips: 386 kfree(zonedev->trips); 387 out_kfree_zonedev: 388 kfree(zonedev); 389 return err; 390 } 391 392 static int pkg_thermal_cpu_offline(unsigned int cpu) 393 { 394 struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu); 395 bool lastcpu, was_target; 396 int target; 397 398 if (!zonedev) 399 return 0; 400 401 target = cpumask_any_but(&zonedev->cpumask, cpu); 402 cpumask_clear_cpu(cpu, &zonedev->cpumask); 403 lastcpu = target >= nr_cpu_ids; 404 /* 405 * Remove the sysfs files, if this is the last cpu in the package 406 * before doing further cleanups. 407 */ 408 if (lastcpu) { 409 struct thermal_zone_device *tzone = zonedev->tzone; 410 411 /* 412 * We must protect against a work function calling 413 * thermal_zone_update, after/while unregister. We null out 414 * the pointer under the zone mutex, so the worker function 415 * won't try to call. 416 */ 417 mutex_lock(&thermal_zone_mutex); 418 zonedev->tzone = NULL; 419 mutex_unlock(&thermal_zone_mutex); 420 421 thermal_zone_device_unregister(tzone); 422 } 423 424 /* Protect against work and interrupts */ 425 raw_spin_lock_irq(&pkg_temp_lock); 426 427 /* 428 * Check whether this cpu was the current target and store the new 429 * one. When we drop the lock, then the interrupt notify function 430 * will see the new target. 431 */ 432 was_target = zonedev->cpu == cpu; 433 zonedev->cpu = target; 434 435 /* 436 * If this is the last CPU in the package remove the package 437 * reference from the array and restore the interrupt MSR. When we 438 * drop the lock neither the interrupt notify function nor the 439 * worker will see the package anymore. 440 */ 441 if (lastcpu) { 442 zones[topology_logical_die_id(cpu)] = NULL; 443 /* After this point nothing touches the MSR anymore. */ 444 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, 445 zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high); 446 } 447 448 /* 449 * Check whether there is work scheduled and whether the work is 450 * targeted at the outgoing CPU. 451 */ 452 if (zonedev->work_scheduled && was_target) { 453 /* 454 * To cancel the work we need to drop the lock, otherwise 455 * we might deadlock if the work needs to be flushed. 456 */ 457 raw_spin_unlock_irq(&pkg_temp_lock); 458 cancel_delayed_work_sync(&zonedev->work); 459 raw_spin_lock_irq(&pkg_temp_lock); 460 /* 461 * If this is not the last cpu in the package and the work 462 * did not run after we dropped the lock above, then we 463 * need to reschedule the work, otherwise the interrupt 464 * stays disabled forever. 465 */ 466 if (!lastcpu && zonedev->work_scheduled) 467 pkg_thermal_schedule_work(target, &zonedev->work); 468 } 469 470 raw_spin_unlock_irq(&pkg_temp_lock); 471 472 /* Final cleanup if this is the last cpu */ 473 if (lastcpu) { 474 kfree(zonedev->trips); 475 kfree(zonedev); 476 } 477 return 0; 478 } 479 480 static int pkg_thermal_cpu_online(unsigned int cpu) 481 { 482 struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu); 483 struct cpuinfo_x86 *c = &cpu_data(cpu); 484 485 /* Paranoia check */ 486 if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS)) 487 return -ENODEV; 488 489 /* If the package exists, nothing to do */ 490 if (zonedev) { 491 cpumask_set_cpu(cpu, &zonedev->cpumask); 492 return 0; 493 } 494 return pkg_temp_thermal_device_add(cpu); 495 } 496 497 static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = { 498 X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL), 499 {} 500 }; 501 MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids); 502 503 static int __init pkg_temp_thermal_init(void) 504 { 505 int ret; 506 507 if (!x86_match_cpu(pkg_temp_thermal_ids)) 508 return -ENODEV; 509 510 max_id = topology_max_packages() * topology_max_die_per_package(); 511 zones = kcalloc(max_id, sizeof(struct zone_device *), 512 GFP_KERNEL); 513 if (!zones) 514 return -ENOMEM; 515 516 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online", 517 pkg_thermal_cpu_online, pkg_thermal_cpu_offline); 518 if (ret < 0) 519 goto err; 520 521 /* Store the state for module exit */ 522 pkg_thermal_hp_state = ret; 523 524 platform_thermal_package_notify = pkg_thermal_notify; 525 platform_thermal_package_rate_control = pkg_thermal_rate_control; 526 527 /* Don't care if it fails */ 528 pkg_temp_debugfs_init(); 529 return 0; 530 531 err: 532 kfree(zones); 533 return ret; 534 } 535 module_init(pkg_temp_thermal_init) 536 537 static void __exit pkg_temp_thermal_exit(void) 538 { 539 platform_thermal_package_notify = NULL; 540 platform_thermal_package_rate_control = NULL; 541 542 cpuhp_remove_state(pkg_thermal_hp_state); 543 debugfs_remove_recursive(debugfs); 544 kfree(zones); 545 } 546 module_exit(pkg_temp_thermal_exit) 547 548 MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver"); 549 MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>"); 550 MODULE_LICENSE("GPL v2"); 551