1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * x86_pkg_temp_thermal driver 4 * Copyright (c) 2013, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/module.h> 9 #include <linux/init.h> 10 #include <linux/err.h> 11 #include <linux/param.h> 12 #include <linux/device.h> 13 #include <linux/platform_device.h> 14 #include <linux/cpu.h> 15 #include <linux/smp.h> 16 #include <linux/slab.h> 17 #include <linux/pm.h> 18 #include <linux/thermal.h> 19 #include <linux/debugfs.h> 20 #include <asm/cpu_device_id.h> 21 #include <asm/mce.h> 22 23 /* 24 * Rate control delay: Idea is to introduce denounce effect 25 * This should be long enough to avoid reduce events, when 26 * threshold is set to a temperature, which is constantly 27 * violated, but at the short enough to take any action. 28 * The action can be remove threshold or change it to next 29 * interesting setting. Based on experiments, in around 30 * every 5 seconds under load will give us a significant 31 * temperature change. 32 */ 33 #define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000 34 static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY; 35 module_param(notify_delay_ms, int, 0644); 36 MODULE_PARM_DESC(notify_delay_ms, 37 "User space notification delay in milli seconds."); 38 39 /* Number of trip points in thermal zone. Currently it can't 40 * be more than 2. MSR can allow setting and getting notifications 41 * for only 2 thresholds. This define enforces this, if there 42 * is some wrong values returned by cpuid for number of thresholds. 43 */ 44 #define MAX_NUMBER_OF_TRIPS 2 45 46 struct pkg_device { 47 int cpu; 48 bool work_scheduled; 49 u32 tj_max; 50 u32 msr_pkg_therm_low; 51 u32 msr_pkg_therm_high; 52 struct delayed_work work; 53 struct thermal_zone_device *tzone; 54 struct cpumask cpumask; 55 }; 56 57 static struct thermal_zone_params pkg_temp_tz_params = { 58 .no_hwmon = true, 59 }; 60 61 /* Keep track of how many package pointers we allocated in init() */ 62 static int max_packages __read_mostly; 63 /* Array of package pointers */ 64 static struct pkg_device **packages; 65 /* Serializes interrupt notification, work and hotplug */ 66 static DEFINE_SPINLOCK(pkg_temp_lock); 67 /* Protects zone operation in the work function against hotplug removal */ 68 static DEFINE_MUTEX(thermal_zone_mutex); 69 70 /* The dynamically assigned cpu hotplug state for module_exit() */ 71 static enum cpuhp_state pkg_thermal_hp_state __read_mostly; 72 73 /* Debug counters to show using debugfs */ 74 static struct dentry *debugfs; 75 static unsigned int pkg_interrupt_cnt; 76 static unsigned int pkg_work_cnt; 77 78 static int pkg_temp_debugfs_init(void) 79 { 80 struct dentry *d; 81 82 debugfs = debugfs_create_dir("pkg_temp_thermal", NULL); 83 if (!debugfs) 84 return -ENOENT; 85 86 d = debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs, 87 &pkg_interrupt_cnt); 88 if (!d) 89 goto err_out; 90 91 d = debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs, 92 &pkg_work_cnt); 93 if (!d) 94 goto err_out; 95 96 return 0; 97 98 err_out: 99 debugfs_remove_recursive(debugfs); 100 return -ENOENT; 101 } 102 103 /* 104 * Protection: 105 * 106 * - cpu hotplug: Read serialized by cpu hotplug lock 107 * Write must hold pkg_temp_lock 108 * 109 * - Other callsites: Must hold pkg_temp_lock 110 */ 111 static struct pkg_device *pkg_temp_thermal_get_dev(unsigned int cpu) 112 { 113 int pkgid = topology_logical_package_id(cpu); 114 115 if (pkgid >= 0 && pkgid < max_packages) 116 return packages[pkgid]; 117 return NULL; 118 } 119 120 /* 121 * tj-max is is interesting because threshold is set relative to this 122 * temperature. 123 */ 124 static int get_tj_max(int cpu, u32 *tj_max) 125 { 126 u32 eax, edx, val; 127 int err; 128 129 err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx); 130 if (err) 131 return err; 132 133 val = (eax >> 16) & 0xff; 134 *tj_max = val * 1000; 135 136 return val ? 0 : -EINVAL; 137 } 138 139 static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp) 140 { 141 struct pkg_device *pkgdev = tzd->devdata; 142 u32 eax, edx; 143 144 rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_STATUS, &eax, &edx); 145 if (eax & 0x80000000) { 146 *temp = pkgdev->tj_max - ((eax >> 16) & 0x7f) * 1000; 147 pr_debug("sys_get_curr_temp %d\n", *temp); 148 return 0; 149 } 150 return -EINVAL; 151 } 152 153 static int sys_get_trip_temp(struct thermal_zone_device *tzd, 154 int trip, int *temp) 155 { 156 struct pkg_device *pkgdev = tzd->devdata; 157 unsigned long thres_reg_value; 158 u32 mask, shift, eax, edx; 159 int ret; 160 161 if (trip >= MAX_NUMBER_OF_TRIPS) 162 return -EINVAL; 163 164 if (trip) { 165 mask = THERM_MASK_THRESHOLD1; 166 shift = THERM_SHIFT_THRESHOLD1; 167 } else { 168 mask = THERM_MASK_THRESHOLD0; 169 shift = THERM_SHIFT_THRESHOLD0; 170 } 171 172 ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, 173 &eax, &edx); 174 if (ret < 0) 175 return ret; 176 177 thres_reg_value = (eax & mask) >> shift; 178 if (thres_reg_value) 179 *temp = pkgdev->tj_max - thres_reg_value * 1000; 180 else 181 *temp = 0; 182 pr_debug("sys_get_trip_temp %d\n", *temp); 183 184 return 0; 185 } 186 187 static int 188 sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp) 189 { 190 struct pkg_device *pkgdev = tzd->devdata; 191 u32 l, h, mask, shift, intr; 192 int ret; 193 194 if (trip >= MAX_NUMBER_OF_TRIPS || temp >= pkgdev->tj_max) 195 return -EINVAL; 196 197 ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, 198 &l, &h); 199 if (ret < 0) 200 return ret; 201 202 if (trip) { 203 mask = THERM_MASK_THRESHOLD1; 204 shift = THERM_SHIFT_THRESHOLD1; 205 intr = THERM_INT_THRESHOLD1_ENABLE; 206 } else { 207 mask = THERM_MASK_THRESHOLD0; 208 shift = THERM_SHIFT_THRESHOLD0; 209 intr = THERM_INT_THRESHOLD0_ENABLE; 210 } 211 l &= ~mask; 212 /* 213 * When users space sets a trip temperature == 0, which is indication 214 * that, it is no longer interested in receiving notifications. 215 */ 216 if (!temp) { 217 l &= ~intr; 218 } else { 219 l |= (pkgdev->tj_max - temp)/1000 << shift; 220 l |= intr; 221 } 222 223 return wrmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 224 } 225 226 static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip, 227 enum thermal_trip_type *type) 228 { 229 *type = THERMAL_TRIP_PASSIVE; 230 return 0; 231 } 232 233 /* Thermal zone callback registry */ 234 static struct thermal_zone_device_ops tzone_ops = { 235 .get_temp = sys_get_curr_temp, 236 .get_trip_temp = sys_get_trip_temp, 237 .get_trip_type = sys_get_trip_type, 238 .set_trip_temp = sys_set_trip_temp, 239 }; 240 241 static bool pkg_thermal_rate_control(void) 242 { 243 return true; 244 } 245 246 /* Enable threshold interrupt on local package/cpu */ 247 static inline void enable_pkg_thres_interrupt(void) 248 { 249 u8 thres_0, thres_1; 250 u32 l, h; 251 252 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 253 /* only enable/disable if it had valid threshold value */ 254 thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0; 255 thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1; 256 if (thres_0) 257 l |= THERM_INT_THRESHOLD0_ENABLE; 258 if (thres_1) 259 l |= THERM_INT_THRESHOLD1_ENABLE; 260 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 261 } 262 263 /* Disable threshold interrupt on local package/cpu */ 264 static inline void disable_pkg_thres_interrupt(void) 265 { 266 u32 l, h; 267 268 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 269 270 l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE); 271 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 272 } 273 274 static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work) 275 { 276 struct thermal_zone_device *tzone = NULL; 277 int cpu = smp_processor_id(); 278 struct pkg_device *pkgdev; 279 u64 msr_val, wr_val; 280 281 mutex_lock(&thermal_zone_mutex); 282 spin_lock_irq(&pkg_temp_lock); 283 ++pkg_work_cnt; 284 285 pkgdev = pkg_temp_thermal_get_dev(cpu); 286 if (!pkgdev) { 287 spin_unlock_irq(&pkg_temp_lock); 288 mutex_unlock(&thermal_zone_mutex); 289 return; 290 } 291 pkgdev->work_scheduled = false; 292 293 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 294 wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1); 295 if (wr_val != msr_val) { 296 wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val); 297 tzone = pkgdev->tzone; 298 } 299 300 enable_pkg_thres_interrupt(); 301 spin_unlock_irq(&pkg_temp_lock); 302 303 /* 304 * If tzone is not NULL, then thermal_zone_mutex will prevent the 305 * concurrent removal in the cpu offline callback. 306 */ 307 if (tzone) 308 thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED); 309 310 mutex_unlock(&thermal_zone_mutex); 311 } 312 313 static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work) 314 { 315 unsigned long ms = msecs_to_jiffies(notify_delay_ms); 316 317 schedule_delayed_work_on(cpu, work, ms); 318 } 319 320 static int pkg_thermal_notify(u64 msr_val) 321 { 322 int cpu = smp_processor_id(); 323 struct pkg_device *pkgdev; 324 unsigned long flags; 325 326 spin_lock_irqsave(&pkg_temp_lock, flags); 327 ++pkg_interrupt_cnt; 328 329 disable_pkg_thres_interrupt(); 330 331 /* Work is per package, so scheduling it once is enough. */ 332 pkgdev = pkg_temp_thermal_get_dev(cpu); 333 if (pkgdev && !pkgdev->work_scheduled) { 334 pkgdev->work_scheduled = true; 335 pkg_thermal_schedule_work(pkgdev->cpu, &pkgdev->work); 336 } 337 338 spin_unlock_irqrestore(&pkg_temp_lock, flags); 339 return 0; 340 } 341 342 static int pkg_temp_thermal_device_add(unsigned int cpu) 343 { 344 int pkgid = topology_logical_package_id(cpu); 345 u32 tj_max, eax, ebx, ecx, edx; 346 struct pkg_device *pkgdev; 347 int thres_count, err; 348 349 if (pkgid >= max_packages) 350 return -ENOMEM; 351 352 cpuid(6, &eax, &ebx, &ecx, &edx); 353 thres_count = ebx & 0x07; 354 if (!thres_count) 355 return -ENODEV; 356 357 thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS); 358 359 err = get_tj_max(cpu, &tj_max); 360 if (err) 361 return err; 362 363 pkgdev = kzalloc(sizeof(*pkgdev), GFP_KERNEL); 364 if (!pkgdev) 365 return -ENOMEM; 366 367 INIT_DELAYED_WORK(&pkgdev->work, pkg_temp_thermal_threshold_work_fn); 368 pkgdev->cpu = cpu; 369 pkgdev->tj_max = tj_max; 370 pkgdev->tzone = thermal_zone_device_register("x86_pkg_temp", 371 thres_count, 372 (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01, 373 pkgdev, &tzone_ops, &pkg_temp_tz_params, 0, 0); 374 if (IS_ERR(pkgdev->tzone)) { 375 err = PTR_ERR(pkgdev->tzone); 376 kfree(pkgdev); 377 return err; 378 } 379 /* Store MSR value for package thermal interrupt, to restore at exit */ 380 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, pkgdev->msr_pkg_therm_low, 381 pkgdev->msr_pkg_therm_high); 382 383 cpumask_set_cpu(cpu, &pkgdev->cpumask); 384 spin_lock_irq(&pkg_temp_lock); 385 packages[pkgid] = pkgdev; 386 spin_unlock_irq(&pkg_temp_lock); 387 return 0; 388 } 389 390 static int pkg_thermal_cpu_offline(unsigned int cpu) 391 { 392 struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu); 393 bool lastcpu, was_target; 394 int target; 395 396 if (!pkgdev) 397 return 0; 398 399 target = cpumask_any_but(&pkgdev->cpumask, cpu); 400 cpumask_clear_cpu(cpu, &pkgdev->cpumask); 401 lastcpu = target >= nr_cpu_ids; 402 /* 403 * Remove the sysfs files, if this is the last cpu in the package 404 * before doing further cleanups. 405 */ 406 if (lastcpu) { 407 struct thermal_zone_device *tzone = pkgdev->tzone; 408 409 /* 410 * We must protect against a work function calling 411 * thermal_zone_update, after/while unregister. We null out 412 * the pointer under the zone mutex, so the worker function 413 * won't try to call. 414 */ 415 mutex_lock(&thermal_zone_mutex); 416 pkgdev->tzone = NULL; 417 mutex_unlock(&thermal_zone_mutex); 418 419 thermal_zone_device_unregister(tzone); 420 } 421 422 /* Protect against work and interrupts */ 423 spin_lock_irq(&pkg_temp_lock); 424 425 /* 426 * Check whether this cpu was the current target and store the new 427 * one. When we drop the lock, then the interrupt notify function 428 * will see the new target. 429 */ 430 was_target = pkgdev->cpu == cpu; 431 pkgdev->cpu = target; 432 433 /* 434 * If this is the last CPU in the package remove the package 435 * reference from the array and restore the interrupt MSR. When we 436 * drop the lock neither the interrupt notify function nor the 437 * worker will see the package anymore. 438 */ 439 if (lastcpu) { 440 packages[topology_logical_package_id(cpu)] = NULL; 441 /* After this point nothing touches the MSR anymore. */ 442 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, 443 pkgdev->msr_pkg_therm_low, pkgdev->msr_pkg_therm_high); 444 } 445 446 /* 447 * Check whether there is work scheduled and whether the work is 448 * targeted at the outgoing CPU. 449 */ 450 if (pkgdev->work_scheduled && was_target) { 451 /* 452 * To cancel the work we need to drop the lock, otherwise 453 * we might deadlock if the work needs to be flushed. 454 */ 455 spin_unlock_irq(&pkg_temp_lock); 456 cancel_delayed_work_sync(&pkgdev->work); 457 spin_lock_irq(&pkg_temp_lock); 458 /* 459 * If this is not the last cpu in the package and the work 460 * did not run after we dropped the lock above, then we 461 * need to reschedule the work, otherwise the interrupt 462 * stays disabled forever. 463 */ 464 if (!lastcpu && pkgdev->work_scheduled) 465 pkg_thermal_schedule_work(target, &pkgdev->work); 466 } 467 468 spin_unlock_irq(&pkg_temp_lock); 469 470 /* Final cleanup if this is the last cpu */ 471 if (lastcpu) 472 kfree(pkgdev); 473 return 0; 474 } 475 476 static int pkg_thermal_cpu_online(unsigned int cpu) 477 { 478 struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu); 479 struct cpuinfo_x86 *c = &cpu_data(cpu); 480 481 /* Paranoia check */ 482 if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS)) 483 return -ENODEV; 484 485 /* If the package exists, nothing to do */ 486 if (pkgdev) { 487 cpumask_set_cpu(cpu, &pkgdev->cpumask); 488 return 0; 489 } 490 return pkg_temp_thermal_device_add(cpu); 491 } 492 493 static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = { 494 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS }, 495 {} 496 }; 497 MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids); 498 499 static int __init pkg_temp_thermal_init(void) 500 { 501 int ret; 502 503 if (!x86_match_cpu(pkg_temp_thermal_ids)) 504 return -ENODEV; 505 506 max_packages = topology_max_packages(); 507 packages = kcalloc(max_packages, sizeof(struct pkg_device *), 508 GFP_KERNEL); 509 if (!packages) 510 return -ENOMEM; 511 512 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online", 513 pkg_thermal_cpu_online, pkg_thermal_cpu_offline); 514 if (ret < 0) 515 goto err; 516 517 /* Store the state for module exit */ 518 pkg_thermal_hp_state = ret; 519 520 platform_thermal_package_notify = pkg_thermal_notify; 521 platform_thermal_package_rate_control = pkg_thermal_rate_control; 522 523 /* Don't care if it fails */ 524 pkg_temp_debugfs_init(); 525 return 0; 526 527 err: 528 kfree(packages); 529 return ret; 530 } 531 module_init(pkg_temp_thermal_init) 532 533 static void __exit pkg_temp_thermal_exit(void) 534 { 535 platform_thermal_package_notify = NULL; 536 platform_thermal_package_rate_control = NULL; 537 538 cpuhp_remove_state(pkg_thermal_hp_state); 539 debugfs_remove_recursive(debugfs); 540 kfree(packages); 541 } 542 module_exit(pkg_temp_thermal_exit) 543 544 MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver"); 545 MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>"); 546 MODULE_LICENSE("GPL v2"); 547