1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * fam15h_power.c - AMD Family 15h processor power monitoring 4 * 5 * Copyright (c) 2011-2016 Advanced Micro Devices, Inc. 6 * Author: Andreas Herrmann <herrmann.der.user@googlemail.com> 7 */ 8 9 #include <linux/err.h> 10 #include <linux/hwmon.h> 11 #include <linux/hwmon-sysfs.h> 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/pci.h> 15 #include <linux/bitops.h> 16 #include <linux/cpu.h> 17 #include <linux/cpumask.h> 18 #include <linux/time.h> 19 #include <linux/sched.h> 20 #include <asm/processor.h> 21 #include <asm/msr.h> 22 23 MODULE_DESCRIPTION("AMD Family 15h CPU processor power monitor"); 24 MODULE_AUTHOR("Andreas Herrmann <herrmann.der.user@googlemail.com>"); 25 MODULE_LICENSE("GPL"); 26 27 /* D18F3 */ 28 #define REG_NORTHBRIDGE_CAP 0xe8 29 30 /* D18F4 */ 31 #define REG_PROCESSOR_TDP 0x1b8 32 33 /* D18F5 */ 34 #define REG_TDP_RUNNING_AVERAGE 0xe0 35 #define REG_TDP_LIMIT3 0xe8 36 37 #define FAM15H_MIN_NUM_ATTRS 2 38 #define FAM15H_NUM_GROUPS 2 39 #define MAX_CUS 8 40 41 /* set maximum interval as 1 second */ 42 #define MAX_INTERVAL 1000 43 44 #define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a 45 #define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b 46 #define MSR_F15H_PTSC 0xc0010280 47 48 #define PCI_DEVICE_ID_AMD_15H_M70H_NB_F4 0x15b4 49 50 struct fam15h_power_data { 51 struct pci_dev *pdev; 52 unsigned int tdp_to_watts; 53 unsigned int base_tdp; 54 unsigned int processor_pwr_watts; 55 unsigned int cpu_pwr_sample_ratio; 56 const struct attribute_group *groups[FAM15H_NUM_GROUPS]; 57 struct attribute_group group; 58 /* maximum accumulated power of a compute unit */ 59 u64 max_cu_acc_power; 60 /* accumulated power of the compute units */ 61 u64 cu_acc_power[MAX_CUS]; 62 /* performance timestamp counter */ 63 u64 cpu_sw_pwr_ptsc[MAX_CUS]; 64 /* online/offline status of current compute unit */ 65 int cu_on[MAX_CUS]; 66 unsigned long power_period; 67 }; 68 69 static bool is_carrizo_or_later(void) 70 { 71 return boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model >= 0x60; 72 } 73 74 static ssize_t power1_input_show(struct device *dev, 75 struct device_attribute *attr, char *buf) 76 { 77 u32 val, tdp_limit, running_avg_range; 78 s32 running_avg_capture; 79 u64 curr_pwr_watts; 80 struct fam15h_power_data *data = dev_get_drvdata(dev); 81 struct pci_dev *f4 = data->pdev; 82 83 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 84 REG_TDP_RUNNING_AVERAGE, &val); 85 86 /* 87 * On Carrizo and later platforms, TdpRunAvgAccCap bit field 88 * is extended to 4:31 from 4:25. 89 */ 90 if (is_carrizo_or_later()) { 91 running_avg_capture = val >> 4; 92 running_avg_capture = sign_extend32(running_avg_capture, 27); 93 } else { 94 running_avg_capture = (val >> 4) & 0x3fffff; 95 running_avg_capture = sign_extend32(running_avg_capture, 21); 96 } 97 98 running_avg_range = (val & 0xf) + 1; 99 100 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 101 REG_TDP_LIMIT3, &val); 102 103 /* 104 * On Carrizo and later platforms, ApmTdpLimit bit field 105 * is extended to 16:31 from 16:28. 106 */ 107 if (is_carrizo_or_later()) 108 tdp_limit = val >> 16; 109 else 110 tdp_limit = (val >> 16) & 0x1fff; 111 112 curr_pwr_watts = ((u64)(tdp_limit + 113 data->base_tdp)) << running_avg_range; 114 curr_pwr_watts -= running_avg_capture; 115 curr_pwr_watts *= data->tdp_to_watts; 116 117 /* 118 * Convert to microWatt 119 * 120 * power is in Watt provided as fixed point integer with 121 * scaling factor 1/(2^16). For conversion we use 122 * (10^6)/(2^16) = 15625/(2^10) 123 */ 124 curr_pwr_watts = (curr_pwr_watts * 15625) >> (10 + running_avg_range); 125 return sprintf(buf, "%u\n", (unsigned int) curr_pwr_watts); 126 } 127 static DEVICE_ATTR_RO(power1_input); 128 129 static ssize_t power1_crit_show(struct device *dev, 130 struct device_attribute *attr, char *buf) 131 { 132 struct fam15h_power_data *data = dev_get_drvdata(dev); 133 134 return sprintf(buf, "%u\n", data->processor_pwr_watts); 135 } 136 static DEVICE_ATTR_RO(power1_crit); 137 138 static void do_read_registers_on_cu(void *_data) 139 { 140 struct fam15h_power_data *data = _data; 141 int cpu, cu; 142 143 cpu = smp_processor_id(); 144 145 /* 146 * With the new x86 topology modelling, cpu core id actually 147 * is compute unit id. 148 */ 149 cu = cpu_data(cpu).cpu_core_id; 150 151 rdmsrl_safe(MSR_F15H_CU_PWR_ACCUMULATOR, &data->cu_acc_power[cu]); 152 rdmsrl_safe(MSR_F15H_PTSC, &data->cpu_sw_pwr_ptsc[cu]); 153 154 data->cu_on[cu] = 1; 155 } 156 157 /* 158 * This function is only able to be called when CPUID 159 * Fn8000_0007:EDX[12] is set. 160 */ 161 static int read_registers(struct fam15h_power_data *data) 162 { 163 int core, this_core; 164 cpumask_var_t mask; 165 int ret, cpu; 166 167 ret = zalloc_cpumask_var(&mask, GFP_KERNEL); 168 if (!ret) 169 return -ENOMEM; 170 171 memset(data->cu_on, 0, sizeof(int) * MAX_CUS); 172 173 get_online_cpus(); 174 175 /* 176 * Choose the first online core of each compute unit, and then 177 * read their MSR value of power and ptsc in a single IPI, 178 * because the MSR value of CPU core represent the compute 179 * unit's. 180 */ 181 core = -1; 182 183 for_each_online_cpu(cpu) { 184 this_core = topology_core_id(cpu); 185 186 if (this_core == core) 187 continue; 188 189 core = this_core; 190 191 /* get any CPU on this compute unit */ 192 cpumask_set_cpu(cpumask_any(topology_sibling_cpumask(cpu)), mask); 193 } 194 195 on_each_cpu_mask(mask, do_read_registers_on_cu, data, true); 196 197 put_online_cpus(); 198 free_cpumask_var(mask); 199 200 return 0; 201 } 202 203 static ssize_t power1_average_show(struct device *dev, 204 struct device_attribute *attr, char *buf) 205 { 206 struct fam15h_power_data *data = dev_get_drvdata(dev); 207 u64 prev_cu_acc_power[MAX_CUS], prev_ptsc[MAX_CUS], 208 jdelta[MAX_CUS]; 209 u64 tdelta, avg_acc; 210 int cu, cu_num, ret; 211 signed long leftover; 212 213 /* 214 * With the new x86 topology modelling, x86_max_cores is the 215 * compute unit number. 216 */ 217 cu_num = boot_cpu_data.x86_max_cores; 218 219 ret = read_registers(data); 220 if (ret) 221 return 0; 222 223 for (cu = 0; cu < cu_num; cu++) { 224 prev_cu_acc_power[cu] = data->cu_acc_power[cu]; 225 prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; 226 } 227 228 leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); 229 if (leftover) 230 return 0; 231 232 ret = read_registers(data); 233 if (ret) 234 return 0; 235 236 for (cu = 0, avg_acc = 0; cu < cu_num; cu++) { 237 /* check if current compute unit is online */ 238 if (data->cu_on[cu] == 0) 239 continue; 240 241 if (data->cu_acc_power[cu] < prev_cu_acc_power[cu]) { 242 jdelta[cu] = data->max_cu_acc_power + data->cu_acc_power[cu]; 243 jdelta[cu] -= prev_cu_acc_power[cu]; 244 } else { 245 jdelta[cu] = data->cu_acc_power[cu] - prev_cu_acc_power[cu]; 246 } 247 tdelta = data->cpu_sw_pwr_ptsc[cu] - prev_ptsc[cu]; 248 jdelta[cu] *= data->cpu_pwr_sample_ratio * 1000; 249 do_div(jdelta[cu], tdelta); 250 251 /* the unit is microWatt */ 252 avg_acc += jdelta[cu]; 253 } 254 255 return sprintf(buf, "%llu\n", (unsigned long long)avg_acc); 256 } 257 static DEVICE_ATTR_RO(power1_average); 258 259 static ssize_t power1_average_interval_show(struct device *dev, 260 struct device_attribute *attr, 261 char *buf) 262 { 263 struct fam15h_power_data *data = dev_get_drvdata(dev); 264 265 return sprintf(buf, "%lu\n", data->power_period); 266 } 267 268 static ssize_t power1_average_interval_store(struct device *dev, 269 struct device_attribute *attr, 270 const char *buf, size_t count) 271 { 272 struct fam15h_power_data *data = dev_get_drvdata(dev); 273 unsigned long temp; 274 int ret; 275 276 ret = kstrtoul(buf, 10, &temp); 277 if (ret) 278 return ret; 279 280 if (temp > MAX_INTERVAL) 281 return -EINVAL; 282 283 /* the interval value should be greater than 0 */ 284 if (temp <= 0) 285 return -EINVAL; 286 287 data->power_period = temp; 288 289 return count; 290 } 291 static DEVICE_ATTR_RW(power1_average_interval); 292 293 static int fam15h_power_init_attrs(struct pci_dev *pdev, 294 struct fam15h_power_data *data) 295 { 296 int n = FAM15H_MIN_NUM_ATTRS; 297 struct attribute **fam15h_power_attrs; 298 struct cpuinfo_x86 *c = &boot_cpu_data; 299 300 if (c->x86 == 0x15 && 301 (c->x86_model <= 0xf || 302 (c->x86_model >= 0x60 && c->x86_model <= 0x7f))) 303 n += 1; 304 305 /* check if processor supports accumulated power */ 306 if (boot_cpu_has(X86_FEATURE_ACC_POWER)) 307 n += 2; 308 309 fam15h_power_attrs = devm_kcalloc(&pdev->dev, n, 310 sizeof(*fam15h_power_attrs), 311 GFP_KERNEL); 312 313 if (!fam15h_power_attrs) 314 return -ENOMEM; 315 316 n = 0; 317 fam15h_power_attrs[n++] = &dev_attr_power1_crit.attr; 318 if (c->x86 == 0x15 && 319 (c->x86_model <= 0xf || 320 (c->x86_model >= 0x60 && c->x86_model <= 0x7f))) 321 fam15h_power_attrs[n++] = &dev_attr_power1_input.attr; 322 323 if (boot_cpu_has(X86_FEATURE_ACC_POWER)) { 324 fam15h_power_attrs[n++] = &dev_attr_power1_average.attr; 325 fam15h_power_attrs[n++] = &dev_attr_power1_average_interval.attr; 326 } 327 328 data->group.attrs = fam15h_power_attrs; 329 330 return 0; 331 } 332 333 static bool should_load_on_this_node(struct pci_dev *f4) 334 { 335 u32 val; 336 337 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 3), 338 REG_NORTHBRIDGE_CAP, &val); 339 if ((val & BIT(29)) && ((val >> 30) & 3)) 340 return false; 341 342 return true; 343 } 344 345 /* 346 * Newer BKDG versions have an updated recommendation on how to properly 347 * initialize the running average range (was: 0xE, now: 0x9). This avoids 348 * counter saturations resulting in bogus power readings. 349 * We correct this value ourselves to cope with older BIOSes. 350 */ 351 static const struct pci_device_id affected_device[] = { 352 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, 353 { 0 } 354 }; 355 356 static void tweak_runavg_range(struct pci_dev *pdev) 357 { 358 u32 val; 359 360 /* 361 * let this quirk apply only to the current version of the 362 * northbridge, since future versions may change the behavior 363 */ 364 if (!pci_match_id(affected_device, pdev)) 365 return; 366 367 pci_bus_read_config_dword(pdev->bus, 368 PCI_DEVFN(PCI_SLOT(pdev->devfn), 5), 369 REG_TDP_RUNNING_AVERAGE, &val); 370 if ((val & 0xf) != 0xe) 371 return; 372 373 val &= ~0xf; 374 val |= 0x9; 375 pci_bus_write_config_dword(pdev->bus, 376 PCI_DEVFN(PCI_SLOT(pdev->devfn), 5), 377 REG_TDP_RUNNING_AVERAGE, val); 378 } 379 380 #ifdef CONFIG_PM 381 static int fam15h_power_resume(struct pci_dev *pdev) 382 { 383 tweak_runavg_range(pdev); 384 return 0; 385 } 386 #else 387 #define fam15h_power_resume NULL 388 #endif 389 390 static int fam15h_power_init_data(struct pci_dev *f4, 391 struct fam15h_power_data *data) 392 { 393 u32 val; 394 u64 tmp; 395 int ret; 396 397 pci_read_config_dword(f4, REG_PROCESSOR_TDP, &val); 398 data->base_tdp = val >> 16; 399 tmp = val & 0xffff; 400 401 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 402 REG_TDP_LIMIT3, &val); 403 404 data->tdp_to_watts = ((val & 0x3ff) << 6) | ((val >> 10) & 0x3f); 405 tmp *= data->tdp_to_watts; 406 407 /* result not allowed to be >= 256W */ 408 if ((tmp >> 16) >= 256) 409 dev_warn(&f4->dev, 410 "Bogus value for ProcessorPwrWatts (processor_pwr_watts>=%u)\n", 411 (unsigned int) (tmp >> 16)); 412 413 /* convert to microWatt */ 414 data->processor_pwr_watts = (tmp * 15625) >> 10; 415 416 ret = fam15h_power_init_attrs(f4, data); 417 if (ret) 418 return ret; 419 420 421 /* CPUID Fn8000_0007:EDX[12] indicates to support accumulated power */ 422 if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) 423 return 0; 424 425 /* 426 * determine the ratio of the compute unit power accumulator 427 * sample period to the PTSC counter period by executing CPUID 428 * Fn8000_0007:ECX 429 */ 430 data->cpu_pwr_sample_ratio = cpuid_ecx(0x80000007); 431 432 if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &tmp)) { 433 pr_err("Failed to read max compute unit power accumulator MSR\n"); 434 return -ENODEV; 435 } 436 437 data->max_cu_acc_power = tmp; 438 439 /* 440 * Milliseconds are a reasonable interval for the measurement. 441 * But it shouldn't set too long here, because several seconds 442 * would cause the read function to hang. So set default 443 * interval as 10 ms. 444 */ 445 data->power_period = 10; 446 447 return read_registers(data); 448 } 449 450 static int fam15h_power_probe(struct pci_dev *pdev, 451 const struct pci_device_id *id) 452 { 453 struct fam15h_power_data *data; 454 struct device *dev = &pdev->dev; 455 struct device *hwmon_dev; 456 int ret; 457 458 /* 459 * though we ignore every other northbridge, we still have to 460 * do the tweaking on _each_ node in MCM processors as the counters 461 * are working hand-in-hand 462 */ 463 tweak_runavg_range(pdev); 464 465 if (!should_load_on_this_node(pdev)) 466 return -ENODEV; 467 468 data = devm_kzalloc(dev, sizeof(struct fam15h_power_data), GFP_KERNEL); 469 if (!data) 470 return -ENOMEM; 471 472 ret = fam15h_power_init_data(pdev, data); 473 if (ret) 474 return ret; 475 476 data->pdev = pdev; 477 478 data->groups[0] = &data->group; 479 480 hwmon_dev = devm_hwmon_device_register_with_groups(dev, "fam15h_power", 481 data, 482 &data->groups[0]); 483 return PTR_ERR_OR_ZERO(hwmon_dev); 484 } 485 486 static const struct pci_device_id fam15h_power_id_table[] = { 487 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, 488 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, 489 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, 490 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M70H_NB_F4) }, 491 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, 492 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, 493 {} 494 }; 495 MODULE_DEVICE_TABLE(pci, fam15h_power_id_table); 496 497 static struct pci_driver fam15h_power_driver = { 498 .name = "fam15h_power", 499 .id_table = fam15h_power_id_table, 500 .probe = fam15h_power_probe, 501 .resume = fam15h_power_resume, 502 }; 503 504 module_pci_driver(fam15h_power_driver); 505