1 /* 2 * fam15h_power.c - AMD Family 15h processor power monitoring 3 * 4 * Copyright (c) 2011-2016 Advanced Micro Devices, Inc. 5 * Author: Andreas Herrmann <herrmann.der.user@googlemail.com> 6 * 7 * 8 * This driver is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This driver is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 15 * See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this driver; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #include <linux/err.h> 22 #include <linux/hwmon.h> 23 #include <linux/hwmon-sysfs.h> 24 #include <linux/init.h> 25 #include <linux/module.h> 26 #include <linux/pci.h> 27 #include <linux/bitops.h> 28 #include <linux/cpu.h> 29 #include <linux/cpumask.h> 30 #include <linux/time.h> 31 #include <linux/sched.h> 32 #include <asm/processor.h> 33 #include <asm/msr.h> 34 35 MODULE_DESCRIPTION("AMD Family 15h CPU processor power monitor"); 36 MODULE_AUTHOR("Andreas Herrmann <herrmann.der.user@googlemail.com>"); 37 MODULE_LICENSE("GPL"); 38 39 /* D18F3 */ 40 #define REG_NORTHBRIDGE_CAP 0xe8 41 42 /* D18F4 */ 43 #define REG_PROCESSOR_TDP 0x1b8 44 45 /* D18F5 */ 46 #define REG_TDP_RUNNING_AVERAGE 0xe0 47 #define REG_TDP_LIMIT3 0xe8 48 49 #define FAM15H_MIN_NUM_ATTRS 2 50 #define FAM15H_NUM_GROUPS 2 51 #define MAX_CUS 8 52 53 /* set maximum interval as 1 second */ 54 #define MAX_INTERVAL 1000 55 56 #define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a 57 #define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b 58 #define MSR_F15H_PTSC 0xc0010280 59 60 #define PCI_DEVICE_ID_AMD_15H_M70H_NB_F4 0x15b4 61 62 struct fam15h_power_data { 63 struct pci_dev *pdev; 64 unsigned int tdp_to_watts; 65 unsigned int base_tdp; 66 unsigned int processor_pwr_watts; 67 unsigned int cpu_pwr_sample_ratio; 68 const struct attribute_group *groups[FAM15H_NUM_GROUPS]; 69 struct attribute_group group; 70 /* maximum accumulated power of a compute unit */ 71 u64 max_cu_acc_power; 72 /* accumulated power of the compute units */ 73 u64 cu_acc_power[MAX_CUS]; 74 /* performance timestamp counter */ 75 u64 cpu_sw_pwr_ptsc[MAX_CUS]; 76 /* online/offline status of current compute unit */ 77 int cu_on[MAX_CUS]; 78 unsigned long power_period; 79 }; 80 81 static bool is_carrizo_or_later(void) 82 { 83 return boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model >= 0x60; 84 } 85 86 static ssize_t show_power(struct device *dev, 87 struct device_attribute *attr, char *buf) 88 { 89 u32 val, tdp_limit, running_avg_range; 90 s32 running_avg_capture; 91 u64 curr_pwr_watts; 92 struct fam15h_power_data *data = dev_get_drvdata(dev); 93 struct pci_dev *f4 = data->pdev; 94 95 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 96 REG_TDP_RUNNING_AVERAGE, &val); 97 98 /* 99 * On Carrizo and later platforms, TdpRunAvgAccCap bit field 100 * is extended to 4:31 from 4:25. 101 */ 102 if (is_carrizo_or_later()) { 103 running_avg_capture = val >> 4; 104 running_avg_capture = sign_extend32(running_avg_capture, 27); 105 } else { 106 running_avg_capture = (val >> 4) & 0x3fffff; 107 running_avg_capture = sign_extend32(running_avg_capture, 21); 108 } 109 110 running_avg_range = (val & 0xf) + 1; 111 112 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 113 REG_TDP_LIMIT3, &val); 114 115 /* 116 * On Carrizo and later platforms, ApmTdpLimit bit field 117 * is extended to 16:31 from 16:28. 118 */ 119 if (is_carrizo_or_later()) 120 tdp_limit = val >> 16; 121 else 122 tdp_limit = (val >> 16) & 0x1fff; 123 124 curr_pwr_watts = ((u64)(tdp_limit + 125 data->base_tdp)) << running_avg_range; 126 curr_pwr_watts -= running_avg_capture; 127 curr_pwr_watts *= data->tdp_to_watts; 128 129 /* 130 * Convert to microWatt 131 * 132 * power is in Watt provided as fixed point integer with 133 * scaling factor 1/(2^16). For conversion we use 134 * (10^6)/(2^16) = 15625/(2^10) 135 */ 136 curr_pwr_watts = (curr_pwr_watts * 15625) >> (10 + running_avg_range); 137 return sprintf(buf, "%u\n", (unsigned int) curr_pwr_watts); 138 } 139 static DEVICE_ATTR(power1_input, S_IRUGO, show_power, NULL); 140 141 static ssize_t show_power_crit(struct device *dev, 142 struct device_attribute *attr, char *buf) 143 { 144 struct fam15h_power_data *data = dev_get_drvdata(dev); 145 146 return sprintf(buf, "%u\n", data->processor_pwr_watts); 147 } 148 static DEVICE_ATTR(power1_crit, S_IRUGO, show_power_crit, NULL); 149 150 static void do_read_registers_on_cu(void *_data) 151 { 152 struct fam15h_power_data *data = _data; 153 int cpu, cu; 154 155 cpu = smp_processor_id(); 156 157 /* 158 * With the new x86 topology modelling, cpu core id actually 159 * is compute unit id. 160 */ 161 cu = cpu_data(cpu).cpu_core_id; 162 163 rdmsrl_safe(MSR_F15H_CU_PWR_ACCUMULATOR, &data->cu_acc_power[cu]); 164 rdmsrl_safe(MSR_F15H_PTSC, &data->cpu_sw_pwr_ptsc[cu]); 165 166 data->cu_on[cu] = 1; 167 } 168 169 /* 170 * This function is only able to be called when CPUID 171 * Fn8000_0007:EDX[12] is set. 172 */ 173 static int read_registers(struct fam15h_power_data *data) 174 { 175 int this_cpu, ret, cpu; 176 int core, this_core; 177 cpumask_var_t mask; 178 179 ret = zalloc_cpumask_var(&mask, GFP_KERNEL); 180 if (!ret) 181 return -ENOMEM; 182 183 memset(data->cu_on, 0, sizeof(int) * MAX_CUS); 184 185 get_online_cpus(); 186 this_cpu = smp_processor_id(); 187 188 /* 189 * Choose the first online core of each compute unit, and then 190 * read their MSR value of power and ptsc in a single IPI, 191 * because the MSR value of CPU core represent the compute 192 * unit's. 193 */ 194 core = -1; 195 196 for_each_online_cpu(cpu) { 197 this_core = topology_core_id(cpu); 198 199 if (this_core == core) 200 continue; 201 202 core = this_core; 203 204 /* get any CPU on this compute unit */ 205 cpumask_set_cpu(cpumask_any(topology_sibling_cpumask(cpu)), mask); 206 } 207 208 if (cpumask_test_cpu(this_cpu, mask)) 209 do_read_registers_on_cu(data); 210 211 smp_call_function_many(mask, do_read_registers_on_cu, data, true); 212 put_online_cpus(); 213 214 free_cpumask_var(mask); 215 216 return 0; 217 } 218 219 static ssize_t acc_show_power(struct device *dev, 220 struct device_attribute *attr, 221 char *buf) 222 { 223 struct fam15h_power_data *data = dev_get_drvdata(dev); 224 u64 prev_cu_acc_power[MAX_CUS], prev_ptsc[MAX_CUS], 225 jdelta[MAX_CUS]; 226 u64 tdelta, avg_acc; 227 int cu, cu_num, ret; 228 signed long leftover; 229 230 /* 231 * With the new x86 topology modelling, x86_max_cores is the 232 * compute unit number. 233 */ 234 cu_num = boot_cpu_data.x86_max_cores; 235 236 ret = read_registers(data); 237 if (ret) 238 return 0; 239 240 for (cu = 0; cu < cu_num; cu++) { 241 prev_cu_acc_power[cu] = data->cu_acc_power[cu]; 242 prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; 243 } 244 245 leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); 246 if (leftover) 247 return 0; 248 249 ret = read_registers(data); 250 if (ret) 251 return 0; 252 253 for (cu = 0, avg_acc = 0; cu < cu_num; cu++) { 254 /* check if current compute unit is online */ 255 if (data->cu_on[cu] == 0) 256 continue; 257 258 if (data->cu_acc_power[cu] < prev_cu_acc_power[cu]) { 259 jdelta[cu] = data->max_cu_acc_power + data->cu_acc_power[cu]; 260 jdelta[cu] -= prev_cu_acc_power[cu]; 261 } else { 262 jdelta[cu] = data->cu_acc_power[cu] - prev_cu_acc_power[cu]; 263 } 264 tdelta = data->cpu_sw_pwr_ptsc[cu] - prev_ptsc[cu]; 265 jdelta[cu] *= data->cpu_pwr_sample_ratio * 1000; 266 do_div(jdelta[cu], tdelta); 267 268 /* the unit is microWatt */ 269 avg_acc += jdelta[cu]; 270 } 271 272 return sprintf(buf, "%llu\n", (unsigned long long)avg_acc); 273 } 274 static DEVICE_ATTR(power1_average, S_IRUGO, acc_show_power, NULL); 275 276 static ssize_t acc_show_power_period(struct device *dev, 277 struct device_attribute *attr, 278 char *buf) 279 { 280 struct fam15h_power_data *data = dev_get_drvdata(dev); 281 282 return sprintf(buf, "%lu\n", data->power_period); 283 } 284 285 static ssize_t acc_set_power_period(struct device *dev, 286 struct device_attribute *attr, 287 const char *buf, size_t count) 288 { 289 struct fam15h_power_data *data = dev_get_drvdata(dev); 290 unsigned long temp; 291 int ret; 292 293 ret = kstrtoul(buf, 10, &temp); 294 if (ret) 295 return ret; 296 297 if (temp > MAX_INTERVAL) 298 return -EINVAL; 299 300 /* the interval value should be greater than 0 */ 301 if (temp <= 0) 302 return -EINVAL; 303 304 data->power_period = temp; 305 306 return count; 307 } 308 static DEVICE_ATTR(power1_average_interval, S_IRUGO | S_IWUSR, 309 acc_show_power_period, acc_set_power_period); 310 311 static int fam15h_power_init_attrs(struct pci_dev *pdev, 312 struct fam15h_power_data *data) 313 { 314 int n = FAM15H_MIN_NUM_ATTRS; 315 struct attribute **fam15h_power_attrs; 316 struct cpuinfo_x86 *c = &boot_cpu_data; 317 318 if (c->x86 == 0x15 && 319 (c->x86_model <= 0xf || 320 (c->x86_model >= 0x60 && c->x86_model <= 0x7f))) 321 n += 1; 322 323 /* check if processor supports accumulated power */ 324 if (boot_cpu_has(X86_FEATURE_ACC_POWER)) 325 n += 2; 326 327 fam15h_power_attrs = devm_kcalloc(&pdev->dev, n, 328 sizeof(*fam15h_power_attrs), 329 GFP_KERNEL); 330 331 if (!fam15h_power_attrs) 332 return -ENOMEM; 333 334 n = 0; 335 fam15h_power_attrs[n++] = &dev_attr_power1_crit.attr; 336 if (c->x86 == 0x15 && 337 (c->x86_model <= 0xf || 338 (c->x86_model >= 0x60 && c->x86_model <= 0x7f))) 339 fam15h_power_attrs[n++] = &dev_attr_power1_input.attr; 340 341 if (boot_cpu_has(X86_FEATURE_ACC_POWER)) { 342 fam15h_power_attrs[n++] = &dev_attr_power1_average.attr; 343 fam15h_power_attrs[n++] = &dev_attr_power1_average_interval.attr; 344 } 345 346 data->group.attrs = fam15h_power_attrs; 347 348 return 0; 349 } 350 351 static bool should_load_on_this_node(struct pci_dev *f4) 352 { 353 u32 val; 354 355 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 3), 356 REG_NORTHBRIDGE_CAP, &val); 357 if ((val & BIT(29)) && ((val >> 30) & 3)) 358 return false; 359 360 return true; 361 } 362 363 /* 364 * Newer BKDG versions have an updated recommendation on how to properly 365 * initialize the running average range (was: 0xE, now: 0x9). This avoids 366 * counter saturations resulting in bogus power readings. 367 * We correct this value ourselves to cope with older BIOSes. 368 */ 369 static const struct pci_device_id affected_device[] = { 370 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, 371 { 0 } 372 }; 373 374 static void tweak_runavg_range(struct pci_dev *pdev) 375 { 376 u32 val; 377 378 /* 379 * let this quirk apply only to the current version of the 380 * northbridge, since future versions may change the behavior 381 */ 382 if (!pci_match_id(affected_device, pdev)) 383 return; 384 385 pci_bus_read_config_dword(pdev->bus, 386 PCI_DEVFN(PCI_SLOT(pdev->devfn), 5), 387 REG_TDP_RUNNING_AVERAGE, &val); 388 if ((val & 0xf) != 0xe) 389 return; 390 391 val &= ~0xf; 392 val |= 0x9; 393 pci_bus_write_config_dword(pdev->bus, 394 PCI_DEVFN(PCI_SLOT(pdev->devfn), 5), 395 REG_TDP_RUNNING_AVERAGE, val); 396 } 397 398 #ifdef CONFIG_PM 399 static int fam15h_power_resume(struct pci_dev *pdev) 400 { 401 tweak_runavg_range(pdev); 402 return 0; 403 } 404 #else 405 #define fam15h_power_resume NULL 406 #endif 407 408 static int fam15h_power_init_data(struct pci_dev *f4, 409 struct fam15h_power_data *data) 410 { 411 u32 val; 412 u64 tmp; 413 int ret; 414 415 pci_read_config_dword(f4, REG_PROCESSOR_TDP, &val); 416 data->base_tdp = val >> 16; 417 tmp = val & 0xffff; 418 419 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 420 REG_TDP_LIMIT3, &val); 421 422 data->tdp_to_watts = ((val & 0x3ff) << 6) | ((val >> 10) & 0x3f); 423 tmp *= data->tdp_to_watts; 424 425 /* result not allowed to be >= 256W */ 426 if ((tmp >> 16) >= 256) 427 dev_warn(&f4->dev, 428 "Bogus value for ProcessorPwrWatts (processor_pwr_watts>=%u)\n", 429 (unsigned int) (tmp >> 16)); 430 431 /* convert to microWatt */ 432 data->processor_pwr_watts = (tmp * 15625) >> 10; 433 434 ret = fam15h_power_init_attrs(f4, data); 435 if (ret) 436 return ret; 437 438 439 /* CPUID Fn8000_0007:EDX[12] indicates to support accumulated power */ 440 if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) 441 return 0; 442 443 /* 444 * determine the ratio of the compute unit power accumulator 445 * sample period to the PTSC counter period by executing CPUID 446 * Fn8000_0007:ECX 447 */ 448 data->cpu_pwr_sample_ratio = cpuid_ecx(0x80000007); 449 450 if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &tmp)) { 451 pr_err("Failed to read max compute unit power accumulator MSR\n"); 452 return -ENODEV; 453 } 454 455 data->max_cu_acc_power = tmp; 456 457 /* 458 * Milliseconds are a reasonable interval for the measurement. 459 * But it shouldn't set too long here, because several seconds 460 * would cause the read function to hang. So set default 461 * interval as 10 ms. 462 */ 463 data->power_period = 10; 464 465 return read_registers(data); 466 } 467 468 static int fam15h_power_probe(struct pci_dev *pdev, 469 const struct pci_device_id *id) 470 { 471 struct fam15h_power_data *data; 472 struct device *dev = &pdev->dev; 473 struct device *hwmon_dev; 474 int ret; 475 476 /* 477 * though we ignore every other northbridge, we still have to 478 * do the tweaking on _each_ node in MCM processors as the counters 479 * are working hand-in-hand 480 */ 481 tweak_runavg_range(pdev); 482 483 if (!should_load_on_this_node(pdev)) 484 return -ENODEV; 485 486 data = devm_kzalloc(dev, sizeof(struct fam15h_power_data), GFP_KERNEL); 487 if (!data) 488 return -ENOMEM; 489 490 ret = fam15h_power_init_data(pdev, data); 491 if (ret) 492 return ret; 493 494 data->pdev = pdev; 495 496 data->groups[0] = &data->group; 497 498 hwmon_dev = devm_hwmon_device_register_with_groups(dev, "fam15h_power", 499 data, 500 &data->groups[0]); 501 return PTR_ERR_OR_ZERO(hwmon_dev); 502 } 503 504 static const struct pci_device_id fam15h_power_id_table[] = { 505 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, 506 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, 507 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, 508 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M70H_NB_F4) }, 509 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, 510 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, 511 {} 512 }; 513 MODULE_DEVICE_TABLE(pci, fam15h_power_id_table); 514 515 static struct pci_driver fam15h_power_driver = { 516 .name = "fam15h_power", 517 .id_table = fam15h_power_id_table, 518 .probe = fam15h_power_probe, 519 .resume = fam15h_power_resume, 520 }; 521 522 module_pci_driver(fam15h_power_driver); 523