1 /* 2 * fam15h_power.c - AMD Family 15h processor power monitoring 3 * 4 * Copyright (c) 2011-2016 Advanced Micro Devices, Inc. 5 * Author: Andreas Herrmann <herrmann.der.user@googlemail.com> 6 * 7 * 8 * This driver is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This driver is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 15 * See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this driver; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #include <linux/err.h> 22 #include <linux/hwmon.h> 23 #include <linux/hwmon-sysfs.h> 24 #include <linux/init.h> 25 #include <linux/module.h> 26 #include <linux/pci.h> 27 #include <linux/bitops.h> 28 #include <linux/cpu.h> 29 #include <linux/cpumask.h> 30 #include <linux/time.h> 31 #include <linux/sched.h> 32 #include <asm/processor.h> 33 #include <asm/msr.h> 34 35 MODULE_DESCRIPTION("AMD Family 15h CPU processor power monitor"); 36 MODULE_AUTHOR("Andreas Herrmann <herrmann.der.user@googlemail.com>"); 37 MODULE_LICENSE("GPL"); 38 39 /* D18F3 */ 40 #define REG_NORTHBRIDGE_CAP 0xe8 41 42 /* D18F4 */ 43 #define REG_PROCESSOR_TDP 0x1b8 44 45 /* D18F5 */ 46 #define REG_TDP_RUNNING_AVERAGE 0xe0 47 #define REG_TDP_LIMIT3 0xe8 48 49 #define FAM15H_MIN_NUM_ATTRS 2 50 #define FAM15H_NUM_GROUPS 2 51 #define MAX_CUS 8 52 53 /* set maximum interval as 1 second */ 54 #define MAX_INTERVAL 1000 55 56 #define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a 57 #define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b 58 #define MSR_F15H_PTSC 0xc0010280 59 60 #define PCI_DEVICE_ID_AMD_15H_M70H_NB_F4 0x15b4 61 62 struct fam15h_power_data { 63 struct pci_dev *pdev; 64 unsigned int tdp_to_watts; 65 unsigned int base_tdp; 66 unsigned int processor_pwr_watts; 67 unsigned int cpu_pwr_sample_ratio; 68 const struct attribute_group *groups[FAM15H_NUM_GROUPS]; 69 struct attribute_group group; 70 /* maximum accumulated power of a compute unit */ 71 u64 max_cu_acc_power; 72 /* accumulated power of the compute units */ 73 u64 cu_acc_power[MAX_CUS]; 74 /* performance timestamp counter */ 75 u64 cpu_sw_pwr_ptsc[MAX_CUS]; 76 /* online/offline status of current compute unit */ 77 int cu_on[MAX_CUS]; 78 unsigned long power_period; 79 }; 80 81 static bool is_carrizo_or_later(void) 82 { 83 return boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model >= 0x60; 84 } 85 86 static ssize_t show_power(struct device *dev, 87 struct device_attribute *attr, char *buf) 88 { 89 u32 val, tdp_limit, running_avg_range; 90 s32 running_avg_capture; 91 u64 curr_pwr_watts; 92 struct fam15h_power_data *data = dev_get_drvdata(dev); 93 struct pci_dev *f4 = data->pdev; 94 95 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 96 REG_TDP_RUNNING_AVERAGE, &val); 97 98 /* 99 * On Carrizo and later platforms, TdpRunAvgAccCap bit field 100 * is extended to 4:31 from 4:25. 101 */ 102 if (is_carrizo_or_later()) { 103 running_avg_capture = val >> 4; 104 running_avg_capture = sign_extend32(running_avg_capture, 27); 105 } else { 106 running_avg_capture = (val >> 4) & 0x3fffff; 107 running_avg_capture = sign_extend32(running_avg_capture, 21); 108 } 109 110 running_avg_range = (val & 0xf) + 1; 111 112 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 113 REG_TDP_LIMIT3, &val); 114 115 /* 116 * On Carrizo and later platforms, ApmTdpLimit bit field 117 * is extended to 16:31 from 16:28. 118 */ 119 if (is_carrizo_or_later()) 120 tdp_limit = val >> 16; 121 else 122 tdp_limit = (val >> 16) & 0x1fff; 123 124 curr_pwr_watts = ((u64)(tdp_limit + 125 data->base_tdp)) << running_avg_range; 126 curr_pwr_watts -= running_avg_capture; 127 curr_pwr_watts *= data->tdp_to_watts; 128 129 /* 130 * Convert to microWatt 131 * 132 * power is in Watt provided as fixed point integer with 133 * scaling factor 1/(2^16). For conversion we use 134 * (10^6)/(2^16) = 15625/(2^10) 135 */ 136 curr_pwr_watts = (curr_pwr_watts * 15625) >> (10 + running_avg_range); 137 return sprintf(buf, "%u\n", (unsigned int) curr_pwr_watts); 138 } 139 static DEVICE_ATTR(power1_input, S_IRUGO, show_power, NULL); 140 141 static ssize_t show_power_crit(struct device *dev, 142 struct device_attribute *attr, char *buf) 143 { 144 struct fam15h_power_data *data = dev_get_drvdata(dev); 145 146 return sprintf(buf, "%u\n", data->processor_pwr_watts); 147 } 148 static DEVICE_ATTR(power1_crit, S_IRUGO, show_power_crit, NULL); 149 150 static void do_read_registers_on_cu(void *_data) 151 { 152 struct fam15h_power_data *data = _data; 153 int cpu, cu; 154 155 cpu = smp_processor_id(); 156 157 /* 158 * With the new x86 topology modelling, cpu core id actually 159 * is compute unit id. 160 */ 161 cu = cpu_data(cpu).cpu_core_id; 162 163 rdmsrl_safe(MSR_F15H_CU_PWR_ACCUMULATOR, &data->cu_acc_power[cu]); 164 rdmsrl_safe(MSR_F15H_PTSC, &data->cpu_sw_pwr_ptsc[cu]); 165 166 data->cu_on[cu] = 1; 167 } 168 169 /* 170 * This function is only able to be called when CPUID 171 * Fn8000_0007:EDX[12] is set. 172 */ 173 static int read_registers(struct fam15h_power_data *data) 174 { 175 int core, this_core; 176 cpumask_var_t mask; 177 int ret, cpu; 178 179 ret = zalloc_cpumask_var(&mask, GFP_KERNEL); 180 if (!ret) 181 return -ENOMEM; 182 183 memset(data->cu_on, 0, sizeof(int) * MAX_CUS); 184 185 get_online_cpus(); 186 187 /* 188 * Choose the first online core of each compute unit, and then 189 * read their MSR value of power and ptsc in a single IPI, 190 * because the MSR value of CPU core represent the compute 191 * unit's. 192 */ 193 core = -1; 194 195 for_each_online_cpu(cpu) { 196 this_core = topology_core_id(cpu); 197 198 if (this_core == core) 199 continue; 200 201 core = this_core; 202 203 /* get any CPU on this compute unit */ 204 cpumask_set_cpu(cpumask_any(topology_sibling_cpumask(cpu)), mask); 205 } 206 207 on_each_cpu_mask(mask, do_read_registers_on_cu, data, true); 208 209 put_online_cpus(); 210 free_cpumask_var(mask); 211 212 return 0; 213 } 214 215 static ssize_t acc_show_power(struct device *dev, 216 struct device_attribute *attr, 217 char *buf) 218 { 219 struct fam15h_power_data *data = dev_get_drvdata(dev); 220 u64 prev_cu_acc_power[MAX_CUS], prev_ptsc[MAX_CUS], 221 jdelta[MAX_CUS]; 222 u64 tdelta, avg_acc; 223 int cu, cu_num, ret; 224 signed long leftover; 225 226 /* 227 * With the new x86 topology modelling, x86_max_cores is the 228 * compute unit number. 229 */ 230 cu_num = boot_cpu_data.x86_max_cores; 231 232 ret = read_registers(data); 233 if (ret) 234 return 0; 235 236 for (cu = 0; cu < cu_num; cu++) { 237 prev_cu_acc_power[cu] = data->cu_acc_power[cu]; 238 prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; 239 } 240 241 leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); 242 if (leftover) 243 return 0; 244 245 ret = read_registers(data); 246 if (ret) 247 return 0; 248 249 for (cu = 0, avg_acc = 0; cu < cu_num; cu++) { 250 /* check if current compute unit is online */ 251 if (data->cu_on[cu] == 0) 252 continue; 253 254 if (data->cu_acc_power[cu] < prev_cu_acc_power[cu]) { 255 jdelta[cu] = data->max_cu_acc_power + data->cu_acc_power[cu]; 256 jdelta[cu] -= prev_cu_acc_power[cu]; 257 } else { 258 jdelta[cu] = data->cu_acc_power[cu] - prev_cu_acc_power[cu]; 259 } 260 tdelta = data->cpu_sw_pwr_ptsc[cu] - prev_ptsc[cu]; 261 jdelta[cu] *= data->cpu_pwr_sample_ratio * 1000; 262 do_div(jdelta[cu], tdelta); 263 264 /* the unit is microWatt */ 265 avg_acc += jdelta[cu]; 266 } 267 268 return sprintf(buf, "%llu\n", (unsigned long long)avg_acc); 269 } 270 static DEVICE_ATTR(power1_average, S_IRUGO, acc_show_power, NULL); 271 272 static ssize_t acc_show_power_period(struct device *dev, 273 struct device_attribute *attr, 274 char *buf) 275 { 276 struct fam15h_power_data *data = dev_get_drvdata(dev); 277 278 return sprintf(buf, "%lu\n", data->power_period); 279 } 280 281 static ssize_t acc_set_power_period(struct device *dev, 282 struct device_attribute *attr, 283 const char *buf, size_t count) 284 { 285 struct fam15h_power_data *data = dev_get_drvdata(dev); 286 unsigned long temp; 287 int ret; 288 289 ret = kstrtoul(buf, 10, &temp); 290 if (ret) 291 return ret; 292 293 if (temp > MAX_INTERVAL) 294 return -EINVAL; 295 296 /* the interval value should be greater than 0 */ 297 if (temp <= 0) 298 return -EINVAL; 299 300 data->power_period = temp; 301 302 return count; 303 } 304 static DEVICE_ATTR(power1_average_interval, S_IRUGO | S_IWUSR, 305 acc_show_power_period, acc_set_power_period); 306 307 static int fam15h_power_init_attrs(struct pci_dev *pdev, 308 struct fam15h_power_data *data) 309 { 310 int n = FAM15H_MIN_NUM_ATTRS; 311 struct attribute **fam15h_power_attrs; 312 struct cpuinfo_x86 *c = &boot_cpu_data; 313 314 if (c->x86 == 0x15 && 315 (c->x86_model <= 0xf || 316 (c->x86_model >= 0x60 && c->x86_model <= 0x7f))) 317 n += 1; 318 319 /* check if processor supports accumulated power */ 320 if (boot_cpu_has(X86_FEATURE_ACC_POWER)) 321 n += 2; 322 323 fam15h_power_attrs = devm_kcalloc(&pdev->dev, n, 324 sizeof(*fam15h_power_attrs), 325 GFP_KERNEL); 326 327 if (!fam15h_power_attrs) 328 return -ENOMEM; 329 330 n = 0; 331 fam15h_power_attrs[n++] = &dev_attr_power1_crit.attr; 332 if (c->x86 == 0x15 && 333 (c->x86_model <= 0xf || 334 (c->x86_model >= 0x60 && c->x86_model <= 0x7f))) 335 fam15h_power_attrs[n++] = &dev_attr_power1_input.attr; 336 337 if (boot_cpu_has(X86_FEATURE_ACC_POWER)) { 338 fam15h_power_attrs[n++] = &dev_attr_power1_average.attr; 339 fam15h_power_attrs[n++] = &dev_attr_power1_average_interval.attr; 340 } 341 342 data->group.attrs = fam15h_power_attrs; 343 344 return 0; 345 } 346 347 static bool should_load_on_this_node(struct pci_dev *f4) 348 { 349 u32 val; 350 351 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 3), 352 REG_NORTHBRIDGE_CAP, &val); 353 if ((val & BIT(29)) && ((val >> 30) & 3)) 354 return false; 355 356 return true; 357 } 358 359 /* 360 * Newer BKDG versions have an updated recommendation on how to properly 361 * initialize the running average range (was: 0xE, now: 0x9). This avoids 362 * counter saturations resulting in bogus power readings. 363 * We correct this value ourselves to cope with older BIOSes. 364 */ 365 static const struct pci_device_id affected_device[] = { 366 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, 367 { 0 } 368 }; 369 370 static void tweak_runavg_range(struct pci_dev *pdev) 371 { 372 u32 val; 373 374 /* 375 * let this quirk apply only to the current version of the 376 * northbridge, since future versions may change the behavior 377 */ 378 if (!pci_match_id(affected_device, pdev)) 379 return; 380 381 pci_bus_read_config_dword(pdev->bus, 382 PCI_DEVFN(PCI_SLOT(pdev->devfn), 5), 383 REG_TDP_RUNNING_AVERAGE, &val); 384 if ((val & 0xf) != 0xe) 385 return; 386 387 val &= ~0xf; 388 val |= 0x9; 389 pci_bus_write_config_dword(pdev->bus, 390 PCI_DEVFN(PCI_SLOT(pdev->devfn), 5), 391 REG_TDP_RUNNING_AVERAGE, val); 392 } 393 394 #ifdef CONFIG_PM 395 static int fam15h_power_resume(struct pci_dev *pdev) 396 { 397 tweak_runavg_range(pdev); 398 return 0; 399 } 400 #else 401 #define fam15h_power_resume NULL 402 #endif 403 404 static int fam15h_power_init_data(struct pci_dev *f4, 405 struct fam15h_power_data *data) 406 { 407 u32 val; 408 u64 tmp; 409 int ret; 410 411 pci_read_config_dword(f4, REG_PROCESSOR_TDP, &val); 412 data->base_tdp = val >> 16; 413 tmp = val & 0xffff; 414 415 pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), 416 REG_TDP_LIMIT3, &val); 417 418 data->tdp_to_watts = ((val & 0x3ff) << 6) | ((val >> 10) & 0x3f); 419 tmp *= data->tdp_to_watts; 420 421 /* result not allowed to be >= 256W */ 422 if ((tmp >> 16) >= 256) 423 dev_warn(&f4->dev, 424 "Bogus value for ProcessorPwrWatts (processor_pwr_watts>=%u)\n", 425 (unsigned int) (tmp >> 16)); 426 427 /* convert to microWatt */ 428 data->processor_pwr_watts = (tmp * 15625) >> 10; 429 430 ret = fam15h_power_init_attrs(f4, data); 431 if (ret) 432 return ret; 433 434 435 /* CPUID Fn8000_0007:EDX[12] indicates to support accumulated power */ 436 if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) 437 return 0; 438 439 /* 440 * determine the ratio of the compute unit power accumulator 441 * sample period to the PTSC counter period by executing CPUID 442 * Fn8000_0007:ECX 443 */ 444 data->cpu_pwr_sample_ratio = cpuid_ecx(0x80000007); 445 446 if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &tmp)) { 447 pr_err("Failed to read max compute unit power accumulator MSR\n"); 448 return -ENODEV; 449 } 450 451 data->max_cu_acc_power = tmp; 452 453 /* 454 * Milliseconds are a reasonable interval for the measurement. 455 * But it shouldn't set too long here, because several seconds 456 * would cause the read function to hang. So set default 457 * interval as 10 ms. 458 */ 459 data->power_period = 10; 460 461 return read_registers(data); 462 } 463 464 static int fam15h_power_probe(struct pci_dev *pdev, 465 const struct pci_device_id *id) 466 { 467 struct fam15h_power_data *data; 468 struct device *dev = &pdev->dev; 469 struct device *hwmon_dev; 470 int ret; 471 472 /* 473 * though we ignore every other northbridge, we still have to 474 * do the tweaking on _each_ node in MCM processors as the counters 475 * are working hand-in-hand 476 */ 477 tweak_runavg_range(pdev); 478 479 if (!should_load_on_this_node(pdev)) 480 return -ENODEV; 481 482 data = devm_kzalloc(dev, sizeof(struct fam15h_power_data), GFP_KERNEL); 483 if (!data) 484 return -ENOMEM; 485 486 ret = fam15h_power_init_data(pdev, data); 487 if (ret) 488 return ret; 489 490 data->pdev = pdev; 491 492 data->groups[0] = &data->group; 493 494 hwmon_dev = devm_hwmon_device_register_with_groups(dev, "fam15h_power", 495 data, 496 &data->groups[0]); 497 return PTR_ERR_OR_ZERO(hwmon_dev); 498 } 499 500 static const struct pci_device_id fam15h_power_id_table[] = { 501 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, 502 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, 503 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, 504 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M70H_NB_F4) }, 505 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, 506 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, 507 {} 508 }; 509 MODULE_DEVICE_TABLE(pci, fam15h_power_id_table); 510 511 static struct pci_driver fam15h_power_driver = { 512 .name = "fam15h_power", 513 .id_table = fam15h_power_id_table, 514 .probe = fam15h_power_probe, 515 .resume = fam15h_power_resume, 516 }; 517 518 module_pci_driver(fam15h_power_driver); 519