1 /* 2 * POWERNV cpufreq driver for the IBM POWER processors 3 * 4 * (C) Copyright IBM 2014 5 * 6 * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2, or (at your option) 11 * any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 */ 19 20 #define pr_fmt(fmt) "powernv-cpufreq: " fmt 21 22 #include <linux/kernel.h> 23 #include <linux/sysfs.h> 24 #include <linux/cpumask.h> 25 #include <linux/module.h> 26 #include <linux/cpufreq.h> 27 #include <linux/smp.h> 28 #include <linux/of.h> 29 #include <linux/reboot.h> 30 #include <linux/slab.h> 31 #include <linux/cpu.h> 32 #include <trace/events/power.h> 33 34 #include <asm/cputhreads.h> 35 #include <asm/firmware.h> 36 #include <asm/reg.h> 37 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */ 38 #include <asm/opal.h> 39 40 #define POWERNV_MAX_PSTATES 256 41 #define PMSR_PSAFE_ENABLE (1UL << 30) 42 #define PMSR_SPR_EM_DISABLE (1UL << 31) 43 #define PMSR_MAX(x) ((x >> 32) & 0xFF) 44 45 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; 46 static bool rebooting, throttled, occ_reset; 47 48 static const char * const throttle_reason[] = { 49 "No throttling", 50 "Power Cap", 51 "Processor Over Temperature", 52 "Power Supply Failure", 53 "Over Current", 54 "OCC Reset" 55 }; 56 57 enum throttle_reason_type { 58 NO_THROTTLE = 0, 59 POWERCAP, 60 CPU_OVERTEMP, 61 POWER_SUPPLY_FAILURE, 62 OVERCURRENT, 63 OCC_RESET_THROTTLE, 64 OCC_MAX_REASON 65 }; 66 67 static struct chip { 68 unsigned int id; 69 bool throttled; 70 bool restore; 71 u8 throttle_reason; 72 cpumask_t mask; 73 struct work_struct throttle; 74 int throttle_turbo; 75 int throttle_sub_turbo; 76 int reason[OCC_MAX_REASON]; 77 } *chips; 78 79 static int nr_chips; 80 static DEFINE_PER_CPU(struct chip *, chip_info); 81 82 /* 83 * Note: The set of pstates consists of contiguous integers, the 84 * smallest of which is indicated by powernv_pstate_info.min, the 85 * largest of which is indicated by powernv_pstate_info.max. 86 * 87 * The nominal pstate is the highest non-turbo pstate in this 88 * platform. This is indicated by powernv_pstate_info.nominal. 89 */ 90 static struct powernv_pstate_info { 91 int min; 92 int max; 93 int nominal; 94 int nr_pstates; 95 } powernv_pstate_info; 96 97 /* 98 * Initialize the freq table based on data obtained 99 * from the firmware passed via device-tree 100 */ 101 static int init_powernv_pstates(void) 102 { 103 struct device_node *power_mgt; 104 int i, pstate_min, pstate_max, pstate_nominal, nr_pstates = 0; 105 const __be32 *pstate_ids, *pstate_freqs; 106 u32 len_ids, len_freqs; 107 108 power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); 109 if (!power_mgt) { 110 pr_warn("power-mgt node not found\n"); 111 return -ENODEV; 112 } 113 114 if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) { 115 pr_warn("ibm,pstate-min node not found\n"); 116 return -ENODEV; 117 } 118 119 if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) { 120 pr_warn("ibm,pstate-max node not found\n"); 121 return -ENODEV; 122 } 123 124 if (of_property_read_u32(power_mgt, "ibm,pstate-nominal", 125 &pstate_nominal)) { 126 pr_warn("ibm,pstate-nominal not found\n"); 127 return -ENODEV; 128 } 129 pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, 130 pstate_nominal, pstate_max); 131 132 pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids); 133 if (!pstate_ids) { 134 pr_warn("ibm,pstate-ids not found\n"); 135 return -ENODEV; 136 } 137 138 pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz", 139 &len_freqs); 140 if (!pstate_freqs) { 141 pr_warn("ibm,pstate-frequencies-mhz not found\n"); 142 return -ENODEV; 143 } 144 145 if (len_ids != len_freqs) { 146 pr_warn("Entries in ibm,pstate-ids and " 147 "ibm,pstate-frequencies-mhz does not match\n"); 148 } 149 150 nr_pstates = min(len_ids, len_freqs) / sizeof(u32); 151 if (!nr_pstates) { 152 pr_warn("No PStates found\n"); 153 return -ENODEV; 154 } 155 156 pr_debug("NR PStates %d\n", nr_pstates); 157 for (i = 0; i < nr_pstates; i++) { 158 u32 id = be32_to_cpu(pstate_ids[i]); 159 u32 freq = be32_to_cpu(pstate_freqs[i]); 160 161 pr_debug("PState id %d freq %d MHz\n", id, freq); 162 powernv_freqs[i].frequency = freq * 1000; /* kHz */ 163 powernv_freqs[i].driver_data = id; 164 } 165 /* End of list marker entry */ 166 powernv_freqs[i].frequency = CPUFREQ_TABLE_END; 167 168 powernv_pstate_info.min = pstate_min; 169 powernv_pstate_info.max = pstate_max; 170 powernv_pstate_info.nominal = pstate_nominal; 171 powernv_pstate_info.nr_pstates = nr_pstates; 172 173 return 0; 174 } 175 176 /* Returns the CPU frequency corresponding to the pstate_id. */ 177 static unsigned int pstate_id_to_freq(int pstate_id) 178 { 179 int i; 180 181 i = powernv_pstate_info.max - pstate_id; 182 if (i >= powernv_pstate_info.nr_pstates || i < 0) { 183 pr_warn("PState id %d outside of PState table, " 184 "reporting nominal id %d instead\n", 185 pstate_id, powernv_pstate_info.nominal); 186 i = powernv_pstate_info.max - powernv_pstate_info.nominal; 187 } 188 189 return powernv_freqs[i].frequency; 190 } 191 192 /* 193 * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by 194 * the firmware 195 */ 196 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy, 197 char *buf) 198 { 199 return sprintf(buf, "%u\n", 200 pstate_id_to_freq(powernv_pstate_info.nominal)); 201 } 202 203 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq = 204 __ATTR_RO(cpuinfo_nominal_freq); 205 206 static struct freq_attr *powernv_cpu_freq_attr[] = { 207 &cpufreq_freq_attr_scaling_available_freqs, 208 &cpufreq_freq_attr_cpuinfo_nominal_freq, 209 NULL, 210 }; 211 212 #define throttle_attr(name, member) \ 213 static ssize_t name##_show(struct cpufreq_policy *policy, char *buf) \ 214 { \ 215 struct chip *chip = per_cpu(chip_info, policy->cpu); \ 216 \ 217 return sprintf(buf, "%u\n", chip->member); \ 218 } \ 219 \ 220 static struct freq_attr throttle_attr_##name = __ATTR_RO(name) \ 221 222 throttle_attr(unthrottle, reason[NO_THROTTLE]); 223 throttle_attr(powercap, reason[POWERCAP]); 224 throttle_attr(overtemp, reason[CPU_OVERTEMP]); 225 throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]); 226 throttle_attr(overcurrent, reason[OVERCURRENT]); 227 throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]); 228 throttle_attr(turbo_stat, throttle_turbo); 229 throttle_attr(sub_turbo_stat, throttle_sub_turbo); 230 231 static struct attribute *throttle_attrs[] = { 232 &throttle_attr_unthrottle.attr, 233 &throttle_attr_powercap.attr, 234 &throttle_attr_overtemp.attr, 235 &throttle_attr_supply_fault.attr, 236 &throttle_attr_overcurrent.attr, 237 &throttle_attr_occ_reset.attr, 238 &throttle_attr_turbo_stat.attr, 239 &throttle_attr_sub_turbo_stat.attr, 240 NULL, 241 }; 242 243 static const struct attribute_group throttle_attr_grp = { 244 .name = "throttle_stats", 245 .attrs = throttle_attrs, 246 }; 247 248 /* Helper routines */ 249 250 /* Access helpers to power mgt SPR */ 251 252 static inline unsigned long get_pmspr(unsigned long sprn) 253 { 254 switch (sprn) { 255 case SPRN_PMCR: 256 return mfspr(SPRN_PMCR); 257 258 case SPRN_PMICR: 259 return mfspr(SPRN_PMICR); 260 261 case SPRN_PMSR: 262 return mfspr(SPRN_PMSR); 263 } 264 BUG(); 265 } 266 267 static inline void set_pmspr(unsigned long sprn, unsigned long val) 268 { 269 switch (sprn) { 270 case SPRN_PMCR: 271 mtspr(SPRN_PMCR, val); 272 return; 273 274 case SPRN_PMICR: 275 mtspr(SPRN_PMICR, val); 276 return; 277 } 278 BUG(); 279 } 280 281 /* 282 * Use objects of this type to query/update 283 * pstates on a remote CPU via smp_call_function. 284 */ 285 struct powernv_smp_call_data { 286 unsigned int freq; 287 int pstate_id; 288 }; 289 290 /* 291 * powernv_read_cpu_freq: Reads the current frequency on this CPU. 292 * 293 * Called via smp_call_function. 294 * 295 * Note: The caller of the smp_call_function should pass an argument of 296 * the type 'struct powernv_smp_call_data *' along with this function. 297 * 298 * The current frequency on this CPU will be returned via 299 * ((struct powernv_smp_call_data *)arg)->freq; 300 */ 301 static void powernv_read_cpu_freq(void *arg) 302 { 303 unsigned long pmspr_val; 304 s8 local_pstate_id; 305 struct powernv_smp_call_data *freq_data = arg; 306 307 pmspr_val = get_pmspr(SPRN_PMSR); 308 309 /* 310 * The local pstate id corresponds bits 48..55 in the PMSR. 311 * Note: Watch out for the sign! 312 */ 313 local_pstate_id = (pmspr_val >> 48) & 0xFF; 314 freq_data->pstate_id = local_pstate_id; 315 freq_data->freq = pstate_id_to_freq(freq_data->pstate_id); 316 317 pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n", 318 raw_smp_processor_id(), pmspr_val, freq_data->pstate_id, 319 freq_data->freq); 320 } 321 322 /* 323 * powernv_cpufreq_get: Returns the CPU frequency as reported by the 324 * firmware for CPU 'cpu'. This value is reported through the sysfs 325 * file cpuinfo_cur_freq. 326 */ 327 static unsigned int powernv_cpufreq_get(unsigned int cpu) 328 { 329 struct powernv_smp_call_data freq_data; 330 331 smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq, 332 &freq_data, 1); 333 334 return freq_data.freq; 335 } 336 337 /* 338 * set_pstate: Sets the pstate on this CPU. 339 * 340 * This is called via an smp_call_function. 341 * 342 * The caller must ensure that freq_data is of the type 343 * (struct powernv_smp_call_data *) and the pstate_id which needs to be set 344 * on this CPU should be present in freq_data->pstate_id. 345 */ 346 static void set_pstate(void *freq_data) 347 { 348 unsigned long val; 349 unsigned long pstate_ul = 350 ((struct powernv_smp_call_data *) freq_data)->pstate_id; 351 352 val = get_pmspr(SPRN_PMCR); 353 val = val & 0x0000FFFFFFFFFFFFULL; 354 355 pstate_ul = pstate_ul & 0xFF; 356 357 /* Set both global(bits 56..63) and local(bits 48..55) PStates */ 358 val = val | (pstate_ul << 56) | (pstate_ul << 48); 359 360 pr_debug("Setting cpu %d pmcr to %016lX\n", 361 raw_smp_processor_id(), val); 362 set_pmspr(SPRN_PMCR, val); 363 } 364 365 /* 366 * get_nominal_index: Returns the index corresponding to the nominal 367 * pstate in the cpufreq table 368 */ 369 static inline unsigned int get_nominal_index(void) 370 { 371 return powernv_pstate_info.max - powernv_pstate_info.nominal; 372 } 373 374 static void powernv_cpufreq_throttle_check(void *data) 375 { 376 struct chip *chip; 377 unsigned int cpu = smp_processor_id(); 378 unsigned long pmsr; 379 int pmsr_pmax; 380 381 pmsr = get_pmspr(SPRN_PMSR); 382 chip = this_cpu_read(chip_info); 383 384 /* Check for Pmax Capping */ 385 pmsr_pmax = (s8)PMSR_MAX(pmsr); 386 if (pmsr_pmax != powernv_pstate_info.max) { 387 if (chip->throttled) 388 goto next; 389 chip->throttled = true; 390 if (pmsr_pmax < powernv_pstate_info.nominal) { 391 pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", 392 cpu, chip->id, pmsr_pmax, 393 powernv_pstate_info.nominal); 394 chip->throttle_sub_turbo++; 395 } else { 396 chip->throttle_turbo++; 397 } 398 trace_powernv_throttle(chip->id, 399 throttle_reason[chip->throttle_reason], 400 pmsr_pmax); 401 } else if (chip->throttled) { 402 chip->throttled = false; 403 trace_powernv_throttle(chip->id, 404 throttle_reason[chip->throttle_reason], 405 pmsr_pmax); 406 } 407 408 /* Check if Psafe_mode_active is set in PMSR. */ 409 next: 410 if (pmsr & PMSR_PSAFE_ENABLE) { 411 throttled = true; 412 pr_info("Pstate set to safe frequency\n"); 413 } 414 415 /* Check if SPR_EM_DISABLE is set in PMSR */ 416 if (pmsr & PMSR_SPR_EM_DISABLE) { 417 throttled = true; 418 pr_info("Frequency Control disabled from OS\n"); 419 } 420 421 if (throttled) { 422 pr_info("PMSR = %16lx\n", pmsr); 423 pr_warn("CPU Frequency could be throttled\n"); 424 } 425 } 426 427 /* 428 * powernv_cpufreq_target_index: Sets the frequency corresponding to 429 * the cpufreq table entry indexed by new_index on the cpus in the 430 * mask policy->cpus 431 */ 432 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, 433 unsigned int new_index) 434 { 435 struct powernv_smp_call_data freq_data; 436 437 if (unlikely(rebooting) && new_index != get_nominal_index()) 438 return 0; 439 440 if (!throttled) 441 powernv_cpufreq_throttle_check(NULL); 442 443 freq_data.pstate_id = powernv_freqs[new_index].driver_data; 444 445 /* 446 * Use smp_call_function to send IPI and execute the 447 * mtspr on target CPU. We could do that without IPI 448 * if current CPU is within policy->cpus (core) 449 */ 450 smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1); 451 452 return 0; 453 } 454 455 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) 456 { 457 int base, i; 458 459 base = cpu_first_thread_sibling(policy->cpu); 460 461 for (i = 0; i < threads_per_core; i++) 462 cpumask_set_cpu(base + i, policy->cpus); 463 464 if (!policy->driver_data) { 465 int ret; 466 467 ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp); 468 if (ret) { 469 pr_info("Failed to create throttle stats directory for cpu %d\n", 470 policy->cpu); 471 return ret; 472 } 473 /* 474 * policy->driver_data is used as a flag for one-time 475 * creation of throttle sysfs files. 476 */ 477 policy->driver_data = policy; 478 } 479 return cpufreq_table_validate_and_show(policy, powernv_freqs); 480 } 481 482 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb, 483 unsigned long action, void *unused) 484 { 485 int cpu; 486 struct cpufreq_policy cpu_policy; 487 488 rebooting = true; 489 for_each_online_cpu(cpu) { 490 cpufreq_get_policy(&cpu_policy, cpu); 491 powernv_cpufreq_target_index(&cpu_policy, get_nominal_index()); 492 } 493 494 return NOTIFY_DONE; 495 } 496 497 static struct notifier_block powernv_cpufreq_reboot_nb = { 498 .notifier_call = powernv_cpufreq_reboot_notifier, 499 }; 500 501 void powernv_cpufreq_work_fn(struct work_struct *work) 502 { 503 struct chip *chip = container_of(work, struct chip, throttle); 504 unsigned int cpu; 505 cpumask_t mask; 506 507 get_online_cpus(); 508 cpumask_and(&mask, &chip->mask, cpu_online_mask); 509 smp_call_function_any(&mask, 510 powernv_cpufreq_throttle_check, NULL, 0); 511 512 if (!chip->restore) 513 goto out; 514 515 chip->restore = false; 516 for_each_cpu(cpu, &mask) { 517 int index; 518 struct cpufreq_policy policy; 519 520 cpufreq_get_policy(&policy, cpu); 521 cpufreq_frequency_table_target(&policy, policy.freq_table, 522 policy.cur, 523 CPUFREQ_RELATION_C, &index); 524 powernv_cpufreq_target_index(&policy, index); 525 cpumask_andnot(&mask, &mask, policy.cpus); 526 } 527 out: 528 put_online_cpus(); 529 } 530 531 static int powernv_cpufreq_occ_msg(struct notifier_block *nb, 532 unsigned long msg_type, void *_msg) 533 { 534 struct opal_msg *msg = _msg; 535 struct opal_occ_msg omsg; 536 int i; 537 538 if (msg_type != OPAL_MSG_OCC) 539 return 0; 540 541 omsg.type = be64_to_cpu(msg->params[0]); 542 543 switch (omsg.type) { 544 case OCC_RESET: 545 occ_reset = true; 546 pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n"); 547 /* 548 * powernv_cpufreq_throttle_check() is called in 549 * target() callback which can detect the throttle state 550 * for governors like ondemand. 551 * But static governors will not call target() often thus 552 * report throttling here. 553 */ 554 if (!throttled) { 555 throttled = true; 556 pr_warn("CPU frequency is throttled for duration\n"); 557 } 558 559 break; 560 case OCC_LOAD: 561 pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n"); 562 break; 563 case OCC_THROTTLE: 564 omsg.chip = be64_to_cpu(msg->params[1]); 565 omsg.throttle_status = be64_to_cpu(msg->params[2]); 566 567 if (occ_reset) { 568 occ_reset = false; 569 throttled = false; 570 pr_info("OCC Active, CPU frequency is no longer throttled\n"); 571 572 for (i = 0; i < nr_chips; i++) { 573 chips[i].restore = true; 574 schedule_work(&chips[i].throttle); 575 } 576 577 return 0; 578 } 579 580 for (i = 0; i < nr_chips; i++) 581 if (chips[i].id == omsg.chip) 582 break; 583 584 if (omsg.throttle_status >= 0 && 585 omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) { 586 chips[i].throttle_reason = omsg.throttle_status; 587 chips[i].reason[omsg.throttle_status]++; 588 } 589 590 if (!omsg.throttle_status) 591 chips[i].restore = true; 592 593 schedule_work(&chips[i].throttle); 594 } 595 return 0; 596 } 597 598 static struct notifier_block powernv_cpufreq_opal_nb = { 599 .notifier_call = powernv_cpufreq_occ_msg, 600 .next = NULL, 601 .priority = 0, 602 }; 603 604 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) 605 { 606 struct powernv_smp_call_data freq_data; 607 608 freq_data.pstate_id = powernv_pstate_info.min; 609 smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); 610 } 611 612 static struct cpufreq_driver powernv_cpufreq_driver = { 613 .name = "powernv-cpufreq", 614 .flags = CPUFREQ_CONST_LOOPS, 615 .init = powernv_cpufreq_cpu_init, 616 .verify = cpufreq_generic_frequency_table_verify, 617 .target_index = powernv_cpufreq_target_index, 618 .get = powernv_cpufreq_get, 619 .stop_cpu = powernv_cpufreq_stop_cpu, 620 .attr = powernv_cpu_freq_attr, 621 }; 622 623 static int init_chip_info(void) 624 { 625 unsigned int chip[256]; 626 unsigned int cpu, i; 627 unsigned int prev_chip_id = UINT_MAX; 628 629 for_each_possible_cpu(cpu) { 630 unsigned int id = cpu_to_chip_id(cpu); 631 632 if (prev_chip_id != id) { 633 prev_chip_id = id; 634 chip[nr_chips++] = id; 635 } 636 } 637 638 chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); 639 if (!chips) 640 return -ENOMEM; 641 642 for (i = 0; i < nr_chips; i++) { 643 chips[i].id = chip[i]; 644 cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); 645 INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); 646 for_each_cpu(cpu, &chips[i].mask) 647 per_cpu(chip_info, cpu) = &chips[i]; 648 } 649 650 return 0; 651 } 652 653 static inline void clean_chip_info(void) 654 { 655 kfree(chips); 656 } 657 658 static inline void unregister_all_notifiers(void) 659 { 660 opal_message_notifier_unregister(OPAL_MSG_OCC, 661 &powernv_cpufreq_opal_nb); 662 unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); 663 } 664 665 static int __init powernv_cpufreq_init(void) 666 { 667 int rc = 0; 668 669 /* Don't probe on pseries (guest) platforms */ 670 if (!firmware_has_feature(FW_FEATURE_OPAL)) 671 return -ENODEV; 672 673 /* Discover pstates from device tree and init */ 674 rc = init_powernv_pstates(); 675 if (rc) 676 goto out; 677 678 /* Populate chip info */ 679 rc = init_chip_info(); 680 if (rc) 681 goto out; 682 683 register_reboot_notifier(&powernv_cpufreq_reboot_nb); 684 opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); 685 686 rc = cpufreq_register_driver(&powernv_cpufreq_driver); 687 if (!rc) 688 return 0; 689 690 pr_info("Failed to register the cpufreq driver (%d)\n", rc); 691 unregister_all_notifiers(); 692 clean_chip_info(); 693 out: 694 pr_info("Platform driver disabled. System does not support PState control\n"); 695 return rc; 696 } 697 module_init(powernv_cpufreq_init); 698 699 static void __exit powernv_cpufreq_exit(void) 700 { 701 cpufreq_unregister_driver(&powernv_cpufreq_driver); 702 unregister_all_notifiers(); 703 clean_chip_info(); 704 } 705 module_exit(powernv_cpufreq_exit); 706 707 MODULE_LICENSE("GPL"); 708 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>"); 709