1 /* 2 * POWERNV cpufreq driver for the IBM POWER processors 3 * 4 * (C) Copyright IBM 2014 5 * 6 * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2, or (at your option) 11 * any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 */ 19 20 #define pr_fmt(fmt) "powernv-cpufreq: " fmt 21 22 #include <linux/kernel.h> 23 #include <linux/sysfs.h> 24 #include <linux/cpumask.h> 25 #include <linux/module.h> 26 #include <linux/cpufreq.h> 27 #include <linux/smp.h> 28 #include <linux/of.h> 29 #include <linux/reboot.h> 30 #include <linux/slab.h> 31 32 #include <asm/cputhreads.h> 33 #include <asm/firmware.h> 34 #include <asm/reg.h> 35 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */ 36 #include <asm/opal.h> 37 38 #define POWERNV_MAX_PSTATES 256 39 #define PMSR_PSAFE_ENABLE (1UL << 30) 40 #define PMSR_SPR_EM_DISABLE (1UL << 31) 41 #define PMSR_MAX(x) ((x >> 32) & 0xFF) 42 43 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; 44 static bool rebooting, throttled, occ_reset; 45 46 static struct chip { 47 unsigned int id; 48 bool throttled; 49 cpumask_t mask; 50 struct work_struct throttle; 51 bool restore; 52 } *chips; 53 54 static int nr_chips; 55 56 /* 57 * Note: The set of pstates consists of contiguous integers, the 58 * smallest of which is indicated by powernv_pstate_info.min, the 59 * largest of which is indicated by powernv_pstate_info.max. 60 * 61 * The nominal pstate is the highest non-turbo pstate in this 62 * platform. This is indicated by powernv_pstate_info.nominal. 63 */ 64 static struct powernv_pstate_info { 65 int min; 66 int max; 67 int nominal; 68 int nr_pstates; 69 } powernv_pstate_info; 70 71 /* 72 * Initialize the freq table based on data obtained 73 * from the firmware passed via device-tree 74 */ 75 static int init_powernv_pstates(void) 76 { 77 struct device_node *power_mgt; 78 int i, pstate_min, pstate_max, pstate_nominal, nr_pstates = 0; 79 const __be32 *pstate_ids, *pstate_freqs; 80 u32 len_ids, len_freqs; 81 82 power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); 83 if (!power_mgt) { 84 pr_warn("power-mgt node not found\n"); 85 return -ENODEV; 86 } 87 88 if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) { 89 pr_warn("ibm,pstate-min node not found\n"); 90 return -ENODEV; 91 } 92 93 if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) { 94 pr_warn("ibm,pstate-max node not found\n"); 95 return -ENODEV; 96 } 97 98 if (of_property_read_u32(power_mgt, "ibm,pstate-nominal", 99 &pstate_nominal)) { 100 pr_warn("ibm,pstate-nominal not found\n"); 101 return -ENODEV; 102 } 103 pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, 104 pstate_nominal, pstate_max); 105 106 pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids); 107 if (!pstate_ids) { 108 pr_warn("ibm,pstate-ids not found\n"); 109 return -ENODEV; 110 } 111 112 pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz", 113 &len_freqs); 114 if (!pstate_freqs) { 115 pr_warn("ibm,pstate-frequencies-mhz not found\n"); 116 return -ENODEV; 117 } 118 119 if (len_ids != len_freqs) { 120 pr_warn("Entries in ibm,pstate-ids and " 121 "ibm,pstate-frequencies-mhz does not match\n"); 122 } 123 124 nr_pstates = min(len_ids, len_freqs) / sizeof(u32); 125 if (!nr_pstates) { 126 pr_warn("No PStates found\n"); 127 return -ENODEV; 128 } 129 130 pr_debug("NR PStates %d\n", nr_pstates); 131 for (i = 0; i < nr_pstates; i++) { 132 u32 id = be32_to_cpu(pstate_ids[i]); 133 u32 freq = be32_to_cpu(pstate_freqs[i]); 134 135 pr_debug("PState id %d freq %d MHz\n", id, freq); 136 powernv_freqs[i].frequency = freq * 1000; /* kHz */ 137 powernv_freqs[i].driver_data = id; 138 } 139 /* End of list marker entry */ 140 powernv_freqs[i].frequency = CPUFREQ_TABLE_END; 141 142 powernv_pstate_info.min = pstate_min; 143 powernv_pstate_info.max = pstate_max; 144 powernv_pstate_info.nominal = pstate_nominal; 145 powernv_pstate_info.nr_pstates = nr_pstates; 146 147 return 0; 148 } 149 150 /* Returns the CPU frequency corresponding to the pstate_id. */ 151 static unsigned int pstate_id_to_freq(int pstate_id) 152 { 153 int i; 154 155 i = powernv_pstate_info.max - pstate_id; 156 if (i >= powernv_pstate_info.nr_pstates || i < 0) { 157 pr_warn("PState id %d outside of PState table, " 158 "reporting nominal id %d instead\n", 159 pstate_id, powernv_pstate_info.nominal); 160 i = powernv_pstate_info.max - powernv_pstate_info.nominal; 161 } 162 163 return powernv_freqs[i].frequency; 164 } 165 166 /* 167 * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by 168 * the firmware 169 */ 170 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy, 171 char *buf) 172 { 173 return sprintf(buf, "%u\n", 174 pstate_id_to_freq(powernv_pstate_info.nominal)); 175 } 176 177 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq = 178 __ATTR_RO(cpuinfo_nominal_freq); 179 180 static struct freq_attr *powernv_cpu_freq_attr[] = { 181 &cpufreq_freq_attr_scaling_available_freqs, 182 &cpufreq_freq_attr_cpuinfo_nominal_freq, 183 NULL, 184 }; 185 186 /* Helper routines */ 187 188 /* Access helpers to power mgt SPR */ 189 190 static inline unsigned long get_pmspr(unsigned long sprn) 191 { 192 switch (sprn) { 193 case SPRN_PMCR: 194 return mfspr(SPRN_PMCR); 195 196 case SPRN_PMICR: 197 return mfspr(SPRN_PMICR); 198 199 case SPRN_PMSR: 200 return mfspr(SPRN_PMSR); 201 } 202 BUG(); 203 } 204 205 static inline void set_pmspr(unsigned long sprn, unsigned long val) 206 { 207 switch (sprn) { 208 case SPRN_PMCR: 209 mtspr(SPRN_PMCR, val); 210 return; 211 212 case SPRN_PMICR: 213 mtspr(SPRN_PMICR, val); 214 return; 215 } 216 BUG(); 217 } 218 219 /* 220 * Use objects of this type to query/update 221 * pstates on a remote CPU via smp_call_function. 222 */ 223 struct powernv_smp_call_data { 224 unsigned int freq; 225 int pstate_id; 226 }; 227 228 /* 229 * powernv_read_cpu_freq: Reads the current frequency on this CPU. 230 * 231 * Called via smp_call_function. 232 * 233 * Note: The caller of the smp_call_function should pass an argument of 234 * the type 'struct powernv_smp_call_data *' along with this function. 235 * 236 * The current frequency on this CPU will be returned via 237 * ((struct powernv_smp_call_data *)arg)->freq; 238 */ 239 static void powernv_read_cpu_freq(void *arg) 240 { 241 unsigned long pmspr_val; 242 s8 local_pstate_id; 243 struct powernv_smp_call_data *freq_data = arg; 244 245 pmspr_val = get_pmspr(SPRN_PMSR); 246 247 /* 248 * The local pstate id corresponds bits 48..55 in the PMSR. 249 * Note: Watch out for the sign! 250 */ 251 local_pstate_id = (pmspr_val >> 48) & 0xFF; 252 freq_data->pstate_id = local_pstate_id; 253 freq_data->freq = pstate_id_to_freq(freq_data->pstate_id); 254 255 pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n", 256 raw_smp_processor_id(), pmspr_val, freq_data->pstate_id, 257 freq_data->freq); 258 } 259 260 /* 261 * powernv_cpufreq_get: Returns the CPU frequency as reported by the 262 * firmware for CPU 'cpu'. This value is reported through the sysfs 263 * file cpuinfo_cur_freq. 264 */ 265 static unsigned int powernv_cpufreq_get(unsigned int cpu) 266 { 267 struct powernv_smp_call_data freq_data; 268 269 smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq, 270 &freq_data, 1); 271 272 return freq_data.freq; 273 } 274 275 /* 276 * set_pstate: Sets the pstate on this CPU. 277 * 278 * This is called via an smp_call_function. 279 * 280 * The caller must ensure that freq_data is of the type 281 * (struct powernv_smp_call_data *) and the pstate_id which needs to be set 282 * on this CPU should be present in freq_data->pstate_id. 283 */ 284 static void set_pstate(void *freq_data) 285 { 286 unsigned long val; 287 unsigned long pstate_ul = 288 ((struct powernv_smp_call_data *) freq_data)->pstate_id; 289 290 val = get_pmspr(SPRN_PMCR); 291 val = val & 0x0000FFFFFFFFFFFFULL; 292 293 pstate_ul = pstate_ul & 0xFF; 294 295 /* Set both global(bits 56..63) and local(bits 48..55) PStates */ 296 val = val | (pstate_ul << 56) | (pstate_ul << 48); 297 298 pr_debug("Setting cpu %d pmcr to %016lX\n", 299 raw_smp_processor_id(), val); 300 set_pmspr(SPRN_PMCR, val); 301 } 302 303 /* 304 * get_nominal_index: Returns the index corresponding to the nominal 305 * pstate in the cpufreq table 306 */ 307 static inline unsigned int get_nominal_index(void) 308 { 309 return powernv_pstate_info.max - powernv_pstate_info.nominal; 310 } 311 312 static void powernv_cpufreq_throttle_check(void *data) 313 { 314 unsigned int cpu = smp_processor_id(); 315 unsigned long pmsr; 316 int pmsr_pmax, i; 317 318 pmsr = get_pmspr(SPRN_PMSR); 319 320 for (i = 0; i < nr_chips; i++) 321 if (chips[i].id == cpu_to_chip_id(cpu)) 322 break; 323 324 /* Check for Pmax Capping */ 325 pmsr_pmax = (s8)PMSR_MAX(pmsr); 326 if (pmsr_pmax != powernv_pstate_info.max) { 327 if (chips[i].throttled) 328 goto next; 329 chips[i].throttled = true; 330 if (pmsr_pmax < powernv_pstate_info.nominal) 331 pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", 332 cpu, chips[i].id, pmsr_pmax, 333 powernv_pstate_info.nominal); 334 else 335 pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", 336 cpu, chips[i].id, pmsr_pmax, 337 powernv_pstate_info.max); 338 } else if (chips[i].throttled) { 339 chips[i].throttled = false; 340 pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, 341 chips[i].id, pmsr_pmax); 342 } 343 344 /* Check if Psafe_mode_active is set in PMSR. */ 345 next: 346 if (pmsr & PMSR_PSAFE_ENABLE) { 347 throttled = true; 348 pr_info("Pstate set to safe frequency\n"); 349 } 350 351 /* Check if SPR_EM_DISABLE is set in PMSR */ 352 if (pmsr & PMSR_SPR_EM_DISABLE) { 353 throttled = true; 354 pr_info("Frequency Control disabled from OS\n"); 355 } 356 357 if (throttled) { 358 pr_info("PMSR = %16lx\n", pmsr); 359 pr_crit("CPU Frequency could be throttled\n"); 360 } 361 } 362 363 /* 364 * powernv_cpufreq_target_index: Sets the frequency corresponding to 365 * the cpufreq table entry indexed by new_index on the cpus in the 366 * mask policy->cpus 367 */ 368 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, 369 unsigned int new_index) 370 { 371 struct powernv_smp_call_data freq_data; 372 373 if (unlikely(rebooting) && new_index != get_nominal_index()) 374 return 0; 375 376 if (!throttled) 377 powernv_cpufreq_throttle_check(NULL); 378 379 freq_data.pstate_id = powernv_freqs[new_index].driver_data; 380 381 /* 382 * Use smp_call_function to send IPI and execute the 383 * mtspr on target CPU. We could do that without IPI 384 * if current CPU is within policy->cpus (core) 385 */ 386 smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1); 387 388 return 0; 389 } 390 391 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) 392 { 393 int base, i; 394 395 base = cpu_first_thread_sibling(policy->cpu); 396 397 for (i = 0; i < threads_per_core; i++) 398 cpumask_set_cpu(base + i, policy->cpus); 399 400 return cpufreq_table_validate_and_show(policy, powernv_freqs); 401 } 402 403 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb, 404 unsigned long action, void *unused) 405 { 406 int cpu; 407 struct cpufreq_policy cpu_policy; 408 409 rebooting = true; 410 for_each_online_cpu(cpu) { 411 cpufreq_get_policy(&cpu_policy, cpu); 412 powernv_cpufreq_target_index(&cpu_policy, get_nominal_index()); 413 } 414 415 return NOTIFY_DONE; 416 } 417 418 static struct notifier_block powernv_cpufreq_reboot_nb = { 419 .notifier_call = powernv_cpufreq_reboot_notifier, 420 }; 421 422 void powernv_cpufreq_work_fn(struct work_struct *work) 423 { 424 struct chip *chip = container_of(work, struct chip, throttle); 425 unsigned int cpu; 426 cpumask_var_t mask; 427 428 smp_call_function_any(&chip->mask, 429 powernv_cpufreq_throttle_check, NULL, 0); 430 431 if (!chip->restore) 432 return; 433 434 chip->restore = false; 435 cpumask_copy(mask, &chip->mask); 436 for_each_cpu_and(cpu, mask, cpu_online_mask) { 437 int index, tcpu; 438 struct cpufreq_policy policy; 439 440 cpufreq_get_policy(&policy, cpu); 441 cpufreq_frequency_table_target(&policy, policy.freq_table, 442 policy.cur, 443 CPUFREQ_RELATION_C, &index); 444 powernv_cpufreq_target_index(&policy, index); 445 for_each_cpu(tcpu, policy.cpus) 446 cpumask_clear_cpu(tcpu, mask); 447 } 448 } 449 450 static char throttle_reason[][30] = { 451 "No throttling", 452 "Power Cap", 453 "Processor Over Temperature", 454 "Power Supply Failure", 455 "Over Current", 456 "OCC Reset" 457 }; 458 459 static int powernv_cpufreq_occ_msg(struct notifier_block *nb, 460 unsigned long msg_type, void *_msg) 461 { 462 struct opal_msg *msg = _msg; 463 struct opal_occ_msg omsg; 464 int i; 465 466 if (msg_type != OPAL_MSG_OCC) 467 return 0; 468 469 omsg.type = be64_to_cpu(msg->params[0]); 470 471 switch (omsg.type) { 472 case OCC_RESET: 473 occ_reset = true; 474 pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n"); 475 /* 476 * powernv_cpufreq_throttle_check() is called in 477 * target() callback which can detect the throttle state 478 * for governors like ondemand. 479 * But static governors will not call target() often thus 480 * report throttling here. 481 */ 482 if (!throttled) { 483 throttled = true; 484 pr_crit("CPU frequency is throttled for duration\n"); 485 } 486 487 break; 488 case OCC_LOAD: 489 pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n"); 490 break; 491 case OCC_THROTTLE: 492 omsg.chip = be64_to_cpu(msg->params[1]); 493 omsg.throttle_status = be64_to_cpu(msg->params[2]); 494 495 if (occ_reset) { 496 occ_reset = false; 497 throttled = false; 498 pr_info("OCC Active, CPU frequency is no longer throttled\n"); 499 500 for (i = 0; i < nr_chips; i++) { 501 chips[i].restore = true; 502 schedule_work(&chips[i].throttle); 503 } 504 505 return 0; 506 } 507 508 if (omsg.throttle_status && 509 omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) 510 pr_info("OCC: Chip %u Pmax reduced due to %s\n", 511 (unsigned int)omsg.chip, 512 throttle_reason[omsg.throttle_status]); 513 else if (!omsg.throttle_status) 514 pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip, 515 throttle_reason[omsg.throttle_status]); 516 else 517 return 0; 518 519 for (i = 0; i < nr_chips; i++) 520 if (chips[i].id == omsg.chip) { 521 if (!omsg.throttle_status) 522 chips[i].restore = true; 523 schedule_work(&chips[i].throttle); 524 } 525 } 526 return 0; 527 } 528 529 static struct notifier_block powernv_cpufreq_opal_nb = { 530 .notifier_call = powernv_cpufreq_occ_msg, 531 .next = NULL, 532 .priority = 0, 533 }; 534 535 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) 536 { 537 struct powernv_smp_call_data freq_data; 538 539 freq_data.pstate_id = powernv_pstate_info.min; 540 smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); 541 } 542 543 static struct cpufreq_driver powernv_cpufreq_driver = { 544 .name = "powernv-cpufreq", 545 .flags = CPUFREQ_CONST_LOOPS, 546 .init = powernv_cpufreq_cpu_init, 547 .verify = cpufreq_generic_frequency_table_verify, 548 .target_index = powernv_cpufreq_target_index, 549 .get = powernv_cpufreq_get, 550 .stop_cpu = powernv_cpufreq_stop_cpu, 551 .attr = powernv_cpu_freq_attr, 552 }; 553 554 static int init_chip_info(void) 555 { 556 unsigned int chip[256]; 557 unsigned int cpu, i; 558 unsigned int prev_chip_id = UINT_MAX; 559 560 for_each_possible_cpu(cpu) { 561 unsigned int id = cpu_to_chip_id(cpu); 562 563 if (prev_chip_id != id) { 564 prev_chip_id = id; 565 chip[nr_chips++] = id; 566 } 567 } 568 569 chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); 570 if (!chips) 571 return -ENOMEM; 572 573 for (i = 0; i < nr_chips; i++) { 574 chips[i].id = chip[i]; 575 chips[i].throttled = false; 576 cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); 577 INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); 578 chips[i].restore = false; 579 } 580 581 return 0; 582 } 583 584 static int __init powernv_cpufreq_init(void) 585 { 586 int rc = 0; 587 588 /* Don't probe on pseries (guest) platforms */ 589 if (!firmware_has_feature(FW_FEATURE_OPALv3)) 590 return -ENODEV; 591 592 /* Discover pstates from device tree and init */ 593 rc = init_powernv_pstates(); 594 if (rc) { 595 pr_info("powernv-cpufreq disabled. System does not support PState control\n"); 596 return rc; 597 } 598 599 /* Populate chip info */ 600 rc = init_chip_info(); 601 if (rc) 602 return rc; 603 604 register_reboot_notifier(&powernv_cpufreq_reboot_nb); 605 opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); 606 return cpufreq_register_driver(&powernv_cpufreq_driver); 607 } 608 module_init(powernv_cpufreq_init); 609 610 static void __exit powernv_cpufreq_exit(void) 611 { 612 unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); 613 opal_message_notifier_unregister(OPAL_MSG_OCC, 614 &powernv_cpufreq_opal_nb); 615 cpufreq_unregister_driver(&powernv_cpufreq_driver); 616 } 617 module_exit(powernv_cpufreq_exit); 618 619 MODULE_LICENSE("GPL"); 620 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>"); 621