1 /* 2 * POWERNV cpufreq driver for the IBM POWER processors 3 * 4 * (C) Copyright IBM 2014 5 * 6 * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2, or (at your option) 11 * any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 */ 19 20 #define pr_fmt(fmt) "powernv-cpufreq: " fmt 21 22 #include <linux/kernel.h> 23 #include <linux/sysfs.h> 24 #include <linux/cpumask.h> 25 #include <linux/module.h> 26 #include <linux/cpufreq.h> 27 #include <linux/smp.h> 28 #include <linux/of.h> 29 #include <linux/reboot.h> 30 #include <linux/slab.h> 31 32 #include <asm/cputhreads.h> 33 #include <asm/firmware.h> 34 #include <asm/reg.h> 35 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */ 36 #include <asm/opal.h> 37 38 #define POWERNV_MAX_PSTATES 256 39 #define PMSR_PSAFE_ENABLE (1UL << 30) 40 #define PMSR_SPR_EM_DISABLE (1UL << 31) 41 #define PMSR_MAX(x) ((x >> 32) & 0xFF) 42 43 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; 44 static bool rebooting, throttled, occ_reset; 45 46 static struct chip { 47 unsigned int id; 48 bool throttled; 49 cpumask_t mask; 50 struct work_struct throttle; 51 bool restore; 52 } *chips; 53 54 static int nr_chips; 55 56 /* 57 * Note: The set of pstates consists of contiguous integers, the 58 * smallest of which is indicated by powernv_pstate_info.min, the 59 * largest of which is indicated by powernv_pstate_info.max. 60 * 61 * The nominal pstate is the highest non-turbo pstate in this 62 * platform. This is indicated by powernv_pstate_info.nominal. 63 */ 64 static struct powernv_pstate_info { 65 int min; 66 int max; 67 int nominal; 68 int nr_pstates; 69 } powernv_pstate_info; 70 71 /* 72 * Initialize the freq table based on data obtained 73 * from the firmware passed via device-tree 74 */ 75 static int init_powernv_pstates(void) 76 { 77 struct device_node *power_mgt; 78 int i, pstate_min, pstate_max, pstate_nominal, nr_pstates = 0; 79 const __be32 *pstate_ids, *pstate_freqs; 80 u32 len_ids, len_freqs; 81 82 power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); 83 if (!power_mgt) { 84 pr_warn("power-mgt node not found\n"); 85 return -ENODEV; 86 } 87 88 if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) { 89 pr_warn("ibm,pstate-min node not found\n"); 90 return -ENODEV; 91 } 92 93 if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) { 94 pr_warn("ibm,pstate-max node not found\n"); 95 return -ENODEV; 96 } 97 98 if (of_property_read_u32(power_mgt, "ibm,pstate-nominal", 99 &pstate_nominal)) { 100 pr_warn("ibm,pstate-nominal not found\n"); 101 return -ENODEV; 102 } 103 pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, 104 pstate_nominal, pstate_max); 105 106 pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids); 107 if (!pstate_ids) { 108 pr_warn("ibm,pstate-ids not found\n"); 109 return -ENODEV; 110 } 111 112 pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz", 113 &len_freqs); 114 if (!pstate_freqs) { 115 pr_warn("ibm,pstate-frequencies-mhz not found\n"); 116 return -ENODEV; 117 } 118 119 if (len_ids != len_freqs) { 120 pr_warn("Entries in ibm,pstate-ids and " 121 "ibm,pstate-frequencies-mhz does not match\n"); 122 } 123 124 nr_pstates = min(len_ids, len_freqs) / sizeof(u32); 125 if (!nr_pstates) { 126 pr_warn("No PStates found\n"); 127 return -ENODEV; 128 } 129 130 pr_debug("NR PStates %d\n", nr_pstates); 131 for (i = 0; i < nr_pstates; i++) { 132 u32 id = be32_to_cpu(pstate_ids[i]); 133 u32 freq = be32_to_cpu(pstate_freqs[i]); 134 135 pr_debug("PState id %d freq %d MHz\n", id, freq); 136 powernv_freqs[i].frequency = freq * 1000; /* kHz */ 137 powernv_freqs[i].driver_data = id; 138 } 139 /* End of list marker entry */ 140 powernv_freqs[i].frequency = CPUFREQ_TABLE_END; 141 142 powernv_pstate_info.min = pstate_min; 143 powernv_pstate_info.max = pstate_max; 144 powernv_pstate_info.nominal = pstate_nominal; 145 powernv_pstate_info.nr_pstates = nr_pstates; 146 147 return 0; 148 } 149 150 /* Returns the CPU frequency corresponding to the pstate_id. */ 151 static unsigned int pstate_id_to_freq(int pstate_id) 152 { 153 int i; 154 155 i = powernv_pstate_info.max - pstate_id; 156 if (i >= powernv_pstate_info.nr_pstates || i < 0) { 157 pr_warn("PState id %d outside of PState table, " 158 "reporting nominal id %d instead\n", 159 pstate_id, powernv_pstate_info.nominal); 160 i = powernv_pstate_info.max - powernv_pstate_info.nominal; 161 } 162 163 return powernv_freqs[i].frequency; 164 } 165 166 /* 167 * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by 168 * the firmware 169 */ 170 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy, 171 char *buf) 172 { 173 return sprintf(buf, "%u\n", 174 pstate_id_to_freq(powernv_pstate_info.nominal)); 175 } 176 177 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq = 178 __ATTR_RO(cpuinfo_nominal_freq); 179 180 static struct freq_attr *powernv_cpu_freq_attr[] = { 181 &cpufreq_freq_attr_scaling_available_freqs, 182 &cpufreq_freq_attr_cpuinfo_nominal_freq, 183 NULL, 184 }; 185 186 /* Helper routines */ 187 188 /* Access helpers to power mgt SPR */ 189 190 static inline unsigned long get_pmspr(unsigned long sprn) 191 { 192 switch (sprn) { 193 case SPRN_PMCR: 194 return mfspr(SPRN_PMCR); 195 196 case SPRN_PMICR: 197 return mfspr(SPRN_PMICR); 198 199 case SPRN_PMSR: 200 return mfspr(SPRN_PMSR); 201 } 202 BUG(); 203 } 204 205 static inline void set_pmspr(unsigned long sprn, unsigned long val) 206 { 207 switch (sprn) { 208 case SPRN_PMCR: 209 mtspr(SPRN_PMCR, val); 210 return; 211 212 case SPRN_PMICR: 213 mtspr(SPRN_PMICR, val); 214 return; 215 } 216 BUG(); 217 } 218 219 /* 220 * Use objects of this type to query/update 221 * pstates on a remote CPU via smp_call_function. 222 */ 223 struct powernv_smp_call_data { 224 unsigned int freq; 225 int pstate_id; 226 }; 227 228 /* 229 * powernv_read_cpu_freq: Reads the current frequency on this CPU. 230 * 231 * Called via smp_call_function. 232 * 233 * Note: The caller of the smp_call_function should pass an argument of 234 * the type 'struct powernv_smp_call_data *' along with this function. 235 * 236 * The current frequency on this CPU will be returned via 237 * ((struct powernv_smp_call_data *)arg)->freq; 238 */ 239 static void powernv_read_cpu_freq(void *arg) 240 { 241 unsigned long pmspr_val; 242 s8 local_pstate_id; 243 struct powernv_smp_call_data *freq_data = arg; 244 245 pmspr_val = get_pmspr(SPRN_PMSR); 246 247 /* 248 * The local pstate id corresponds bits 48..55 in the PMSR. 249 * Note: Watch out for the sign! 250 */ 251 local_pstate_id = (pmspr_val >> 48) & 0xFF; 252 freq_data->pstate_id = local_pstate_id; 253 freq_data->freq = pstate_id_to_freq(freq_data->pstate_id); 254 255 pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n", 256 raw_smp_processor_id(), pmspr_val, freq_data->pstate_id, 257 freq_data->freq); 258 } 259 260 /* 261 * powernv_cpufreq_get: Returns the CPU frequency as reported by the 262 * firmware for CPU 'cpu'. This value is reported through the sysfs 263 * file cpuinfo_cur_freq. 264 */ 265 static unsigned int powernv_cpufreq_get(unsigned int cpu) 266 { 267 struct powernv_smp_call_data freq_data; 268 269 smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq, 270 &freq_data, 1); 271 272 return freq_data.freq; 273 } 274 275 /* 276 * set_pstate: Sets the pstate on this CPU. 277 * 278 * This is called via an smp_call_function. 279 * 280 * The caller must ensure that freq_data is of the type 281 * (struct powernv_smp_call_data *) and the pstate_id which needs to be set 282 * on this CPU should be present in freq_data->pstate_id. 283 */ 284 static void set_pstate(void *freq_data) 285 { 286 unsigned long val; 287 unsigned long pstate_ul = 288 ((struct powernv_smp_call_data *) freq_data)->pstate_id; 289 290 val = get_pmspr(SPRN_PMCR); 291 val = val & 0x0000FFFFFFFFFFFFULL; 292 293 pstate_ul = pstate_ul & 0xFF; 294 295 /* Set both global(bits 56..63) and local(bits 48..55) PStates */ 296 val = val | (pstate_ul << 56) | (pstate_ul << 48); 297 298 pr_debug("Setting cpu %d pmcr to %016lX\n", 299 raw_smp_processor_id(), val); 300 set_pmspr(SPRN_PMCR, val); 301 } 302 303 /* 304 * get_nominal_index: Returns the index corresponding to the nominal 305 * pstate in the cpufreq table 306 */ 307 static inline unsigned int get_nominal_index(void) 308 { 309 return powernv_pstate_info.max - powernv_pstate_info.nominal; 310 } 311 312 static void powernv_cpufreq_throttle_check(void *data) 313 { 314 unsigned int cpu = smp_processor_id(); 315 unsigned long pmsr; 316 int pmsr_pmax, i; 317 318 pmsr = get_pmspr(SPRN_PMSR); 319 320 for (i = 0; i < nr_chips; i++) 321 if (chips[i].id == cpu_to_chip_id(cpu)) 322 break; 323 324 /* Check for Pmax Capping */ 325 pmsr_pmax = (s8)PMSR_MAX(pmsr); 326 if (pmsr_pmax != powernv_pstate_info.max) { 327 if (chips[i].throttled) 328 goto next; 329 chips[i].throttled = true; 330 pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, 331 chips[i].id, pmsr_pmax); 332 } else if (chips[i].throttled) { 333 chips[i].throttled = false; 334 pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, 335 chips[i].id, pmsr_pmax); 336 } 337 338 /* Check if Psafe_mode_active is set in PMSR. */ 339 next: 340 if (pmsr & PMSR_PSAFE_ENABLE) { 341 throttled = true; 342 pr_info("Pstate set to safe frequency\n"); 343 } 344 345 /* Check if SPR_EM_DISABLE is set in PMSR */ 346 if (pmsr & PMSR_SPR_EM_DISABLE) { 347 throttled = true; 348 pr_info("Frequency Control disabled from OS\n"); 349 } 350 351 if (throttled) { 352 pr_info("PMSR = %16lx\n", pmsr); 353 pr_crit("CPU Frequency could be throttled\n"); 354 } 355 } 356 357 /* 358 * powernv_cpufreq_target_index: Sets the frequency corresponding to 359 * the cpufreq table entry indexed by new_index on the cpus in the 360 * mask policy->cpus 361 */ 362 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, 363 unsigned int new_index) 364 { 365 struct powernv_smp_call_data freq_data; 366 367 if (unlikely(rebooting) && new_index != get_nominal_index()) 368 return 0; 369 370 if (!throttled) 371 powernv_cpufreq_throttle_check(NULL); 372 373 freq_data.pstate_id = powernv_freqs[new_index].driver_data; 374 375 /* 376 * Use smp_call_function to send IPI and execute the 377 * mtspr on target CPU. We could do that without IPI 378 * if current CPU is within policy->cpus (core) 379 */ 380 smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1); 381 382 return 0; 383 } 384 385 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) 386 { 387 int base, i; 388 389 base = cpu_first_thread_sibling(policy->cpu); 390 391 for (i = 0; i < threads_per_core; i++) 392 cpumask_set_cpu(base + i, policy->cpus); 393 394 return cpufreq_table_validate_and_show(policy, powernv_freqs); 395 } 396 397 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb, 398 unsigned long action, void *unused) 399 { 400 int cpu; 401 struct cpufreq_policy cpu_policy; 402 403 rebooting = true; 404 for_each_online_cpu(cpu) { 405 cpufreq_get_policy(&cpu_policy, cpu); 406 powernv_cpufreq_target_index(&cpu_policy, get_nominal_index()); 407 } 408 409 return NOTIFY_DONE; 410 } 411 412 static struct notifier_block powernv_cpufreq_reboot_nb = { 413 .notifier_call = powernv_cpufreq_reboot_notifier, 414 }; 415 416 void powernv_cpufreq_work_fn(struct work_struct *work) 417 { 418 struct chip *chip = container_of(work, struct chip, throttle); 419 unsigned int cpu; 420 cpumask_var_t mask; 421 422 smp_call_function_any(&chip->mask, 423 powernv_cpufreq_throttle_check, NULL, 0); 424 425 if (!chip->restore) 426 return; 427 428 chip->restore = false; 429 cpumask_copy(mask, &chip->mask); 430 for_each_cpu_and(cpu, mask, cpu_online_mask) { 431 int index, tcpu; 432 struct cpufreq_policy policy; 433 434 cpufreq_get_policy(&policy, cpu); 435 cpufreq_frequency_table_target(&policy, policy.freq_table, 436 policy.cur, 437 CPUFREQ_RELATION_C, &index); 438 powernv_cpufreq_target_index(&policy, index); 439 for_each_cpu(tcpu, policy.cpus) 440 cpumask_clear_cpu(tcpu, mask); 441 } 442 } 443 444 static char throttle_reason[][30] = { 445 "No throttling", 446 "Power Cap", 447 "Processor Over Temperature", 448 "Power Supply Failure", 449 "Over Current", 450 "OCC Reset" 451 }; 452 453 static int powernv_cpufreq_occ_msg(struct notifier_block *nb, 454 unsigned long msg_type, void *_msg) 455 { 456 struct opal_msg *msg = _msg; 457 struct opal_occ_msg omsg; 458 int i; 459 460 if (msg_type != OPAL_MSG_OCC) 461 return 0; 462 463 omsg.type = be64_to_cpu(msg->params[0]); 464 465 switch (omsg.type) { 466 case OCC_RESET: 467 occ_reset = true; 468 pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n"); 469 /* 470 * powernv_cpufreq_throttle_check() is called in 471 * target() callback which can detect the throttle state 472 * for governors like ondemand. 473 * But static governors will not call target() often thus 474 * report throttling here. 475 */ 476 if (!throttled) { 477 throttled = true; 478 pr_crit("CPU frequency is throttled for duration\n"); 479 } 480 481 break; 482 case OCC_LOAD: 483 pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n"); 484 break; 485 case OCC_THROTTLE: 486 omsg.chip = be64_to_cpu(msg->params[1]); 487 omsg.throttle_status = be64_to_cpu(msg->params[2]); 488 489 if (occ_reset) { 490 occ_reset = false; 491 throttled = false; 492 pr_info("OCC Active, CPU frequency is no longer throttled\n"); 493 494 for (i = 0; i < nr_chips; i++) { 495 chips[i].restore = true; 496 schedule_work(&chips[i].throttle); 497 } 498 499 return 0; 500 } 501 502 if (omsg.throttle_status && 503 omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) 504 pr_info("OCC: Chip %u Pmax reduced due to %s\n", 505 (unsigned int)omsg.chip, 506 throttle_reason[omsg.throttle_status]); 507 else if (!omsg.throttle_status) 508 pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip, 509 throttle_reason[omsg.throttle_status]); 510 else 511 return 0; 512 513 for (i = 0; i < nr_chips; i++) 514 if (chips[i].id == omsg.chip) { 515 if (!omsg.throttle_status) 516 chips[i].restore = true; 517 schedule_work(&chips[i].throttle); 518 } 519 } 520 return 0; 521 } 522 523 static struct notifier_block powernv_cpufreq_opal_nb = { 524 .notifier_call = powernv_cpufreq_occ_msg, 525 .next = NULL, 526 .priority = 0, 527 }; 528 529 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) 530 { 531 struct powernv_smp_call_data freq_data; 532 533 freq_data.pstate_id = powernv_pstate_info.min; 534 smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); 535 } 536 537 static struct cpufreq_driver powernv_cpufreq_driver = { 538 .name = "powernv-cpufreq", 539 .flags = CPUFREQ_CONST_LOOPS, 540 .init = powernv_cpufreq_cpu_init, 541 .verify = cpufreq_generic_frequency_table_verify, 542 .target_index = powernv_cpufreq_target_index, 543 .get = powernv_cpufreq_get, 544 .stop_cpu = powernv_cpufreq_stop_cpu, 545 .attr = powernv_cpu_freq_attr, 546 }; 547 548 static int init_chip_info(void) 549 { 550 unsigned int chip[256]; 551 unsigned int cpu, i; 552 unsigned int prev_chip_id = UINT_MAX; 553 554 for_each_possible_cpu(cpu) { 555 unsigned int id = cpu_to_chip_id(cpu); 556 557 if (prev_chip_id != id) { 558 prev_chip_id = id; 559 chip[nr_chips++] = id; 560 } 561 } 562 563 chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); 564 if (!chips) 565 return -ENOMEM; 566 567 for (i = 0; i < nr_chips; i++) { 568 chips[i].id = chip[i]; 569 chips[i].throttled = false; 570 cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); 571 INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); 572 chips[i].restore = false; 573 } 574 575 return 0; 576 } 577 578 static int __init powernv_cpufreq_init(void) 579 { 580 int rc = 0; 581 582 /* Don't probe on pseries (guest) platforms */ 583 if (!firmware_has_feature(FW_FEATURE_OPALv3)) 584 return -ENODEV; 585 586 /* Discover pstates from device tree and init */ 587 rc = init_powernv_pstates(); 588 if (rc) { 589 pr_info("powernv-cpufreq disabled. System does not support PState control\n"); 590 return rc; 591 } 592 593 /* Populate chip info */ 594 rc = init_chip_info(); 595 if (rc) 596 return rc; 597 598 register_reboot_notifier(&powernv_cpufreq_reboot_nb); 599 opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); 600 return cpufreq_register_driver(&powernv_cpufreq_driver); 601 } 602 module_init(powernv_cpufreq_init); 603 604 static void __exit powernv_cpufreq_exit(void) 605 { 606 unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); 607 opal_message_notifier_unregister(OPAL_MSG_OCC, 608 &powernv_cpufreq_opal_nb); 609 cpufreq_unregister_driver(&powernv_cpufreq_driver); 610 } 611 module_exit(powernv_cpufreq_exit); 612 613 MODULE_LICENSE("GPL"); 614 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>"); 615