1 /* 2 * POWERNV cpufreq driver for the IBM POWER processors 3 * 4 * (C) Copyright IBM 2014 5 * 6 * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2, or (at your option) 11 * any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 */ 19 20 #define pr_fmt(fmt) "powernv-cpufreq: " fmt 21 22 #include <linux/kernel.h> 23 #include <linux/sysfs.h> 24 #include <linux/cpumask.h> 25 #include <linux/module.h> 26 #include <linux/cpufreq.h> 27 #include <linux/smp.h> 28 #include <linux/of.h> 29 #include <linux/reboot.h> 30 #include <linux/slab.h> 31 #include <linux/cpu.h> 32 #include <trace/events/power.h> 33 34 #include <asm/cputhreads.h> 35 #include <asm/firmware.h> 36 #include <asm/reg.h> 37 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */ 38 #include <asm/opal.h> 39 #include <linux/timer.h> 40 41 #define POWERNV_MAX_PSTATES 256 42 #define PMSR_PSAFE_ENABLE (1UL << 30) 43 #define PMSR_SPR_EM_DISABLE (1UL << 31) 44 #define PMSR_MAX(x) ((x >> 32) & 0xFF) 45 #define LPSTATE_SHIFT 48 46 #define GPSTATE_SHIFT 56 47 #define GET_LPSTATE(x) (((x) >> LPSTATE_SHIFT) & 0xFF) 48 #define GET_GPSTATE(x) (((x) >> GPSTATE_SHIFT) & 0xFF) 49 50 #define MAX_RAMP_DOWN_TIME 5120 51 /* 52 * On an idle system we want the global pstate to ramp-down from max value to 53 * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and 54 * then ramp-down rapidly later on. 55 * 56 * This gives a percentage rampdown for time elapsed in milliseconds. 57 * ramp_down_percentage = ((ms * ms) >> 18) 58 * ~= 3.8 * (sec * sec) 59 * 60 * At 0 ms ramp_down_percent = 0 61 * At 5120 ms ramp_down_percent = 100 62 */ 63 #define ramp_down_percent(time) ((time * time) >> 18) 64 65 /* Interval after which the timer is queued to bring down global pstate */ 66 #define GPSTATE_TIMER_INTERVAL 2000 67 68 /** 69 * struct global_pstate_info - Per policy data structure to maintain history of 70 * global pstates 71 * @highest_lpstate_idx: The local pstate index from which we are 72 * ramping down 73 * @elapsed_time: Time in ms spent in ramping down from 74 * highest_lpstate_idx 75 * @last_sampled_time: Time from boot in ms when global pstates were 76 * last set 77 * @last_lpstate_idx, Last set value of local pstate and global 78 * last_gpstate_idx pstate in terms of cpufreq table index 79 * @timer: Is used for ramping down if cpu goes idle for 80 * a long time with global pstate held high 81 * @gpstate_lock: A spinlock to maintain synchronization between 82 * routines called by the timer handler and 83 * governer's target_index calls 84 */ 85 struct global_pstate_info { 86 int highest_lpstate_idx; 87 unsigned int elapsed_time; 88 unsigned int last_sampled_time; 89 int last_lpstate_idx; 90 int last_gpstate_idx; 91 spinlock_t gpstate_lock; 92 struct timer_list timer; 93 }; 94 95 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; 96 static bool rebooting, throttled, occ_reset; 97 98 static const char * const throttle_reason[] = { 99 "No throttling", 100 "Power Cap", 101 "Processor Over Temperature", 102 "Power Supply Failure", 103 "Over Current", 104 "OCC Reset" 105 }; 106 107 enum throttle_reason_type { 108 NO_THROTTLE = 0, 109 POWERCAP, 110 CPU_OVERTEMP, 111 POWER_SUPPLY_FAILURE, 112 OVERCURRENT, 113 OCC_RESET_THROTTLE, 114 OCC_MAX_REASON 115 }; 116 117 static struct chip { 118 unsigned int id; 119 bool throttled; 120 bool restore; 121 u8 throttle_reason; 122 cpumask_t mask; 123 struct work_struct throttle; 124 int throttle_turbo; 125 int throttle_sub_turbo; 126 int reason[OCC_MAX_REASON]; 127 } *chips; 128 129 static int nr_chips; 130 static DEFINE_PER_CPU(struct chip *, chip_info); 131 132 /* 133 * Note: 134 * The set of pstates consists of contiguous integers. 135 * powernv_pstate_info stores the index of the frequency table for 136 * max, min and nominal frequencies. It also stores number of 137 * available frequencies. 138 * 139 * powernv_pstate_info.nominal indicates the index to the highest 140 * non-turbo frequency. 141 */ 142 static struct powernv_pstate_info { 143 unsigned int min; 144 unsigned int max; 145 unsigned int nominal; 146 unsigned int nr_pstates; 147 } powernv_pstate_info; 148 149 /* Use following macros for conversions between pstate_id and index */ 150 static inline int idx_to_pstate(unsigned int i) 151 { 152 if (unlikely(i >= powernv_pstate_info.nr_pstates)) { 153 pr_warn_once("index %u is out of bound\n", i); 154 return powernv_freqs[powernv_pstate_info.nominal].driver_data; 155 } 156 157 return powernv_freqs[i].driver_data; 158 } 159 160 static inline unsigned int pstate_to_idx(int pstate) 161 { 162 int min = powernv_freqs[powernv_pstate_info.min].driver_data; 163 int max = powernv_freqs[powernv_pstate_info.max].driver_data; 164 165 if (min > 0) { 166 if (unlikely((pstate < max) || (pstate > min))) { 167 pr_warn_once("pstate %d is out of bound\n", pstate); 168 return powernv_pstate_info.nominal; 169 } 170 } else { 171 if (unlikely((pstate > max) || (pstate < min))) { 172 pr_warn_once("pstate %d is out of bound\n", pstate); 173 return powernv_pstate_info.nominal; 174 } 175 } 176 /* 177 * abs() is deliberately used so that is works with 178 * both monotonically increasing and decreasing 179 * pstate values 180 */ 181 return abs(pstate - idx_to_pstate(powernv_pstate_info.max)); 182 } 183 184 static inline void reset_gpstates(struct cpufreq_policy *policy) 185 { 186 struct global_pstate_info *gpstates = policy->driver_data; 187 188 gpstates->highest_lpstate_idx = 0; 189 gpstates->elapsed_time = 0; 190 gpstates->last_sampled_time = 0; 191 gpstates->last_lpstate_idx = 0; 192 gpstates->last_gpstate_idx = 0; 193 } 194 195 /* 196 * Initialize the freq table based on data obtained 197 * from the firmware passed via device-tree 198 */ 199 static int init_powernv_pstates(void) 200 { 201 struct device_node *power_mgt; 202 int i, nr_pstates = 0; 203 const __be32 *pstate_ids, *pstate_freqs; 204 u32 len_ids, len_freqs; 205 u32 pstate_min, pstate_max, pstate_nominal; 206 207 power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); 208 if (!power_mgt) { 209 pr_warn("power-mgt node not found\n"); 210 return -ENODEV; 211 } 212 213 if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) { 214 pr_warn("ibm,pstate-min node not found\n"); 215 return -ENODEV; 216 } 217 218 if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) { 219 pr_warn("ibm,pstate-max node not found\n"); 220 return -ENODEV; 221 } 222 223 if (of_property_read_u32(power_mgt, "ibm,pstate-nominal", 224 &pstate_nominal)) { 225 pr_warn("ibm,pstate-nominal not found\n"); 226 return -ENODEV; 227 } 228 pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, 229 pstate_nominal, pstate_max); 230 231 pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids); 232 if (!pstate_ids) { 233 pr_warn("ibm,pstate-ids not found\n"); 234 return -ENODEV; 235 } 236 237 pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz", 238 &len_freqs); 239 if (!pstate_freqs) { 240 pr_warn("ibm,pstate-frequencies-mhz not found\n"); 241 return -ENODEV; 242 } 243 244 if (len_ids != len_freqs) { 245 pr_warn("Entries in ibm,pstate-ids and " 246 "ibm,pstate-frequencies-mhz does not match\n"); 247 } 248 249 nr_pstates = min(len_ids, len_freqs) / sizeof(u32); 250 if (!nr_pstates) { 251 pr_warn("No PStates found\n"); 252 return -ENODEV; 253 } 254 255 powernv_pstate_info.nr_pstates = nr_pstates; 256 pr_debug("NR PStates %d\n", nr_pstates); 257 for (i = 0; i < nr_pstates; i++) { 258 u32 id = be32_to_cpu(pstate_ids[i]); 259 u32 freq = be32_to_cpu(pstate_freqs[i]); 260 261 pr_debug("PState id %d freq %d MHz\n", id, freq); 262 powernv_freqs[i].frequency = freq * 1000; /* kHz */ 263 powernv_freqs[i].driver_data = id; 264 265 if (id == pstate_max) 266 powernv_pstate_info.max = i; 267 else if (id == pstate_nominal) 268 powernv_pstate_info.nominal = i; 269 else if (id == pstate_min) 270 powernv_pstate_info.min = i; 271 } 272 273 /* End of list marker entry */ 274 powernv_freqs[i].frequency = CPUFREQ_TABLE_END; 275 return 0; 276 } 277 278 /* Returns the CPU frequency corresponding to the pstate_id. */ 279 static unsigned int pstate_id_to_freq(int pstate_id) 280 { 281 int i; 282 283 i = pstate_to_idx(pstate_id); 284 if (i >= powernv_pstate_info.nr_pstates || i < 0) { 285 pr_warn("PState id %d outside of PState table, " 286 "reporting nominal id %d instead\n", 287 pstate_id, idx_to_pstate(powernv_pstate_info.nominal)); 288 i = powernv_pstate_info.nominal; 289 } 290 291 return powernv_freqs[i].frequency; 292 } 293 294 /* 295 * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by 296 * the firmware 297 */ 298 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy, 299 char *buf) 300 { 301 return sprintf(buf, "%u\n", 302 powernv_freqs[powernv_pstate_info.nominal].frequency); 303 } 304 305 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq = 306 __ATTR_RO(cpuinfo_nominal_freq); 307 308 static struct freq_attr *powernv_cpu_freq_attr[] = { 309 &cpufreq_freq_attr_scaling_available_freqs, 310 &cpufreq_freq_attr_cpuinfo_nominal_freq, 311 NULL, 312 }; 313 314 #define throttle_attr(name, member) \ 315 static ssize_t name##_show(struct cpufreq_policy *policy, char *buf) \ 316 { \ 317 struct chip *chip = per_cpu(chip_info, policy->cpu); \ 318 \ 319 return sprintf(buf, "%u\n", chip->member); \ 320 } \ 321 \ 322 static struct freq_attr throttle_attr_##name = __ATTR_RO(name) \ 323 324 throttle_attr(unthrottle, reason[NO_THROTTLE]); 325 throttle_attr(powercap, reason[POWERCAP]); 326 throttle_attr(overtemp, reason[CPU_OVERTEMP]); 327 throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]); 328 throttle_attr(overcurrent, reason[OVERCURRENT]); 329 throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]); 330 throttle_attr(turbo_stat, throttle_turbo); 331 throttle_attr(sub_turbo_stat, throttle_sub_turbo); 332 333 static struct attribute *throttle_attrs[] = { 334 &throttle_attr_unthrottle.attr, 335 &throttle_attr_powercap.attr, 336 &throttle_attr_overtemp.attr, 337 &throttle_attr_supply_fault.attr, 338 &throttle_attr_overcurrent.attr, 339 &throttle_attr_occ_reset.attr, 340 &throttle_attr_turbo_stat.attr, 341 &throttle_attr_sub_turbo_stat.attr, 342 NULL, 343 }; 344 345 static const struct attribute_group throttle_attr_grp = { 346 .name = "throttle_stats", 347 .attrs = throttle_attrs, 348 }; 349 350 /* Helper routines */ 351 352 /* Access helpers to power mgt SPR */ 353 354 static inline unsigned long get_pmspr(unsigned long sprn) 355 { 356 switch (sprn) { 357 case SPRN_PMCR: 358 return mfspr(SPRN_PMCR); 359 360 case SPRN_PMICR: 361 return mfspr(SPRN_PMICR); 362 363 case SPRN_PMSR: 364 return mfspr(SPRN_PMSR); 365 } 366 BUG(); 367 } 368 369 static inline void set_pmspr(unsigned long sprn, unsigned long val) 370 { 371 switch (sprn) { 372 case SPRN_PMCR: 373 mtspr(SPRN_PMCR, val); 374 return; 375 376 case SPRN_PMICR: 377 mtspr(SPRN_PMICR, val); 378 return; 379 } 380 BUG(); 381 } 382 383 /* 384 * Use objects of this type to query/update 385 * pstates on a remote CPU via smp_call_function. 386 */ 387 struct powernv_smp_call_data { 388 unsigned int freq; 389 int pstate_id; 390 int gpstate_id; 391 }; 392 393 /* 394 * powernv_read_cpu_freq: Reads the current frequency on this CPU. 395 * 396 * Called via smp_call_function. 397 * 398 * Note: The caller of the smp_call_function should pass an argument of 399 * the type 'struct powernv_smp_call_data *' along with this function. 400 * 401 * The current frequency on this CPU will be returned via 402 * ((struct powernv_smp_call_data *)arg)->freq; 403 */ 404 static void powernv_read_cpu_freq(void *arg) 405 { 406 unsigned long pmspr_val; 407 s8 local_pstate_id; 408 struct powernv_smp_call_data *freq_data = arg; 409 410 pmspr_val = get_pmspr(SPRN_PMSR); 411 412 /* 413 * The local pstate id corresponds bits 48..55 in the PMSR. 414 * Note: Watch out for the sign! 415 */ 416 local_pstate_id = (pmspr_val >> 48) & 0xFF; 417 freq_data->pstate_id = local_pstate_id; 418 freq_data->freq = pstate_id_to_freq(freq_data->pstate_id); 419 420 pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n", 421 raw_smp_processor_id(), pmspr_val, freq_data->pstate_id, 422 freq_data->freq); 423 } 424 425 /* 426 * powernv_cpufreq_get: Returns the CPU frequency as reported by the 427 * firmware for CPU 'cpu'. This value is reported through the sysfs 428 * file cpuinfo_cur_freq. 429 */ 430 static unsigned int powernv_cpufreq_get(unsigned int cpu) 431 { 432 struct powernv_smp_call_data freq_data; 433 434 smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq, 435 &freq_data, 1); 436 437 return freq_data.freq; 438 } 439 440 /* 441 * set_pstate: Sets the pstate on this CPU. 442 * 443 * This is called via an smp_call_function. 444 * 445 * The caller must ensure that freq_data is of the type 446 * (struct powernv_smp_call_data *) and the pstate_id which needs to be set 447 * on this CPU should be present in freq_data->pstate_id. 448 */ 449 static void set_pstate(void *data) 450 { 451 unsigned long val; 452 struct powernv_smp_call_data *freq_data = data; 453 unsigned long pstate_ul = freq_data->pstate_id; 454 unsigned long gpstate_ul = freq_data->gpstate_id; 455 456 val = get_pmspr(SPRN_PMCR); 457 val = val & 0x0000FFFFFFFFFFFFULL; 458 459 pstate_ul = pstate_ul & 0xFF; 460 gpstate_ul = gpstate_ul & 0xFF; 461 462 /* Set both global(bits 56..63) and local(bits 48..55) PStates */ 463 val = val | (gpstate_ul << 56) | (pstate_ul << 48); 464 465 pr_debug("Setting cpu %d pmcr to %016lX\n", 466 raw_smp_processor_id(), val); 467 set_pmspr(SPRN_PMCR, val); 468 } 469 470 /* 471 * get_nominal_index: Returns the index corresponding to the nominal 472 * pstate in the cpufreq table 473 */ 474 static inline unsigned int get_nominal_index(void) 475 { 476 return powernv_pstate_info.nominal; 477 } 478 479 static void powernv_cpufreq_throttle_check(void *data) 480 { 481 struct chip *chip; 482 unsigned int cpu = smp_processor_id(); 483 unsigned long pmsr; 484 int pmsr_pmax; 485 unsigned int pmsr_pmax_idx; 486 487 pmsr = get_pmspr(SPRN_PMSR); 488 chip = this_cpu_read(chip_info); 489 490 /* Check for Pmax Capping */ 491 pmsr_pmax = (s8)PMSR_MAX(pmsr); 492 pmsr_pmax_idx = pstate_to_idx(pmsr_pmax); 493 if (pmsr_pmax_idx != powernv_pstate_info.max) { 494 if (chip->throttled) 495 goto next; 496 chip->throttled = true; 497 if (pmsr_pmax_idx > powernv_pstate_info.nominal) { 498 pr_warn_once("CPU %d on Chip %u has Pmax(%d) reduced below nominal frequency(%d)\n", 499 cpu, chip->id, pmsr_pmax, 500 idx_to_pstate(powernv_pstate_info.nominal)); 501 chip->throttle_sub_turbo++; 502 } else { 503 chip->throttle_turbo++; 504 } 505 trace_powernv_throttle(chip->id, 506 throttle_reason[chip->throttle_reason], 507 pmsr_pmax); 508 } else if (chip->throttled) { 509 chip->throttled = false; 510 trace_powernv_throttle(chip->id, 511 throttle_reason[chip->throttle_reason], 512 pmsr_pmax); 513 } 514 515 /* Check if Psafe_mode_active is set in PMSR. */ 516 next: 517 if (pmsr & PMSR_PSAFE_ENABLE) { 518 throttled = true; 519 pr_info("Pstate set to safe frequency\n"); 520 } 521 522 /* Check if SPR_EM_DISABLE is set in PMSR */ 523 if (pmsr & PMSR_SPR_EM_DISABLE) { 524 throttled = true; 525 pr_info("Frequency Control disabled from OS\n"); 526 } 527 528 if (throttled) { 529 pr_info("PMSR = %16lx\n", pmsr); 530 pr_warn("CPU Frequency could be throttled\n"); 531 } 532 } 533 534 /** 535 * calc_global_pstate - Calculate global pstate 536 * @elapsed_time: Elapsed time in milliseconds 537 * @local_pstate_idx: New local pstate 538 * @highest_lpstate_idx: pstate from which its ramping down 539 * 540 * Finds the appropriate global pstate based on the pstate from which its 541 * ramping down and the time elapsed in ramping down. It follows a quadratic 542 * equation which ensures that it reaches ramping down to pmin in 5sec. 543 */ 544 static inline int calc_global_pstate(unsigned int elapsed_time, 545 int highest_lpstate_idx, 546 int local_pstate_idx) 547 { 548 int index_diff; 549 550 /* 551 * Using ramp_down_percent we get the percentage of rampdown 552 * that we are expecting to be dropping. Difference between 553 * highest_lpstate_idx and powernv_pstate_info.min will give a absolute 554 * number of how many pstates we will drop eventually by the end of 555 * 5 seconds, then just scale it get the number pstates to be dropped. 556 */ 557 index_diff = ((int)ramp_down_percent(elapsed_time) * 558 (powernv_pstate_info.min - highest_lpstate_idx)) / 100; 559 560 /* Ensure that global pstate is >= to local pstate */ 561 if (highest_lpstate_idx + index_diff >= local_pstate_idx) 562 return local_pstate_idx; 563 else 564 return highest_lpstate_idx + index_diff; 565 } 566 567 static inline void queue_gpstate_timer(struct global_pstate_info *gpstates) 568 { 569 unsigned int timer_interval; 570 571 /* 572 * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But 573 * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time. 574 * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME 575 * seconds of ramp down time. 576 */ 577 if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL) 578 > MAX_RAMP_DOWN_TIME) 579 timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time; 580 else 581 timer_interval = GPSTATE_TIMER_INTERVAL; 582 583 mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval)); 584 } 585 586 /** 587 * gpstate_timer_handler 588 * 589 * @data: pointer to cpufreq_policy on which timer was queued 590 * 591 * This handler brings down the global pstate closer to the local pstate 592 * according quadratic equation. Queues a new timer if it is still not equal 593 * to local pstate 594 */ 595 void gpstate_timer_handler(unsigned long data) 596 { 597 struct cpufreq_policy *policy = (struct cpufreq_policy *)data; 598 struct global_pstate_info *gpstates = policy->driver_data; 599 int gpstate_idx, lpstate_idx; 600 unsigned long val; 601 unsigned int time_diff = jiffies_to_msecs(jiffies) 602 - gpstates->last_sampled_time; 603 struct powernv_smp_call_data freq_data; 604 605 if (!spin_trylock(&gpstates->gpstate_lock)) 606 return; 607 608 /* 609 * If PMCR was last updated was using fast_swtich then 610 * We may have wrong in gpstate->last_lpstate_idx 611 * value. Hence, read from PMCR to get correct data. 612 */ 613 val = get_pmspr(SPRN_PMCR); 614 freq_data.gpstate_id = (s8)GET_GPSTATE(val); 615 freq_data.pstate_id = (s8)GET_LPSTATE(val); 616 if (freq_data.gpstate_id == freq_data.pstate_id) { 617 reset_gpstates(policy); 618 spin_unlock(&gpstates->gpstate_lock); 619 return; 620 } 621 622 gpstates->last_sampled_time += time_diff; 623 gpstates->elapsed_time += time_diff; 624 625 if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) { 626 gpstate_idx = pstate_to_idx(freq_data.pstate_id); 627 lpstate_idx = gpstate_idx; 628 reset_gpstates(policy); 629 gpstates->highest_lpstate_idx = gpstate_idx; 630 } else { 631 lpstate_idx = pstate_to_idx(freq_data.pstate_id); 632 gpstate_idx = calc_global_pstate(gpstates->elapsed_time, 633 gpstates->highest_lpstate_idx, 634 lpstate_idx); 635 } 636 freq_data.gpstate_id = idx_to_pstate(gpstate_idx); 637 gpstates->last_gpstate_idx = gpstate_idx; 638 gpstates->last_lpstate_idx = lpstate_idx; 639 /* 640 * If local pstate is equal to global pstate, rampdown is over 641 * So timer is not required to be queued. 642 */ 643 if (gpstate_idx != gpstates->last_lpstate_idx) 644 queue_gpstate_timer(gpstates); 645 646 spin_unlock(&gpstates->gpstate_lock); 647 648 /* Timer may get migrated to a different cpu on cpu hot unplug */ 649 smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1); 650 } 651 652 /* 653 * powernv_cpufreq_target_index: Sets the frequency corresponding to 654 * the cpufreq table entry indexed by new_index on the cpus in the 655 * mask policy->cpus 656 */ 657 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, 658 unsigned int new_index) 659 { 660 struct powernv_smp_call_data freq_data; 661 unsigned int cur_msec, gpstate_idx; 662 struct global_pstate_info *gpstates = policy->driver_data; 663 664 if (unlikely(rebooting) && new_index != get_nominal_index()) 665 return 0; 666 667 if (!throttled) { 668 /* we don't want to be preempted while 669 * checking if the CPU frequency has been throttled 670 */ 671 preempt_disable(); 672 powernv_cpufreq_throttle_check(NULL); 673 preempt_enable(); 674 } 675 676 cur_msec = jiffies_to_msecs(get_jiffies_64()); 677 678 spin_lock(&gpstates->gpstate_lock); 679 freq_data.pstate_id = idx_to_pstate(new_index); 680 681 if (!gpstates->last_sampled_time) { 682 gpstate_idx = new_index; 683 gpstates->highest_lpstate_idx = new_index; 684 goto gpstates_done; 685 } 686 687 if (gpstates->last_gpstate_idx < new_index) { 688 gpstates->elapsed_time += cur_msec - 689 gpstates->last_sampled_time; 690 691 /* 692 * If its has been ramping down for more than MAX_RAMP_DOWN_TIME 693 * we should be resetting all global pstate related data. Set it 694 * equal to local pstate to start fresh. 695 */ 696 if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) { 697 reset_gpstates(policy); 698 gpstates->highest_lpstate_idx = new_index; 699 gpstate_idx = new_index; 700 } else { 701 /* Elaspsed_time is less than 5 seconds, continue to rampdown */ 702 gpstate_idx = calc_global_pstate(gpstates->elapsed_time, 703 gpstates->highest_lpstate_idx, 704 new_index); 705 } 706 } else { 707 reset_gpstates(policy); 708 gpstates->highest_lpstate_idx = new_index; 709 gpstate_idx = new_index; 710 } 711 712 /* 713 * If local pstate is equal to global pstate, rampdown is over 714 * So timer is not required to be queued. 715 */ 716 if (gpstate_idx != new_index) 717 queue_gpstate_timer(gpstates); 718 else 719 del_timer_sync(&gpstates->timer); 720 721 gpstates_done: 722 freq_data.gpstate_id = idx_to_pstate(gpstate_idx); 723 gpstates->last_sampled_time = cur_msec; 724 gpstates->last_gpstate_idx = gpstate_idx; 725 gpstates->last_lpstate_idx = new_index; 726 727 spin_unlock(&gpstates->gpstate_lock); 728 729 /* 730 * Use smp_call_function to send IPI and execute the 731 * mtspr on target CPU. We could do that without IPI 732 * if current CPU is within policy->cpus (core) 733 */ 734 smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1); 735 return 0; 736 } 737 738 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) 739 { 740 int base, i, ret; 741 struct kernfs_node *kn; 742 struct global_pstate_info *gpstates; 743 744 base = cpu_first_thread_sibling(policy->cpu); 745 746 for (i = 0; i < threads_per_core; i++) 747 cpumask_set_cpu(base + i, policy->cpus); 748 749 kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name); 750 if (!kn) { 751 int ret; 752 753 ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp); 754 if (ret) { 755 pr_info("Failed to create throttle stats directory for cpu %d\n", 756 policy->cpu); 757 return ret; 758 } 759 } else { 760 kernfs_put(kn); 761 } 762 763 gpstates = kzalloc(sizeof(*gpstates), GFP_KERNEL); 764 if (!gpstates) 765 return -ENOMEM; 766 767 policy->driver_data = gpstates; 768 769 /* initialize timer */ 770 init_timer_pinned_deferrable(&gpstates->timer); 771 gpstates->timer.data = (unsigned long)policy; 772 gpstates->timer.function = gpstate_timer_handler; 773 gpstates->timer.expires = jiffies + 774 msecs_to_jiffies(GPSTATE_TIMER_INTERVAL); 775 spin_lock_init(&gpstates->gpstate_lock); 776 ret = cpufreq_table_validate_and_show(policy, powernv_freqs); 777 778 if (ret < 0) { 779 kfree(policy->driver_data); 780 return ret; 781 } 782 783 policy->fast_switch_possible = true; 784 return ret; 785 } 786 787 static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy) 788 { 789 /* timer is deleted in cpufreq_cpu_stop() */ 790 kfree(policy->driver_data); 791 792 return 0; 793 } 794 795 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb, 796 unsigned long action, void *unused) 797 { 798 int cpu; 799 struct cpufreq_policy cpu_policy; 800 801 rebooting = true; 802 for_each_online_cpu(cpu) { 803 cpufreq_get_policy(&cpu_policy, cpu); 804 powernv_cpufreq_target_index(&cpu_policy, get_nominal_index()); 805 } 806 807 return NOTIFY_DONE; 808 } 809 810 static struct notifier_block powernv_cpufreq_reboot_nb = { 811 .notifier_call = powernv_cpufreq_reboot_notifier, 812 }; 813 814 void powernv_cpufreq_work_fn(struct work_struct *work) 815 { 816 struct chip *chip = container_of(work, struct chip, throttle); 817 unsigned int cpu; 818 cpumask_t mask; 819 820 get_online_cpus(); 821 cpumask_and(&mask, &chip->mask, cpu_online_mask); 822 smp_call_function_any(&mask, 823 powernv_cpufreq_throttle_check, NULL, 0); 824 825 if (!chip->restore) 826 goto out; 827 828 chip->restore = false; 829 for_each_cpu(cpu, &mask) { 830 int index; 831 struct cpufreq_policy policy; 832 833 cpufreq_get_policy(&policy, cpu); 834 index = cpufreq_table_find_index_c(&policy, policy.cur); 835 powernv_cpufreq_target_index(&policy, index); 836 cpumask_andnot(&mask, &mask, policy.cpus); 837 } 838 out: 839 put_online_cpus(); 840 } 841 842 static int powernv_cpufreq_occ_msg(struct notifier_block *nb, 843 unsigned long msg_type, void *_msg) 844 { 845 struct opal_msg *msg = _msg; 846 struct opal_occ_msg omsg; 847 int i; 848 849 if (msg_type != OPAL_MSG_OCC) 850 return 0; 851 852 omsg.type = be64_to_cpu(msg->params[0]); 853 854 switch (omsg.type) { 855 case OCC_RESET: 856 occ_reset = true; 857 pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n"); 858 /* 859 * powernv_cpufreq_throttle_check() is called in 860 * target() callback which can detect the throttle state 861 * for governors like ondemand. 862 * But static governors will not call target() often thus 863 * report throttling here. 864 */ 865 if (!throttled) { 866 throttled = true; 867 pr_warn("CPU frequency is throttled for duration\n"); 868 } 869 870 break; 871 case OCC_LOAD: 872 pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n"); 873 break; 874 case OCC_THROTTLE: 875 omsg.chip = be64_to_cpu(msg->params[1]); 876 omsg.throttle_status = be64_to_cpu(msg->params[2]); 877 878 if (occ_reset) { 879 occ_reset = false; 880 throttled = false; 881 pr_info("OCC Active, CPU frequency is no longer throttled\n"); 882 883 for (i = 0; i < nr_chips; i++) { 884 chips[i].restore = true; 885 schedule_work(&chips[i].throttle); 886 } 887 888 return 0; 889 } 890 891 for (i = 0; i < nr_chips; i++) 892 if (chips[i].id == omsg.chip) 893 break; 894 895 if (omsg.throttle_status >= 0 && 896 omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) { 897 chips[i].throttle_reason = omsg.throttle_status; 898 chips[i].reason[omsg.throttle_status]++; 899 } 900 901 if (!omsg.throttle_status) 902 chips[i].restore = true; 903 904 schedule_work(&chips[i].throttle); 905 } 906 return 0; 907 } 908 909 static struct notifier_block powernv_cpufreq_opal_nb = { 910 .notifier_call = powernv_cpufreq_occ_msg, 911 .next = NULL, 912 .priority = 0, 913 }; 914 915 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) 916 { 917 struct powernv_smp_call_data freq_data; 918 struct global_pstate_info *gpstates = policy->driver_data; 919 920 freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min); 921 freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min); 922 smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); 923 del_timer_sync(&gpstates->timer); 924 } 925 926 static unsigned int powernv_fast_switch(struct cpufreq_policy *policy, 927 unsigned int target_freq) 928 { 929 int index; 930 struct powernv_smp_call_data freq_data; 931 932 index = cpufreq_table_find_index_dl(policy, target_freq); 933 freq_data.pstate_id = powernv_freqs[index].driver_data; 934 freq_data.gpstate_id = powernv_freqs[index].driver_data; 935 set_pstate(&freq_data); 936 937 return powernv_freqs[index].frequency; 938 } 939 940 static struct cpufreq_driver powernv_cpufreq_driver = { 941 .name = "powernv-cpufreq", 942 .flags = CPUFREQ_CONST_LOOPS, 943 .init = powernv_cpufreq_cpu_init, 944 .exit = powernv_cpufreq_cpu_exit, 945 .verify = cpufreq_generic_frequency_table_verify, 946 .target_index = powernv_cpufreq_target_index, 947 .fast_switch = powernv_fast_switch, 948 .get = powernv_cpufreq_get, 949 .stop_cpu = powernv_cpufreq_stop_cpu, 950 .attr = powernv_cpu_freq_attr, 951 }; 952 953 static int init_chip_info(void) 954 { 955 unsigned int chip[256]; 956 unsigned int cpu, i; 957 unsigned int prev_chip_id = UINT_MAX; 958 959 for_each_possible_cpu(cpu) { 960 unsigned int id = cpu_to_chip_id(cpu); 961 962 if (prev_chip_id != id) { 963 prev_chip_id = id; 964 chip[nr_chips++] = id; 965 } 966 } 967 968 chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); 969 if (!chips) 970 return -ENOMEM; 971 972 for (i = 0; i < nr_chips; i++) { 973 chips[i].id = chip[i]; 974 cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); 975 INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); 976 for_each_cpu(cpu, &chips[i].mask) 977 per_cpu(chip_info, cpu) = &chips[i]; 978 } 979 980 return 0; 981 } 982 983 static inline void clean_chip_info(void) 984 { 985 kfree(chips); 986 } 987 988 static inline void unregister_all_notifiers(void) 989 { 990 opal_message_notifier_unregister(OPAL_MSG_OCC, 991 &powernv_cpufreq_opal_nb); 992 unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); 993 } 994 995 static int __init powernv_cpufreq_init(void) 996 { 997 int rc = 0; 998 999 /* Don't probe on pseries (guest) platforms */ 1000 if (!firmware_has_feature(FW_FEATURE_OPAL)) 1001 return -ENODEV; 1002 1003 /* Discover pstates from device tree and init */ 1004 rc = init_powernv_pstates(); 1005 if (rc) 1006 goto out; 1007 1008 /* Populate chip info */ 1009 rc = init_chip_info(); 1010 if (rc) 1011 goto out; 1012 1013 register_reboot_notifier(&powernv_cpufreq_reboot_nb); 1014 opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); 1015 1016 rc = cpufreq_register_driver(&powernv_cpufreq_driver); 1017 if (!rc) 1018 return 0; 1019 1020 pr_info("Failed to register the cpufreq driver (%d)\n", rc); 1021 unregister_all_notifiers(); 1022 clean_chip_info(); 1023 out: 1024 pr_info("Platform driver disabled. System does not support PState control\n"); 1025 return rc; 1026 } 1027 module_init(powernv_cpufreq_init); 1028 1029 static void __exit powernv_cpufreq_exit(void) 1030 { 1031 cpufreq_unregister_driver(&powernv_cpufreq_driver); 1032 unregister_all_notifiers(); 1033 clean_chip_info(); 1034 } 1035 module_exit(powernv_cpufreq_exit); 1036 1037 MODULE_LICENSE("GPL"); 1038 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>"); 1039