1 /*
2  * POWERNV cpufreq driver for the IBM POWER processors
3  *
4  * (C) Copyright IBM 2014
5  *
6  * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2, or (at your option)
11  * any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  */
19 
20 #define pr_fmt(fmt)	"powernv-cpufreq: " fmt
21 
22 #include <linux/kernel.h>
23 #include <linux/sysfs.h>
24 #include <linux/cpumask.h>
25 #include <linux/module.h>
26 #include <linux/cpufreq.h>
27 #include <linux/smp.h>
28 #include <linux/of.h>
29 #include <linux/reboot.h>
30 #include <linux/slab.h>
31 #include <linux/cpu.h>
32 #include <trace/events/power.h>
33 
34 #include <asm/cputhreads.h>
35 #include <asm/firmware.h>
36 #include <asm/reg.h>
37 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
38 #include <asm/opal.h>
39 #include <linux/timer.h>
40 
41 #define POWERNV_MAX_PSTATES	256
42 #define PMSR_PSAFE_ENABLE	(1UL << 30)
43 #define PMSR_SPR_EM_DISABLE	(1UL << 31)
44 #define PMSR_MAX(x)		((x >> 32) & 0xFF)
45 #define LPSTATE_SHIFT		48
46 #define GPSTATE_SHIFT		56
47 #define GET_LPSTATE(x)		(((x) >> LPSTATE_SHIFT) & 0xFF)
48 #define GET_GPSTATE(x)		(((x) >> GPSTATE_SHIFT) & 0xFF)
49 
50 #define MAX_RAMP_DOWN_TIME				5120
51 /*
52  * On an idle system we want the global pstate to ramp-down from max value to
53  * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
54  * then ramp-down rapidly later on.
55  *
56  * This gives a percentage rampdown for time elapsed in milliseconds.
57  * ramp_down_percentage = ((ms * ms) >> 18)
58  *			~= 3.8 * (sec * sec)
59  *
60  * At 0 ms	ramp_down_percent = 0
61  * At 5120 ms	ramp_down_percent = 100
62  */
63 #define ramp_down_percent(time)		((time * time) >> 18)
64 
65 /* Interval after which the timer is queued to bring down global pstate */
66 #define GPSTATE_TIMER_INTERVAL				2000
67 
68 /**
69  * struct global_pstate_info -	Per policy data structure to maintain history of
70  *				global pstates
71  * @highest_lpstate_idx:	The local pstate index from which we are
72  *				ramping down
73  * @elapsed_time:		Time in ms spent in ramping down from
74  *				highest_lpstate_idx
75  * @last_sampled_time:		Time from boot in ms when global pstates were
76  *				last set
77  * @last_lpstate_idx,		Last set value of local pstate and global
78  * last_gpstate_idx		pstate in terms of cpufreq table index
79  * @timer:			Is used for ramping down if cpu goes idle for
80  *				a long time with global pstate held high
81  * @gpstate_lock:		A spinlock to maintain synchronization between
82  *				routines called by the timer handler and
83  *				governer's target_index calls
84  */
85 struct global_pstate_info {
86 	int highest_lpstate_idx;
87 	unsigned int elapsed_time;
88 	unsigned int last_sampled_time;
89 	int last_lpstate_idx;
90 	int last_gpstate_idx;
91 	spinlock_t gpstate_lock;
92 	struct timer_list timer;
93 };
94 
95 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
96 static bool rebooting, throttled, occ_reset;
97 
98 static const char * const throttle_reason[] = {
99 	"No throttling",
100 	"Power Cap",
101 	"Processor Over Temperature",
102 	"Power Supply Failure",
103 	"Over Current",
104 	"OCC Reset"
105 };
106 
107 enum throttle_reason_type {
108 	NO_THROTTLE = 0,
109 	POWERCAP,
110 	CPU_OVERTEMP,
111 	POWER_SUPPLY_FAILURE,
112 	OVERCURRENT,
113 	OCC_RESET_THROTTLE,
114 	OCC_MAX_REASON
115 };
116 
117 static struct chip {
118 	unsigned int id;
119 	bool throttled;
120 	bool restore;
121 	u8 throttle_reason;
122 	cpumask_t mask;
123 	struct work_struct throttle;
124 	int throttle_turbo;
125 	int throttle_sub_turbo;
126 	int reason[OCC_MAX_REASON];
127 } *chips;
128 
129 static int nr_chips;
130 static DEFINE_PER_CPU(struct chip *, chip_info);
131 
132 /*
133  * Note:
134  * The set of pstates consists of contiguous integers.
135  * powernv_pstate_info stores the index of the frequency table for
136  * max, min and nominal frequencies. It also stores number of
137  * available frequencies.
138  *
139  * powernv_pstate_info.nominal indicates the index to the highest
140  * non-turbo frequency.
141  */
142 static struct powernv_pstate_info {
143 	unsigned int min;
144 	unsigned int max;
145 	unsigned int nominal;
146 	unsigned int nr_pstates;
147 } powernv_pstate_info;
148 
149 /* Use following macros for conversions between pstate_id and index */
150 static inline int idx_to_pstate(unsigned int i)
151 {
152 	if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
153 		pr_warn_once("index %u is out of bound\n", i);
154 		return powernv_freqs[powernv_pstate_info.nominal].driver_data;
155 	}
156 
157 	return powernv_freqs[i].driver_data;
158 }
159 
160 static inline unsigned int pstate_to_idx(int pstate)
161 {
162 	int min = powernv_freqs[powernv_pstate_info.min].driver_data;
163 	int max = powernv_freqs[powernv_pstate_info.max].driver_data;
164 
165 	if (min > 0) {
166 		if (unlikely((pstate < max) || (pstate > min))) {
167 			pr_warn_once("pstate %d is out of bound\n", pstate);
168 			return powernv_pstate_info.nominal;
169 		}
170 	} else {
171 		if (unlikely((pstate > max) || (pstate < min))) {
172 			pr_warn_once("pstate %d is out of bound\n", pstate);
173 			return powernv_pstate_info.nominal;
174 		}
175 	}
176 	/*
177 	 * abs() is deliberately used so that is works with
178 	 * both monotonically increasing and decreasing
179 	 * pstate values
180 	 */
181 	return abs(pstate - idx_to_pstate(powernv_pstate_info.max));
182 }
183 
184 static inline void reset_gpstates(struct cpufreq_policy *policy)
185 {
186 	struct global_pstate_info *gpstates = policy->driver_data;
187 
188 	gpstates->highest_lpstate_idx = 0;
189 	gpstates->elapsed_time = 0;
190 	gpstates->last_sampled_time = 0;
191 	gpstates->last_lpstate_idx = 0;
192 	gpstates->last_gpstate_idx = 0;
193 }
194 
195 /*
196  * Initialize the freq table based on data obtained
197  * from the firmware passed via device-tree
198  */
199 static int init_powernv_pstates(void)
200 {
201 	struct device_node *power_mgt;
202 	int i, nr_pstates = 0;
203 	const __be32 *pstate_ids, *pstate_freqs;
204 	u32 len_ids, len_freqs;
205 	u32 pstate_min, pstate_max, pstate_nominal;
206 
207 	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
208 	if (!power_mgt) {
209 		pr_warn("power-mgt node not found\n");
210 		return -ENODEV;
211 	}
212 
213 	if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
214 		pr_warn("ibm,pstate-min node not found\n");
215 		return -ENODEV;
216 	}
217 
218 	if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
219 		pr_warn("ibm,pstate-max node not found\n");
220 		return -ENODEV;
221 	}
222 
223 	if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
224 				 &pstate_nominal)) {
225 		pr_warn("ibm,pstate-nominal not found\n");
226 		return -ENODEV;
227 	}
228 	pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
229 		pstate_nominal, pstate_max);
230 
231 	pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
232 	if (!pstate_ids) {
233 		pr_warn("ibm,pstate-ids not found\n");
234 		return -ENODEV;
235 	}
236 
237 	pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
238 				      &len_freqs);
239 	if (!pstate_freqs) {
240 		pr_warn("ibm,pstate-frequencies-mhz not found\n");
241 		return -ENODEV;
242 	}
243 
244 	if (len_ids != len_freqs) {
245 		pr_warn("Entries in ibm,pstate-ids and "
246 			"ibm,pstate-frequencies-mhz does not match\n");
247 	}
248 
249 	nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
250 	if (!nr_pstates) {
251 		pr_warn("No PStates found\n");
252 		return -ENODEV;
253 	}
254 
255 	powernv_pstate_info.nr_pstates = nr_pstates;
256 	pr_debug("NR PStates %d\n", nr_pstates);
257 	for (i = 0; i < nr_pstates; i++) {
258 		u32 id = be32_to_cpu(pstate_ids[i]);
259 		u32 freq = be32_to_cpu(pstate_freqs[i]);
260 
261 		pr_debug("PState id %d freq %d MHz\n", id, freq);
262 		powernv_freqs[i].frequency = freq * 1000; /* kHz */
263 		powernv_freqs[i].driver_data = id;
264 
265 		if (id == pstate_max)
266 			powernv_pstate_info.max = i;
267 		else if (id == pstate_nominal)
268 			powernv_pstate_info.nominal = i;
269 		else if (id == pstate_min)
270 			powernv_pstate_info.min = i;
271 	}
272 
273 	/* End of list marker entry */
274 	powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
275 	return 0;
276 }
277 
278 /* Returns the CPU frequency corresponding to the pstate_id. */
279 static unsigned int pstate_id_to_freq(int pstate_id)
280 {
281 	int i;
282 
283 	i = pstate_to_idx(pstate_id);
284 	if (i >= powernv_pstate_info.nr_pstates || i < 0) {
285 		pr_warn("PState id %d outside of PState table, "
286 			"reporting nominal id %d instead\n",
287 			pstate_id, idx_to_pstate(powernv_pstate_info.nominal));
288 		i = powernv_pstate_info.nominal;
289 	}
290 
291 	return powernv_freqs[i].frequency;
292 }
293 
294 /*
295  * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by
296  * the firmware
297  */
298 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy,
299 					char *buf)
300 {
301 	return sprintf(buf, "%u\n",
302 		powernv_freqs[powernv_pstate_info.nominal].frequency);
303 }
304 
305 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq =
306 	__ATTR_RO(cpuinfo_nominal_freq);
307 
308 static struct freq_attr *powernv_cpu_freq_attr[] = {
309 	&cpufreq_freq_attr_scaling_available_freqs,
310 	&cpufreq_freq_attr_cpuinfo_nominal_freq,
311 	NULL,
312 };
313 
314 #define throttle_attr(name, member)					\
315 static ssize_t name##_show(struct cpufreq_policy *policy, char *buf)	\
316 {									\
317 	struct chip *chip = per_cpu(chip_info, policy->cpu);		\
318 									\
319 	return sprintf(buf, "%u\n", chip->member);			\
320 }									\
321 									\
322 static struct freq_attr throttle_attr_##name = __ATTR_RO(name)		\
323 
324 throttle_attr(unthrottle, reason[NO_THROTTLE]);
325 throttle_attr(powercap, reason[POWERCAP]);
326 throttle_attr(overtemp, reason[CPU_OVERTEMP]);
327 throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
328 throttle_attr(overcurrent, reason[OVERCURRENT]);
329 throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
330 throttle_attr(turbo_stat, throttle_turbo);
331 throttle_attr(sub_turbo_stat, throttle_sub_turbo);
332 
333 static struct attribute *throttle_attrs[] = {
334 	&throttle_attr_unthrottle.attr,
335 	&throttle_attr_powercap.attr,
336 	&throttle_attr_overtemp.attr,
337 	&throttle_attr_supply_fault.attr,
338 	&throttle_attr_overcurrent.attr,
339 	&throttle_attr_occ_reset.attr,
340 	&throttle_attr_turbo_stat.attr,
341 	&throttle_attr_sub_turbo_stat.attr,
342 	NULL,
343 };
344 
345 static const struct attribute_group throttle_attr_grp = {
346 	.name	= "throttle_stats",
347 	.attrs	= throttle_attrs,
348 };
349 
350 /* Helper routines */
351 
352 /* Access helpers to power mgt SPR */
353 
354 static inline unsigned long get_pmspr(unsigned long sprn)
355 {
356 	switch (sprn) {
357 	case SPRN_PMCR:
358 		return mfspr(SPRN_PMCR);
359 
360 	case SPRN_PMICR:
361 		return mfspr(SPRN_PMICR);
362 
363 	case SPRN_PMSR:
364 		return mfspr(SPRN_PMSR);
365 	}
366 	BUG();
367 }
368 
369 static inline void set_pmspr(unsigned long sprn, unsigned long val)
370 {
371 	switch (sprn) {
372 	case SPRN_PMCR:
373 		mtspr(SPRN_PMCR, val);
374 		return;
375 
376 	case SPRN_PMICR:
377 		mtspr(SPRN_PMICR, val);
378 		return;
379 	}
380 	BUG();
381 }
382 
383 /*
384  * Use objects of this type to query/update
385  * pstates on a remote CPU via smp_call_function.
386  */
387 struct powernv_smp_call_data {
388 	unsigned int freq;
389 	int pstate_id;
390 	int gpstate_id;
391 };
392 
393 /*
394  * powernv_read_cpu_freq: Reads the current frequency on this CPU.
395  *
396  * Called via smp_call_function.
397  *
398  * Note: The caller of the smp_call_function should pass an argument of
399  * the type 'struct powernv_smp_call_data *' along with this function.
400  *
401  * The current frequency on this CPU will be returned via
402  * ((struct powernv_smp_call_data *)arg)->freq;
403  */
404 static void powernv_read_cpu_freq(void *arg)
405 {
406 	unsigned long pmspr_val;
407 	s8 local_pstate_id;
408 	struct powernv_smp_call_data *freq_data = arg;
409 
410 	pmspr_val = get_pmspr(SPRN_PMSR);
411 
412 	/*
413 	 * The local pstate id corresponds bits 48..55 in the PMSR.
414 	 * Note: Watch out for the sign!
415 	 */
416 	local_pstate_id = (pmspr_val >> 48) & 0xFF;
417 	freq_data->pstate_id = local_pstate_id;
418 	freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);
419 
420 	pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n",
421 		raw_smp_processor_id(), pmspr_val, freq_data->pstate_id,
422 		freq_data->freq);
423 }
424 
425 /*
426  * powernv_cpufreq_get: Returns the CPU frequency as reported by the
427  * firmware for CPU 'cpu'. This value is reported through the sysfs
428  * file cpuinfo_cur_freq.
429  */
430 static unsigned int powernv_cpufreq_get(unsigned int cpu)
431 {
432 	struct powernv_smp_call_data freq_data;
433 
434 	smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq,
435 			&freq_data, 1);
436 
437 	return freq_data.freq;
438 }
439 
440 /*
441  * set_pstate: Sets the pstate on this CPU.
442  *
443  * This is called via an smp_call_function.
444  *
445  * The caller must ensure that freq_data is of the type
446  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
447  * on this CPU should be present in freq_data->pstate_id.
448  */
449 static void set_pstate(void *data)
450 {
451 	unsigned long val;
452 	struct powernv_smp_call_data *freq_data = data;
453 	unsigned long pstate_ul = freq_data->pstate_id;
454 	unsigned long gpstate_ul = freq_data->gpstate_id;
455 
456 	val = get_pmspr(SPRN_PMCR);
457 	val = val & 0x0000FFFFFFFFFFFFULL;
458 
459 	pstate_ul = pstate_ul & 0xFF;
460 	gpstate_ul = gpstate_ul & 0xFF;
461 
462 	/* Set both global(bits 56..63) and local(bits 48..55) PStates */
463 	val = val | (gpstate_ul << 56) | (pstate_ul << 48);
464 
465 	pr_debug("Setting cpu %d pmcr to %016lX\n",
466 			raw_smp_processor_id(), val);
467 	set_pmspr(SPRN_PMCR, val);
468 }
469 
470 /*
471  * get_nominal_index: Returns the index corresponding to the nominal
472  * pstate in the cpufreq table
473  */
474 static inline unsigned int get_nominal_index(void)
475 {
476 	return powernv_pstate_info.nominal;
477 }
478 
479 static void powernv_cpufreq_throttle_check(void *data)
480 {
481 	struct chip *chip;
482 	unsigned int cpu = smp_processor_id();
483 	unsigned long pmsr;
484 	int pmsr_pmax;
485 	unsigned int pmsr_pmax_idx;
486 
487 	pmsr = get_pmspr(SPRN_PMSR);
488 	chip = this_cpu_read(chip_info);
489 
490 	/* Check for Pmax Capping */
491 	pmsr_pmax = (s8)PMSR_MAX(pmsr);
492 	pmsr_pmax_idx = pstate_to_idx(pmsr_pmax);
493 	if (pmsr_pmax_idx != powernv_pstate_info.max) {
494 		if (chip->throttled)
495 			goto next;
496 		chip->throttled = true;
497 		if (pmsr_pmax_idx > powernv_pstate_info.nominal) {
498 			pr_warn_once("CPU %d on Chip %u has Pmax(%d) reduced below nominal frequency(%d)\n",
499 				     cpu, chip->id, pmsr_pmax,
500 				     idx_to_pstate(powernv_pstate_info.nominal));
501 			chip->throttle_sub_turbo++;
502 		} else {
503 			chip->throttle_turbo++;
504 		}
505 		trace_powernv_throttle(chip->id,
506 				      throttle_reason[chip->throttle_reason],
507 				      pmsr_pmax);
508 	} else if (chip->throttled) {
509 		chip->throttled = false;
510 		trace_powernv_throttle(chip->id,
511 				      throttle_reason[chip->throttle_reason],
512 				      pmsr_pmax);
513 	}
514 
515 	/* Check if Psafe_mode_active is set in PMSR. */
516 next:
517 	if (pmsr & PMSR_PSAFE_ENABLE) {
518 		throttled = true;
519 		pr_info("Pstate set to safe frequency\n");
520 	}
521 
522 	/* Check if SPR_EM_DISABLE is set in PMSR */
523 	if (pmsr & PMSR_SPR_EM_DISABLE) {
524 		throttled = true;
525 		pr_info("Frequency Control disabled from OS\n");
526 	}
527 
528 	if (throttled) {
529 		pr_info("PMSR = %16lx\n", pmsr);
530 		pr_warn("CPU Frequency could be throttled\n");
531 	}
532 }
533 
534 /**
535  * calc_global_pstate - Calculate global pstate
536  * @elapsed_time:		Elapsed time in milliseconds
537  * @local_pstate_idx:		New local pstate
538  * @highest_lpstate_idx:	pstate from which its ramping down
539  *
540  * Finds the appropriate global pstate based on the pstate from which its
541  * ramping down and the time elapsed in ramping down. It follows a quadratic
542  * equation which ensures that it reaches ramping down to pmin in 5sec.
543  */
544 static inline int calc_global_pstate(unsigned int elapsed_time,
545 				     int highest_lpstate_idx,
546 				     int local_pstate_idx)
547 {
548 	int index_diff;
549 
550 	/*
551 	 * Using ramp_down_percent we get the percentage of rampdown
552 	 * that we are expecting to be dropping. Difference between
553 	 * highest_lpstate_idx and powernv_pstate_info.min will give a absolute
554 	 * number of how many pstates we will drop eventually by the end of
555 	 * 5 seconds, then just scale it get the number pstates to be dropped.
556 	 */
557 	index_diff =  ((int)ramp_down_percent(elapsed_time) *
558 			(powernv_pstate_info.min - highest_lpstate_idx)) / 100;
559 
560 	/* Ensure that global pstate is >= to local pstate */
561 	if (highest_lpstate_idx + index_diff >= local_pstate_idx)
562 		return local_pstate_idx;
563 	else
564 		return highest_lpstate_idx + index_diff;
565 }
566 
567 static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
568 {
569 	unsigned int timer_interval;
570 
571 	/*
572 	 * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
573 	 * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
574 	 * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
575 	 * seconds of ramp down time.
576 	 */
577 	if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
578 	     > MAX_RAMP_DOWN_TIME)
579 		timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
580 	else
581 		timer_interval = GPSTATE_TIMER_INTERVAL;
582 
583 	mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval));
584 }
585 
586 /**
587  * gpstate_timer_handler
588  *
589  * @data: pointer to cpufreq_policy on which timer was queued
590  *
591  * This handler brings down the global pstate closer to the local pstate
592  * according quadratic equation. Queues a new timer if it is still not equal
593  * to local pstate
594  */
595 void gpstate_timer_handler(unsigned long data)
596 {
597 	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
598 	struct global_pstate_info *gpstates = policy->driver_data;
599 	int gpstate_idx, lpstate_idx;
600 	unsigned long val;
601 	unsigned int time_diff = jiffies_to_msecs(jiffies)
602 					- gpstates->last_sampled_time;
603 	struct powernv_smp_call_data freq_data;
604 
605 	if (!spin_trylock(&gpstates->gpstate_lock))
606 		return;
607 
608 	/*
609 	 * If PMCR was last updated was using fast_swtich then
610 	 * We may have wrong in gpstate->last_lpstate_idx
611 	 * value. Hence, read from PMCR to get correct data.
612 	 */
613 	val = get_pmspr(SPRN_PMCR);
614 	freq_data.gpstate_id = (s8)GET_GPSTATE(val);
615 	freq_data.pstate_id = (s8)GET_LPSTATE(val);
616 	if (freq_data.gpstate_id  == freq_data.pstate_id) {
617 		reset_gpstates(policy);
618 		spin_unlock(&gpstates->gpstate_lock);
619 		return;
620 	}
621 
622 	gpstates->last_sampled_time += time_diff;
623 	gpstates->elapsed_time += time_diff;
624 
625 	if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
626 		gpstate_idx = pstate_to_idx(freq_data.pstate_id);
627 		lpstate_idx = gpstate_idx;
628 		reset_gpstates(policy);
629 		gpstates->highest_lpstate_idx = gpstate_idx;
630 	} else {
631 		lpstate_idx = pstate_to_idx(freq_data.pstate_id);
632 		gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
633 						 gpstates->highest_lpstate_idx,
634 						 lpstate_idx);
635 	}
636 	freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
637 	gpstates->last_gpstate_idx = gpstate_idx;
638 	gpstates->last_lpstate_idx = lpstate_idx;
639 	/*
640 	 * If local pstate is equal to global pstate, rampdown is over
641 	 * So timer is not required to be queued.
642 	 */
643 	if (gpstate_idx != gpstates->last_lpstate_idx)
644 		queue_gpstate_timer(gpstates);
645 
646 	spin_unlock(&gpstates->gpstate_lock);
647 
648 	/* Timer may get migrated to a different cpu on cpu hot unplug */
649 	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
650 }
651 
652 /*
653  * powernv_cpufreq_target_index: Sets the frequency corresponding to
654  * the cpufreq table entry indexed by new_index on the cpus in the
655  * mask policy->cpus
656  */
657 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
658 					unsigned int new_index)
659 {
660 	struct powernv_smp_call_data freq_data;
661 	unsigned int cur_msec, gpstate_idx;
662 	struct global_pstate_info *gpstates = policy->driver_data;
663 
664 	if (unlikely(rebooting) && new_index != get_nominal_index())
665 		return 0;
666 
667 	if (!throttled) {
668 		/* we don't want to be preempted while
669 		 * checking if the CPU frequency has been throttled
670 		 */
671 		preempt_disable();
672 		powernv_cpufreq_throttle_check(NULL);
673 		preempt_enable();
674 	}
675 
676 	cur_msec = jiffies_to_msecs(get_jiffies_64());
677 
678 	spin_lock(&gpstates->gpstate_lock);
679 	freq_data.pstate_id = idx_to_pstate(new_index);
680 
681 	if (!gpstates->last_sampled_time) {
682 		gpstate_idx = new_index;
683 		gpstates->highest_lpstate_idx = new_index;
684 		goto gpstates_done;
685 	}
686 
687 	if (gpstates->last_gpstate_idx < new_index) {
688 		gpstates->elapsed_time += cur_msec -
689 						 gpstates->last_sampled_time;
690 
691 		/*
692 		 * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
693 		 * we should be resetting all global pstate related data. Set it
694 		 * equal to local pstate to start fresh.
695 		 */
696 		if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
697 			reset_gpstates(policy);
698 			gpstates->highest_lpstate_idx = new_index;
699 			gpstate_idx = new_index;
700 		} else {
701 		/* Elaspsed_time is less than 5 seconds, continue to rampdown */
702 			gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
703 							 gpstates->highest_lpstate_idx,
704 							 new_index);
705 		}
706 	} else {
707 		reset_gpstates(policy);
708 		gpstates->highest_lpstate_idx = new_index;
709 		gpstate_idx = new_index;
710 	}
711 
712 	/*
713 	 * If local pstate is equal to global pstate, rampdown is over
714 	 * So timer is not required to be queued.
715 	 */
716 	if (gpstate_idx != new_index)
717 		queue_gpstate_timer(gpstates);
718 	else
719 		del_timer_sync(&gpstates->timer);
720 
721 gpstates_done:
722 	freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
723 	gpstates->last_sampled_time = cur_msec;
724 	gpstates->last_gpstate_idx = gpstate_idx;
725 	gpstates->last_lpstate_idx = new_index;
726 
727 	spin_unlock(&gpstates->gpstate_lock);
728 
729 	/*
730 	 * Use smp_call_function to send IPI and execute the
731 	 * mtspr on target CPU.  We could do that without IPI
732 	 * if current CPU is within policy->cpus (core)
733 	 */
734 	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
735 	return 0;
736 }
737 
738 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
739 {
740 	int base, i, ret;
741 	struct kernfs_node *kn;
742 	struct global_pstate_info *gpstates;
743 
744 	base = cpu_first_thread_sibling(policy->cpu);
745 
746 	for (i = 0; i < threads_per_core; i++)
747 		cpumask_set_cpu(base + i, policy->cpus);
748 
749 	kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
750 	if (!kn) {
751 		int ret;
752 
753 		ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
754 		if (ret) {
755 			pr_info("Failed to create throttle stats directory for cpu %d\n",
756 				policy->cpu);
757 			return ret;
758 		}
759 	} else {
760 		kernfs_put(kn);
761 	}
762 
763 	gpstates =  kzalloc(sizeof(*gpstates), GFP_KERNEL);
764 	if (!gpstates)
765 		return -ENOMEM;
766 
767 	policy->driver_data = gpstates;
768 
769 	/* initialize timer */
770 	init_timer_pinned_deferrable(&gpstates->timer);
771 	gpstates->timer.data = (unsigned long)policy;
772 	gpstates->timer.function = gpstate_timer_handler;
773 	gpstates->timer.expires = jiffies +
774 				msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
775 	spin_lock_init(&gpstates->gpstate_lock);
776 	ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
777 
778 	if (ret < 0) {
779 		kfree(policy->driver_data);
780 		return ret;
781 	}
782 
783 	policy->fast_switch_possible = true;
784 	return ret;
785 }
786 
787 static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
788 {
789 	/* timer is deleted in cpufreq_cpu_stop() */
790 	kfree(policy->driver_data);
791 
792 	return 0;
793 }
794 
795 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
796 				unsigned long action, void *unused)
797 {
798 	int cpu;
799 	struct cpufreq_policy cpu_policy;
800 
801 	rebooting = true;
802 	for_each_online_cpu(cpu) {
803 		cpufreq_get_policy(&cpu_policy, cpu);
804 		powernv_cpufreq_target_index(&cpu_policy, get_nominal_index());
805 	}
806 
807 	return NOTIFY_DONE;
808 }
809 
810 static struct notifier_block powernv_cpufreq_reboot_nb = {
811 	.notifier_call = powernv_cpufreq_reboot_notifier,
812 };
813 
814 void powernv_cpufreq_work_fn(struct work_struct *work)
815 {
816 	struct chip *chip = container_of(work, struct chip, throttle);
817 	unsigned int cpu;
818 	cpumask_t mask;
819 
820 	get_online_cpus();
821 	cpumask_and(&mask, &chip->mask, cpu_online_mask);
822 	smp_call_function_any(&mask,
823 			      powernv_cpufreq_throttle_check, NULL, 0);
824 
825 	if (!chip->restore)
826 		goto out;
827 
828 	chip->restore = false;
829 	for_each_cpu(cpu, &mask) {
830 		int index;
831 		struct cpufreq_policy policy;
832 
833 		cpufreq_get_policy(&policy, cpu);
834 		index = cpufreq_table_find_index_c(&policy, policy.cur);
835 		powernv_cpufreq_target_index(&policy, index);
836 		cpumask_andnot(&mask, &mask, policy.cpus);
837 	}
838 out:
839 	put_online_cpus();
840 }
841 
842 static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
843 				   unsigned long msg_type, void *_msg)
844 {
845 	struct opal_msg *msg = _msg;
846 	struct opal_occ_msg omsg;
847 	int i;
848 
849 	if (msg_type != OPAL_MSG_OCC)
850 		return 0;
851 
852 	omsg.type = be64_to_cpu(msg->params[0]);
853 
854 	switch (omsg.type) {
855 	case OCC_RESET:
856 		occ_reset = true;
857 		pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n");
858 		/*
859 		 * powernv_cpufreq_throttle_check() is called in
860 		 * target() callback which can detect the throttle state
861 		 * for governors like ondemand.
862 		 * But static governors will not call target() often thus
863 		 * report throttling here.
864 		 */
865 		if (!throttled) {
866 			throttled = true;
867 			pr_warn("CPU frequency is throttled for duration\n");
868 		}
869 
870 		break;
871 	case OCC_LOAD:
872 		pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n");
873 		break;
874 	case OCC_THROTTLE:
875 		omsg.chip = be64_to_cpu(msg->params[1]);
876 		omsg.throttle_status = be64_to_cpu(msg->params[2]);
877 
878 		if (occ_reset) {
879 			occ_reset = false;
880 			throttled = false;
881 			pr_info("OCC Active, CPU frequency is no longer throttled\n");
882 
883 			for (i = 0; i < nr_chips; i++) {
884 				chips[i].restore = true;
885 				schedule_work(&chips[i].throttle);
886 			}
887 
888 			return 0;
889 		}
890 
891 		for (i = 0; i < nr_chips; i++)
892 			if (chips[i].id == omsg.chip)
893 				break;
894 
895 		if (omsg.throttle_status >= 0 &&
896 		    omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
897 			chips[i].throttle_reason = omsg.throttle_status;
898 			chips[i].reason[omsg.throttle_status]++;
899 		}
900 
901 		if (!omsg.throttle_status)
902 			chips[i].restore = true;
903 
904 		schedule_work(&chips[i].throttle);
905 	}
906 	return 0;
907 }
908 
909 static struct notifier_block powernv_cpufreq_opal_nb = {
910 	.notifier_call	= powernv_cpufreq_occ_msg,
911 	.next		= NULL,
912 	.priority	= 0,
913 };
914 
915 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
916 {
917 	struct powernv_smp_call_data freq_data;
918 	struct global_pstate_info *gpstates = policy->driver_data;
919 
920 	freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min);
921 	freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min);
922 	smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
923 	del_timer_sync(&gpstates->timer);
924 }
925 
926 static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
927 					unsigned int target_freq)
928 {
929 	int index;
930 	struct powernv_smp_call_data freq_data;
931 
932 	index = cpufreq_table_find_index_dl(policy, target_freq);
933 	freq_data.pstate_id = powernv_freqs[index].driver_data;
934 	freq_data.gpstate_id = powernv_freqs[index].driver_data;
935 	set_pstate(&freq_data);
936 
937 	return powernv_freqs[index].frequency;
938 }
939 
940 static struct cpufreq_driver powernv_cpufreq_driver = {
941 	.name		= "powernv-cpufreq",
942 	.flags		= CPUFREQ_CONST_LOOPS,
943 	.init		= powernv_cpufreq_cpu_init,
944 	.exit		= powernv_cpufreq_cpu_exit,
945 	.verify		= cpufreq_generic_frequency_table_verify,
946 	.target_index	= powernv_cpufreq_target_index,
947 	.fast_switch	= powernv_fast_switch,
948 	.get		= powernv_cpufreq_get,
949 	.stop_cpu	= powernv_cpufreq_stop_cpu,
950 	.attr		= powernv_cpu_freq_attr,
951 };
952 
953 static int init_chip_info(void)
954 {
955 	unsigned int chip[256];
956 	unsigned int cpu, i;
957 	unsigned int prev_chip_id = UINT_MAX;
958 
959 	for_each_possible_cpu(cpu) {
960 		unsigned int id = cpu_to_chip_id(cpu);
961 
962 		if (prev_chip_id != id) {
963 			prev_chip_id = id;
964 			chip[nr_chips++] = id;
965 		}
966 	}
967 
968 	chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
969 	if (!chips)
970 		return -ENOMEM;
971 
972 	for (i = 0; i < nr_chips; i++) {
973 		chips[i].id = chip[i];
974 		cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
975 		INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
976 		for_each_cpu(cpu, &chips[i].mask)
977 			per_cpu(chip_info, cpu) =  &chips[i];
978 	}
979 
980 	return 0;
981 }
982 
983 static inline void clean_chip_info(void)
984 {
985 	kfree(chips);
986 }
987 
988 static inline void unregister_all_notifiers(void)
989 {
990 	opal_message_notifier_unregister(OPAL_MSG_OCC,
991 					 &powernv_cpufreq_opal_nb);
992 	unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
993 }
994 
995 static int __init powernv_cpufreq_init(void)
996 {
997 	int rc = 0;
998 
999 	/* Don't probe on pseries (guest) platforms */
1000 	if (!firmware_has_feature(FW_FEATURE_OPAL))
1001 		return -ENODEV;
1002 
1003 	/* Discover pstates from device tree and init */
1004 	rc = init_powernv_pstates();
1005 	if (rc)
1006 		goto out;
1007 
1008 	/* Populate chip info */
1009 	rc = init_chip_info();
1010 	if (rc)
1011 		goto out;
1012 
1013 	register_reboot_notifier(&powernv_cpufreq_reboot_nb);
1014 	opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
1015 
1016 	rc = cpufreq_register_driver(&powernv_cpufreq_driver);
1017 	if (!rc)
1018 		return 0;
1019 
1020 	pr_info("Failed to register the cpufreq driver (%d)\n", rc);
1021 	unregister_all_notifiers();
1022 	clean_chip_info();
1023 out:
1024 	pr_info("Platform driver disabled. System does not support PState control\n");
1025 	return rc;
1026 }
1027 module_init(powernv_cpufreq_init);
1028 
1029 static void __exit powernv_cpufreq_exit(void)
1030 {
1031 	cpufreq_unregister_driver(&powernv_cpufreq_driver);
1032 	unregister_all_notifiers();
1033 	clean_chip_info();
1034 }
1035 module_exit(powernv_cpufreq_exit);
1036 
1037 MODULE_LICENSE("GPL");
1038 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");
1039