1 /*
2  * PowerNV cpuidle code
3  *
4  * Copyright 2015 IBM Corp.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 
12 #include <linux/types.h>
13 #include <linux/mm.h>
14 #include <linux/slab.h>
15 #include <linux/of.h>
16 #include <linux/device.h>
17 #include <linux/cpu.h>
18 
19 #include <asm/firmware.h>
20 #include <asm/machdep.h>
21 #include <asm/opal.h>
22 #include <asm/cputhreads.h>
23 #include <asm/cpuidle.h>
24 #include <asm/code-patching.h>
25 #include <asm/smp.h>
26 #include <asm/runlatch.h>
27 #include <asm/dbell.h>
28 
29 #include "powernv.h"
30 #include "subcore.h"
31 
32 /* Power ISA 3.0 allows for stop states 0x0 - 0xF */
33 #define MAX_STOP_STATE	0xF
34 
35 #define P9_STOP_SPR_MSR 2000
36 #define P9_STOP_SPR_PSSCR      855
37 
38 static u32 supported_cpuidle_states;
39 
40 /*
41  * The default stop state that will be used by ppc_md.power_save
42  * function on platforms that support stop instruction.
43  */
44 static u64 pnv_default_stop_val;
45 static u64 pnv_default_stop_mask;
46 static bool default_stop_found;
47 
48 /*
49  * First deep stop state. Used to figure out when to save/restore
50  * hypervisor context.
51  */
52 u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
53 
54 /*
55  * psscr value and mask of the deepest stop idle state.
56  * Used when a cpu is offlined.
57  */
58 static u64 pnv_deepest_stop_psscr_val;
59 static u64 pnv_deepest_stop_psscr_mask;
60 static u64 pnv_deepest_stop_flag;
61 static bool deepest_stop_found;
62 
63 static int pnv_save_sprs_for_deep_states(void)
64 {
65 	int cpu;
66 	int rc;
67 
68 	/*
69 	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric across
70 	 * all cpus at boot. Get these reg values of current cpu and use the
71 	 * same across all cpus.
72 	 */
73 	uint64_t lpcr_val = mfspr(SPRN_LPCR);
74 	uint64_t hid0_val = mfspr(SPRN_HID0);
75 	uint64_t hid1_val = mfspr(SPRN_HID1);
76 	uint64_t hid4_val = mfspr(SPRN_HID4);
77 	uint64_t hid5_val = mfspr(SPRN_HID5);
78 	uint64_t hmeer_val = mfspr(SPRN_HMEER);
79 	uint64_t msr_val = MSR_IDLE;
80 	uint64_t psscr_val = pnv_deepest_stop_psscr_val;
81 
82 	for_each_present_cpu(cpu) {
83 		uint64_t pir = get_hard_smp_processor_id(cpu);
84 		uint64_t hsprg0_val = (uint64_t)paca_ptrs[cpu];
85 
86 		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
87 		if (rc != 0)
88 			return rc;
89 
90 		rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
91 		if (rc != 0)
92 			return rc;
93 
94 		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
95 			rc = opal_slw_set_reg(pir, P9_STOP_SPR_MSR, msr_val);
96 			if (rc)
97 				return rc;
98 
99 			rc = opal_slw_set_reg(pir,
100 					      P9_STOP_SPR_PSSCR, psscr_val);
101 
102 			if (rc)
103 				return rc;
104 		}
105 
106 		/* HIDs are per core registers */
107 		if (cpu_thread_in_core(cpu) == 0) {
108 
109 			rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val);
110 			if (rc != 0)
111 				return rc;
112 
113 			rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val);
114 			if (rc != 0)
115 				return rc;
116 
117 			/* Only p8 needs to set extra HID regiters */
118 			if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
119 
120 				rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
121 				if (rc != 0)
122 					return rc;
123 
124 				rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
125 				if (rc != 0)
126 					return rc;
127 
128 				rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
129 				if (rc != 0)
130 					return rc;
131 			}
132 		}
133 	}
134 
135 	return 0;
136 }
137 
138 static void pnv_alloc_idle_core_states(void)
139 {
140 	int i, j;
141 	int nr_cores = cpu_nr_cores();
142 	u32 *core_idle_state;
143 
144 	/*
145 	 * core_idle_state - The lower 8 bits track the idle state of
146 	 * each thread of the core.
147 	 *
148 	 * The most significant bit is the lock bit.
149 	 *
150 	 * Initially all the bits corresponding to threads_per_core
151 	 * are set. They are cleared when the thread enters deep idle
152 	 * state like sleep and winkle/stop.
153 	 *
154 	 * Initially the lock bit is cleared.  The lock bit has 2
155 	 * purposes:
156 	 * 	a. While the first thread in the core waking up from
157 	 * 	   idle is restoring core state, it prevents other
158 	 * 	   threads in the core from switching to process
159 	 * 	   context.
160 	 * 	b. While the last thread in the core is saving the
161 	 *	   core state, it prevents a different thread from
162 	 *	   waking up.
163 	 */
164 	for (i = 0; i < nr_cores; i++) {
165 		int first_cpu = i * threads_per_core;
166 		int node = cpu_to_node(first_cpu);
167 		size_t paca_ptr_array_size;
168 
169 		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
170 		*core_idle_state = (1 << threads_per_core) - 1;
171 		paca_ptr_array_size = (threads_per_core *
172 				       sizeof(struct paca_struct *));
173 
174 		for (j = 0; j < threads_per_core; j++) {
175 			int cpu = first_cpu + j;
176 
177 			paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state;
178 			paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING;
179 			paca_ptrs[cpu]->thread_mask = 1 << j;
180 			if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
181 				continue;
182 			paca_ptrs[cpu]->thread_sibling_pacas =
183 				kmalloc_node(paca_ptr_array_size,
184 					     GFP_KERNEL, node);
185 		}
186 	}
187 
188 	update_subcore_sibling_mask();
189 
190 	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
191 		int rc = pnv_save_sprs_for_deep_states();
192 
193 		if (likely(!rc))
194 			return;
195 
196 		/*
197 		 * The stop-api is unable to restore hypervisor
198 		 * resources on wakeup from platform idle states which
199 		 * lose full context. So disable such states.
200 		 */
201 		supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
202 		pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
203 		pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
204 
205 		if (cpu_has_feature(CPU_FTR_ARCH_300) &&
206 		    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
207 			/*
208 			 * Use the default stop state for CPU-Hotplug
209 			 * if available.
210 			 */
211 			if (default_stop_found) {
212 				pnv_deepest_stop_psscr_val =
213 					pnv_default_stop_val;
214 				pnv_deepest_stop_psscr_mask =
215 					pnv_default_stop_mask;
216 				pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
217 					pnv_deepest_stop_psscr_val);
218 			} else { /* Fallback to snooze loop for CPU-Hotplug */
219 				deepest_stop_found = false;
220 				pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
221 			}
222 		}
223 	}
224 }
225 
226 u32 pnv_get_supported_cpuidle_states(void)
227 {
228 	return supported_cpuidle_states;
229 }
230 EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
231 
232 static void pnv_fastsleep_workaround_apply(void *info)
233 
234 {
235 	int rc;
236 	int *err = info;
237 
238 	rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
239 					OPAL_CONFIG_IDLE_APPLY);
240 	if (rc)
241 		*err = 1;
242 }
243 
244 /*
245  * Used to store fastsleep workaround state
246  * 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
247  * 1 - Workaround applied once, never undone.
248  */
249 static u8 fastsleep_workaround_applyonce;
250 
251 static ssize_t show_fastsleep_workaround_applyonce(struct device *dev,
252 		struct device_attribute *attr, char *buf)
253 {
254 	return sprintf(buf, "%u\n", fastsleep_workaround_applyonce);
255 }
256 
257 static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
258 		struct device_attribute *attr, const char *buf,
259 		size_t count)
260 {
261 	cpumask_t primary_thread_mask;
262 	int err;
263 	u8 val;
264 
265 	if (kstrtou8(buf, 0, &val) || val != 1)
266 		return -EINVAL;
267 
268 	if (fastsleep_workaround_applyonce == 1)
269 		return count;
270 
271 	/*
272 	 * fastsleep_workaround_applyonce = 1 implies
273 	 * fastsleep workaround needs to be left in 'applied' state on all
274 	 * the cores. Do this by-
275 	 * 1. Patching out the call to 'undo' workaround in fastsleep exit path
276 	 * 2. Sending ipi to all the cores which have at least one online thread
277 	 * 3. Patching out the call to 'apply' workaround in fastsleep entry
278 	 * path
279 	 * There is no need to send ipi to cores which have all threads
280 	 * offlined, as last thread of the core entering fastsleep or deeper
281 	 * state would have applied workaround.
282 	 */
283 	err = patch_instruction(
284 		(unsigned int *)pnv_fastsleep_workaround_at_exit,
285 		PPC_INST_NOP);
286 	if (err) {
287 		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit");
288 		goto fail;
289 	}
290 
291 	get_online_cpus();
292 	primary_thread_mask = cpu_online_cores_map();
293 	on_each_cpu_mask(&primary_thread_mask,
294 				pnv_fastsleep_workaround_apply,
295 				&err, 1);
296 	put_online_cpus();
297 	if (err) {
298 		pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply");
299 		goto fail;
300 	}
301 
302 	err = patch_instruction(
303 		(unsigned int *)pnv_fastsleep_workaround_at_entry,
304 		PPC_INST_NOP);
305 	if (err) {
306 		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry");
307 		goto fail;
308 	}
309 
310 	fastsleep_workaround_applyonce = 1;
311 
312 	return count;
313 fail:
314 	return -EIO;
315 }
316 
317 static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
318 			show_fastsleep_workaround_applyonce,
319 			store_fastsleep_workaround_applyonce);
320 
321 static unsigned long __power7_idle_type(unsigned long type)
322 {
323 	unsigned long srr1;
324 
325 	if (!prep_irq_for_idle_irqsoff())
326 		return 0;
327 
328 	__ppc64_runlatch_off();
329 	srr1 = power7_idle_insn(type);
330 	__ppc64_runlatch_on();
331 
332 	fini_irq_for_idle_irqsoff();
333 
334 	return srr1;
335 }
336 
337 void power7_idle_type(unsigned long type)
338 {
339 	unsigned long srr1;
340 
341 	srr1 = __power7_idle_type(type);
342 	irq_set_pending_from_srr1(srr1);
343 }
344 
345 void power7_idle(void)
346 {
347 	if (!powersave_nap)
348 		return;
349 
350 	power7_idle_type(PNV_THREAD_NAP);
351 }
352 
353 static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
354 				      unsigned long stop_psscr_mask)
355 {
356 	unsigned long psscr;
357 	unsigned long srr1;
358 
359 	if (!prep_irq_for_idle_irqsoff())
360 		return 0;
361 
362 	psscr = mfspr(SPRN_PSSCR);
363 	psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
364 
365 	__ppc64_runlatch_off();
366 	srr1 = power9_idle_stop(psscr);
367 	__ppc64_runlatch_on();
368 
369 	fini_irq_for_idle_irqsoff();
370 
371 	return srr1;
372 }
373 
374 void power9_idle_type(unsigned long stop_psscr_val,
375 				      unsigned long stop_psscr_mask)
376 {
377 	unsigned long srr1;
378 
379 	srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask);
380 	irq_set_pending_from_srr1(srr1);
381 }
382 
383 /*
384  * Used for ppc_md.power_save which needs a function with no parameters
385  */
386 void power9_idle(void)
387 {
388 	power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
389 }
390 
391 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
392 /*
393  * This is used in working around bugs in thread reconfiguration
394  * on POWER9 (at least up to Nimbus DD2.2) relating to transactional
395  * memory and the way that XER[SO] is checkpointed.
396  * This function forces the core into SMT4 in order by asking
397  * all other threads not to stop, and sending a message to any
398  * that are in a stop state.
399  * Must be called with preemption disabled.
400  */
401 void pnv_power9_force_smt4_catch(void)
402 {
403 	int cpu, cpu0, thr;
404 	int awake_threads = 1;		/* this thread is awake */
405 	int poke_threads = 0;
406 	int need_awake = threads_per_core;
407 
408 	cpu = smp_processor_id();
409 	cpu0 = cpu & ~(threads_per_core - 1);
410 	for (thr = 0; thr < threads_per_core; ++thr) {
411 		if (cpu != cpu0 + thr)
412 			atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop);
413 	}
414 	/* order setting dont_stop vs testing requested_psscr */
415 	mb();
416 	for (thr = 0; thr < threads_per_core; ++thr) {
417 		if (!paca_ptrs[cpu0+thr]->requested_psscr)
418 			++awake_threads;
419 		else
420 			poke_threads |= (1 << thr);
421 	}
422 
423 	/* If at least 3 threads are awake, the core is in SMT4 already */
424 	if (awake_threads < need_awake) {
425 		/* We have to wake some threads; we'll use msgsnd */
426 		for (thr = 0; thr < threads_per_core; ++thr) {
427 			if (poke_threads & (1 << thr)) {
428 				ppc_msgsnd_sync();
429 				ppc_msgsnd(PPC_DBELL_MSGTYPE, 0,
430 					   paca_ptrs[cpu0+thr]->hw_cpu_id);
431 			}
432 		}
433 		/* now spin until at least 3 threads are awake */
434 		do {
435 			for (thr = 0; thr < threads_per_core; ++thr) {
436 				if ((poke_threads & (1 << thr)) &&
437 				    !paca_ptrs[cpu0+thr]->requested_psscr) {
438 					++awake_threads;
439 					poke_threads &= ~(1 << thr);
440 				}
441 			}
442 		} while (awake_threads < need_awake);
443 	}
444 }
445 EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_catch);
446 
447 void pnv_power9_force_smt4_release(void)
448 {
449 	int cpu, cpu0, thr;
450 
451 	cpu = smp_processor_id();
452 	cpu0 = cpu & ~(threads_per_core - 1);
453 
454 	/* clear all the dont_stop flags */
455 	for (thr = 0; thr < threads_per_core; ++thr) {
456 		if (cpu != cpu0 + thr)
457 			atomic_dec(&paca_ptrs[cpu0+thr]->dont_stop);
458 	}
459 }
460 EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release);
461 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
462 
463 #ifdef CONFIG_HOTPLUG_CPU
464 static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
465 {
466 	u64 pir = get_hard_smp_processor_id(cpu);
467 
468 	mtspr(SPRN_LPCR, lpcr_val);
469 
470 	/*
471 	 * Program the LPCR via stop-api only if the deepest stop state
472 	 * can lose hypervisor context.
473 	 */
474 	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
475 		opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
476 }
477 
478 /*
479  * pnv_cpu_offline: A function that puts the CPU into the deepest
480  * available platform idle state on a CPU-Offline.
481  * interrupts hard disabled and no lazy irq pending.
482  */
483 unsigned long pnv_cpu_offline(unsigned int cpu)
484 {
485 	unsigned long srr1;
486 	u32 idle_states = pnv_get_supported_cpuidle_states();
487 	u64 lpcr_val;
488 
489 	/*
490 	 * We don't want to take decrementer interrupts while we are
491 	 * offline, so clear LPCR:PECE1. We keep PECE2 (and
492 	 * LPCR_PECE_HVEE on P9) enabled as to let IPIs in.
493 	 *
494 	 * If the CPU gets woken up by a special wakeup, ensure that
495 	 * the SLW engine sets LPCR with decrementer bit cleared, else
496 	 * the CPU will come back to the kernel due to a spurious
497 	 * wakeup.
498 	 */
499 	lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
500 	pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
501 
502 	__ppc64_runlatch_off();
503 
504 	if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
505 		unsigned long psscr;
506 
507 		psscr = mfspr(SPRN_PSSCR);
508 		psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
509 						pnv_deepest_stop_psscr_val;
510 		srr1 = power9_offline_stop(psscr);
511 
512 	} else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
513 		   (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
514 		srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
515 	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
516 		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
517 		srr1 = power7_idle_insn(PNV_THREAD_SLEEP);
518 	} else if (idle_states & OPAL_PM_NAP_ENABLED) {
519 		srr1 = power7_idle_insn(PNV_THREAD_NAP);
520 	} else {
521 		/* This is the fallback method. We emulate snooze */
522 		while (!generic_check_cpu_restart(cpu)) {
523 			HMT_low();
524 			HMT_very_low();
525 		}
526 		srr1 = 0;
527 		HMT_medium();
528 	}
529 
530 	__ppc64_runlatch_on();
531 
532 	/*
533 	 * Re-enable decrementer interrupts in LPCR.
534 	 *
535 	 * Further, we want stop states to be woken up by decrementer
536 	 * for non-hotplug cases. So program the LPCR via stop api as
537 	 * well.
538 	 */
539 	lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
540 	pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
541 
542 	return srr1;
543 }
544 #endif
545 
546 /*
547  * Power ISA 3.0 idle initialization.
548  *
549  * POWER ISA 3.0 defines a new SPR Processor stop Status and Control
550  * Register (PSSCR) to control idle behavior.
551  *
552  * PSSCR layout:
553  * ----------------------------------------------------------
554  * | PLS | /// | SD | ESL | EC | PSLL | /// | TR | MTL | RL |
555  * ----------------------------------------------------------
556  * 0      4     41   42    43   44     48    54   56    60
557  *
558  * PSSCR key fields:
559  *	Bits 0:3  - Power-Saving Level Status (PLS). This field indicates the
560  *	lowest power-saving state the thread entered since stop instruction was
561  *	last executed.
562  *
563  *	Bit 41 - Status Disable(SD)
564  *	0 - Shows PLS entries
565  *	1 - PLS entries are all 0
566  *
567  *	Bit 42 - Enable State Loss
568  *	0 - No state is lost irrespective of other fields
569  *	1 - Allows state loss
570  *
571  *	Bit 43 - Exit Criterion
572  *	0 - Exit from power-save mode on any interrupt
573  *	1 - Exit from power-save mode controlled by LPCR's PECE bits
574  *
575  *	Bits 44:47 - Power-Saving Level Limit
576  *	This limits the power-saving level that can be entered into.
577  *
578  *	Bits 60:63 - Requested Level
579  *	Used to specify which power-saving level must be entered on executing
580  *	stop instruction
581  */
582 
583 int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
584 {
585 	int err = 0;
586 
587 	/*
588 	 * psscr_mask == 0xf indicates an older firmware.
589 	 * Set remaining fields of psscr to the default values.
590 	 * See NOTE above definition of PSSCR_HV_DEFAULT_VAL
591 	 */
592 	if (*psscr_mask == 0xf) {
593 		*psscr_val = *psscr_val | PSSCR_HV_DEFAULT_VAL;
594 		*psscr_mask = PSSCR_HV_DEFAULT_MASK;
595 		return err;
596 	}
597 
598 	/*
599 	 * New firmware is expected to set the psscr_val bits correctly.
600 	 * Validate that the following invariants are correctly maintained by
601 	 * the new firmware.
602 	 * - ESL bit value matches the EC bit value.
603 	 * - ESL bit is set for all the deep stop states.
604 	 */
605 	if (GET_PSSCR_ESL(*psscr_val) != GET_PSSCR_EC(*psscr_val)) {
606 		err = ERR_EC_ESL_MISMATCH;
607 	} else if ((flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
608 		GET_PSSCR_ESL(*psscr_val) == 0) {
609 		err = ERR_DEEP_STATE_ESL_MISMATCH;
610 	}
611 
612 	return err;
613 }
614 
615 /*
616  * pnv_arch300_idle_init: Initializes the default idle state, first
617  *                        deep idle state and deepest idle state on
618  *                        ISA 3.0 CPUs.
619  *
620  * @np: /ibm,opal/power-mgt device node
621  * @flags: cpu-idle-state-flags array
622  * @dt_idle_states: Number of idle state entries
623  * Returns 0 on success
624  */
625 static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
626 					int dt_idle_states)
627 {
628 	u64 *psscr_val = NULL;
629 	u64 *psscr_mask = NULL;
630 	u32 *residency_ns = NULL;
631 	u64 max_residency_ns = 0;
632 	int rc = 0, i;
633 
634 	psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
635 	psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
636 	residency_ns = kcalloc(dt_idle_states, sizeof(*residency_ns),
637 			       GFP_KERNEL);
638 
639 	if (!psscr_val || !psscr_mask || !residency_ns) {
640 		rc = -1;
641 		goto out;
642 	}
643 
644 	if (of_property_read_u64_array(np,
645 		"ibm,cpu-idle-state-psscr",
646 		psscr_val, dt_idle_states)) {
647 		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n");
648 		rc = -1;
649 		goto out;
650 	}
651 
652 	if (of_property_read_u64_array(np,
653 				       "ibm,cpu-idle-state-psscr-mask",
654 				       psscr_mask, dt_idle_states)) {
655 		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask in DT\n");
656 		rc = -1;
657 		goto out;
658 	}
659 
660 	if (of_property_read_u32_array(np,
661 				       "ibm,cpu-idle-state-residency-ns",
662 					residency_ns, dt_idle_states)) {
663 		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n");
664 		rc = -1;
665 		goto out;
666 	}
667 
668 	/*
669 	 * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
670 	 * and the pnv_default_stop_{val,mask}.
671 	 *
672 	 * pnv_first_deep_stop_state should be set to the first stop
673 	 * level to cause hypervisor state loss.
674 	 *
675 	 * pnv_deepest_stop_{val,mask} should be set to values corresponding to
676 	 * the deepest stop state.
677 	 *
678 	 * pnv_default_stop_{val,mask} should be set to values corresponding to
679 	 * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
680 	 */
681 	pnv_first_deep_stop_state = MAX_STOP_STATE;
682 	for (i = 0; i < dt_idle_states; i++) {
683 		int err;
684 		u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK;
685 
686 		if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) &&
687 		     (pnv_first_deep_stop_state > psscr_rl))
688 			pnv_first_deep_stop_state = psscr_rl;
689 
690 		err = validate_psscr_val_mask(&psscr_val[i], &psscr_mask[i],
691 					      flags[i]);
692 		if (err) {
693 			report_invalid_psscr_val(psscr_val[i], err);
694 			continue;
695 		}
696 
697 		if (max_residency_ns < residency_ns[i]) {
698 			max_residency_ns = residency_ns[i];
699 			pnv_deepest_stop_psscr_val = psscr_val[i];
700 			pnv_deepest_stop_psscr_mask = psscr_mask[i];
701 			pnv_deepest_stop_flag = flags[i];
702 			deepest_stop_found = true;
703 		}
704 
705 		if (!default_stop_found &&
706 		    (flags[i] & OPAL_PM_STOP_INST_FAST)) {
707 			pnv_default_stop_val = psscr_val[i];
708 			pnv_default_stop_mask = psscr_mask[i];
709 			default_stop_found = true;
710 		}
711 	}
712 
713 	if (unlikely(!default_stop_found)) {
714 		pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
715 	} else {
716 		ppc_md.power_save = power9_idle;
717 		pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
718 			pnv_default_stop_val, pnv_default_stop_mask);
719 	}
720 
721 	if (unlikely(!deepest_stop_found)) {
722 		pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait");
723 	} else {
724 		pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n",
725 			pnv_deepest_stop_psscr_val,
726 			pnv_deepest_stop_psscr_mask);
727 	}
728 
729 	pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
730 		pnv_first_deep_stop_state);
731 out:
732 	kfree(psscr_val);
733 	kfree(psscr_mask);
734 	kfree(residency_ns);
735 	return rc;
736 }
737 
738 /*
739  * Probe device tree for supported idle states
740  */
741 static void __init pnv_probe_idle_states(void)
742 {
743 	struct device_node *np;
744 	int dt_idle_states;
745 	u32 *flags = NULL;
746 	int i;
747 
748 	np = of_find_node_by_path("/ibm,opal/power-mgt");
749 	if (!np) {
750 		pr_warn("opal: PowerMgmt Node not found\n");
751 		goto out;
752 	}
753 	dt_idle_states = of_property_count_u32_elems(np,
754 			"ibm,cpu-idle-state-flags");
755 	if (dt_idle_states < 0) {
756 		pr_warn("cpuidle-powernv: no idle states found in the DT\n");
757 		goto out;
758 	}
759 
760 	flags = kcalloc(dt_idle_states, sizeof(*flags),  GFP_KERNEL);
761 
762 	if (of_property_read_u32_array(np,
763 			"ibm,cpu-idle-state-flags", flags, dt_idle_states)) {
764 		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n");
765 		goto out;
766 	}
767 
768 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
769 		if (pnv_power9_idle_init(np, flags, dt_idle_states))
770 			goto out;
771 	}
772 
773 	for (i = 0; i < dt_idle_states; i++)
774 		supported_cpuidle_states |= flags[i];
775 
776 out:
777 	kfree(flags);
778 }
779 static int __init pnv_init_idle_states(void)
780 {
781 
782 	supported_cpuidle_states = 0;
783 
784 	if (cpuidle_disable != IDLE_NO_OVERRIDE)
785 		goto out;
786 
787 	pnv_probe_idle_states();
788 
789 	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
790 		patch_instruction(
791 			(unsigned int *)pnv_fastsleep_workaround_at_entry,
792 			PPC_INST_NOP);
793 		patch_instruction(
794 			(unsigned int *)pnv_fastsleep_workaround_at_exit,
795 			PPC_INST_NOP);
796 	} else {
797 		/*
798 		 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
799 		 * workaround is needed to use fastsleep. Provide sysfs
800 		 * control to choose how this workaround has to be applied.
801 		 */
802 		device_create_file(cpu_subsys.dev_root,
803 				&dev_attr_fastsleep_workaround_applyonce);
804 	}
805 
806 	pnv_alloc_idle_core_states();
807 
808 	/*
809 	 * For each CPU, record its PACA address in each of it's
810 	 * sibling thread's PACA at the slot corresponding to this
811 	 * CPU's index in the core.
812 	 */
813 	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
814 		int cpu;
815 
816 		pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n");
817 		for_each_present_cpu(cpu) {
818 			int base_cpu = cpu_first_thread_sibling(cpu);
819 			int idx = cpu_thread_in_core(cpu);
820 			int i;
821 
822 			for (i = 0; i < threads_per_core; i++) {
823 				int j = base_cpu + i;
824 
825 				paca_ptrs[j]->thread_sibling_pacas[idx] =
826 					paca_ptrs[cpu];
827 			}
828 		}
829 	}
830 
831 	if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
832 		ppc_md.power_save = power7_idle;
833 
834 out:
835 	return 0;
836 }
837 machine_subsys_initcall(powernv, pnv_init_idle_states);
838