xref: /openbmc/linux/drivers/cpuidle/cpuidle-pseries.c (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  cpuidle-pseries - idle state cpuidle driver.
4   *  Adapted from drivers/idle/intel_idle.c and
5   *  drivers/acpi/processor_idle.c
6   *
7   */
8  
9  #include <linux/kernel.h>
10  #include <linux/module.h>
11  #include <linux/init.h>
12  #include <linux/moduleparam.h>
13  #include <linux/cpuidle.h>
14  #include <linux/cpu.h>
15  #include <linux/notifier.h>
16  
17  #include <asm/paca.h>
18  #include <asm/reg.h>
19  #include <asm/machdep.h>
20  #include <asm/firmware.h>
21  #include <asm/runlatch.h>
22  #include <asm/idle.h>
23  #include <asm/plpar_wrappers.h>
24  #include <asm/rtas.h>
25  
26  static struct cpuidle_driver pseries_idle_driver = {
27  	.name             = "pseries_idle",
28  	.owner            = THIS_MODULE,
29  };
30  
31  static int max_idle_state __read_mostly;
32  static struct cpuidle_state *cpuidle_state_table __read_mostly;
33  static u64 snooze_timeout __read_mostly;
34  static bool snooze_timeout_en __read_mostly;
35  
36  static __cpuidle
snooze_loop(struct cpuidle_device * dev,struct cpuidle_driver * drv,int index)37  int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
38  		int index)
39  {
40  	u64 snooze_exit_time;
41  
42  	set_thread_flag(TIF_POLLING_NRFLAG);
43  
44  	pseries_idle_prolog();
45  	raw_local_irq_enable();
46  	snooze_exit_time = get_tb() + snooze_timeout;
47  	dev->poll_time_limit = false;
48  
49  	while (!need_resched()) {
50  		HMT_low();
51  		HMT_very_low();
52  		if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
53  			/*
54  			 * Task has not woken up but we are exiting the polling
55  			 * loop anyway. Require a barrier after polling is
56  			 * cleared to order subsequent test of need_resched().
57  			 */
58  			dev->poll_time_limit = true;
59  			clear_thread_flag(TIF_POLLING_NRFLAG);
60  			smp_mb();
61  			break;
62  		}
63  	}
64  
65  	HMT_medium();
66  	clear_thread_flag(TIF_POLLING_NRFLAG);
67  
68  	raw_local_irq_disable();
69  
70  	pseries_idle_epilog();
71  
72  	return index;
73  }
74  
check_and_cede_processor(void)75  static __cpuidle void check_and_cede_processor(void)
76  {
77  	/*
78  	 * Ensure our interrupt state is properly tracked,
79  	 * also checks if no interrupt has occurred while we
80  	 * were soft-disabled
81  	 */
82  	if (prep_irq_for_idle()) {
83  		cede_processor();
84  #ifdef CONFIG_TRACE_IRQFLAGS
85  		/* Ensure that H_CEDE returns with IRQs on */
86  		if (WARN_ON(!(mfmsr() & MSR_EE)))
87  			__hard_irq_enable();
88  #endif
89  	}
90  }
91  
92  /*
93   * XCEDE: Extended CEDE states discovered through the
94   *        "ibm,get-systems-parameter" RTAS call with the token
95   *        CEDE_LATENCY_TOKEN
96   */
97  
98  /*
99   * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
100   * table with all the parameters to ibm,get-system-parameters.
101   * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
102   * Settings Information.
103   */
104  #define CEDE_LATENCY_TOKEN	45
105  
106  /*
107   * If the platform supports the cede latency settings information system
108   * parameter it must provide the following information in the NULL terminated
109   * parameter string:
110   *
111   * a. The first byte is the length ā€œNā€ of each cede latency setting record minus
112   *    one (zero indicates a length of 1 byte).
113   *
114   * b. For each supported cede latency setting a cede latency setting record
115   *    consisting of the first ā€œNā€ bytes as per the following table.
116   *
117   *    -----------------------------
118   *    | Field           | Field   |
119   *    | Name            | Length  |
120   *    -----------------------------
121   *    | Cede Latency    | 1 Byte  |
122   *    | Specifier Value |         |
123   *    -----------------------------
124   *    | Maximum wakeup  |         |
125   *    | latency in      | 8 Bytes |
126   *    | tb-ticks        |         |
127   *    -----------------------------
128   *    | Responsive to   |         |
129   *    | external        | 1 Byte  |
130   *    | interrupts      |         |
131   *    -----------------------------
132   *
133   * This version has cede latency record size = 10.
134   *
135   * The structure xcede_latency_payload represents a) and b) with
136   * xcede_latency_record representing the table in b).
137   *
138   * xcede_latency_parameter is what gets returned by
139   * ibm,get-systems-parameter RTAS call when made with
140   * CEDE_LATENCY_TOKEN.
141   *
142   * These structures are only used to represent the data obtained by the RTAS
143   * call. The data is in big-endian.
144   */
145  struct xcede_latency_record {
146  	u8	hint;
147  	__be64	latency_ticks;
148  	u8	wake_on_irqs;
149  } __packed;
150  
151  // Make space for 16 records, which "should be enough".
152  struct xcede_latency_payload {
153  	u8     record_size;
154  	struct xcede_latency_record records[16];
155  } __packed;
156  
157  struct xcede_latency_parameter {
158  	__be16  payload_size;
159  	struct xcede_latency_payload payload;
160  	u8 null_char;
161  } __packed;
162  
163  static unsigned int nr_xcede_records;
164  static struct xcede_latency_parameter xcede_latency_parameter __initdata;
165  
parse_cede_parameters(void)166  static int __init parse_cede_parameters(void)
167  {
168  	struct xcede_latency_payload *payload;
169  	u32 total_xcede_records_size;
170  	u8 xcede_record_size;
171  	u16 payload_size;
172  	int ret, i;
173  
174  	ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
175  			NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
176  			sizeof(xcede_latency_parameter));
177  	if (ret) {
178  		pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
179  		return ret;
180  	}
181  
182  	payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
183  	payload = &xcede_latency_parameter.payload;
184  
185  	xcede_record_size = payload->record_size + 1;
186  
187  	if (xcede_record_size != sizeof(struct xcede_latency_record)) {
188  		pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
189  		       sizeof(struct xcede_latency_record), xcede_record_size);
190  		return -EINVAL;
191  	}
192  
193  	pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
194  
195  	/*
196  	 * Since the payload_size includes the last NULL byte and the
197  	 * xcede_record_size, the remaining bytes correspond to array of all
198  	 * cede_latency settings.
199  	 */
200  	total_xcede_records_size = payload_size - 2;
201  	nr_xcede_records = total_xcede_records_size / xcede_record_size;
202  
203  	for (i = 0; i < nr_xcede_records; i++) {
204  		struct xcede_latency_record *record = &payload->records[i];
205  		u64 latency_ticks = be64_to_cpu(record->latency_ticks);
206  		u8 wake_on_irqs = record->wake_on_irqs;
207  		u8 hint = record->hint;
208  
209  		pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
210  			i, hint, latency_ticks, wake_on_irqs);
211  	}
212  
213  	return 0;
214  }
215  
216  #define NR_DEDICATED_STATES	2 /* snooze, CEDE */
217  static u8 cede_latency_hint[NR_DEDICATED_STATES];
218  
219  static __cpuidle
dedicated_cede_loop(struct cpuidle_device * dev,struct cpuidle_driver * drv,int index)220  int dedicated_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
221  			int index)
222  {
223  	u8 old_latency_hint;
224  
225  	pseries_idle_prolog();
226  	get_lppaca()->donate_dedicated_cpu = 1;
227  	old_latency_hint = get_lppaca()->cede_latency_hint;
228  	get_lppaca()->cede_latency_hint = cede_latency_hint[index];
229  
230  	HMT_medium();
231  	check_and_cede_processor();
232  
233  	raw_local_irq_disable();
234  	get_lppaca()->donate_dedicated_cpu = 0;
235  	get_lppaca()->cede_latency_hint = old_latency_hint;
236  
237  	pseries_idle_epilog();
238  
239  	return index;
240  }
241  
242  static __cpuidle
shared_cede_loop(struct cpuidle_device * dev,struct cpuidle_driver * drv,int index)243  int shared_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
244  		     int index)
245  {
246  
247  	pseries_idle_prolog();
248  
249  	/*
250  	 * Yield the processor to the hypervisor.  We return if
251  	 * an external interrupt occurs (which are driven prior
252  	 * to returning here) or if a prod occurs from another
253  	 * processor. When returning here, external interrupts
254  	 * are enabled.
255  	 */
256  	check_and_cede_processor();
257  
258  	raw_local_irq_disable();
259  	pseries_idle_epilog();
260  
261  	return index;
262  }
263  
264  /*
265   * States for dedicated partition case.
266   */
267  static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
268  	{ /* Snooze */
269  		.name = "snooze",
270  		.desc = "snooze",
271  		.exit_latency = 0,
272  		.target_residency = 0,
273  		.enter = &snooze_loop,
274  		.flags = CPUIDLE_FLAG_POLLING },
275  	{ /* CEDE */
276  		.name = "CEDE",
277  		.desc = "CEDE",
278  		.exit_latency = 10,
279  		.target_residency = 100,
280  		.enter = &dedicated_cede_loop },
281  };
282  
283  /*
284   * States for shared partition case.
285   */
286  static struct cpuidle_state shared_states[] = {
287  	{ /* Snooze */
288  		.name = "snooze",
289  		.desc = "snooze",
290  		.exit_latency = 0,
291  		.target_residency = 0,
292  		.enter = &snooze_loop,
293  		.flags = CPUIDLE_FLAG_POLLING },
294  	{ /* Shared Cede */
295  		.name = "Shared Cede",
296  		.desc = "Shared Cede",
297  		.exit_latency = 10,
298  		.target_residency = 100,
299  		.enter = &shared_cede_loop },
300  };
301  
pseries_cpuidle_cpu_online(unsigned int cpu)302  static int pseries_cpuidle_cpu_online(unsigned int cpu)
303  {
304  	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
305  
306  	if (dev && cpuidle_get_driver()) {
307  		cpuidle_pause_and_lock();
308  		cpuidle_enable_device(dev);
309  		cpuidle_resume_and_unlock();
310  	}
311  	return 0;
312  }
313  
pseries_cpuidle_cpu_dead(unsigned int cpu)314  static int pseries_cpuidle_cpu_dead(unsigned int cpu)
315  {
316  	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
317  
318  	if (dev && cpuidle_get_driver()) {
319  		cpuidle_pause_and_lock();
320  		cpuidle_disable_device(dev);
321  		cpuidle_resume_and_unlock();
322  	}
323  	return 0;
324  }
325  
326  /*
327   * pseries_cpuidle_driver_init()
328   */
pseries_cpuidle_driver_init(void)329  static int pseries_cpuidle_driver_init(void)
330  {
331  	int idle_state;
332  	struct cpuidle_driver *drv = &pseries_idle_driver;
333  
334  	drv->state_count = 0;
335  
336  	for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
337  		/* Is the state not enabled? */
338  		if (cpuidle_state_table[idle_state].enter == NULL)
339  			continue;
340  
341  		drv->states[drv->state_count] =	/* structure copy */
342  			cpuidle_state_table[idle_state];
343  
344  		drv->state_count += 1;
345  	}
346  
347  	return 0;
348  }
349  
fixup_cede0_latency(void)350  static void __init fixup_cede0_latency(void)
351  {
352  	struct xcede_latency_payload *payload;
353  	u64 min_xcede_latency_us = UINT_MAX;
354  	int i;
355  
356  	if (parse_cede_parameters())
357  		return;
358  
359  	pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
360  		nr_xcede_records);
361  
362  	payload = &xcede_latency_parameter.payload;
363  
364  	/*
365  	 * The CEDE idle state maps to CEDE(0). While the hypervisor
366  	 * does not advertise CEDE(0) exit latency values, it does
367  	 * advertise the latency values of the extended CEDE states.
368  	 * We use the lowest advertised exit latency value as a proxy
369  	 * for the exit latency of CEDE(0).
370  	 */
371  	for (i = 0; i < nr_xcede_records; i++) {
372  		struct xcede_latency_record *record = &payload->records[i];
373  		u8 hint = record->hint;
374  		u64 latency_tb = be64_to_cpu(record->latency_ticks);
375  		u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
376  
377  		/*
378  		 * We expect the exit latency of an extended CEDE
379  		 * state to be non-zero, it to since it takes at least
380  		 * a few nanoseconds to wakeup the idle CPU and
381  		 * dispatch the virtual processor into the Linux
382  		 * Guest.
383  		 *
384  		 * So we consider only non-zero value for performing
385  		 * the fixup of CEDE(0) latency.
386  		 */
387  		if (latency_us == 0) {
388  			pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
389  				i, hint);
390  			continue;
391  		}
392  
393  		if (latency_us < min_xcede_latency_us)
394  			min_xcede_latency_us = latency_us;
395  	}
396  
397  	if (min_xcede_latency_us != UINT_MAX) {
398  		dedicated_states[1].exit_latency = min_xcede_latency_us;
399  		dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
400  		pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
401  			min_xcede_latency_us);
402  	}
403  
404  }
405  
406  /*
407   * pseries_idle_probe()
408   * Choose state table for shared versus dedicated partition
409   */
pseries_idle_probe(void)410  static int __init pseries_idle_probe(void)
411  {
412  
413  	if (cpuidle_disable != IDLE_NO_OVERRIDE)
414  		return -ENODEV;
415  
416  	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
417  		if (lppaca_shared_proc()) {
418  			cpuidle_state_table = shared_states;
419  			max_idle_state = ARRAY_SIZE(shared_states);
420  		} else {
421  			/*
422  			 * Use firmware provided latency values
423  			 * starting with POWER10 platforms. In the
424  			 * case that we are running on a POWER10
425  			 * platform but in an earlier compat mode, we
426  			 * can still use the firmware provided values.
427  			 *
428  			 * However, on platforms prior to POWER10, we
429  			 * cannot rely on the accuracy of the firmware
430  			 * provided latency values. On such platforms,
431  			 * go with the conservative default estimate
432  			 * of 10us.
433  			 */
434  			if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
435  				fixup_cede0_latency();
436  			cpuidle_state_table = dedicated_states;
437  			max_idle_state = NR_DEDICATED_STATES;
438  		}
439  	} else
440  		return -ENODEV;
441  
442  	if (max_idle_state > 1) {
443  		snooze_timeout_en = true;
444  		snooze_timeout = cpuidle_state_table[1].target_residency *
445  				 tb_ticks_per_usec;
446  	}
447  	return 0;
448  }
449  
pseries_processor_idle_init(void)450  static int __init pseries_processor_idle_init(void)
451  {
452  	int retval;
453  
454  	retval = pseries_idle_probe();
455  	if (retval)
456  		return retval;
457  
458  	pseries_cpuidle_driver_init();
459  	retval = cpuidle_register(&pseries_idle_driver, NULL);
460  	if (retval) {
461  		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
462  		return retval;
463  	}
464  
465  	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
466  					   "cpuidle/pseries:online",
467  					   pseries_cpuidle_cpu_online, NULL);
468  	WARN_ON(retval < 0);
469  	retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
470  					   "cpuidle/pseries:DEAD", NULL,
471  					   pseries_cpuidle_cpu_dead);
472  	WARN_ON(retval < 0);
473  	printk(KERN_DEBUG "pseries_idle_driver registered\n");
474  	return 0;
475  }
476  
477  device_initcall(pseries_processor_idle_init);
478