xref: /openbmc/linux/drivers/cpuidle/governors/teo.c (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Timer events oriented CPU idle governor
4   *
5   * TEO governor:
6   * Copyright (C) 2018 - 2021 Intel Corporation
7   * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
8   *
9   * Util-awareness mechanism:
10   * Copyright (C) 2022 Arm Ltd.
11   * Author: Kajetan Puchalski <kajetan.puchalski@arm.com>
12   */
13  
14  /**
15   * DOC: teo-description
16   *
17   * The idea of this governor is based on the observation that on many systems
18   * timer events are two or more orders of magnitude more frequent than any
19   * other interrupts, so they are likely to be the most significant cause of CPU
20   * wakeups from idle states.  Moreover, information about what happened in the
21   * (relatively recent) past can be used to estimate whether or not the deepest
22   * idle state with target residency within the (known) time till the closest
23   * timer event, referred to as the sleep length, is likely to be suitable for
24   * the upcoming CPU idle period and, if not, then which of the shallower idle
25   * states to choose instead of it.
26   *
27   * Of course, non-timer wakeup sources are more important in some use cases
28   * which can be covered by taking a few most recent idle time intervals of the
29   * CPU into account.  However, even in that context it is not necessary to
30   * consider idle duration values greater than the sleep length, because the
31   * closest timer will ultimately wake up the CPU anyway unless it is woken up
32   * earlier.
33   *
34   * Thus this governor estimates whether or not the prospective idle duration of
35   * a CPU is likely to be significantly shorter than the sleep length and selects
36   * an idle state for it accordingly.
37   *
38   * The computations carried out by this governor are based on using bins whose
39   * boundaries are aligned with the target residency parameter values of the CPU
40   * idle states provided by the %CPUIdle driver in the ascending order.  That is,
41   * the first bin spans from 0 up to, but not including, the target residency of
42   * the second idle state (idle state 1), the second bin spans from the target
43   * residency of idle state 1 up to, but not including, the target residency of
44   * idle state 2, the third bin spans from the target residency of idle state 2
45   * up to, but not including, the target residency of idle state 3 and so on.
46   * The last bin spans from the target residency of the deepest idle state
47   * supplied by the driver to infinity.
48   *
49   * Two metrics called "hits" and "intercepts" are associated with each bin.
50   * They are updated every time before selecting an idle state for the given CPU
51   * in accordance with what happened last time.
52   *
53   * The "hits" metric reflects the relative frequency of situations in which the
54   * sleep length and the idle duration measured after CPU wakeup fall into the
55   * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
56   * length).  In turn, the "intercepts" metric reflects the relative frequency of
57   * situations in which the measured idle duration is so much shorter than the
58   * sleep length that the bin it falls into corresponds to an idle state
59   * shallower than the one whose bin is fallen into by the sleep length (these
60   * situations are referred to as "intercepts" below).
61   *
62   * In addition to the metrics described above, the governor counts recent
63   * intercepts (that is, intercepts that have occurred during the last
64   * %NR_RECENT invocations of it for the given CPU) for each bin.
65   *
66   * In order to select an idle state for a CPU, the governor takes the following
67   * steps (modulo the possible latency constraint that must be taken into account
68   * too):
69   *
70   * 1. Find the deepest CPU idle state whose target residency does not exceed
71   *    the current sleep length (the candidate idle state) and compute 3 sums as
72   *    follows:
73   *
74   *    - The sum of the "hits" and "intercepts" metrics for the candidate state
75   *      and all of the deeper idle states (it represents the cases in which the
76   *      CPU was idle long enough to avoid being intercepted if the sleep length
77   *      had been equal to the current one).
78   *
79   *    - The sum of the "intercepts" metrics for all of the idle states shallower
80   *      than the candidate one (it represents the cases in which the CPU was not
81   *      idle long enough to avoid being intercepted if the sleep length had been
82   *      equal to the current one).
83   *
84   *    - The sum of the numbers of recent intercepts for all of the idle states
85   *      shallower than the candidate one.
86   *
87   * 2. If the second sum is greater than the first one or the third sum is
88   *    greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look
89   *    for an alternative idle state to select.
90   *
91   *    - Traverse the idle states shallower than the candidate one in the
92   *      descending order.
93   *
94   *    - For each of them compute the sum of the "intercepts" metrics and the sum
95   *      of the numbers of recent intercepts over all of the idle states between
96   *      it and the candidate one (including the former and excluding the
97   *      latter).
98   *
99   *    - If each of these sums that needs to be taken into account (because the
100   *      check related to it has indicated that the CPU is likely to wake up
101   *      early) is greater than a half of the corresponding sum computed in step
102   *      1 (which means that the target residency of the state in question had
103   *      not exceeded the idle duration in over a half of the relevant cases),
104   *      select the given idle state instead of the candidate one.
105   *
106   * 3. By default, select the candidate state.
107   *
108   * Util-awareness mechanism:
109   *
110   * The idea behind the util-awareness extension is that there are two distinct
111   * scenarios for the CPU which should result in two different approaches to idle
112   * state selection - utilized and not utilized.
113   *
114   * In this case, 'utilized' means that the average runqueue util of the CPU is
115   * above a certain threshold.
116   *
117   * When the CPU is utilized while going into idle, more likely than not it will
118   * be woken up to do more work soon and so a shallower idle state should be
119   * selected to minimise latency and maximise performance. When the CPU is not
120   * being utilized, the usual metrics-based approach to selecting the deepest
121   * available idle state should be preferred to take advantage of the power
122   * saving.
123   *
124   * In order to achieve this, the governor uses a utilization threshold.
125   * The threshold is computed per-CPU as a percentage of the CPU's capacity
126   * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%)
127   * seems to be getting the best results.
128   *
129   * Before selecting the next idle state, the governor compares the current CPU
130   * util to the precomputed util threshold. If it's below, it defaults to the
131   * TEO metrics mechanism. If it's above, the closest shallower idle state will
132   * be selected instead, as long as is not a polling state.
133   */
134  
135  #include <linux/cpuidle.h>
136  #include <linux/jiffies.h>
137  #include <linux/kernel.h>
138  #include <linux/sched.h>
139  #include <linux/sched/clock.h>
140  #include <linux/sched/topology.h>
141  #include <linux/tick.h>
142  
143  #include "gov.h"
144  
145  /*
146   * The number of bits to shift the CPU's capacity by in order to determine
147   * the utilized threshold.
148   *
149   * 6 was chosen based on testing as the number that achieved the best balance
150   * of power and performance on average.
151   *
152   * The resulting threshold is high enough to not be triggered by background
153   * noise and low enough to react quickly when activity starts to ramp up.
154   */
155  #define UTIL_THRESHOLD_SHIFT 6
156  
157  /*
158   * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
159   * is used for decreasing metrics on a regular basis.
160   */
161  #define PULSE		1024
162  #define DECAY_SHIFT	3
163  
164  /*
165   * Number of the most recent idle duration values to take into consideration for
166   * the detection of recent early wakeup patterns.
167   */
168  #define NR_RECENT	9
169  
170  /**
171   * struct teo_bin - Metrics used by the TEO cpuidle governor.
172   * @intercepts: The "intercepts" metric.
173   * @hits: The "hits" metric.
174   * @recent: The number of recent "intercepts".
175   */
176  struct teo_bin {
177  	unsigned int intercepts;
178  	unsigned int hits;
179  	unsigned int recent;
180  };
181  
182  /**
183   * struct teo_cpu - CPU data used by the TEO cpuidle governor.
184   * @time_span_ns: Time between idle state selection and post-wakeup update.
185   * @sleep_length_ns: Time till the closest timer event (at the selection time).
186   * @state_bins: Idle state data bins for this CPU.
187   * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
188   * @next_recent_idx: Index of the next @recent_idx entry to update.
189   * @recent_idx: Indices of bins corresponding to recent "intercepts".
190   * @tick_hits: Number of "hits" after TICK_NSEC.
191   * @util_threshold: Threshold above which the CPU is considered utilized
192   */
193  struct teo_cpu {
194  	s64 time_span_ns;
195  	s64 sleep_length_ns;
196  	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
197  	unsigned int total;
198  	int next_recent_idx;
199  	int recent_idx[NR_RECENT];
200  	unsigned int tick_hits;
201  	unsigned long util_threshold;
202  };
203  
204  static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
205  
206  /**
207   * teo_cpu_is_utilized - Check if the CPU's util is above the threshold
208   * @cpu: Target CPU
209   * @cpu_data: Governor CPU data for the target CPU
210   */
211  #ifdef CONFIG_SMP
teo_cpu_is_utilized(int cpu,struct teo_cpu * cpu_data)212  static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
213  {
214  	return sched_cpu_util(cpu) > cpu_data->util_threshold;
215  }
216  #else
teo_cpu_is_utilized(int cpu,struct teo_cpu * cpu_data)217  static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
218  {
219  	return false;
220  }
221  #endif
222  
223  /**
224   * teo_update - Update CPU metrics after wakeup.
225   * @drv: cpuidle driver containing state data.
226   * @dev: Target CPU.
227   */
teo_update(struct cpuidle_driver * drv,struct cpuidle_device * dev)228  static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
229  {
230  	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
231  	int i, idx_timer = 0, idx_duration = 0;
232  	s64 target_residency_ns;
233  	u64 measured_ns;
234  
235  	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
236  		/*
237  		 * One of the safety nets has triggered or the wakeup was close
238  		 * enough to the closest timer event expected at the idle state
239  		 * selection time to be discarded.
240  		 */
241  		measured_ns = U64_MAX;
242  	} else {
243  		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
244  
245  		/*
246  		 * The computations below are to determine whether or not the
247  		 * (saved) time till the next timer event and the measured idle
248  		 * duration fall into the same "bin", so use last_residency_ns
249  		 * for that instead of time_span_ns which includes the cpuidle
250  		 * overhead.
251  		 */
252  		measured_ns = dev->last_residency_ns;
253  		/*
254  		 * The delay between the wakeup and the first instruction
255  		 * executed by the CPU is not likely to be worst-case every
256  		 * time, so take 1/2 of the exit latency as a very rough
257  		 * approximation of the average of it.
258  		 */
259  		if (measured_ns >= lat_ns)
260  			measured_ns -= lat_ns / 2;
261  		else
262  			measured_ns /= 2;
263  	}
264  
265  	cpu_data->total = 0;
266  
267  	/*
268  	 * Decay the "hits" and "intercepts" metrics for all of the bins and
269  	 * find the bins that the sleep length and the measured idle duration
270  	 * fall into.
271  	 */
272  	for (i = 0; i < drv->state_count; i++) {
273  		struct teo_bin *bin = &cpu_data->state_bins[i];
274  
275  		bin->hits -= bin->hits >> DECAY_SHIFT;
276  		bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
277  
278  		cpu_data->total += bin->hits + bin->intercepts;
279  
280  		target_residency_ns = drv->states[i].target_residency_ns;
281  
282  		if (target_residency_ns <= cpu_data->sleep_length_ns) {
283  			idx_timer = i;
284  			if (target_residency_ns <= measured_ns)
285  				idx_duration = i;
286  		}
287  	}
288  
289  	i = cpu_data->next_recent_idx++;
290  	if (cpu_data->next_recent_idx >= NR_RECENT)
291  		cpu_data->next_recent_idx = 0;
292  
293  	if (cpu_data->recent_idx[i] >= 0)
294  		cpu_data->state_bins[cpu_data->recent_idx[i]].recent--;
295  
296  	/*
297  	 * If the deepest state's target residency is below the tick length,
298  	 * make a record of it to help teo_select() decide whether or not
299  	 * to stop the tick.  This effectively adds an extra hits-only bin
300  	 * beyond the last state-related one.
301  	 */
302  	if (target_residency_ns < TICK_NSEC) {
303  		cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT;
304  
305  		cpu_data->total += cpu_data->tick_hits;
306  
307  		if (TICK_NSEC <= cpu_data->sleep_length_ns) {
308  			idx_timer = drv->state_count;
309  			if (TICK_NSEC <= measured_ns) {
310  				cpu_data->tick_hits += PULSE;
311  				goto end;
312  			}
313  		}
314  	}
315  
316  	/*
317  	 * If the measured idle duration falls into the same bin as the sleep
318  	 * length, this is a "hit", so update the "hits" metric for that bin.
319  	 * Otherwise, update the "intercepts" metric for the bin fallen into by
320  	 * the measured idle duration.
321  	 */
322  	if (idx_timer == idx_duration) {
323  		cpu_data->state_bins[idx_timer].hits += PULSE;
324  		cpu_data->recent_idx[i] = -1;
325  	} else {
326  		cpu_data->state_bins[idx_duration].intercepts += PULSE;
327  		cpu_data->state_bins[idx_duration].recent++;
328  		cpu_data->recent_idx[i] = idx_duration;
329  	}
330  
331  end:
332  	cpu_data->total += PULSE;
333  }
334  
teo_state_ok(int i,struct cpuidle_driver * drv)335  static bool teo_state_ok(int i, struct cpuidle_driver *drv)
336  {
337  	return !tick_nohz_tick_stopped() ||
338  		drv->states[i].target_residency_ns >= TICK_NSEC;
339  }
340  
341  /**
342   * teo_find_shallower_state - Find shallower idle state matching given duration.
343   * @drv: cpuidle driver containing state data.
344   * @dev: Target CPU.
345   * @state_idx: Index of the capping idle state.
346   * @duration_ns: Idle duration value to match.
347   * @no_poll: Don't consider polling states.
348   */
teo_find_shallower_state(struct cpuidle_driver * drv,struct cpuidle_device * dev,int state_idx,s64 duration_ns,bool no_poll)349  static int teo_find_shallower_state(struct cpuidle_driver *drv,
350  				    struct cpuidle_device *dev, int state_idx,
351  				    s64 duration_ns, bool no_poll)
352  {
353  	int i;
354  
355  	for (i = state_idx - 1; i >= 0; i--) {
356  		if (dev->states_usage[i].disable ||
357  				(no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
358  			continue;
359  
360  		state_idx = i;
361  		if (drv->states[i].target_residency_ns <= duration_ns)
362  			break;
363  	}
364  	return state_idx;
365  }
366  
367  /**
368   * teo_select - Selects the next idle state to enter.
369   * @drv: cpuidle driver containing state data.
370   * @dev: Target CPU.
371   * @stop_tick: Indication on whether or not to stop the scheduler tick.
372   */
teo_select(struct cpuidle_driver * drv,struct cpuidle_device * dev,bool * stop_tick)373  static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
374  		      bool *stop_tick)
375  {
376  	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
377  	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
378  	ktime_t delta_tick = TICK_NSEC / 2;
379  	unsigned int tick_intercept_sum = 0;
380  	unsigned int idx_intercept_sum = 0;
381  	unsigned int intercept_sum = 0;
382  	unsigned int idx_recent_sum = 0;
383  	unsigned int recent_sum = 0;
384  	unsigned int idx_hit_sum = 0;
385  	unsigned int hit_sum = 0;
386  	int constraint_idx = 0;
387  	int idx0 = 0, idx = -1;
388  	bool alt_intercepts, alt_recent;
389  	bool cpu_utilized;
390  	s64 duration_ns;
391  	int i;
392  
393  	if (dev->last_state_idx >= 0) {
394  		teo_update(drv, dev);
395  		dev->last_state_idx = -1;
396  	}
397  
398  	cpu_data->time_span_ns = local_clock();
399  	/*
400  	 * Set the expected sleep length to infinity in case of an early
401  	 * return.
402  	 */
403  	cpu_data->sleep_length_ns = KTIME_MAX;
404  
405  	/* Check if there is any choice in the first place. */
406  	if (drv->state_count < 2) {
407  		idx = 0;
408  		goto out_tick;
409  	}
410  
411  	if (!dev->states_usage[0].disable)
412  		idx = 0;
413  
414  	cpu_utilized = teo_cpu_is_utilized(dev->cpu, cpu_data);
415  	/*
416  	 * If the CPU is being utilized over the threshold and there are only 2
417  	 * states to choose from, the metrics need not be considered, so choose
418  	 * the shallowest non-polling state and exit.
419  	 */
420  	if (drv->state_count < 3 && cpu_utilized) {
421  		/*
422  		 * If state 0 is enabled and it is not a polling one, select it
423  		 * right away unless the scheduler tick has been stopped, in
424  		 * which case care needs to be taken to leave the CPU in a deep
425  		 * enough state in case it is not woken up any time soon after
426  		 * all.  If state 1 is disabled, though, state 0 must be used
427  		 * anyway.
428  		 */
429  		if ((!idx && !(drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
430  		    teo_state_ok(0, drv)) || dev->states_usage[1].disable) {
431  			idx = 0;
432  			goto out_tick;
433  		}
434  		/* Assume that state 1 is not a polling one and use it. */
435  		idx = 1;
436  		duration_ns = drv->states[1].target_residency_ns;
437  		goto end;
438  	}
439  
440  	/* Compute the sums of metrics for early wakeup pattern detection. */
441  	for (i = 1; i < drv->state_count; i++) {
442  		struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
443  		struct cpuidle_state *s = &drv->states[i];
444  
445  		/*
446  		 * Update the sums of idle state mertics for all of the states
447  		 * shallower than the current one.
448  		 */
449  		intercept_sum += prev_bin->intercepts;
450  		hit_sum += prev_bin->hits;
451  		recent_sum += prev_bin->recent;
452  
453  		if (dev->states_usage[i].disable)
454  			continue;
455  
456  		if (idx < 0)
457  			idx0 = i; /* first enabled state */
458  
459  		idx = i;
460  
461  		if (s->exit_latency_ns <= latency_req)
462  			constraint_idx = i;
463  
464  		/* Save the sums for the current state. */
465  		idx_intercept_sum = intercept_sum;
466  		idx_hit_sum = hit_sum;
467  		idx_recent_sum = recent_sum;
468  	}
469  
470  	/* Avoid unnecessary overhead. */
471  	if (idx < 0) {
472  		idx = 0; /* No states enabled, must use 0. */
473  		goto out_tick;
474  	}
475  
476  	if (idx == idx0) {
477  		/*
478  		 * Only one idle state is enabled, so use it, but do not
479  		 * allow the tick to be stopped it is shallow enough.
480  		 */
481  		duration_ns = drv->states[idx].target_residency_ns;
482  		goto end;
483  	}
484  
485  	tick_intercept_sum = intercept_sum +
486  			cpu_data->state_bins[drv->state_count-1].intercepts;
487  
488  	/*
489  	 * If the sum of the intercepts metric for all of the idle states
490  	 * shallower than the current candidate one (idx) is greater than the
491  	 * sum of the intercepts and hits metrics for the candidate state and
492  	 * all of the deeper states, or the sum of the numbers of recent
493  	 * intercepts over all of the states shallower than the candidate one
494  	 * is greater than a half of the number of recent events taken into
495  	 * account, a shallower idle state is likely to be a better choice.
496  	 */
497  	alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum;
498  	alt_recent = idx_recent_sum > NR_RECENT / 2;
499  	if (alt_recent || alt_intercepts) {
500  		int first_suitable_idx = idx;
501  
502  		/*
503  		 * Look for the deepest idle state whose target residency had
504  		 * not exceeded the idle duration in over a half of the relevant
505  		 * cases (both with respect to intercepts overall and with
506  		 * respect to the recent intercepts only) in the past.
507  		 *
508  		 * Take the possible duration limitation present if the tick
509  		 * has been stopped already into account.
510  		 */
511  		intercept_sum = 0;
512  		recent_sum = 0;
513  
514  		for (i = idx - 1; i >= 0; i--) {
515  			struct teo_bin *bin = &cpu_data->state_bins[i];
516  
517  			intercept_sum += bin->intercepts;
518  			recent_sum += bin->recent;
519  
520  			if ((!alt_recent || 2 * recent_sum > idx_recent_sum) &&
521  			    (!alt_intercepts ||
522  			     2 * intercept_sum > idx_intercept_sum)) {
523  				/*
524  				 * Use the current state unless it is too
525  				 * shallow or disabled, in which case take the
526  				 * first enabled state that is deep enough.
527  				 */
528  				if (teo_state_ok(i, drv) &&
529  				    !dev->states_usage[i].disable)
530  					idx = i;
531  				else
532  					idx = first_suitable_idx;
533  
534  				break;
535  			}
536  
537  			if (dev->states_usage[i].disable)
538  				continue;
539  
540  			if (!teo_state_ok(i, drv)) {
541  				/*
542  				 * The current state is too shallow, but if an
543  				 * alternative candidate state has been found,
544  				 * it may still turn out to be a better choice.
545  				 */
546  				if (first_suitable_idx != idx)
547  					continue;
548  
549  				break;
550  			}
551  
552  			first_suitable_idx = i;
553  		}
554  	}
555  
556  	/*
557  	 * If there is a latency constraint, it may be necessary to select an
558  	 * idle state shallower than the current candidate one.
559  	 */
560  	if (idx > constraint_idx)
561  		idx = constraint_idx;
562  
563  	/*
564  	 * If the CPU is being utilized over the threshold, choose a shallower
565  	 * non-polling state to improve latency, unless the scheduler tick has
566  	 * been stopped already and the shallower state's target residency is
567  	 * not sufficiently large.
568  	 */
569  	if (cpu_utilized) {
570  		i = teo_find_shallower_state(drv, dev, idx, KTIME_MAX, true);
571  		if (teo_state_ok(i, drv))
572  			idx = i;
573  	}
574  
575  	/*
576  	 * Skip the timers check if state 0 is the current candidate one,
577  	 * because an immediate non-timer wakeup is expected in that case.
578  	 */
579  	if (!idx)
580  		goto out_tick;
581  
582  	/*
583  	 * If state 0 is a polling one, check if the target residency of
584  	 * the current candidate state is low enough and skip the timers
585  	 * check in that case too.
586  	 */
587  	if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
588  	    drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS)
589  		goto out_tick;
590  
591  	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
592  	cpu_data->sleep_length_ns = duration_ns;
593  
594  	/*
595  	 * If the closest expected timer is before the terget residency of the
596  	 * candidate state, a shallower one needs to be found.
597  	 */
598  	if (drv->states[idx].target_residency_ns > duration_ns) {
599  		i = teo_find_shallower_state(drv, dev, idx, duration_ns, false);
600  		if (teo_state_ok(i, drv))
601  			idx = i;
602  	}
603  
604  	/*
605  	 * If the selected state's target residency is below the tick length
606  	 * and intercepts occurring before the tick length are the majority of
607  	 * total wakeup events, do not stop the tick.
608  	 */
609  	if (drv->states[idx].target_residency_ns < TICK_NSEC &&
610  	    tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8)
611  		duration_ns = TICK_NSEC / 2;
612  
613  end:
614  	/*
615  	 * Allow the tick to be stopped unless the selected state is a polling
616  	 * one or the expected idle duration is shorter than the tick period
617  	 * length.
618  	 */
619  	if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
620  	    duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
621  		return idx;
622  
623  	/*
624  	 * The tick is not going to be stopped, so if the target residency of
625  	 * the state to be returned is not within the time till the closest
626  	 * timer including the tick, try to correct that.
627  	 */
628  	if (idx > idx0 &&
629  	    drv->states[idx].target_residency_ns > delta_tick)
630  		idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
631  
632  out_tick:
633  	*stop_tick = false;
634  	return idx;
635  }
636  
637  /**
638   * teo_reflect - Note that governor data for the CPU need to be updated.
639   * @dev: Target CPU.
640   * @state: Entered state.
641   */
teo_reflect(struct cpuidle_device * dev,int state)642  static void teo_reflect(struct cpuidle_device *dev, int state)
643  {
644  	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
645  
646  	dev->last_state_idx = state;
647  	/*
648  	 * If the wakeup was not "natural", but triggered by one of the safety
649  	 * nets, assume that the CPU might have been idle for the entire sleep
650  	 * length time.
651  	 */
652  	if (dev->poll_time_limit ||
653  	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
654  		dev->poll_time_limit = false;
655  		cpu_data->time_span_ns = cpu_data->sleep_length_ns;
656  	} else {
657  		cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
658  	}
659  }
660  
661  /**
662   * teo_enable_device - Initialize the governor's data for the target CPU.
663   * @drv: cpuidle driver (not used).
664   * @dev: Target CPU.
665   */
teo_enable_device(struct cpuidle_driver * drv,struct cpuidle_device * dev)666  static int teo_enable_device(struct cpuidle_driver *drv,
667  			     struct cpuidle_device *dev)
668  {
669  	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
670  	unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu);
671  	int i;
672  
673  	memset(cpu_data, 0, sizeof(*cpu_data));
674  	cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT;
675  
676  	for (i = 0; i < NR_RECENT; i++)
677  		cpu_data->recent_idx[i] = -1;
678  
679  	return 0;
680  }
681  
682  static struct cpuidle_governor teo_governor = {
683  	.name =		"teo",
684  	.rating =	19,
685  	.enable =	teo_enable_device,
686  	.select =	teo_select,
687  	.reflect =	teo_reflect,
688  };
689  
teo_governor_init(void)690  static int __init teo_governor_init(void)
691  {
692  	return cpuidle_register_governor(&teo_governor);
693  }
694  
695  postcore_initcall(teo_governor_init);
696