1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Timer events oriented CPU idle governor 4 * 5 * Copyright (C) 2018 - 2021 Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 9 /** 10 * DOC: teo-description 11 * 12 * The idea of this governor is based on the observation that on many systems 13 * timer events are two or more orders of magnitude more frequent than any 14 * other interrupts, so they are likely to be the most significant cause of CPU 15 * wakeups from idle states. Moreover, information about what happened in the 16 * (relatively recent) past can be used to estimate whether or not the deepest 17 * idle state with target residency within the (known) time till the closest 18 * timer event, referred to as the sleep length, is likely to be suitable for 19 * the upcoming CPU idle period and, if not, then which of the shallower idle 20 * states to choose instead of it. 21 * 22 * Of course, non-timer wakeup sources are more important in some use cases 23 * which can be covered by taking a few most recent idle time intervals of the 24 * CPU into account. However, even in that context it is not necessary to 25 * consider idle duration values greater than the sleep length, because the 26 * closest timer will ultimately wake up the CPU anyway unless it is woken up 27 * earlier. 28 * 29 * Thus this governor estimates whether or not the prospective idle duration of 30 * a CPU is likely to be significantly shorter than the sleep length and selects 31 * an idle state for it accordingly. 32 * 33 * The computations carried out by this governor are based on using bins whose 34 * boundaries are aligned with the target residency parameter values of the CPU 35 * idle states provided by the %CPUIdle driver in the ascending order. That is, 36 * the first bin spans from 0 up to, but not including, the target residency of 37 * the second idle state (idle state 1), the second bin spans from the target 38 * residency of idle state 1 up to, but not including, the target residency of 39 * idle state 2, the third bin spans from the target residency of idle state 2 40 * up to, but not including, the target residency of idle state 3 and so on. 41 * The last bin spans from the target residency of the deepest idle state 42 * supplied by the driver to infinity. 43 * 44 * Two metrics called "hits" and "intercepts" are associated with each bin. 45 * They are updated every time before selecting an idle state for the given CPU 46 * in accordance with what happened last time. 47 * 48 * The "hits" metric reflects the relative frequency of situations in which the 49 * sleep length and the idle duration measured after CPU wakeup fall into the 50 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep 51 * length). In turn, the "intercepts" metric reflects the relative frequency of 52 * situations in which the measured idle duration is so much shorter than the 53 * sleep length that the bin it falls into corresponds to an idle state 54 * shallower than the one whose bin is fallen into by the sleep length (these 55 * situations are referred to as "intercepts" below). 56 * 57 * In addition to the metrics described above, the governor counts recent 58 * intercepts (that is, intercepts that have occurred during the last 59 * %NR_RECENT invocations of it for the given CPU) for each bin. 60 * 61 * In order to select an idle state for a CPU, the governor takes the following 62 * steps (modulo the possible latency constraint that must be taken into account 63 * too): 64 * 65 * 1. Find the deepest CPU idle state whose target residency does not exceed 66 * the current sleep length (the candidate idle state) and compute 3 sums as 67 * follows: 68 * 69 * - The sum of the "hits" and "intercepts" metrics for the candidate state 70 * and all of the deeper idle states (it represents the cases in which the 71 * CPU was idle long enough to avoid being intercepted if the sleep length 72 * had been equal to the current one). 73 * 74 * - The sum of the "intercepts" metrics for all of the idle states shallower 75 * than the candidate one (it represents the cases in which the CPU was not 76 * idle long enough to avoid being intercepted if the sleep length had been 77 * equal to the current one). 78 * 79 * - The sum of the numbers of recent intercepts for all of the idle states 80 * shallower than the candidate one. 81 * 82 * 2. If the second sum is greater than the first one or the third sum is 83 * greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look 84 * for an alternative idle state to select. 85 * 86 * - Traverse the idle states shallower than the candidate one in the 87 * descending order. 88 * 89 * - For each of them compute the sum of the "intercepts" metrics and the sum 90 * of the numbers of recent intercepts over all of the idle states between 91 * it and the candidate one (including the former and excluding the 92 * latter). 93 * 94 * - If each of these sums that needs to be taken into account (because the 95 * check related to it has indicated that the CPU is likely to wake up 96 * early) is greater than a half of the corresponding sum computed in step 97 * 1 (which means that the target residency of the state in question had 98 * not exceeded the idle duration in over a half of the relevant cases), 99 * select the given idle state instead of the candidate one. 100 * 101 * 3. By default, select the candidate state. 102 */ 103 104 #include <linux/cpuidle.h> 105 #include <linux/jiffies.h> 106 #include <linux/kernel.h> 107 #include <linux/sched/clock.h> 108 #include <linux/tick.h> 109 110 /* 111 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value 112 * is used for decreasing metrics on a regular basis. 113 */ 114 #define PULSE 1024 115 #define DECAY_SHIFT 3 116 117 /* 118 * Number of the most recent idle duration values to take into consideration for 119 * the detection of recent early wakeup patterns. 120 */ 121 #define NR_RECENT 9 122 123 /** 124 * struct teo_bin - Metrics used by the TEO cpuidle governor. 125 * @intercepts: The "intercepts" metric. 126 * @hits: The "hits" metric. 127 * @recent: The number of recent "intercepts". 128 */ 129 struct teo_bin { 130 unsigned int intercepts; 131 unsigned int hits; 132 unsigned int recent; 133 }; 134 135 /** 136 * struct teo_cpu - CPU data used by the TEO cpuidle governor. 137 * @time_span_ns: Time between idle state selection and post-wakeup update. 138 * @sleep_length_ns: Time till the closest timer event (at the selection time). 139 * @state_bins: Idle state data bins for this CPU. 140 * @total: Grand total of the "intercepts" and "hits" mertics for all bins. 141 * @next_recent_idx: Index of the next @recent_idx entry to update. 142 * @recent_idx: Indices of bins corresponding to recent "intercepts". 143 */ 144 struct teo_cpu { 145 s64 time_span_ns; 146 s64 sleep_length_ns; 147 struct teo_bin state_bins[CPUIDLE_STATE_MAX]; 148 unsigned int total; 149 int next_recent_idx; 150 int recent_idx[NR_RECENT]; 151 }; 152 153 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); 154 155 /** 156 * teo_update - Update CPU metrics after wakeup. 157 * @drv: cpuidle driver containing state data. 158 * @dev: Target CPU. 159 */ 160 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) 161 { 162 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 163 int i, idx_timer = 0, idx_duration = 0; 164 u64 measured_ns; 165 166 if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { 167 /* 168 * One of the safety nets has triggered or the wakeup was close 169 * enough to the closest timer event expected at the idle state 170 * selection time to be discarded. 171 */ 172 measured_ns = U64_MAX; 173 } else { 174 u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; 175 176 /* 177 * The computations below are to determine whether or not the 178 * (saved) time till the next timer event and the measured idle 179 * duration fall into the same "bin", so use last_residency_ns 180 * for that instead of time_span_ns which includes the cpuidle 181 * overhead. 182 */ 183 measured_ns = dev->last_residency_ns; 184 /* 185 * The delay between the wakeup and the first instruction 186 * executed by the CPU is not likely to be worst-case every 187 * time, so take 1/2 of the exit latency as a very rough 188 * approximation of the average of it. 189 */ 190 if (measured_ns >= lat_ns) 191 measured_ns -= lat_ns / 2; 192 else 193 measured_ns /= 2; 194 } 195 196 cpu_data->total = 0; 197 198 /* 199 * Decay the "hits" and "intercepts" metrics for all of the bins and 200 * find the bins that the sleep length and the measured idle duration 201 * fall into. 202 */ 203 for (i = 0; i < drv->state_count; i++) { 204 s64 target_residency_ns = drv->states[i].target_residency_ns; 205 struct teo_bin *bin = &cpu_data->state_bins[i]; 206 207 bin->hits -= bin->hits >> DECAY_SHIFT; 208 bin->intercepts -= bin->intercepts >> DECAY_SHIFT; 209 210 cpu_data->total += bin->hits + bin->intercepts; 211 212 if (target_residency_ns <= cpu_data->sleep_length_ns) { 213 idx_timer = i; 214 if (target_residency_ns <= measured_ns) 215 idx_duration = i; 216 } 217 } 218 219 i = cpu_data->next_recent_idx++; 220 if (cpu_data->next_recent_idx >= NR_RECENT) 221 cpu_data->next_recent_idx = 0; 222 223 if (cpu_data->recent_idx[i] >= 0) 224 cpu_data->state_bins[cpu_data->recent_idx[i]].recent--; 225 226 /* 227 * If the measured idle duration falls into the same bin as the sleep 228 * length, this is a "hit", so update the "hits" metric for that bin. 229 * Otherwise, update the "intercepts" metric for the bin fallen into by 230 * the measured idle duration. 231 */ 232 if (idx_timer == idx_duration) { 233 cpu_data->state_bins[idx_timer].hits += PULSE; 234 cpu_data->recent_idx[i] = -1; 235 } else { 236 cpu_data->state_bins[idx_duration].intercepts += PULSE; 237 cpu_data->state_bins[idx_duration].recent++; 238 cpu_data->recent_idx[i] = idx_duration; 239 } 240 241 cpu_data->total += PULSE; 242 } 243 244 static bool teo_time_ok(u64 interval_ns) 245 { 246 return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC; 247 } 248 249 static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) 250 { 251 return (drv->states[idx].target_residency_ns + 252 drv->states[idx+1].target_residency_ns) / 2; 253 } 254 255 /** 256 * teo_find_shallower_state - Find shallower idle state matching given duration. 257 * @drv: cpuidle driver containing state data. 258 * @dev: Target CPU. 259 * @state_idx: Index of the capping idle state. 260 * @duration_ns: Idle duration value to match. 261 */ 262 static int teo_find_shallower_state(struct cpuidle_driver *drv, 263 struct cpuidle_device *dev, int state_idx, 264 s64 duration_ns) 265 { 266 int i; 267 268 for (i = state_idx - 1; i >= 0; i--) { 269 if (dev->states_usage[i].disable) 270 continue; 271 272 state_idx = i; 273 if (drv->states[i].target_residency_ns <= duration_ns) 274 break; 275 } 276 return state_idx; 277 } 278 279 /** 280 * teo_select - Selects the next idle state to enter. 281 * @drv: cpuidle driver containing state data. 282 * @dev: Target CPU. 283 * @stop_tick: Indication on whether or not to stop the scheduler tick. 284 */ 285 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, 286 bool *stop_tick) 287 { 288 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 289 s64 latency_req = cpuidle_governor_latency_req(dev->cpu); 290 unsigned int idx_intercept_sum = 0; 291 unsigned int intercept_sum = 0; 292 unsigned int idx_recent_sum = 0; 293 unsigned int recent_sum = 0; 294 unsigned int idx_hit_sum = 0; 295 unsigned int hit_sum = 0; 296 int constraint_idx = 0; 297 int idx0 = 0, idx = -1; 298 bool alt_intercepts, alt_recent; 299 ktime_t delta_tick; 300 s64 duration_ns; 301 int i; 302 303 if (dev->last_state_idx >= 0) { 304 teo_update(drv, dev); 305 dev->last_state_idx = -1; 306 } 307 308 cpu_data->time_span_ns = local_clock(); 309 310 duration_ns = tick_nohz_get_sleep_length(&delta_tick); 311 cpu_data->sleep_length_ns = duration_ns; 312 313 /* Check if there is any choice in the first place. */ 314 if (drv->state_count < 2) { 315 idx = 0; 316 goto end; 317 } 318 if (!dev->states_usage[0].disable) { 319 idx = 0; 320 if (drv->states[1].target_residency_ns > duration_ns) 321 goto end; 322 } 323 324 /* 325 * Find the deepest idle state whose target residency does not exceed 326 * the current sleep length and the deepest idle state not deeper than 327 * the former whose exit latency does not exceed the current latency 328 * constraint. Compute the sums of metrics for early wakeup pattern 329 * detection. 330 */ 331 for (i = 1; i < drv->state_count; i++) { 332 struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; 333 struct cpuidle_state *s = &drv->states[i]; 334 335 /* 336 * Update the sums of idle state mertics for all of the states 337 * shallower than the current one. 338 */ 339 intercept_sum += prev_bin->intercepts; 340 hit_sum += prev_bin->hits; 341 recent_sum += prev_bin->recent; 342 343 if (dev->states_usage[i].disable) 344 continue; 345 346 if (idx < 0) { 347 idx = i; /* first enabled state */ 348 idx0 = i; 349 } 350 351 if (s->target_residency_ns > duration_ns) 352 break; 353 354 idx = i; 355 356 if (s->exit_latency_ns <= latency_req) 357 constraint_idx = i; 358 359 idx_intercept_sum = intercept_sum; 360 idx_hit_sum = hit_sum; 361 idx_recent_sum = recent_sum; 362 } 363 364 /* Avoid unnecessary overhead. */ 365 if (idx < 0) { 366 idx = 0; /* No states enabled, must use 0. */ 367 goto end; 368 } else if (idx == idx0) { 369 goto end; 370 } 371 372 /* 373 * If the sum of the intercepts metric for all of the idle states 374 * shallower than the current candidate one (idx) is greater than the 375 * sum of the intercepts and hits metrics for the candidate state and 376 * all of the deeper states, or the sum of the numbers of recent 377 * intercepts over all of the states shallower than the candidate one 378 * is greater than a half of the number of recent events taken into 379 * account, the CPU is likely to wake up early, so find an alternative 380 * idle state to select. 381 */ 382 alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum; 383 alt_recent = idx_recent_sum > NR_RECENT / 2; 384 if (alt_recent || alt_intercepts) { 385 s64 last_enabled_span_ns = duration_ns; 386 int last_enabled_idx = idx; 387 388 /* 389 * Look for the deepest idle state whose target residency had 390 * not exceeded the idle duration in over a half of the relevant 391 * cases (both with respect to intercepts overall and with 392 * respect to the recent intercepts only) in the past. 393 * 394 * Take the possible latency constraint and duration limitation 395 * present if the tick has been stopped already into account. 396 */ 397 intercept_sum = 0; 398 recent_sum = 0; 399 400 for (i = idx - 1; i >= idx0; i--) { 401 struct teo_bin *bin = &cpu_data->state_bins[i]; 402 s64 span_ns; 403 404 intercept_sum += bin->intercepts; 405 recent_sum += bin->recent; 406 407 if (dev->states_usage[i].disable) 408 continue; 409 410 span_ns = teo_middle_of_bin(i, drv); 411 if (!teo_time_ok(span_ns)) { 412 /* 413 * The current state is too shallow, so select 414 * the first enabled deeper state. 415 */ 416 duration_ns = last_enabled_span_ns; 417 idx = last_enabled_idx; 418 break; 419 } 420 421 if ((!alt_recent || 2 * recent_sum > idx_recent_sum) && 422 (!alt_intercepts || 423 2 * intercept_sum > idx_intercept_sum)) { 424 idx = i; 425 duration_ns = span_ns; 426 break; 427 } 428 429 last_enabled_span_ns = span_ns; 430 last_enabled_idx = i; 431 } 432 } 433 434 /* 435 * If there is a latency constraint, it may be necessary to select an 436 * idle state shallower than the current candidate one. 437 */ 438 if (idx > constraint_idx) 439 idx = constraint_idx; 440 441 end: 442 /* 443 * Don't stop the tick if the selected state is a polling one or if the 444 * expected idle duration is shorter than the tick period length. 445 */ 446 if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || 447 duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { 448 *stop_tick = false; 449 450 /* 451 * The tick is not going to be stopped, so if the target 452 * residency of the state to be returned is not within the time 453 * till the closest timer including the tick, try to correct 454 * that. 455 */ 456 if (idx > idx0 && 457 drv->states[idx].target_residency_ns > delta_tick) 458 idx = teo_find_shallower_state(drv, dev, idx, delta_tick); 459 } 460 461 return idx; 462 } 463 464 /** 465 * teo_reflect - Note that governor data for the CPU need to be updated. 466 * @dev: Target CPU. 467 * @state: Entered state. 468 */ 469 static void teo_reflect(struct cpuidle_device *dev, int state) 470 { 471 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 472 473 dev->last_state_idx = state; 474 /* 475 * If the wakeup was not "natural", but triggered by one of the safety 476 * nets, assume that the CPU might have been idle for the entire sleep 477 * length time. 478 */ 479 if (dev->poll_time_limit || 480 (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { 481 dev->poll_time_limit = false; 482 cpu_data->time_span_ns = cpu_data->sleep_length_ns; 483 } else { 484 cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; 485 } 486 } 487 488 /** 489 * teo_enable_device - Initialize the governor's data for the target CPU. 490 * @drv: cpuidle driver (not used). 491 * @dev: Target CPU. 492 */ 493 static int teo_enable_device(struct cpuidle_driver *drv, 494 struct cpuidle_device *dev) 495 { 496 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 497 int i; 498 499 memset(cpu_data, 0, sizeof(*cpu_data)); 500 501 for (i = 0; i < NR_RECENT; i++) 502 cpu_data->recent_idx[i] = -1; 503 504 return 0; 505 } 506 507 static struct cpuidle_governor teo_governor = { 508 .name = "teo", 509 .rating = 19, 510 .enable = teo_enable_device, 511 .select = teo_select, 512 .reflect = teo_reflect, 513 }; 514 515 static int __init teo_governor_init(void) 516 { 517 return cpuidle_register_governor(&teo_governor); 518 } 519 520 postcore_initcall(teo_governor_init); 521