1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Timer events oriented CPU idle governor 4 * 5 * TEO governor: 6 * Copyright (C) 2018 - 2021 Intel Corporation 7 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 8 * 9 * Util-awareness mechanism: 10 * Copyright (C) 2022 Arm Ltd. 11 * Author: Kajetan Puchalski <kajetan.puchalski@arm.com> 12 */ 13 14 /** 15 * DOC: teo-description 16 * 17 * The idea of this governor is based on the observation that on many systems 18 * timer events are two or more orders of magnitude more frequent than any 19 * other interrupts, so they are likely to be the most significant cause of CPU 20 * wakeups from idle states. Moreover, information about what happened in the 21 * (relatively recent) past can be used to estimate whether or not the deepest 22 * idle state with target residency within the (known) time till the closest 23 * timer event, referred to as the sleep length, is likely to be suitable for 24 * the upcoming CPU idle period and, if not, then which of the shallower idle 25 * states to choose instead of it. 26 * 27 * Of course, non-timer wakeup sources are more important in some use cases 28 * which can be covered by taking a few most recent idle time intervals of the 29 * CPU into account. However, even in that context it is not necessary to 30 * consider idle duration values greater than the sleep length, because the 31 * closest timer will ultimately wake up the CPU anyway unless it is woken up 32 * earlier. 33 * 34 * Thus this governor estimates whether or not the prospective idle duration of 35 * a CPU is likely to be significantly shorter than the sleep length and selects 36 * an idle state for it accordingly. 37 * 38 * The computations carried out by this governor are based on using bins whose 39 * boundaries are aligned with the target residency parameter values of the CPU 40 * idle states provided by the %CPUIdle driver in the ascending order. That is, 41 * the first bin spans from 0 up to, but not including, the target residency of 42 * the second idle state (idle state 1), the second bin spans from the target 43 * residency of idle state 1 up to, but not including, the target residency of 44 * idle state 2, the third bin spans from the target residency of idle state 2 45 * up to, but not including, the target residency of idle state 3 and so on. 46 * The last bin spans from the target residency of the deepest idle state 47 * supplied by the driver to infinity. 48 * 49 * Two metrics called "hits" and "intercepts" are associated with each bin. 50 * They are updated every time before selecting an idle state for the given CPU 51 * in accordance with what happened last time. 52 * 53 * The "hits" metric reflects the relative frequency of situations in which the 54 * sleep length and the idle duration measured after CPU wakeup fall into the 55 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep 56 * length). In turn, the "intercepts" metric reflects the relative frequency of 57 * situations in which the measured idle duration is so much shorter than the 58 * sleep length that the bin it falls into corresponds to an idle state 59 * shallower than the one whose bin is fallen into by the sleep length (these 60 * situations are referred to as "intercepts" below). 61 * 62 * In addition to the metrics described above, the governor counts recent 63 * intercepts (that is, intercepts that have occurred during the last 64 * %NR_RECENT invocations of it for the given CPU) for each bin. 65 * 66 * In order to select an idle state for a CPU, the governor takes the following 67 * steps (modulo the possible latency constraint that must be taken into account 68 * too): 69 * 70 * 1. Find the deepest CPU idle state whose target residency does not exceed 71 * the current sleep length (the candidate idle state) and compute 3 sums as 72 * follows: 73 * 74 * - The sum of the "hits" and "intercepts" metrics for the candidate state 75 * and all of the deeper idle states (it represents the cases in which the 76 * CPU was idle long enough to avoid being intercepted if the sleep length 77 * had been equal to the current one). 78 * 79 * - The sum of the "intercepts" metrics for all of the idle states shallower 80 * than the candidate one (it represents the cases in which the CPU was not 81 * idle long enough to avoid being intercepted if the sleep length had been 82 * equal to the current one). 83 * 84 * - The sum of the numbers of recent intercepts for all of the idle states 85 * shallower than the candidate one. 86 * 87 * 2. If the second sum is greater than the first one or the third sum is 88 * greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look 89 * for an alternative idle state to select. 90 * 91 * - Traverse the idle states shallower than the candidate one in the 92 * descending order. 93 * 94 * - For each of them compute the sum of the "intercepts" metrics and the sum 95 * of the numbers of recent intercepts over all of the idle states between 96 * it and the candidate one (including the former and excluding the 97 * latter). 98 * 99 * - If each of these sums that needs to be taken into account (because the 100 * check related to it has indicated that the CPU is likely to wake up 101 * early) is greater than a half of the corresponding sum computed in step 102 * 1 (which means that the target residency of the state in question had 103 * not exceeded the idle duration in over a half of the relevant cases), 104 * select the given idle state instead of the candidate one. 105 * 106 * 3. By default, select the candidate state. 107 * 108 * Util-awareness mechanism: 109 * 110 * The idea behind the util-awareness extension is that there are two distinct 111 * scenarios for the CPU which should result in two different approaches to idle 112 * state selection - utilized and not utilized. 113 * 114 * In this case, 'utilized' means that the average runqueue util of the CPU is 115 * above a certain threshold. 116 * 117 * When the CPU is utilized while going into idle, more likely than not it will 118 * be woken up to do more work soon and so a shallower idle state should be 119 * selected to minimise latency and maximise performance. When the CPU is not 120 * being utilized, the usual metrics-based approach to selecting the deepest 121 * available idle state should be preferred to take advantage of the power 122 * saving. 123 * 124 * In order to achieve this, the governor uses a utilization threshold. 125 * The threshold is computed per-CPU as a percentage of the CPU's capacity 126 * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%) 127 * seems to be getting the best results. 128 * 129 * Before selecting the next idle state, the governor compares the current CPU 130 * util to the precomputed util threshold. If it's below, it defaults to the 131 * TEO metrics mechanism. If it's above, the closest shallower idle state will 132 * be selected instead, as long as is not a polling state. 133 */ 134 135 #include <linux/cpuidle.h> 136 #include <linux/jiffies.h> 137 #include <linux/kernel.h> 138 #include <linux/sched.h> 139 #include <linux/sched/clock.h> 140 #include <linux/sched/topology.h> 141 #include <linux/tick.h> 142 143 /* 144 * The number of bits to shift the CPU's capacity by in order to determine 145 * the utilized threshold. 146 * 147 * 6 was chosen based on testing as the number that achieved the best balance 148 * of power and performance on average. 149 * 150 * The resulting threshold is high enough to not be triggered by background 151 * noise and low enough to react quickly when activity starts to ramp up. 152 */ 153 #define UTIL_THRESHOLD_SHIFT 6 154 155 156 /* 157 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value 158 * is used for decreasing metrics on a regular basis. 159 */ 160 #define PULSE 1024 161 #define DECAY_SHIFT 3 162 163 /* 164 * Number of the most recent idle duration values to take into consideration for 165 * the detection of recent early wakeup patterns. 166 */ 167 #define NR_RECENT 9 168 169 /** 170 * struct teo_bin - Metrics used by the TEO cpuidle governor. 171 * @intercepts: The "intercepts" metric. 172 * @hits: The "hits" metric. 173 * @recent: The number of recent "intercepts". 174 */ 175 struct teo_bin { 176 unsigned int intercepts; 177 unsigned int hits; 178 unsigned int recent; 179 }; 180 181 /** 182 * struct teo_cpu - CPU data used by the TEO cpuidle governor. 183 * @time_span_ns: Time between idle state selection and post-wakeup update. 184 * @sleep_length_ns: Time till the closest timer event (at the selection time). 185 * @state_bins: Idle state data bins for this CPU. 186 * @total: Grand total of the "intercepts" and "hits" metrics for all bins. 187 * @next_recent_idx: Index of the next @recent_idx entry to update. 188 * @recent_idx: Indices of bins corresponding to recent "intercepts". 189 * @util_threshold: Threshold above which the CPU is considered utilized 190 * @utilized: Whether the last sleep on the CPU happened while utilized 191 */ 192 struct teo_cpu { 193 s64 time_span_ns; 194 s64 sleep_length_ns; 195 struct teo_bin state_bins[CPUIDLE_STATE_MAX]; 196 unsigned int total; 197 int next_recent_idx; 198 int recent_idx[NR_RECENT]; 199 unsigned long util_threshold; 200 bool utilized; 201 }; 202 203 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); 204 205 /** 206 * teo_cpu_is_utilized - Check if the CPU's util is above the threshold 207 * @cpu: Target CPU 208 * @cpu_data: Governor CPU data for the target CPU 209 */ 210 #ifdef CONFIG_SMP 211 static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) 212 { 213 return sched_cpu_util(cpu) > cpu_data->util_threshold; 214 } 215 #else 216 static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) 217 { 218 return false; 219 } 220 #endif 221 222 /** 223 * teo_update - Update CPU metrics after wakeup. 224 * @drv: cpuidle driver containing state data. 225 * @dev: Target CPU. 226 */ 227 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) 228 { 229 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 230 int i, idx_timer = 0, idx_duration = 0; 231 u64 measured_ns; 232 233 if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { 234 /* 235 * One of the safety nets has triggered or the wakeup was close 236 * enough to the closest timer event expected at the idle state 237 * selection time to be discarded. 238 */ 239 measured_ns = U64_MAX; 240 } else { 241 u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; 242 243 /* 244 * The computations below are to determine whether or not the 245 * (saved) time till the next timer event and the measured idle 246 * duration fall into the same "bin", so use last_residency_ns 247 * for that instead of time_span_ns which includes the cpuidle 248 * overhead. 249 */ 250 measured_ns = dev->last_residency_ns; 251 /* 252 * The delay between the wakeup and the first instruction 253 * executed by the CPU is not likely to be worst-case every 254 * time, so take 1/2 of the exit latency as a very rough 255 * approximation of the average of it. 256 */ 257 if (measured_ns >= lat_ns) 258 measured_ns -= lat_ns / 2; 259 else 260 measured_ns /= 2; 261 } 262 263 cpu_data->total = 0; 264 265 /* 266 * Decay the "hits" and "intercepts" metrics for all of the bins and 267 * find the bins that the sleep length and the measured idle duration 268 * fall into. 269 */ 270 for (i = 0; i < drv->state_count; i++) { 271 s64 target_residency_ns = drv->states[i].target_residency_ns; 272 struct teo_bin *bin = &cpu_data->state_bins[i]; 273 274 bin->hits -= bin->hits >> DECAY_SHIFT; 275 bin->intercepts -= bin->intercepts >> DECAY_SHIFT; 276 277 cpu_data->total += bin->hits + bin->intercepts; 278 279 if (target_residency_ns <= cpu_data->sleep_length_ns) { 280 idx_timer = i; 281 if (target_residency_ns <= measured_ns) 282 idx_duration = i; 283 } 284 } 285 286 i = cpu_data->next_recent_idx++; 287 if (cpu_data->next_recent_idx >= NR_RECENT) 288 cpu_data->next_recent_idx = 0; 289 290 if (cpu_data->recent_idx[i] >= 0) 291 cpu_data->state_bins[cpu_data->recent_idx[i]].recent--; 292 293 /* 294 * If the measured idle duration falls into the same bin as the sleep 295 * length, this is a "hit", so update the "hits" metric for that bin. 296 * Otherwise, update the "intercepts" metric for the bin fallen into by 297 * the measured idle duration. 298 */ 299 if (idx_timer == idx_duration) { 300 cpu_data->state_bins[idx_timer].hits += PULSE; 301 cpu_data->recent_idx[i] = -1; 302 } else { 303 cpu_data->state_bins[idx_duration].intercepts += PULSE; 304 cpu_data->state_bins[idx_duration].recent++; 305 cpu_data->recent_idx[i] = idx_duration; 306 } 307 308 cpu_data->total += PULSE; 309 } 310 311 static bool teo_time_ok(u64 interval_ns) 312 { 313 return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC; 314 } 315 316 static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) 317 { 318 return (drv->states[idx].target_residency_ns + 319 drv->states[idx+1].target_residency_ns) / 2; 320 } 321 322 /** 323 * teo_find_shallower_state - Find shallower idle state matching given duration. 324 * @drv: cpuidle driver containing state data. 325 * @dev: Target CPU. 326 * @state_idx: Index of the capping idle state. 327 * @duration_ns: Idle duration value to match. 328 * @no_poll: Don't consider polling states. 329 */ 330 static int teo_find_shallower_state(struct cpuidle_driver *drv, 331 struct cpuidle_device *dev, int state_idx, 332 s64 duration_ns, bool no_poll) 333 { 334 int i; 335 336 for (i = state_idx - 1; i >= 0; i--) { 337 if (dev->states_usage[i].disable || 338 (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) 339 continue; 340 341 state_idx = i; 342 if (drv->states[i].target_residency_ns <= duration_ns) 343 break; 344 } 345 return state_idx; 346 } 347 348 /** 349 * teo_select - Selects the next idle state to enter. 350 * @drv: cpuidle driver containing state data. 351 * @dev: Target CPU. 352 * @stop_tick: Indication on whether or not to stop the scheduler tick. 353 */ 354 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, 355 bool *stop_tick) 356 { 357 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 358 s64 latency_req = cpuidle_governor_latency_req(dev->cpu); 359 unsigned int idx_intercept_sum = 0; 360 unsigned int intercept_sum = 0; 361 unsigned int idx_recent_sum = 0; 362 unsigned int recent_sum = 0; 363 unsigned int idx_hit_sum = 0; 364 unsigned int hit_sum = 0; 365 int constraint_idx = 0; 366 int idx0 = 0, idx = -1; 367 bool alt_intercepts, alt_recent; 368 ktime_t delta_tick; 369 s64 duration_ns; 370 int i; 371 372 if (dev->last_state_idx >= 0) { 373 teo_update(drv, dev); 374 dev->last_state_idx = -1; 375 } 376 377 cpu_data->time_span_ns = local_clock(); 378 379 duration_ns = tick_nohz_get_sleep_length(&delta_tick); 380 cpu_data->sleep_length_ns = duration_ns; 381 382 /* Check if there is any choice in the first place. */ 383 if (drv->state_count < 2) { 384 idx = 0; 385 goto end; 386 } 387 if (!dev->states_usage[0].disable) { 388 idx = 0; 389 if (drv->states[1].target_residency_ns > duration_ns) 390 goto end; 391 } 392 393 cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); 394 /* 395 * If the CPU is being utilized over the threshold and there are only 2 396 * states to choose from, the metrics need not be considered, so choose 397 * the shallowest non-polling state and exit. 398 */ 399 if (drv->state_count < 3 && cpu_data->utilized) { 400 for (i = 0; i < drv->state_count; ++i) { 401 if (!dev->states_usage[i].disable && 402 !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) { 403 idx = i; 404 goto end; 405 } 406 } 407 } 408 409 /* 410 * Find the deepest idle state whose target residency does not exceed 411 * the current sleep length and the deepest idle state not deeper than 412 * the former whose exit latency does not exceed the current latency 413 * constraint. Compute the sums of metrics for early wakeup pattern 414 * detection. 415 */ 416 for (i = 1; i < drv->state_count; i++) { 417 struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; 418 struct cpuidle_state *s = &drv->states[i]; 419 420 /* 421 * Update the sums of idle state mertics for all of the states 422 * shallower than the current one. 423 */ 424 intercept_sum += prev_bin->intercepts; 425 hit_sum += prev_bin->hits; 426 recent_sum += prev_bin->recent; 427 428 if (dev->states_usage[i].disable) 429 continue; 430 431 if (idx < 0) { 432 idx = i; /* first enabled state */ 433 idx0 = i; 434 } 435 436 if (s->target_residency_ns > duration_ns) 437 break; 438 439 idx = i; 440 441 if (s->exit_latency_ns <= latency_req) 442 constraint_idx = i; 443 444 idx_intercept_sum = intercept_sum; 445 idx_hit_sum = hit_sum; 446 idx_recent_sum = recent_sum; 447 } 448 449 /* Avoid unnecessary overhead. */ 450 if (idx < 0) { 451 idx = 0; /* No states enabled, must use 0. */ 452 goto end; 453 } else if (idx == idx0) { 454 goto end; 455 } 456 457 /* 458 * If the sum of the intercepts metric for all of the idle states 459 * shallower than the current candidate one (idx) is greater than the 460 * sum of the intercepts and hits metrics for the candidate state and 461 * all of the deeper states, or the sum of the numbers of recent 462 * intercepts over all of the states shallower than the candidate one 463 * is greater than a half of the number of recent events taken into 464 * account, the CPU is likely to wake up early, so find an alternative 465 * idle state to select. 466 */ 467 alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum; 468 alt_recent = idx_recent_sum > NR_RECENT / 2; 469 if (alt_recent || alt_intercepts) { 470 s64 first_suitable_span_ns = duration_ns; 471 int first_suitable_idx = idx; 472 473 /* 474 * Look for the deepest idle state whose target residency had 475 * not exceeded the idle duration in over a half of the relevant 476 * cases (both with respect to intercepts overall and with 477 * respect to the recent intercepts only) in the past. 478 * 479 * Take the possible latency constraint and duration limitation 480 * present if the tick has been stopped already into account. 481 */ 482 intercept_sum = 0; 483 recent_sum = 0; 484 485 for (i = idx - 1; i >= 0; i--) { 486 struct teo_bin *bin = &cpu_data->state_bins[i]; 487 s64 span_ns; 488 489 intercept_sum += bin->intercepts; 490 recent_sum += bin->recent; 491 492 span_ns = teo_middle_of_bin(i, drv); 493 494 if ((!alt_recent || 2 * recent_sum > idx_recent_sum) && 495 (!alt_intercepts || 496 2 * intercept_sum > idx_intercept_sum)) { 497 if (teo_time_ok(span_ns) && 498 !dev->states_usage[i].disable) { 499 idx = i; 500 duration_ns = span_ns; 501 } else { 502 /* 503 * The current state is too shallow or 504 * disabled, so take the first enabled 505 * deeper state with suitable time span. 506 */ 507 idx = first_suitable_idx; 508 duration_ns = first_suitable_span_ns; 509 } 510 break; 511 } 512 513 if (dev->states_usage[i].disable) 514 continue; 515 516 if (!teo_time_ok(span_ns)) { 517 /* 518 * The current state is too shallow, but if an 519 * alternative candidate state has been found, 520 * it may still turn out to be a better choice. 521 */ 522 if (first_suitable_idx != idx) 523 continue; 524 525 break; 526 } 527 528 first_suitable_span_ns = span_ns; 529 first_suitable_idx = i; 530 } 531 } 532 533 /* 534 * If there is a latency constraint, it may be necessary to select an 535 * idle state shallower than the current candidate one. 536 */ 537 if (idx > constraint_idx) 538 idx = constraint_idx; 539 540 /* 541 * If the CPU is being utilized over the threshold, choose a shallower 542 * non-polling state to improve latency 543 */ 544 if (cpu_data->utilized) 545 idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true); 546 547 end: 548 /* 549 * Don't stop the tick if the selected state is a polling one or if the 550 * expected idle duration is shorter than the tick period length. 551 */ 552 if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || 553 duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { 554 *stop_tick = false; 555 556 /* 557 * The tick is not going to be stopped, so if the target 558 * residency of the state to be returned is not within the time 559 * till the closest timer including the tick, try to correct 560 * that. 561 */ 562 if (idx > idx0 && 563 drv->states[idx].target_residency_ns > delta_tick) 564 idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); 565 } 566 567 return idx; 568 } 569 570 /** 571 * teo_reflect - Note that governor data for the CPU need to be updated. 572 * @dev: Target CPU. 573 * @state: Entered state. 574 */ 575 static void teo_reflect(struct cpuidle_device *dev, int state) 576 { 577 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 578 579 dev->last_state_idx = state; 580 /* 581 * If the wakeup was not "natural", but triggered by one of the safety 582 * nets, assume that the CPU might have been idle for the entire sleep 583 * length time. 584 */ 585 if (dev->poll_time_limit || 586 (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { 587 dev->poll_time_limit = false; 588 cpu_data->time_span_ns = cpu_data->sleep_length_ns; 589 } else { 590 cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; 591 } 592 } 593 594 /** 595 * teo_enable_device - Initialize the governor's data for the target CPU. 596 * @drv: cpuidle driver (not used). 597 * @dev: Target CPU. 598 */ 599 static int teo_enable_device(struct cpuidle_driver *drv, 600 struct cpuidle_device *dev) 601 { 602 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 603 unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu); 604 int i; 605 606 memset(cpu_data, 0, sizeof(*cpu_data)); 607 cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT; 608 609 for (i = 0; i < NR_RECENT; i++) 610 cpu_data->recent_idx[i] = -1; 611 612 return 0; 613 } 614 615 static struct cpuidle_governor teo_governor = { 616 .name = "teo", 617 .rating = 19, 618 .enable = teo_enable_device, 619 .select = teo_select, 620 .reflect = teo_reflect, 621 }; 622 623 static int __init teo_governor_init(void) 624 { 625 return cpuidle_register_governor(&teo_governor); 626 } 627 628 postcore_initcall(teo_governor_init); 629