1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * CPUFreq governor based on scheduler-provided CPU utilization data. 4 * 5 * Copyright (C) 2016, Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include "sched.h" 12 13 #include <linux/sched/cpufreq.h> 14 #include <trace/events/power.h> 15 16 #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) 17 18 struct sugov_tunables { 19 struct gov_attr_set attr_set; 20 unsigned int rate_limit_us; 21 }; 22 23 struct sugov_policy { 24 struct cpufreq_policy *policy; 25 26 struct sugov_tunables *tunables; 27 struct list_head tunables_hook; 28 29 raw_spinlock_t update_lock; /* For shared policies */ 30 u64 last_freq_update_time; 31 s64 freq_update_delay_ns; 32 unsigned int next_freq; 33 unsigned int cached_raw_freq; 34 35 /* The next fields are only needed if fast switch cannot be used: */ 36 struct irq_work irq_work; 37 struct kthread_work work; 38 struct mutex work_lock; 39 struct kthread_worker worker; 40 struct task_struct *thread; 41 bool work_in_progress; 42 43 bool limits_changed; 44 bool need_freq_update; 45 }; 46 47 struct sugov_cpu { 48 struct update_util_data update_util; 49 struct sugov_policy *sg_policy; 50 unsigned int cpu; 51 52 bool iowait_boost_pending; 53 unsigned int iowait_boost; 54 u64 last_update; 55 56 unsigned long bw_dl; 57 unsigned long max; 58 59 /* The field below is for single-CPU policies only: */ 60 #ifdef CONFIG_NO_HZ_COMMON 61 unsigned long saved_idle_calls; 62 #endif 63 }; 64 65 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 66 67 /************************ Governor internals ***********************/ 68 69 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) 70 { 71 s64 delta_ns; 72 73 /* 74 * Since cpufreq_update_util() is called with rq->lock held for 75 * the @target_cpu, our per-CPU data is fully serialized. 76 * 77 * However, drivers cannot in general deal with cross-CPU 78 * requests, so while get_next_freq() will work, our 79 * sugov_update_commit() call may not for the fast switching platforms. 80 * 81 * Hence stop here for remote requests if they aren't supported 82 * by the hardware, as calculating the frequency is pointless if 83 * we cannot in fact act on it. 84 * 85 * For the slow switching platforms, the kthread is always scheduled on 86 * the right set of CPUs and any CPU can find the next frequency and 87 * schedule the kthread. 88 */ 89 if (sg_policy->policy->fast_switch_enabled && 90 !cpufreq_this_cpu_can_update(sg_policy->policy)) 91 return false; 92 93 if (unlikely(sg_policy->limits_changed)) { 94 sg_policy->limits_changed = false; 95 sg_policy->need_freq_update = true; 96 return true; 97 } 98 99 delta_ns = time - sg_policy->last_freq_update_time; 100 101 return delta_ns >= sg_policy->freq_update_delay_ns; 102 } 103 104 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, 105 unsigned int next_freq) 106 { 107 if (sg_policy->next_freq == next_freq) 108 return false; 109 110 sg_policy->next_freq = next_freq; 111 sg_policy->last_freq_update_time = time; 112 113 return true; 114 } 115 116 static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, 117 unsigned int next_freq) 118 { 119 struct cpufreq_policy *policy = sg_policy->policy; 120 int cpu; 121 122 if (!sugov_update_next_freq(sg_policy, time, next_freq)) 123 return; 124 125 next_freq = cpufreq_driver_fast_switch(policy, next_freq); 126 if (!next_freq) 127 return; 128 129 policy->cur = next_freq; 130 131 if (trace_cpu_frequency_enabled()) { 132 for_each_cpu(cpu, policy->cpus) 133 trace_cpu_frequency(next_freq, cpu); 134 } 135 } 136 137 static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, 138 unsigned int next_freq) 139 { 140 if (!sugov_update_next_freq(sg_policy, time, next_freq)) 141 return; 142 143 if (!sg_policy->work_in_progress) { 144 sg_policy->work_in_progress = true; 145 irq_work_queue(&sg_policy->irq_work); 146 } 147 } 148 149 /** 150 * get_next_freq - Compute a new frequency for a given cpufreq policy. 151 * @sg_policy: schedutil policy object to compute the new frequency for. 152 * @util: Current CPU utilization. 153 * @max: CPU capacity. 154 * 155 * If the utilization is frequency-invariant, choose the new frequency to be 156 * proportional to it, that is 157 * 158 * next_freq = C * max_freq * util / max 159 * 160 * Otherwise, approximate the would-be frequency-invariant utilization by 161 * util_raw * (curr_freq / max_freq) which leads to 162 * 163 * next_freq = C * curr_freq * util_raw / max 164 * 165 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 166 * 167 * The lowest driver-supported frequency which is equal or greater than the raw 168 * next_freq (as calculated above) is returned, subject to policy min/max and 169 * cpufreq driver limitations. 170 */ 171 static unsigned int get_next_freq(struct sugov_policy *sg_policy, 172 unsigned long util, unsigned long max) 173 { 174 struct cpufreq_policy *policy = sg_policy->policy; 175 unsigned int freq = arch_scale_freq_invariant() ? 176 policy->cpuinfo.max_freq : policy->cur; 177 178 freq = map_util_freq(util, freq, max); 179 180 if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) 181 return sg_policy->next_freq; 182 183 sg_policy->need_freq_update = false; 184 sg_policy->cached_raw_freq = freq; 185 return cpufreq_driver_resolve_freq(policy, freq); 186 } 187 188 /* 189 * This function computes an effective utilization for the given CPU, to be 190 * used for frequency selection given the linear relation: f = u * f_max. 191 * 192 * The scheduler tracks the following metrics: 193 * 194 * cpu_util_{cfs,rt,dl,irq}() 195 * cpu_bw_dl() 196 * 197 * Where the cfs,rt and dl util numbers are tracked with the same metric and 198 * synchronized windows and are thus directly comparable. 199 * 200 * The cfs,rt,dl utilization are the running times measured with rq->clock_task 201 * which excludes things like IRQ and steal-time. These latter are then accrued 202 * in the irq utilization. 203 * 204 * The DL bandwidth number otoh is not a measured metric but a value computed 205 * based on the task model parameters and gives the minimal utilization 206 * required to meet deadlines. 207 */ 208 unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, 209 unsigned long max, enum schedutil_type type, 210 struct task_struct *p) 211 { 212 unsigned long dl_util, util, irq; 213 struct rq *rq = cpu_rq(cpu); 214 215 if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && 216 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { 217 return max; 218 } 219 220 /* 221 * Early check to see if IRQ/steal time saturates the CPU, can be 222 * because of inaccuracies in how we track these -- see 223 * update_irq_load_avg(). 224 */ 225 irq = cpu_util_irq(rq); 226 if (unlikely(irq >= max)) 227 return max; 228 229 /* 230 * Because the time spend on RT/DL tasks is visible as 'lost' time to 231 * CFS tasks and we use the same metric to track the effective 232 * utilization (PELT windows are synchronized) we can directly add them 233 * to obtain the CPU's actual utilization. 234 * 235 * CFS and RT utilization can be boosted or capped, depending on 236 * utilization clamp constraints requested by currently RUNNABLE 237 * tasks. 238 * When there are no CFS RUNNABLE tasks, clamps are released and 239 * frequency will be gracefully reduced with the utilization decay. 240 */ 241 util = util_cfs + cpu_util_rt(rq); 242 if (type == FREQUENCY_UTIL) 243 util = uclamp_util_with(rq, util, p); 244 245 dl_util = cpu_util_dl(rq); 246 247 /* 248 * For frequency selection we do not make cpu_util_dl() a permanent part 249 * of this sum because we want to use cpu_bw_dl() later on, but we need 250 * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such 251 * that we select f_max when there is no idle time. 252 * 253 * NOTE: numerical errors or stop class might cause us to not quite hit 254 * saturation when we should -- something for later. 255 */ 256 if (util + dl_util >= max) 257 return max; 258 259 /* 260 * OTOH, for energy computation we need the estimated running time, so 261 * include util_dl and ignore dl_bw. 262 */ 263 if (type == ENERGY_UTIL) 264 util += dl_util; 265 266 /* 267 * There is still idle time; further improve the number by using the 268 * irq metric. Because IRQ/steal time is hidden from the task clock we 269 * need to scale the task numbers: 270 * 271 * max - irq 272 * U' = irq + --------- * U 273 * max 274 */ 275 util = scale_irq_capacity(util, irq, max); 276 util += irq; 277 278 /* 279 * Bandwidth required by DEADLINE must always be granted while, for 280 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism 281 * to gracefully reduce the frequency when no tasks show up for longer 282 * periods of time. 283 * 284 * Ideally we would like to set bw_dl as min/guaranteed freq and util + 285 * bw_dl as requested freq. However, cpufreq is not yet ready for such 286 * an interface. So, we only do the latter for now. 287 */ 288 if (type == FREQUENCY_UTIL) 289 util += cpu_bw_dl(rq); 290 291 return min(max, util); 292 } 293 294 static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) 295 { 296 struct rq *rq = cpu_rq(sg_cpu->cpu); 297 unsigned long util = cpu_util_cfs(rq); 298 unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); 299 300 sg_cpu->max = max; 301 sg_cpu->bw_dl = cpu_bw_dl(rq); 302 303 return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); 304 } 305 306 /** 307 * sugov_iowait_reset() - Reset the IO boost status of a CPU. 308 * @sg_cpu: the sugov data for the CPU to boost 309 * @time: the update time from the caller 310 * @set_iowait_boost: true if an IO boost has been requested 311 * 312 * The IO wait boost of a task is disabled after a tick since the last update 313 * of a CPU. If a new IO wait boost is requested after more then a tick, then 314 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy 315 * efficiency by ignoring sporadic wakeups from IO. 316 */ 317 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, 318 bool set_iowait_boost) 319 { 320 s64 delta_ns = time - sg_cpu->last_update; 321 322 /* Reset boost only if a tick has elapsed since last request */ 323 if (delta_ns <= TICK_NSEC) 324 return false; 325 326 sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; 327 sg_cpu->iowait_boost_pending = set_iowait_boost; 328 329 return true; 330 } 331 332 /** 333 * sugov_iowait_boost() - Updates the IO boost status of a CPU. 334 * @sg_cpu: the sugov data for the CPU to boost 335 * @time: the update time from the caller 336 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait 337 * 338 * Each time a task wakes up after an IO operation, the CPU utilization can be 339 * boosted to a certain utilization which doubles at each "frequent and 340 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization 341 * of the maximum OPP. 342 * 343 * To keep doubling, an IO boost has to be requested at least once per tick, 344 * otherwise we restart from the utilization of the minimum OPP. 345 */ 346 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 347 unsigned int flags) 348 { 349 bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; 350 351 /* Reset boost if the CPU appears to have been idle enough */ 352 if (sg_cpu->iowait_boost && 353 sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) 354 return; 355 356 /* Boost only tasks waking up after IO */ 357 if (!set_iowait_boost) 358 return; 359 360 /* Ensure boost doubles only one time at each request */ 361 if (sg_cpu->iowait_boost_pending) 362 return; 363 sg_cpu->iowait_boost_pending = true; 364 365 /* Double the boost at each request */ 366 if (sg_cpu->iowait_boost) { 367 sg_cpu->iowait_boost = 368 min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); 369 return; 370 } 371 372 /* First wakeup after IO: start with minimum boost */ 373 sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; 374 } 375 376 /** 377 * sugov_iowait_apply() - Apply the IO boost to a CPU. 378 * @sg_cpu: the sugov data for the cpu to boost 379 * @time: the update time from the caller 380 * @util: the utilization to (eventually) boost 381 * @max: the maximum value the utilization can be boosted to 382 * 383 * A CPU running a task which woken up after an IO operation can have its 384 * utilization boosted to speed up the completion of those IO operations. 385 * The IO boost value is increased each time a task wakes up from IO, in 386 * sugov_iowait_apply(), and it's instead decreased by this function, 387 * each time an increase has not been requested (!iowait_boost_pending). 388 * 389 * A CPU which also appears to have been idle for at least one tick has also 390 * its IO boost utilization reset. 391 * 392 * This mechanism is designed to boost high frequently IO waiting tasks, while 393 * being more conservative on tasks which does sporadic IO operations. 394 */ 395 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, 396 unsigned long util, unsigned long max) 397 { 398 unsigned long boost; 399 400 /* No boost currently required */ 401 if (!sg_cpu->iowait_boost) 402 return util; 403 404 /* Reset boost if the CPU appears to have been idle enough */ 405 if (sugov_iowait_reset(sg_cpu, time, false)) 406 return util; 407 408 if (!sg_cpu->iowait_boost_pending) { 409 /* 410 * No boost pending; reduce the boost value. 411 */ 412 sg_cpu->iowait_boost >>= 1; 413 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { 414 sg_cpu->iowait_boost = 0; 415 return util; 416 } 417 } 418 419 sg_cpu->iowait_boost_pending = false; 420 421 /* 422 * @util is already in capacity scale; convert iowait_boost 423 * into the same scale so we can compare. 424 */ 425 boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; 426 return max(boost, util); 427 } 428 429 #ifdef CONFIG_NO_HZ_COMMON 430 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) 431 { 432 unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); 433 bool ret = idle_calls == sg_cpu->saved_idle_calls; 434 435 sg_cpu->saved_idle_calls = idle_calls; 436 return ret; 437 } 438 #else 439 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } 440 #endif /* CONFIG_NO_HZ_COMMON */ 441 442 /* 443 * Make sugov_should_update_freq() ignore the rate limit when DL 444 * has increased the utilization. 445 */ 446 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) 447 { 448 if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) 449 sg_policy->limits_changed = true; 450 } 451 452 static void sugov_update_single(struct update_util_data *hook, u64 time, 453 unsigned int flags) 454 { 455 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 456 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 457 unsigned long util, max; 458 unsigned int next_f; 459 bool busy; 460 461 sugov_iowait_boost(sg_cpu, time, flags); 462 sg_cpu->last_update = time; 463 464 ignore_dl_rate_limit(sg_cpu, sg_policy); 465 466 if (!sugov_should_update_freq(sg_policy, time)) 467 return; 468 469 /* Limits may have changed, don't skip frequency update */ 470 busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); 471 472 util = sugov_get_util(sg_cpu); 473 max = sg_cpu->max; 474 util = sugov_iowait_apply(sg_cpu, time, util, max); 475 next_f = get_next_freq(sg_policy, util, max); 476 /* 477 * Do not reduce the frequency if the CPU has not been idle 478 * recently, as the reduction is likely to be premature then. 479 */ 480 if (busy && next_f < sg_policy->next_freq) { 481 next_f = sg_policy->next_freq; 482 483 /* Reset cached freq as next_freq has changed */ 484 sg_policy->cached_raw_freq = 0; 485 } 486 487 /* 488 * This code runs under rq->lock for the target CPU, so it won't run 489 * concurrently on two different CPUs for the same target and it is not 490 * necessary to acquire the lock in the fast switch case. 491 */ 492 if (sg_policy->policy->fast_switch_enabled) { 493 sugov_fast_switch(sg_policy, time, next_f); 494 } else { 495 raw_spin_lock(&sg_policy->update_lock); 496 sugov_deferred_update(sg_policy, time, next_f); 497 raw_spin_unlock(&sg_policy->update_lock); 498 } 499 } 500 501 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) 502 { 503 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 504 struct cpufreq_policy *policy = sg_policy->policy; 505 unsigned long util = 0, max = 1; 506 unsigned int j; 507 508 for_each_cpu(j, policy->cpus) { 509 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 510 unsigned long j_util, j_max; 511 512 j_util = sugov_get_util(j_sg_cpu); 513 j_max = j_sg_cpu->max; 514 j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); 515 516 if (j_util * max > j_max * util) { 517 util = j_util; 518 max = j_max; 519 } 520 } 521 522 return get_next_freq(sg_policy, util, max); 523 } 524 525 static void 526 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) 527 { 528 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 529 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 530 unsigned int next_f; 531 532 raw_spin_lock(&sg_policy->update_lock); 533 534 sugov_iowait_boost(sg_cpu, time, flags); 535 sg_cpu->last_update = time; 536 537 ignore_dl_rate_limit(sg_cpu, sg_policy); 538 539 if (sugov_should_update_freq(sg_policy, time)) { 540 next_f = sugov_next_freq_shared(sg_cpu, time); 541 542 if (sg_policy->policy->fast_switch_enabled) 543 sugov_fast_switch(sg_policy, time, next_f); 544 else 545 sugov_deferred_update(sg_policy, time, next_f); 546 } 547 548 raw_spin_unlock(&sg_policy->update_lock); 549 } 550 551 static void sugov_work(struct kthread_work *work) 552 { 553 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 554 unsigned int freq; 555 unsigned long flags; 556 557 /* 558 * Hold sg_policy->update_lock shortly to handle the case where: 559 * incase sg_policy->next_freq is read here, and then updated by 560 * sugov_deferred_update() just before work_in_progress is set to false 561 * here, we may miss queueing the new update. 562 * 563 * Note: If a work was queued after the update_lock is released, 564 * sugov_work() will just be called again by kthread_work code; and the 565 * request will be proceed before the sugov thread sleeps. 566 */ 567 raw_spin_lock_irqsave(&sg_policy->update_lock, flags); 568 freq = sg_policy->next_freq; 569 sg_policy->work_in_progress = false; 570 raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); 571 572 mutex_lock(&sg_policy->work_lock); 573 __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); 574 mutex_unlock(&sg_policy->work_lock); 575 } 576 577 static void sugov_irq_work(struct irq_work *irq_work) 578 { 579 struct sugov_policy *sg_policy; 580 581 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 582 583 kthread_queue_work(&sg_policy->worker, &sg_policy->work); 584 } 585 586 /************************** sysfs interface ************************/ 587 588 static struct sugov_tunables *global_tunables; 589 static DEFINE_MUTEX(global_tunables_lock); 590 591 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) 592 { 593 return container_of(attr_set, struct sugov_tunables, attr_set); 594 } 595 596 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) 597 { 598 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 599 600 return sprintf(buf, "%u\n", tunables->rate_limit_us); 601 } 602 603 static ssize_t 604 rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) 605 { 606 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 607 struct sugov_policy *sg_policy; 608 unsigned int rate_limit_us; 609 610 if (kstrtouint(buf, 10, &rate_limit_us)) 611 return -EINVAL; 612 613 tunables->rate_limit_us = rate_limit_us; 614 615 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) 616 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; 617 618 return count; 619 } 620 621 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 622 623 static struct attribute *sugov_attrs[] = { 624 &rate_limit_us.attr, 625 NULL 626 }; 627 ATTRIBUTE_GROUPS(sugov); 628 629 static struct kobj_type sugov_tunables_ktype = { 630 .default_groups = sugov_groups, 631 .sysfs_ops = &governor_sysfs_ops, 632 }; 633 634 /********************** cpufreq governor interface *********************/ 635 636 struct cpufreq_governor schedutil_gov; 637 638 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) 639 { 640 struct sugov_policy *sg_policy; 641 642 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); 643 if (!sg_policy) 644 return NULL; 645 646 sg_policy->policy = policy; 647 raw_spin_lock_init(&sg_policy->update_lock); 648 return sg_policy; 649 } 650 651 static void sugov_policy_free(struct sugov_policy *sg_policy) 652 { 653 kfree(sg_policy); 654 } 655 656 static int sugov_kthread_create(struct sugov_policy *sg_policy) 657 { 658 struct task_struct *thread; 659 struct sched_attr attr = { 660 .size = sizeof(struct sched_attr), 661 .sched_policy = SCHED_DEADLINE, 662 .sched_flags = SCHED_FLAG_SUGOV, 663 .sched_nice = 0, 664 .sched_priority = 0, 665 /* 666 * Fake (unused) bandwidth; workaround to "fix" 667 * priority inheritance. 668 */ 669 .sched_runtime = 1000000, 670 .sched_deadline = 10000000, 671 .sched_period = 10000000, 672 }; 673 struct cpufreq_policy *policy = sg_policy->policy; 674 int ret; 675 676 /* kthread only required for slow path */ 677 if (policy->fast_switch_enabled) 678 return 0; 679 680 kthread_init_work(&sg_policy->work, sugov_work); 681 kthread_init_worker(&sg_policy->worker); 682 thread = kthread_create(kthread_worker_fn, &sg_policy->worker, 683 "sugov:%d", 684 cpumask_first(policy->related_cpus)); 685 if (IS_ERR(thread)) { 686 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); 687 return PTR_ERR(thread); 688 } 689 690 ret = sched_setattr_nocheck(thread, &attr); 691 if (ret) { 692 kthread_stop(thread); 693 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); 694 return ret; 695 } 696 697 sg_policy->thread = thread; 698 kthread_bind_mask(thread, policy->related_cpus); 699 init_irq_work(&sg_policy->irq_work, sugov_irq_work); 700 mutex_init(&sg_policy->work_lock); 701 702 wake_up_process(thread); 703 704 return 0; 705 } 706 707 static void sugov_kthread_stop(struct sugov_policy *sg_policy) 708 { 709 /* kthread only required for slow path */ 710 if (sg_policy->policy->fast_switch_enabled) 711 return; 712 713 kthread_flush_worker(&sg_policy->worker); 714 kthread_stop(sg_policy->thread); 715 mutex_destroy(&sg_policy->work_lock); 716 } 717 718 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 719 { 720 struct sugov_tunables *tunables; 721 722 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); 723 if (tunables) { 724 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); 725 if (!have_governor_per_policy()) 726 global_tunables = tunables; 727 } 728 return tunables; 729 } 730 731 static void sugov_tunables_free(struct sugov_tunables *tunables) 732 { 733 if (!have_governor_per_policy()) 734 global_tunables = NULL; 735 736 kfree(tunables); 737 } 738 739 static int sugov_init(struct cpufreq_policy *policy) 740 { 741 struct sugov_policy *sg_policy; 742 struct sugov_tunables *tunables; 743 int ret = 0; 744 745 /* State should be equivalent to EXIT */ 746 if (policy->governor_data) 747 return -EBUSY; 748 749 cpufreq_enable_fast_switch(policy); 750 751 sg_policy = sugov_policy_alloc(policy); 752 if (!sg_policy) { 753 ret = -ENOMEM; 754 goto disable_fast_switch; 755 } 756 757 ret = sugov_kthread_create(sg_policy); 758 if (ret) 759 goto free_sg_policy; 760 761 mutex_lock(&global_tunables_lock); 762 763 if (global_tunables) { 764 if (WARN_ON(have_governor_per_policy())) { 765 ret = -EINVAL; 766 goto stop_kthread; 767 } 768 policy->governor_data = sg_policy; 769 sg_policy->tunables = global_tunables; 770 771 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); 772 goto out; 773 } 774 775 tunables = sugov_tunables_alloc(sg_policy); 776 if (!tunables) { 777 ret = -ENOMEM; 778 goto stop_kthread; 779 } 780 781 tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); 782 783 policy->governor_data = sg_policy; 784 sg_policy->tunables = tunables; 785 786 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, 787 get_governor_parent_kobj(policy), "%s", 788 schedutil_gov.name); 789 if (ret) 790 goto fail; 791 792 out: 793 mutex_unlock(&global_tunables_lock); 794 return 0; 795 796 fail: 797 kobject_put(&tunables->attr_set.kobj); 798 policy->governor_data = NULL; 799 sugov_tunables_free(tunables); 800 801 stop_kthread: 802 sugov_kthread_stop(sg_policy); 803 mutex_unlock(&global_tunables_lock); 804 805 free_sg_policy: 806 sugov_policy_free(sg_policy); 807 808 disable_fast_switch: 809 cpufreq_disable_fast_switch(policy); 810 811 pr_err("initialization failed (error %d)\n", ret); 812 return ret; 813 } 814 815 static void sugov_exit(struct cpufreq_policy *policy) 816 { 817 struct sugov_policy *sg_policy = policy->governor_data; 818 struct sugov_tunables *tunables = sg_policy->tunables; 819 unsigned int count; 820 821 mutex_lock(&global_tunables_lock); 822 823 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 824 policy->governor_data = NULL; 825 if (!count) 826 sugov_tunables_free(tunables); 827 828 mutex_unlock(&global_tunables_lock); 829 830 sugov_kthread_stop(sg_policy); 831 sugov_policy_free(sg_policy); 832 cpufreq_disable_fast_switch(policy); 833 } 834 835 static int sugov_start(struct cpufreq_policy *policy) 836 { 837 struct sugov_policy *sg_policy = policy->governor_data; 838 unsigned int cpu; 839 840 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 841 sg_policy->last_freq_update_time = 0; 842 sg_policy->next_freq = 0; 843 sg_policy->work_in_progress = false; 844 sg_policy->limits_changed = false; 845 sg_policy->need_freq_update = false; 846 sg_policy->cached_raw_freq = 0; 847 848 for_each_cpu(cpu, policy->cpus) { 849 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 850 851 memset(sg_cpu, 0, sizeof(*sg_cpu)); 852 sg_cpu->cpu = cpu; 853 sg_cpu->sg_policy = sg_policy; 854 } 855 856 for_each_cpu(cpu, policy->cpus) { 857 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 858 859 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 860 policy_is_shared(policy) ? 861 sugov_update_shared : 862 sugov_update_single); 863 } 864 return 0; 865 } 866 867 static void sugov_stop(struct cpufreq_policy *policy) 868 { 869 struct sugov_policy *sg_policy = policy->governor_data; 870 unsigned int cpu; 871 872 for_each_cpu(cpu, policy->cpus) 873 cpufreq_remove_update_util_hook(cpu); 874 875 synchronize_rcu(); 876 877 if (!policy->fast_switch_enabled) { 878 irq_work_sync(&sg_policy->irq_work); 879 kthread_cancel_work_sync(&sg_policy->work); 880 } 881 } 882 883 static void sugov_limits(struct cpufreq_policy *policy) 884 { 885 struct sugov_policy *sg_policy = policy->governor_data; 886 887 if (!policy->fast_switch_enabled) { 888 mutex_lock(&sg_policy->work_lock); 889 cpufreq_policy_apply_limits(policy); 890 mutex_unlock(&sg_policy->work_lock); 891 } 892 893 sg_policy->limits_changed = true; 894 } 895 896 struct cpufreq_governor schedutil_gov = { 897 .name = "schedutil", 898 .owner = THIS_MODULE, 899 .dynamic_switching = true, 900 .init = sugov_init, 901 .exit = sugov_exit, 902 .start = sugov_start, 903 .stop = sugov_stop, 904 .limits = sugov_limits, 905 }; 906 907 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 908 struct cpufreq_governor *cpufreq_default_governor(void) 909 { 910 return &schedutil_gov; 911 } 912 #endif 913 914 static int __init sugov_register(void) 915 { 916 return cpufreq_register_governor(&schedutil_gov); 917 } 918 fs_initcall(sugov_register); 919 920 #ifdef CONFIG_ENERGY_MODEL 921 extern bool sched_energy_update; 922 extern struct mutex sched_energy_mutex; 923 924 static void rebuild_sd_workfn(struct work_struct *work) 925 { 926 mutex_lock(&sched_energy_mutex); 927 sched_energy_update = true; 928 rebuild_sched_domains(); 929 sched_energy_update = false; 930 mutex_unlock(&sched_energy_mutex); 931 } 932 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); 933 934 /* 935 * EAS shouldn't be attempted without sugov, so rebuild the sched_domains 936 * on governor changes to make sure the scheduler knows about it. 937 */ 938 void sched_cpufreq_governor_change(struct cpufreq_policy *policy, 939 struct cpufreq_governor *old_gov) 940 { 941 if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { 942 /* 943 * When called from the cpufreq_register_driver() path, the 944 * cpu_hotplug_lock is already held, so use a work item to 945 * avoid nested locking in rebuild_sched_domains(). 946 */ 947 schedule_work(&rebuild_sd_work); 948 } 949 950 } 951 #endif 952