1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * CPUFreq governor based on scheduler-provided CPU utilization data. 4 * 5 * Copyright (C) 2016, Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include "sched.h" 12 13 #include <linux/sched/cpufreq.h> 14 #include <trace/events/power.h> 15 16 #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) 17 18 struct sugov_tunables { 19 struct gov_attr_set attr_set; 20 unsigned int rate_limit_us; 21 }; 22 23 struct sugov_policy { 24 struct cpufreq_policy *policy; 25 26 struct sugov_tunables *tunables; 27 struct list_head tunables_hook; 28 29 raw_spinlock_t update_lock; /* For shared policies */ 30 u64 last_freq_update_time; 31 s64 freq_update_delay_ns; 32 unsigned int next_freq; 33 unsigned int cached_raw_freq; 34 35 /* The next fields are only needed if fast switch cannot be used: */ 36 struct irq_work irq_work; 37 struct kthread_work work; 38 struct mutex work_lock; 39 struct kthread_worker worker; 40 struct task_struct *thread; 41 bool work_in_progress; 42 43 bool limits_changed; 44 bool need_freq_update; 45 }; 46 47 struct sugov_cpu { 48 struct update_util_data update_util; 49 struct sugov_policy *sg_policy; 50 unsigned int cpu; 51 52 bool iowait_boost_pending; 53 unsigned int iowait_boost; 54 u64 last_update; 55 56 unsigned long bw_dl; 57 unsigned long max; 58 59 /* The field below is for single-CPU policies only: */ 60 #ifdef CONFIG_NO_HZ_COMMON 61 unsigned long saved_idle_calls; 62 #endif 63 }; 64 65 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 66 67 /************************ Governor internals ***********************/ 68 69 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) 70 { 71 s64 delta_ns; 72 73 /* 74 * Since cpufreq_update_util() is called with rq->lock held for 75 * the @target_cpu, our per-CPU data is fully serialized. 76 * 77 * However, drivers cannot in general deal with cross-CPU 78 * requests, so while get_next_freq() will work, our 79 * sugov_update_commit() call may not for the fast switching platforms. 80 * 81 * Hence stop here for remote requests if they aren't supported 82 * by the hardware, as calculating the frequency is pointless if 83 * we cannot in fact act on it. 84 * 85 * This is needed on the slow switching platforms too to prevent CPUs 86 * going offline from leaving stale IRQ work items behind. 87 */ 88 if (!cpufreq_this_cpu_can_update(sg_policy->policy)) 89 return false; 90 91 if (unlikely(sg_policy->limits_changed)) { 92 sg_policy->limits_changed = false; 93 sg_policy->need_freq_update = true; 94 return true; 95 } 96 97 delta_ns = time - sg_policy->last_freq_update_time; 98 99 return delta_ns >= sg_policy->freq_update_delay_ns; 100 } 101 102 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, 103 unsigned int next_freq) 104 { 105 if (sg_policy->next_freq == next_freq && 106 !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) 107 return false; 108 109 sg_policy->next_freq = next_freq; 110 sg_policy->last_freq_update_time = time; 111 112 return true; 113 } 114 115 static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, 116 unsigned int next_freq) 117 { 118 if (sugov_update_next_freq(sg_policy, time, next_freq)) 119 cpufreq_driver_fast_switch(sg_policy->policy, next_freq); 120 } 121 122 static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, 123 unsigned int next_freq) 124 { 125 if (!sugov_update_next_freq(sg_policy, time, next_freq)) 126 return; 127 128 if (!sg_policy->work_in_progress) { 129 sg_policy->work_in_progress = true; 130 irq_work_queue(&sg_policy->irq_work); 131 } 132 } 133 134 /** 135 * get_next_freq - Compute a new frequency for a given cpufreq policy. 136 * @sg_policy: schedutil policy object to compute the new frequency for. 137 * @util: Current CPU utilization. 138 * @max: CPU capacity. 139 * 140 * If the utilization is frequency-invariant, choose the new frequency to be 141 * proportional to it, that is 142 * 143 * next_freq = C * max_freq * util / max 144 * 145 * Otherwise, approximate the would-be frequency-invariant utilization by 146 * util_raw * (curr_freq / max_freq) which leads to 147 * 148 * next_freq = C * curr_freq * util_raw / max 149 * 150 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 151 * 152 * The lowest driver-supported frequency which is equal or greater than the raw 153 * next_freq (as calculated above) is returned, subject to policy min/max and 154 * cpufreq driver limitations. 155 */ 156 static unsigned int get_next_freq(struct sugov_policy *sg_policy, 157 unsigned long util, unsigned long max) 158 { 159 struct cpufreq_policy *policy = sg_policy->policy; 160 unsigned int freq = arch_scale_freq_invariant() ? 161 policy->cpuinfo.max_freq : policy->cur; 162 163 freq = map_util_freq(util, freq, max); 164 165 if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update && 166 !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) 167 return sg_policy->next_freq; 168 169 sg_policy->need_freq_update = false; 170 sg_policy->cached_raw_freq = freq; 171 return cpufreq_driver_resolve_freq(policy, freq); 172 } 173 174 /* 175 * This function computes an effective utilization for the given CPU, to be 176 * used for frequency selection given the linear relation: f = u * f_max. 177 * 178 * The scheduler tracks the following metrics: 179 * 180 * cpu_util_{cfs,rt,dl,irq}() 181 * cpu_bw_dl() 182 * 183 * Where the cfs,rt and dl util numbers are tracked with the same metric and 184 * synchronized windows and are thus directly comparable. 185 * 186 * The cfs,rt,dl utilization are the running times measured with rq->clock_task 187 * which excludes things like IRQ and steal-time. These latter are then accrued 188 * in the irq utilization. 189 * 190 * The DL bandwidth number otoh is not a measured metric but a value computed 191 * based on the task model parameters and gives the minimal utilization 192 * required to meet deadlines. 193 */ 194 unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, 195 unsigned long max, enum schedutil_type type, 196 struct task_struct *p) 197 { 198 unsigned long dl_util, util, irq; 199 struct rq *rq = cpu_rq(cpu); 200 201 if (!uclamp_is_used() && 202 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { 203 return max; 204 } 205 206 /* 207 * Early check to see if IRQ/steal time saturates the CPU, can be 208 * because of inaccuracies in how we track these -- see 209 * update_irq_load_avg(). 210 */ 211 irq = cpu_util_irq(rq); 212 if (unlikely(irq >= max)) 213 return max; 214 215 /* 216 * Because the time spend on RT/DL tasks is visible as 'lost' time to 217 * CFS tasks and we use the same metric to track the effective 218 * utilization (PELT windows are synchronized) we can directly add them 219 * to obtain the CPU's actual utilization. 220 * 221 * CFS and RT utilization can be boosted or capped, depending on 222 * utilization clamp constraints requested by currently RUNNABLE 223 * tasks. 224 * When there are no CFS RUNNABLE tasks, clamps are released and 225 * frequency will be gracefully reduced with the utilization decay. 226 */ 227 util = util_cfs + cpu_util_rt(rq); 228 if (type == FREQUENCY_UTIL) 229 util = uclamp_rq_util_with(rq, util, p); 230 231 dl_util = cpu_util_dl(rq); 232 233 /* 234 * For frequency selection we do not make cpu_util_dl() a permanent part 235 * of this sum because we want to use cpu_bw_dl() later on, but we need 236 * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such 237 * that we select f_max when there is no idle time. 238 * 239 * NOTE: numerical errors or stop class might cause us to not quite hit 240 * saturation when we should -- something for later. 241 */ 242 if (util + dl_util >= max) 243 return max; 244 245 /* 246 * OTOH, for energy computation we need the estimated running time, so 247 * include util_dl and ignore dl_bw. 248 */ 249 if (type == ENERGY_UTIL) 250 util += dl_util; 251 252 /* 253 * There is still idle time; further improve the number by using the 254 * irq metric. Because IRQ/steal time is hidden from the task clock we 255 * need to scale the task numbers: 256 * 257 * max - irq 258 * U' = irq + --------- * U 259 * max 260 */ 261 util = scale_irq_capacity(util, irq, max); 262 util += irq; 263 264 /* 265 * Bandwidth required by DEADLINE must always be granted while, for 266 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism 267 * to gracefully reduce the frequency when no tasks show up for longer 268 * periods of time. 269 * 270 * Ideally we would like to set bw_dl as min/guaranteed freq and util + 271 * bw_dl as requested freq. However, cpufreq is not yet ready for such 272 * an interface. So, we only do the latter for now. 273 */ 274 if (type == FREQUENCY_UTIL) 275 util += cpu_bw_dl(rq); 276 277 return min(max, util); 278 } 279 280 static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) 281 { 282 struct rq *rq = cpu_rq(sg_cpu->cpu); 283 unsigned long util = cpu_util_cfs(rq); 284 unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); 285 286 sg_cpu->max = max; 287 sg_cpu->bw_dl = cpu_bw_dl(rq); 288 289 return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); 290 } 291 292 /** 293 * sugov_iowait_reset() - Reset the IO boost status of a CPU. 294 * @sg_cpu: the sugov data for the CPU to boost 295 * @time: the update time from the caller 296 * @set_iowait_boost: true if an IO boost has been requested 297 * 298 * The IO wait boost of a task is disabled after a tick since the last update 299 * of a CPU. If a new IO wait boost is requested after more then a tick, then 300 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy 301 * efficiency by ignoring sporadic wakeups from IO. 302 */ 303 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, 304 bool set_iowait_boost) 305 { 306 s64 delta_ns = time - sg_cpu->last_update; 307 308 /* Reset boost only if a tick has elapsed since last request */ 309 if (delta_ns <= TICK_NSEC) 310 return false; 311 312 sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; 313 sg_cpu->iowait_boost_pending = set_iowait_boost; 314 315 return true; 316 } 317 318 /** 319 * sugov_iowait_boost() - Updates the IO boost status of a CPU. 320 * @sg_cpu: the sugov data for the CPU to boost 321 * @time: the update time from the caller 322 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait 323 * 324 * Each time a task wakes up after an IO operation, the CPU utilization can be 325 * boosted to a certain utilization which doubles at each "frequent and 326 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization 327 * of the maximum OPP. 328 * 329 * To keep doubling, an IO boost has to be requested at least once per tick, 330 * otherwise we restart from the utilization of the minimum OPP. 331 */ 332 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 333 unsigned int flags) 334 { 335 bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; 336 337 /* Reset boost if the CPU appears to have been idle enough */ 338 if (sg_cpu->iowait_boost && 339 sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) 340 return; 341 342 /* Boost only tasks waking up after IO */ 343 if (!set_iowait_boost) 344 return; 345 346 /* Ensure boost doubles only one time at each request */ 347 if (sg_cpu->iowait_boost_pending) 348 return; 349 sg_cpu->iowait_boost_pending = true; 350 351 /* Double the boost at each request */ 352 if (sg_cpu->iowait_boost) { 353 sg_cpu->iowait_boost = 354 min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); 355 return; 356 } 357 358 /* First wakeup after IO: start with minimum boost */ 359 sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; 360 } 361 362 /** 363 * sugov_iowait_apply() - Apply the IO boost to a CPU. 364 * @sg_cpu: the sugov data for the cpu to boost 365 * @time: the update time from the caller 366 * @util: the utilization to (eventually) boost 367 * @max: the maximum value the utilization can be boosted to 368 * 369 * A CPU running a task which woken up after an IO operation can have its 370 * utilization boosted to speed up the completion of those IO operations. 371 * The IO boost value is increased each time a task wakes up from IO, in 372 * sugov_iowait_apply(), and it's instead decreased by this function, 373 * each time an increase has not been requested (!iowait_boost_pending). 374 * 375 * A CPU which also appears to have been idle for at least one tick has also 376 * its IO boost utilization reset. 377 * 378 * This mechanism is designed to boost high frequently IO waiting tasks, while 379 * being more conservative on tasks which does sporadic IO operations. 380 */ 381 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, 382 unsigned long util, unsigned long max) 383 { 384 unsigned long boost; 385 386 /* No boost currently required */ 387 if (!sg_cpu->iowait_boost) 388 return util; 389 390 /* Reset boost if the CPU appears to have been idle enough */ 391 if (sugov_iowait_reset(sg_cpu, time, false)) 392 return util; 393 394 if (!sg_cpu->iowait_boost_pending) { 395 /* 396 * No boost pending; reduce the boost value. 397 */ 398 sg_cpu->iowait_boost >>= 1; 399 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { 400 sg_cpu->iowait_boost = 0; 401 return util; 402 } 403 } 404 405 sg_cpu->iowait_boost_pending = false; 406 407 /* 408 * @util is already in capacity scale; convert iowait_boost 409 * into the same scale so we can compare. 410 */ 411 boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; 412 return max(boost, util); 413 } 414 415 #ifdef CONFIG_NO_HZ_COMMON 416 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) 417 { 418 unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); 419 bool ret = idle_calls == sg_cpu->saved_idle_calls; 420 421 sg_cpu->saved_idle_calls = idle_calls; 422 return ret; 423 } 424 #else 425 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } 426 #endif /* CONFIG_NO_HZ_COMMON */ 427 428 /* 429 * Make sugov_should_update_freq() ignore the rate limit when DL 430 * has increased the utilization. 431 */ 432 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) 433 { 434 if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) 435 sg_policy->limits_changed = true; 436 } 437 438 static void sugov_update_single(struct update_util_data *hook, u64 time, 439 unsigned int flags) 440 { 441 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 442 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 443 unsigned long util, max; 444 unsigned int next_f; 445 bool busy; 446 unsigned int cached_freq = sg_policy->cached_raw_freq; 447 448 sugov_iowait_boost(sg_cpu, time, flags); 449 sg_cpu->last_update = time; 450 451 ignore_dl_rate_limit(sg_cpu, sg_policy); 452 453 if (!sugov_should_update_freq(sg_policy, time)) 454 return; 455 456 /* Limits may have changed, don't skip frequency update */ 457 busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); 458 459 util = sugov_get_util(sg_cpu); 460 max = sg_cpu->max; 461 util = sugov_iowait_apply(sg_cpu, time, util, max); 462 next_f = get_next_freq(sg_policy, util, max); 463 /* 464 * Do not reduce the frequency if the CPU has not been idle 465 * recently, as the reduction is likely to be premature then. 466 */ 467 if (busy && next_f < sg_policy->next_freq) { 468 next_f = sg_policy->next_freq; 469 470 /* Restore cached freq as next_freq has changed */ 471 sg_policy->cached_raw_freq = cached_freq; 472 } 473 474 /* 475 * This code runs under rq->lock for the target CPU, so it won't run 476 * concurrently on two different CPUs for the same target and it is not 477 * necessary to acquire the lock in the fast switch case. 478 */ 479 if (sg_policy->policy->fast_switch_enabled) { 480 sugov_fast_switch(sg_policy, time, next_f); 481 } else { 482 raw_spin_lock(&sg_policy->update_lock); 483 sugov_deferred_update(sg_policy, time, next_f); 484 raw_spin_unlock(&sg_policy->update_lock); 485 } 486 } 487 488 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) 489 { 490 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 491 struct cpufreq_policy *policy = sg_policy->policy; 492 unsigned long util = 0, max = 1; 493 unsigned int j; 494 495 for_each_cpu(j, policy->cpus) { 496 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 497 unsigned long j_util, j_max; 498 499 j_util = sugov_get_util(j_sg_cpu); 500 j_max = j_sg_cpu->max; 501 j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); 502 503 if (j_util * max > j_max * util) { 504 util = j_util; 505 max = j_max; 506 } 507 } 508 509 return get_next_freq(sg_policy, util, max); 510 } 511 512 static void 513 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) 514 { 515 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 516 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 517 unsigned int next_f; 518 519 raw_spin_lock(&sg_policy->update_lock); 520 521 sugov_iowait_boost(sg_cpu, time, flags); 522 sg_cpu->last_update = time; 523 524 ignore_dl_rate_limit(sg_cpu, sg_policy); 525 526 if (sugov_should_update_freq(sg_policy, time)) { 527 next_f = sugov_next_freq_shared(sg_cpu, time); 528 529 if (sg_policy->policy->fast_switch_enabled) 530 sugov_fast_switch(sg_policy, time, next_f); 531 else 532 sugov_deferred_update(sg_policy, time, next_f); 533 } 534 535 raw_spin_unlock(&sg_policy->update_lock); 536 } 537 538 static void sugov_work(struct kthread_work *work) 539 { 540 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 541 unsigned int freq; 542 unsigned long flags; 543 544 /* 545 * Hold sg_policy->update_lock shortly to handle the case where: 546 * incase sg_policy->next_freq is read here, and then updated by 547 * sugov_deferred_update() just before work_in_progress is set to false 548 * here, we may miss queueing the new update. 549 * 550 * Note: If a work was queued after the update_lock is released, 551 * sugov_work() will just be called again by kthread_work code; and the 552 * request will be proceed before the sugov thread sleeps. 553 */ 554 raw_spin_lock_irqsave(&sg_policy->update_lock, flags); 555 freq = sg_policy->next_freq; 556 sg_policy->work_in_progress = false; 557 raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); 558 559 mutex_lock(&sg_policy->work_lock); 560 __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); 561 mutex_unlock(&sg_policy->work_lock); 562 } 563 564 static void sugov_irq_work(struct irq_work *irq_work) 565 { 566 struct sugov_policy *sg_policy; 567 568 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 569 570 kthread_queue_work(&sg_policy->worker, &sg_policy->work); 571 } 572 573 /************************** sysfs interface ************************/ 574 575 static struct sugov_tunables *global_tunables; 576 static DEFINE_MUTEX(global_tunables_lock); 577 578 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) 579 { 580 return container_of(attr_set, struct sugov_tunables, attr_set); 581 } 582 583 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) 584 { 585 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 586 587 return sprintf(buf, "%u\n", tunables->rate_limit_us); 588 } 589 590 static ssize_t 591 rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) 592 { 593 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 594 struct sugov_policy *sg_policy; 595 unsigned int rate_limit_us; 596 597 if (kstrtouint(buf, 10, &rate_limit_us)) 598 return -EINVAL; 599 600 tunables->rate_limit_us = rate_limit_us; 601 602 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) 603 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; 604 605 return count; 606 } 607 608 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 609 610 static struct attribute *sugov_attrs[] = { 611 &rate_limit_us.attr, 612 NULL 613 }; 614 ATTRIBUTE_GROUPS(sugov); 615 616 static struct kobj_type sugov_tunables_ktype = { 617 .default_groups = sugov_groups, 618 .sysfs_ops = &governor_sysfs_ops, 619 }; 620 621 /********************** cpufreq governor interface *********************/ 622 623 struct cpufreq_governor schedutil_gov; 624 625 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) 626 { 627 struct sugov_policy *sg_policy; 628 629 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); 630 if (!sg_policy) 631 return NULL; 632 633 sg_policy->policy = policy; 634 raw_spin_lock_init(&sg_policy->update_lock); 635 return sg_policy; 636 } 637 638 static void sugov_policy_free(struct sugov_policy *sg_policy) 639 { 640 kfree(sg_policy); 641 } 642 643 static int sugov_kthread_create(struct sugov_policy *sg_policy) 644 { 645 struct task_struct *thread; 646 struct sched_attr attr = { 647 .size = sizeof(struct sched_attr), 648 .sched_policy = SCHED_DEADLINE, 649 .sched_flags = SCHED_FLAG_SUGOV, 650 .sched_nice = 0, 651 .sched_priority = 0, 652 /* 653 * Fake (unused) bandwidth; workaround to "fix" 654 * priority inheritance. 655 */ 656 .sched_runtime = 1000000, 657 .sched_deadline = 10000000, 658 .sched_period = 10000000, 659 }; 660 struct cpufreq_policy *policy = sg_policy->policy; 661 int ret; 662 663 /* kthread only required for slow path */ 664 if (policy->fast_switch_enabled) 665 return 0; 666 667 kthread_init_work(&sg_policy->work, sugov_work); 668 kthread_init_worker(&sg_policy->worker); 669 thread = kthread_create(kthread_worker_fn, &sg_policy->worker, 670 "sugov:%d", 671 cpumask_first(policy->related_cpus)); 672 if (IS_ERR(thread)) { 673 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); 674 return PTR_ERR(thread); 675 } 676 677 ret = sched_setattr_nocheck(thread, &attr); 678 if (ret) { 679 kthread_stop(thread); 680 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); 681 return ret; 682 } 683 684 sg_policy->thread = thread; 685 kthread_bind_mask(thread, policy->related_cpus); 686 init_irq_work(&sg_policy->irq_work, sugov_irq_work); 687 mutex_init(&sg_policy->work_lock); 688 689 wake_up_process(thread); 690 691 return 0; 692 } 693 694 static void sugov_kthread_stop(struct sugov_policy *sg_policy) 695 { 696 /* kthread only required for slow path */ 697 if (sg_policy->policy->fast_switch_enabled) 698 return; 699 700 kthread_flush_worker(&sg_policy->worker); 701 kthread_stop(sg_policy->thread); 702 mutex_destroy(&sg_policy->work_lock); 703 } 704 705 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 706 { 707 struct sugov_tunables *tunables; 708 709 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); 710 if (tunables) { 711 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); 712 if (!have_governor_per_policy()) 713 global_tunables = tunables; 714 } 715 return tunables; 716 } 717 718 static void sugov_tunables_free(struct sugov_tunables *tunables) 719 { 720 if (!have_governor_per_policy()) 721 global_tunables = NULL; 722 723 kfree(tunables); 724 } 725 726 static int sugov_init(struct cpufreq_policy *policy) 727 { 728 struct sugov_policy *sg_policy; 729 struct sugov_tunables *tunables; 730 int ret = 0; 731 732 /* State should be equivalent to EXIT */ 733 if (policy->governor_data) 734 return -EBUSY; 735 736 cpufreq_enable_fast_switch(policy); 737 738 sg_policy = sugov_policy_alloc(policy); 739 if (!sg_policy) { 740 ret = -ENOMEM; 741 goto disable_fast_switch; 742 } 743 744 ret = sugov_kthread_create(sg_policy); 745 if (ret) 746 goto free_sg_policy; 747 748 mutex_lock(&global_tunables_lock); 749 750 if (global_tunables) { 751 if (WARN_ON(have_governor_per_policy())) { 752 ret = -EINVAL; 753 goto stop_kthread; 754 } 755 policy->governor_data = sg_policy; 756 sg_policy->tunables = global_tunables; 757 758 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); 759 goto out; 760 } 761 762 tunables = sugov_tunables_alloc(sg_policy); 763 if (!tunables) { 764 ret = -ENOMEM; 765 goto stop_kthread; 766 } 767 768 tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); 769 770 policy->governor_data = sg_policy; 771 sg_policy->tunables = tunables; 772 773 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, 774 get_governor_parent_kobj(policy), "%s", 775 schedutil_gov.name); 776 if (ret) 777 goto fail; 778 779 out: 780 mutex_unlock(&global_tunables_lock); 781 return 0; 782 783 fail: 784 kobject_put(&tunables->attr_set.kobj); 785 policy->governor_data = NULL; 786 sugov_tunables_free(tunables); 787 788 stop_kthread: 789 sugov_kthread_stop(sg_policy); 790 mutex_unlock(&global_tunables_lock); 791 792 free_sg_policy: 793 sugov_policy_free(sg_policy); 794 795 disable_fast_switch: 796 cpufreq_disable_fast_switch(policy); 797 798 pr_err("initialization failed (error %d)\n", ret); 799 return ret; 800 } 801 802 static void sugov_exit(struct cpufreq_policy *policy) 803 { 804 struct sugov_policy *sg_policy = policy->governor_data; 805 struct sugov_tunables *tunables = sg_policy->tunables; 806 unsigned int count; 807 808 mutex_lock(&global_tunables_lock); 809 810 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 811 policy->governor_data = NULL; 812 if (!count) 813 sugov_tunables_free(tunables); 814 815 mutex_unlock(&global_tunables_lock); 816 817 sugov_kthread_stop(sg_policy); 818 sugov_policy_free(sg_policy); 819 cpufreq_disable_fast_switch(policy); 820 } 821 822 static int sugov_start(struct cpufreq_policy *policy) 823 { 824 struct sugov_policy *sg_policy = policy->governor_data; 825 unsigned int cpu; 826 827 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 828 sg_policy->last_freq_update_time = 0; 829 sg_policy->next_freq = 0; 830 sg_policy->work_in_progress = false; 831 sg_policy->limits_changed = false; 832 sg_policy->need_freq_update = false; 833 sg_policy->cached_raw_freq = 0; 834 835 for_each_cpu(cpu, policy->cpus) { 836 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 837 838 memset(sg_cpu, 0, sizeof(*sg_cpu)); 839 sg_cpu->cpu = cpu; 840 sg_cpu->sg_policy = sg_policy; 841 } 842 843 for_each_cpu(cpu, policy->cpus) { 844 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 845 846 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 847 policy_is_shared(policy) ? 848 sugov_update_shared : 849 sugov_update_single); 850 } 851 return 0; 852 } 853 854 static void sugov_stop(struct cpufreq_policy *policy) 855 { 856 struct sugov_policy *sg_policy = policy->governor_data; 857 unsigned int cpu; 858 859 for_each_cpu(cpu, policy->cpus) 860 cpufreq_remove_update_util_hook(cpu); 861 862 synchronize_rcu(); 863 864 if (!policy->fast_switch_enabled) { 865 irq_work_sync(&sg_policy->irq_work); 866 kthread_cancel_work_sync(&sg_policy->work); 867 } 868 } 869 870 static void sugov_limits(struct cpufreq_policy *policy) 871 { 872 struct sugov_policy *sg_policy = policy->governor_data; 873 874 if (!policy->fast_switch_enabled) { 875 mutex_lock(&sg_policy->work_lock); 876 cpufreq_policy_apply_limits(policy); 877 mutex_unlock(&sg_policy->work_lock); 878 } 879 880 sg_policy->limits_changed = true; 881 } 882 883 struct cpufreq_governor schedutil_gov = { 884 .name = "schedutil", 885 .owner = THIS_MODULE, 886 .dynamic_switching = true, 887 .init = sugov_init, 888 .exit = sugov_exit, 889 .start = sugov_start, 890 .stop = sugov_stop, 891 .limits = sugov_limits, 892 }; 893 894 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 895 struct cpufreq_governor *cpufreq_default_governor(void) 896 { 897 return &schedutil_gov; 898 } 899 #endif 900 901 cpufreq_governor_init(schedutil_gov); 902 903 #ifdef CONFIG_ENERGY_MODEL 904 extern bool sched_energy_update; 905 extern struct mutex sched_energy_mutex; 906 907 static void rebuild_sd_workfn(struct work_struct *work) 908 { 909 mutex_lock(&sched_energy_mutex); 910 sched_energy_update = true; 911 rebuild_sched_domains(); 912 sched_energy_update = false; 913 mutex_unlock(&sched_energy_mutex); 914 } 915 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); 916 917 /* 918 * EAS shouldn't be attempted without sugov, so rebuild the sched_domains 919 * on governor changes to make sure the scheduler knows about it. 920 */ 921 void sched_cpufreq_governor_change(struct cpufreq_policy *policy, 922 struct cpufreq_governor *old_gov) 923 { 924 if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { 925 /* 926 * When called from the cpufreq_register_driver() path, the 927 * cpu_hotplug_lock is already held, so use a work item to 928 * avoid nested locking in rebuild_sched_domains(). 929 */ 930 schedule_work(&rebuild_sd_work); 931 } 932 933 } 934 #endif 935