1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * CPUFreq governor based on scheduler-provided CPU utilization data. 4 * 5 * Copyright (C) 2016, Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include "sched.h" 12 13 #include <linux/sched/cpufreq.h> 14 #include <trace/events/power.h> 15 16 #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) 17 18 struct sugov_tunables { 19 struct gov_attr_set attr_set; 20 unsigned int rate_limit_us; 21 }; 22 23 struct sugov_policy { 24 struct cpufreq_policy *policy; 25 26 struct sugov_tunables *tunables; 27 struct list_head tunables_hook; 28 29 raw_spinlock_t update_lock; /* For shared policies */ 30 u64 last_freq_update_time; 31 s64 freq_update_delay_ns; 32 unsigned int next_freq; 33 unsigned int cached_raw_freq; 34 35 /* The next fields are only needed if fast switch cannot be used: */ 36 struct irq_work irq_work; 37 struct kthread_work work; 38 struct mutex work_lock; 39 struct kthread_worker worker; 40 struct task_struct *thread; 41 bool work_in_progress; 42 43 bool limits_changed; 44 bool need_freq_update; 45 }; 46 47 struct sugov_cpu { 48 struct update_util_data update_util; 49 struct sugov_policy *sg_policy; 50 unsigned int cpu; 51 52 bool iowait_boost_pending; 53 unsigned int iowait_boost; 54 u64 last_update; 55 56 unsigned long bw_dl; 57 unsigned long max; 58 59 /* The field below is for single-CPU policies only: */ 60 #ifdef CONFIG_NO_HZ_COMMON 61 unsigned long saved_idle_calls; 62 #endif 63 }; 64 65 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 66 67 /************************ Governor internals ***********************/ 68 69 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) 70 { 71 s64 delta_ns; 72 73 /* 74 * Since cpufreq_update_util() is called with rq->lock held for 75 * the @target_cpu, our per-CPU data is fully serialized. 76 * 77 * However, drivers cannot in general deal with cross-CPU 78 * requests, so while get_next_freq() will work, our 79 * sugov_update_commit() call may not for the fast switching platforms. 80 * 81 * Hence stop here for remote requests if they aren't supported 82 * by the hardware, as calculating the frequency is pointless if 83 * we cannot in fact act on it. 84 * 85 * For the slow switching platforms, the kthread is always scheduled on 86 * the right set of CPUs and any CPU can find the next frequency and 87 * schedule the kthread. 88 */ 89 if (sg_policy->policy->fast_switch_enabled && 90 !cpufreq_this_cpu_can_update(sg_policy->policy)) 91 return false; 92 93 if (unlikely(sg_policy->limits_changed)) { 94 sg_policy->limits_changed = false; 95 sg_policy->need_freq_update = true; 96 return true; 97 } 98 99 delta_ns = time - sg_policy->last_freq_update_time; 100 101 return delta_ns >= sg_policy->freq_update_delay_ns; 102 } 103 104 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, 105 unsigned int next_freq) 106 { 107 if (sg_policy->next_freq == next_freq) 108 return false; 109 110 sg_policy->next_freq = next_freq; 111 sg_policy->last_freq_update_time = time; 112 113 return true; 114 } 115 116 static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, 117 unsigned int next_freq) 118 { 119 struct cpufreq_policy *policy = sg_policy->policy; 120 121 if (!sugov_update_next_freq(sg_policy, time, next_freq)) 122 return; 123 124 next_freq = cpufreq_driver_fast_switch(policy, next_freq); 125 if (!next_freq) 126 return; 127 128 policy->cur = next_freq; 129 trace_cpu_frequency(next_freq, smp_processor_id()); 130 } 131 132 static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, 133 unsigned int next_freq) 134 { 135 if (!sugov_update_next_freq(sg_policy, time, next_freq)) 136 return; 137 138 if (!sg_policy->work_in_progress) { 139 sg_policy->work_in_progress = true; 140 irq_work_queue(&sg_policy->irq_work); 141 } 142 } 143 144 /** 145 * get_next_freq - Compute a new frequency for a given cpufreq policy. 146 * @sg_policy: schedutil policy object to compute the new frequency for. 147 * @util: Current CPU utilization. 148 * @max: CPU capacity. 149 * 150 * If the utilization is frequency-invariant, choose the new frequency to be 151 * proportional to it, that is 152 * 153 * next_freq = C * max_freq * util / max 154 * 155 * Otherwise, approximate the would-be frequency-invariant utilization by 156 * util_raw * (curr_freq / max_freq) which leads to 157 * 158 * next_freq = C * curr_freq * util_raw / max 159 * 160 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 161 * 162 * The lowest driver-supported frequency which is equal or greater than the raw 163 * next_freq (as calculated above) is returned, subject to policy min/max and 164 * cpufreq driver limitations. 165 */ 166 static unsigned int get_next_freq(struct sugov_policy *sg_policy, 167 unsigned long util, unsigned long max) 168 { 169 struct cpufreq_policy *policy = sg_policy->policy; 170 unsigned int freq = arch_scale_freq_invariant() ? 171 policy->cpuinfo.max_freq : policy->cur; 172 173 freq = map_util_freq(util, freq, max); 174 175 if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) 176 return sg_policy->next_freq; 177 178 sg_policy->need_freq_update = false; 179 sg_policy->cached_raw_freq = freq; 180 return cpufreq_driver_resolve_freq(policy, freq); 181 } 182 183 /* 184 * This function computes an effective utilization for the given CPU, to be 185 * used for frequency selection given the linear relation: f = u * f_max. 186 * 187 * The scheduler tracks the following metrics: 188 * 189 * cpu_util_{cfs,rt,dl,irq}() 190 * cpu_bw_dl() 191 * 192 * Where the cfs,rt and dl util numbers are tracked with the same metric and 193 * synchronized windows and are thus directly comparable. 194 * 195 * The cfs,rt,dl utilization are the running times measured with rq->clock_task 196 * which excludes things like IRQ and steal-time. These latter are then accrued 197 * in the irq utilization. 198 * 199 * The DL bandwidth number otoh is not a measured metric but a value computed 200 * based on the task model parameters and gives the minimal utilization 201 * required to meet deadlines. 202 */ 203 unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, 204 unsigned long max, enum schedutil_type type, 205 struct task_struct *p) 206 { 207 unsigned long dl_util, util, irq; 208 struct rq *rq = cpu_rq(cpu); 209 210 if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && 211 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { 212 return max; 213 } 214 215 /* 216 * Early check to see if IRQ/steal time saturates the CPU, can be 217 * because of inaccuracies in how we track these -- see 218 * update_irq_load_avg(). 219 */ 220 irq = cpu_util_irq(rq); 221 if (unlikely(irq >= max)) 222 return max; 223 224 /* 225 * Because the time spend on RT/DL tasks is visible as 'lost' time to 226 * CFS tasks and we use the same metric to track the effective 227 * utilization (PELT windows are synchronized) we can directly add them 228 * to obtain the CPU's actual utilization. 229 * 230 * CFS and RT utilization can be boosted or capped, depending on 231 * utilization clamp constraints requested by currently RUNNABLE 232 * tasks. 233 * When there are no CFS RUNNABLE tasks, clamps are released and 234 * frequency will be gracefully reduced with the utilization decay. 235 */ 236 util = util_cfs + cpu_util_rt(rq); 237 if (type == FREQUENCY_UTIL) 238 util = uclamp_util_with(rq, util, p); 239 240 dl_util = cpu_util_dl(rq); 241 242 /* 243 * For frequency selection we do not make cpu_util_dl() a permanent part 244 * of this sum because we want to use cpu_bw_dl() later on, but we need 245 * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such 246 * that we select f_max when there is no idle time. 247 * 248 * NOTE: numerical errors or stop class might cause us to not quite hit 249 * saturation when we should -- something for later. 250 */ 251 if (util + dl_util >= max) 252 return max; 253 254 /* 255 * OTOH, for energy computation we need the estimated running time, so 256 * include util_dl and ignore dl_bw. 257 */ 258 if (type == ENERGY_UTIL) 259 util += dl_util; 260 261 /* 262 * There is still idle time; further improve the number by using the 263 * irq metric. Because IRQ/steal time is hidden from the task clock we 264 * need to scale the task numbers: 265 * 266 * 1 - irq 267 * U' = irq + ------- * U 268 * max 269 */ 270 util = scale_irq_capacity(util, irq, max); 271 util += irq; 272 273 /* 274 * Bandwidth required by DEADLINE must always be granted while, for 275 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism 276 * to gracefully reduce the frequency when no tasks show up for longer 277 * periods of time. 278 * 279 * Ideally we would like to set bw_dl as min/guaranteed freq and util + 280 * bw_dl as requested freq. However, cpufreq is not yet ready for such 281 * an interface. So, we only do the latter for now. 282 */ 283 if (type == FREQUENCY_UTIL) 284 util += cpu_bw_dl(rq); 285 286 return min(max, util); 287 } 288 289 static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) 290 { 291 struct rq *rq = cpu_rq(sg_cpu->cpu); 292 unsigned long util = cpu_util_cfs(rq); 293 unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); 294 295 sg_cpu->max = max; 296 sg_cpu->bw_dl = cpu_bw_dl(rq); 297 298 return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); 299 } 300 301 /** 302 * sugov_iowait_reset() - Reset the IO boost status of a CPU. 303 * @sg_cpu: the sugov data for the CPU to boost 304 * @time: the update time from the caller 305 * @set_iowait_boost: true if an IO boost has been requested 306 * 307 * The IO wait boost of a task is disabled after a tick since the last update 308 * of a CPU. If a new IO wait boost is requested after more then a tick, then 309 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy 310 * efficiency by ignoring sporadic wakeups from IO. 311 */ 312 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, 313 bool set_iowait_boost) 314 { 315 s64 delta_ns = time - sg_cpu->last_update; 316 317 /* Reset boost only if a tick has elapsed since last request */ 318 if (delta_ns <= TICK_NSEC) 319 return false; 320 321 sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; 322 sg_cpu->iowait_boost_pending = set_iowait_boost; 323 324 return true; 325 } 326 327 /** 328 * sugov_iowait_boost() - Updates the IO boost status of a CPU. 329 * @sg_cpu: the sugov data for the CPU to boost 330 * @time: the update time from the caller 331 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait 332 * 333 * Each time a task wakes up after an IO operation, the CPU utilization can be 334 * boosted to a certain utilization which doubles at each "frequent and 335 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization 336 * of the maximum OPP. 337 * 338 * To keep doubling, an IO boost has to be requested at least once per tick, 339 * otherwise we restart from the utilization of the minimum OPP. 340 */ 341 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 342 unsigned int flags) 343 { 344 bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; 345 346 /* Reset boost if the CPU appears to have been idle enough */ 347 if (sg_cpu->iowait_boost && 348 sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) 349 return; 350 351 /* Boost only tasks waking up after IO */ 352 if (!set_iowait_boost) 353 return; 354 355 /* Ensure boost doubles only one time at each request */ 356 if (sg_cpu->iowait_boost_pending) 357 return; 358 sg_cpu->iowait_boost_pending = true; 359 360 /* Double the boost at each request */ 361 if (sg_cpu->iowait_boost) { 362 sg_cpu->iowait_boost = 363 min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); 364 return; 365 } 366 367 /* First wakeup after IO: start with minimum boost */ 368 sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; 369 } 370 371 /** 372 * sugov_iowait_apply() - Apply the IO boost to a CPU. 373 * @sg_cpu: the sugov data for the cpu to boost 374 * @time: the update time from the caller 375 * @util: the utilization to (eventually) boost 376 * @max: the maximum value the utilization can be boosted to 377 * 378 * A CPU running a task which woken up after an IO operation can have its 379 * utilization boosted to speed up the completion of those IO operations. 380 * The IO boost value is increased each time a task wakes up from IO, in 381 * sugov_iowait_apply(), and it's instead decreased by this function, 382 * each time an increase has not been requested (!iowait_boost_pending). 383 * 384 * A CPU which also appears to have been idle for at least one tick has also 385 * its IO boost utilization reset. 386 * 387 * This mechanism is designed to boost high frequently IO waiting tasks, while 388 * being more conservative on tasks which does sporadic IO operations. 389 */ 390 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, 391 unsigned long util, unsigned long max) 392 { 393 unsigned long boost; 394 395 /* No boost currently required */ 396 if (!sg_cpu->iowait_boost) 397 return util; 398 399 /* Reset boost if the CPU appears to have been idle enough */ 400 if (sugov_iowait_reset(sg_cpu, time, false)) 401 return util; 402 403 if (!sg_cpu->iowait_boost_pending) { 404 /* 405 * No boost pending; reduce the boost value. 406 */ 407 sg_cpu->iowait_boost >>= 1; 408 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { 409 sg_cpu->iowait_boost = 0; 410 return util; 411 } 412 } 413 414 sg_cpu->iowait_boost_pending = false; 415 416 /* 417 * @util is already in capacity scale; convert iowait_boost 418 * into the same scale so we can compare. 419 */ 420 boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; 421 return max(boost, util); 422 } 423 424 #ifdef CONFIG_NO_HZ_COMMON 425 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) 426 { 427 unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); 428 bool ret = idle_calls == sg_cpu->saved_idle_calls; 429 430 sg_cpu->saved_idle_calls = idle_calls; 431 return ret; 432 } 433 #else 434 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } 435 #endif /* CONFIG_NO_HZ_COMMON */ 436 437 /* 438 * Make sugov_should_update_freq() ignore the rate limit when DL 439 * has increased the utilization. 440 */ 441 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) 442 { 443 if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) 444 sg_policy->limits_changed = true; 445 } 446 447 static void sugov_update_single(struct update_util_data *hook, u64 time, 448 unsigned int flags) 449 { 450 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 451 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 452 unsigned long util, max; 453 unsigned int next_f; 454 bool busy; 455 456 sugov_iowait_boost(sg_cpu, time, flags); 457 sg_cpu->last_update = time; 458 459 ignore_dl_rate_limit(sg_cpu, sg_policy); 460 461 if (!sugov_should_update_freq(sg_policy, time)) 462 return; 463 464 /* Limits may have changed, don't skip frequency update */ 465 busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); 466 467 util = sugov_get_util(sg_cpu); 468 max = sg_cpu->max; 469 util = sugov_iowait_apply(sg_cpu, time, util, max); 470 next_f = get_next_freq(sg_policy, util, max); 471 /* 472 * Do not reduce the frequency if the CPU has not been idle 473 * recently, as the reduction is likely to be premature then. 474 */ 475 if (busy && next_f < sg_policy->next_freq) { 476 next_f = sg_policy->next_freq; 477 478 /* Reset cached freq as next_freq has changed */ 479 sg_policy->cached_raw_freq = 0; 480 } 481 482 /* 483 * This code runs under rq->lock for the target CPU, so it won't run 484 * concurrently on two different CPUs for the same target and it is not 485 * necessary to acquire the lock in the fast switch case. 486 */ 487 if (sg_policy->policy->fast_switch_enabled) { 488 sugov_fast_switch(sg_policy, time, next_f); 489 } else { 490 raw_spin_lock(&sg_policy->update_lock); 491 sugov_deferred_update(sg_policy, time, next_f); 492 raw_spin_unlock(&sg_policy->update_lock); 493 } 494 } 495 496 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) 497 { 498 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 499 struct cpufreq_policy *policy = sg_policy->policy; 500 unsigned long util = 0, max = 1; 501 unsigned int j; 502 503 for_each_cpu(j, policy->cpus) { 504 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 505 unsigned long j_util, j_max; 506 507 j_util = sugov_get_util(j_sg_cpu); 508 j_max = j_sg_cpu->max; 509 j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); 510 511 if (j_util * max > j_max * util) { 512 util = j_util; 513 max = j_max; 514 } 515 } 516 517 return get_next_freq(sg_policy, util, max); 518 } 519 520 static void 521 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) 522 { 523 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 524 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 525 unsigned int next_f; 526 527 raw_spin_lock(&sg_policy->update_lock); 528 529 sugov_iowait_boost(sg_cpu, time, flags); 530 sg_cpu->last_update = time; 531 532 ignore_dl_rate_limit(sg_cpu, sg_policy); 533 534 if (sugov_should_update_freq(sg_policy, time)) { 535 next_f = sugov_next_freq_shared(sg_cpu, time); 536 537 if (sg_policy->policy->fast_switch_enabled) 538 sugov_fast_switch(sg_policy, time, next_f); 539 else 540 sugov_deferred_update(sg_policy, time, next_f); 541 } 542 543 raw_spin_unlock(&sg_policy->update_lock); 544 } 545 546 static void sugov_work(struct kthread_work *work) 547 { 548 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 549 unsigned int freq; 550 unsigned long flags; 551 552 /* 553 * Hold sg_policy->update_lock shortly to handle the case where: 554 * incase sg_policy->next_freq is read here, and then updated by 555 * sugov_deferred_update() just before work_in_progress is set to false 556 * here, we may miss queueing the new update. 557 * 558 * Note: If a work was queued after the update_lock is released, 559 * sugov_work() will just be called again by kthread_work code; and the 560 * request will be proceed before the sugov thread sleeps. 561 */ 562 raw_spin_lock_irqsave(&sg_policy->update_lock, flags); 563 freq = sg_policy->next_freq; 564 sg_policy->work_in_progress = false; 565 raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); 566 567 mutex_lock(&sg_policy->work_lock); 568 __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); 569 mutex_unlock(&sg_policy->work_lock); 570 } 571 572 static void sugov_irq_work(struct irq_work *irq_work) 573 { 574 struct sugov_policy *sg_policy; 575 576 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 577 578 kthread_queue_work(&sg_policy->worker, &sg_policy->work); 579 } 580 581 /************************** sysfs interface ************************/ 582 583 static struct sugov_tunables *global_tunables; 584 static DEFINE_MUTEX(global_tunables_lock); 585 586 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) 587 { 588 return container_of(attr_set, struct sugov_tunables, attr_set); 589 } 590 591 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) 592 { 593 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 594 595 return sprintf(buf, "%u\n", tunables->rate_limit_us); 596 } 597 598 static ssize_t 599 rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) 600 { 601 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 602 struct sugov_policy *sg_policy; 603 unsigned int rate_limit_us; 604 605 if (kstrtouint(buf, 10, &rate_limit_us)) 606 return -EINVAL; 607 608 tunables->rate_limit_us = rate_limit_us; 609 610 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) 611 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; 612 613 return count; 614 } 615 616 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 617 618 static struct attribute *sugov_attrs[] = { 619 &rate_limit_us.attr, 620 NULL 621 }; 622 ATTRIBUTE_GROUPS(sugov); 623 624 static struct kobj_type sugov_tunables_ktype = { 625 .default_groups = sugov_groups, 626 .sysfs_ops = &governor_sysfs_ops, 627 }; 628 629 /********************** cpufreq governor interface *********************/ 630 631 struct cpufreq_governor schedutil_gov; 632 633 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) 634 { 635 struct sugov_policy *sg_policy; 636 637 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); 638 if (!sg_policy) 639 return NULL; 640 641 sg_policy->policy = policy; 642 raw_spin_lock_init(&sg_policy->update_lock); 643 return sg_policy; 644 } 645 646 static void sugov_policy_free(struct sugov_policy *sg_policy) 647 { 648 kfree(sg_policy); 649 } 650 651 static int sugov_kthread_create(struct sugov_policy *sg_policy) 652 { 653 struct task_struct *thread; 654 struct sched_attr attr = { 655 .size = sizeof(struct sched_attr), 656 .sched_policy = SCHED_DEADLINE, 657 .sched_flags = SCHED_FLAG_SUGOV, 658 .sched_nice = 0, 659 .sched_priority = 0, 660 /* 661 * Fake (unused) bandwidth; workaround to "fix" 662 * priority inheritance. 663 */ 664 .sched_runtime = 1000000, 665 .sched_deadline = 10000000, 666 .sched_period = 10000000, 667 }; 668 struct cpufreq_policy *policy = sg_policy->policy; 669 int ret; 670 671 /* kthread only required for slow path */ 672 if (policy->fast_switch_enabled) 673 return 0; 674 675 kthread_init_work(&sg_policy->work, sugov_work); 676 kthread_init_worker(&sg_policy->worker); 677 thread = kthread_create(kthread_worker_fn, &sg_policy->worker, 678 "sugov:%d", 679 cpumask_first(policy->related_cpus)); 680 if (IS_ERR(thread)) { 681 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); 682 return PTR_ERR(thread); 683 } 684 685 ret = sched_setattr_nocheck(thread, &attr); 686 if (ret) { 687 kthread_stop(thread); 688 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); 689 return ret; 690 } 691 692 sg_policy->thread = thread; 693 kthread_bind_mask(thread, policy->related_cpus); 694 init_irq_work(&sg_policy->irq_work, sugov_irq_work); 695 mutex_init(&sg_policy->work_lock); 696 697 wake_up_process(thread); 698 699 return 0; 700 } 701 702 static void sugov_kthread_stop(struct sugov_policy *sg_policy) 703 { 704 /* kthread only required for slow path */ 705 if (sg_policy->policy->fast_switch_enabled) 706 return; 707 708 kthread_flush_worker(&sg_policy->worker); 709 kthread_stop(sg_policy->thread); 710 mutex_destroy(&sg_policy->work_lock); 711 } 712 713 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 714 { 715 struct sugov_tunables *tunables; 716 717 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); 718 if (tunables) { 719 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); 720 if (!have_governor_per_policy()) 721 global_tunables = tunables; 722 } 723 return tunables; 724 } 725 726 static void sugov_tunables_free(struct sugov_tunables *tunables) 727 { 728 if (!have_governor_per_policy()) 729 global_tunables = NULL; 730 731 kfree(tunables); 732 } 733 734 static int sugov_init(struct cpufreq_policy *policy) 735 { 736 struct sugov_policy *sg_policy; 737 struct sugov_tunables *tunables; 738 int ret = 0; 739 740 /* State should be equivalent to EXIT */ 741 if (policy->governor_data) 742 return -EBUSY; 743 744 cpufreq_enable_fast_switch(policy); 745 746 sg_policy = sugov_policy_alloc(policy); 747 if (!sg_policy) { 748 ret = -ENOMEM; 749 goto disable_fast_switch; 750 } 751 752 ret = sugov_kthread_create(sg_policy); 753 if (ret) 754 goto free_sg_policy; 755 756 mutex_lock(&global_tunables_lock); 757 758 if (global_tunables) { 759 if (WARN_ON(have_governor_per_policy())) { 760 ret = -EINVAL; 761 goto stop_kthread; 762 } 763 policy->governor_data = sg_policy; 764 sg_policy->tunables = global_tunables; 765 766 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); 767 goto out; 768 } 769 770 tunables = sugov_tunables_alloc(sg_policy); 771 if (!tunables) { 772 ret = -ENOMEM; 773 goto stop_kthread; 774 } 775 776 tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); 777 778 policy->governor_data = sg_policy; 779 sg_policy->tunables = tunables; 780 781 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, 782 get_governor_parent_kobj(policy), "%s", 783 schedutil_gov.name); 784 if (ret) 785 goto fail; 786 787 out: 788 mutex_unlock(&global_tunables_lock); 789 return 0; 790 791 fail: 792 kobject_put(&tunables->attr_set.kobj); 793 policy->governor_data = NULL; 794 sugov_tunables_free(tunables); 795 796 stop_kthread: 797 sugov_kthread_stop(sg_policy); 798 mutex_unlock(&global_tunables_lock); 799 800 free_sg_policy: 801 sugov_policy_free(sg_policy); 802 803 disable_fast_switch: 804 cpufreq_disable_fast_switch(policy); 805 806 pr_err("initialization failed (error %d)\n", ret); 807 return ret; 808 } 809 810 static void sugov_exit(struct cpufreq_policy *policy) 811 { 812 struct sugov_policy *sg_policy = policy->governor_data; 813 struct sugov_tunables *tunables = sg_policy->tunables; 814 unsigned int count; 815 816 mutex_lock(&global_tunables_lock); 817 818 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 819 policy->governor_data = NULL; 820 if (!count) 821 sugov_tunables_free(tunables); 822 823 mutex_unlock(&global_tunables_lock); 824 825 sugov_kthread_stop(sg_policy); 826 sugov_policy_free(sg_policy); 827 cpufreq_disable_fast_switch(policy); 828 } 829 830 static int sugov_start(struct cpufreq_policy *policy) 831 { 832 struct sugov_policy *sg_policy = policy->governor_data; 833 unsigned int cpu; 834 835 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 836 sg_policy->last_freq_update_time = 0; 837 sg_policy->next_freq = 0; 838 sg_policy->work_in_progress = false; 839 sg_policy->limits_changed = false; 840 sg_policy->need_freq_update = false; 841 sg_policy->cached_raw_freq = 0; 842 843 for_each_cpu(cpu, policy->cpus) { 844 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 845 846 memset(sg_cpu, 0, sizeof(*sg_cpu)); 847 sg_cpu->cpu = cpu; 848 sg_cpu->sg_policy = sg_policy; 849 } 850 851 for_each_cpu(cpu, policy->cpus) { 852 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 853 854 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 855 policy_is_shared(policy) ? 856 sugov_update_shared : 857 sugov_update_single); 858 } 859 return 0; 860 } 861 862 static void sugov_stop(struct cpufreq_policy *policy) 863 { 864 struct sugov_policy *sg_policy = policy->governor_data; 865 unsigned int cpu; 866 867 for_each_cpu(cpu, policy->cpus) 868 cpufreq_remove_update_util_hook(cpu); 869 870 synchronize_rcu(); 871 872 if (!policy->fast_switch_enabled) { 873 irq_work_sync(&sg_policy->irq_work); 874 kthread_cancel_work_sync(&sg_policy->work); 875 } 876 } 877 878 static void sugov_limits(struct cpufreq_policy *policy) 879 { 880 struct sugov_policy *sg_policy = policy->governor_data; 881 882 if (!policy->fast_switch_enabled) { 883 mutex_lock(&sg_policy->work_lock); 884 cpufreq_policy_apply_limits(policy); 885 mutex_unlock(&sg_policy->work_lock); 886 } 887 888 sg_policy->limits_changed = true; 889 } 890 891 struct cpufreq_governor schedutil_gov = { 892 .name = "schedutil", 893 .owner = THIS_MODULE, 894 .dynamic_switching = true, 895 .init = sugov_init, 896 .exit = sugov_exit, 897 .start = sugov_start, 898 .stop = sugov_stop, 899 .limits = sugov_limits, 900 }; 901 902 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 903 struct cpufreq_governor *cpufreq_default_governor(void) 904 { 905 return &schedutil_gov; 906 } 907 #endif 908 909 static int __init sugov_register(void) 910 { 911 return cpufreq_register_governor(&schedutil_gov); 912 } 913 fs_initcall(sugov_register); 914 915 #ifdef CONFIG_ENERGY_MODEL 916 extern bool sched_energy_update; 917 extern struct mutex sched_energy_mutex; 918 919 static void rebuild_sd_workfn(struct work_struct *work) 920 { 921 mutex_lock(&sched_energy_mutex); 922 sched_energy_update = true; 923 rebuild_sched_domains(); 924 sched_energy_update = false; 925 mutex_unlock(&sched_energy_mutex); 926 } 927 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); 928 929 /* 930 * EAS shouldn't be attempted without sugov, so rebuild the sched_domains 931 * on governor changes to make sure the scheduler knows about it. 932 */ 933 void sched_cpufreq_governor_change(struct cpufreq_policy *policy, 934 struct cpufreq_governor *old_gov) 935 { 936 if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { 937 /* 938 * When called from the cpufreq_register_driver() path, the 939 * cpu_hotplug_lock is already held, so use a work item to 940 * avoid nested locking in rebuild_sched_domains(). 941 */ 942 schedule_work(&rebuild_sd_work); 943 } 944 945 } 946 #endif 947