1 /* 2 * CPUFreq governor based on scheduler-provided CPU utilization data. 3 * 4 * Copyright (C) 2016, Intel Corporation 5 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/cpufreq.h> 15 #include <linux/kthread.h> 16 #include <uapi/linux/sched/types.h> 17 #include <linux/slab.h> 18 #include <trace/events/power.h> 19 20 #include "sched.h" 21 22 #define SUGOV_KTHREAD_PRIORITY 50 23 24 struct sugov_tunables { 25 struct gov_attr_set attr_set; 26 unsigned int rate_limit_us; 27 }; 28 29 struct sugov_policy { 30 struct cpufreq_policy *policy; 31 32 struct sugov_tunables *tunables; 33 struct list_head tunables_hook; 34 35 raw_spinlock_t update_lock; /* For shared policies */ 36 u64 last_freq_update_time; 37 s64 freq_update_delay_ns; 38 unsigned int next_freq; 39 unsigned int cached_raw_freq; 40 41 /* The next fields are only needed if fast switch cannot be used. */ 42 struct irq_work irq_work; 43 struct kthread_work work; 44 struct mutex work_lock; 45 struct kthread_worker worker; 46 struct task_struct *thread; 47 bool work_in_progress; 48 49 bool need_freq_update; 50 }; 51 52 struct sugov_cpu { 53 struct update_util_data update_util; 54 struct sugov_policy *sg_policy; 55 56 unsigned long iowait_boost; 57 unsigned long iowait_boost_max; 58 u64 last_update; 59 60 /* The fields below are only needed when sharing a policy. */ 61 unsigned long util; 62 unsigned long max; 63 unsigned int flags; 64 65 /* The field below is for single-CPU policies only. */ 66 #ifdef CONFIG_NO_HZ_COMMON 67 unsigned long saved_idle_calls; 68 #endif 69 }; 70 71 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 72 73 /************************ Governor internals ***********************/ 74 75 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) 76 { 77 s64 delta_ns; 78 79 if (sg_policy->work_in_progress) 80 return false; 81 82 if (unlikely(sg_policy->need_freq_update)) { 83 sg_policy->need_freq_update = false; 84 /* 85 * This happens when limits change, so forget the previous 86 * next_freq value and force an update. 87 */ 88 sg_policy->next_freq = UINT_MAX; 89 return true; 90 } 91 92 delta_ns = time - sg_policy->last_freq_update_time; 93 return delta_ns >= sg_policy->freq_update_delay_ns; 94 } 95 96 static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, 97 unsigned int next_freq) 98 { 99 struct cpufreq_policy *policy = sg_policy->policy; 100 101 if (sg_policy->next_freq == next_freq) 102 return; 103 104 if (sg_policy->next_freq > next_freq) 105 next_freq = (sg_policy->next_freq + next_freq) >> 1; 106 107 sg_policy->next_freq = next_freq; 108 sg_policy->last_freq_update_time = time; 109 110 if (policy->fast_switch_enabled) { 111 next_freq = cpufreq_driver_fast_switch(policy, next_freq); 112 if (next_freq == CPUFREQ_ENTRY_INVALID) 113 return; 114 115 policy->cur = next_freq; 116 trace_cpu_frequency(next_freq, smp_processor_id()); 117 } else { 118 sg_policy->work_in_progress = true; 119 irq_work_queue(&sg_policy->irq_work); 120 } 121 } 122 123 /** 124 * get_next_freq - Compute a new frequency for a given cpufreq policy. 125 * @sg_policy: schedutil policy object to compute the new frequency for. 126 * @util: Current CPU utilization. 127 * @max: CPU capacity. 128 * 129 * If the utilization is frequency-invariant, choose the new frequency to be 130 * proportional to it, that is 131 * 132 * next_freq = C * max_freq * util / max 133 * 134 * Otherwise, approximate the would-be frequency-invariant utilization by 135 * util_raw * (curr_freq / max_freq) which leads to 136 * 137 * next_freq = C * curr_freq * util_raw / max 138 * 139 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 140 * 141 * The lowest driver-supported frequency which is equal or greater than the raw 142 * next_freq (as calculated above) is returned, subject to policy min/max and 143 * cpufreq driver limitations. 144 */ 145 static unsigned int get_next_freq(struct sugov_policy *sg_policy, 146 unsigned long util, unsigned long max) 147 { 148 struct cpufreq_policy *policy = sg_policy->policy; 149 unsigned int freq = arch_scale_freq_invariant() ? 150 policy->cpuinfo.max_freq : policy->cur; 151 152 freq = (freq + (freq >> 2)) * util / max; 153 154 if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) 155 return sg_policy->next_freq; 156 sg_policy->cached_raw_freq = freq; 157 return cpufreq_driver_resolve_freq(policy, freq); 158 } 159 160 static void sugov_get_util(unsigned long *util, unsigned long *max) 161 { 162 struct rq *rq = this_rq(); 163 unsigned long cfs_max; 164 165 cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); 166 167 *util = min(rq->cfs.avg.util_avg, cfs_max); 168 *max = cfs_max; 169 } 170 171 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 172 unsigned int flags) 173 { 174 if (flags & SCHED_CPUFREQ_IOWAIT) { 175 sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; 176 } else if (sg_cpu->iowait_boost) { 177 s64 delta_ns = time - sg_cpu->last_update; 178 179 /* Clear iowait_boost if the CPU apprears to have been idle. */ 180 if (delta_ns > TICK_NSEC) 181 sg_cpu->iowait_boost = 0; 182 } 183 } 184 185 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, 186 unsigned long *max) 187 { 188 unsigned long boost_util = sg_cpu->iowait_boost; 189 unsigned long boost_max = sg_cpu->iowait_boost_max; 190 191 if (!boost_util) 192 return; 193 194 if (*util * boost_max < *max * boost_util) { 195 *util = boost_util; 196 *max = boost_max; 197 } 198 sg_cpu->iowait_boost >>= 1; 199 } 200 201 #ifdef CONFIG_NO_HZ_COMMON 202 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) 203 { 204 unsigned long idle_calls = tick_nohz_get_idle_calls(); 205 bool ret = idle_calls == sg_cpu->saved_idle_calls; 206 207 sg_cpu->saved_idle_calls = idle_calls; 208 return ret; 209 } 210 #else 211 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } 212 #endif /* CONFIG_NO_HZ_COMMON */ 213 214 static void sugov_update_single(struct update_util_data *hook, u64 time, 215 unsigned int flags) 216 { 217 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 218 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 219 struct cpufreq_policy *policy = sg_policy->policy; 220 unsigned long util, max; 221 unsigned int next_f; 222 bool busy; 223 224 sugov_set_iowait_boost(sg_cpu, time, flags); 225 sg_cpu->last_update = time; 226 227 if (!sugov_should_update_freq(sg_policy, time)) 228 return; 229 230 busy = sugov_cpu_is_busy(sg_cpu); 231 232 if (flags & SCHED_CPUFREQ_RT_DL) { 233 next_f = policy->cpuinfo.max_freq; 234 } else { 235 sugov_get_util(&util, &max); 236 sugov_iowait_boost(sg_cpu, &util, &max); 237 next_f = get_next_freq(sg_policy, util, max); 238 /* 239 * Do not reduce the frequency if the CPU has not been idle 240 * recently, as the reduction is likely to be premature then. 241 */ 242 if (busy && next_f < sg_policy->next_freq) 243 next_f = sg_policy->next_freq; 244 } 245 sugov_update_commit(sg_policy, time, next_f); 246 } 247 248 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) 249 { 250 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 251 struct cpufreq_policy *policy = sg_policy->policy; 252 u64 last_freq_update_time = sg_policy->last_freq_update_time; 253 unsigned long util = 0, max = 1; 254 unsigned int j; 255 256 for_each_cpu(j, policy->cpus) { 257 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 258 unsigned long j_util, j_max; 259 s64 delta_ns; 260 261 /* 262 * If the CPU utilization was last updated before the previous 263 * frequency update and the time elapsed between the last update 264 * of the CPU utilization and the last frequency update is long 265 * enough, don't take the CPU into account as it probably is 266 * idle now (and clear iowait_boost for it). 267 */ 268 delta_ns = last_freq_update_time - j_sg_cpu->last_update; 269 if (delta_ns > TICK_NSEC) { 270 j_sg_cpu->iowait_boost = 0; 271 continue; 272 } 273 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) 274 return policy->cpuinfo.max_freq; 275 276 j_util = j_sg_cpu->util; 277 j_max = j_sg_cpu->max; 278 if (j_util * max > j_max * util) { 279 util = j_util; 280 max = j_max; 281 } 282 283 sugov_iowait_boost(j_sg_cpu, &util, &max); 284 } 285 286 return get_next_freq(sg_policy, util, max); 287 } 288 289 static void sugov_update_shared(struct update_util_data *hook, u64 time, 290 unsigned int flags) 291 { 292 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 293 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 294 unsigned long util, max; 295 unsigned int next_f; 296 297 sugov_get_util(&util, &max); 298 299 raw_spin_lock(&sg_policy->update_lock); 300 301 sg_cpu->util = util; 302 sg_cpu->max = max; 303 sg_cpu->flags = flags; 304 305 sugov_set_iowait_boost(sg_cpu, time, flags); 306 sg_cpu->last_update = time; 307 308 if (sugov_should_update_freq(sg_policy, time)) { 309 if (flags & SCHED_CPUFREQ_RT_DL) 310 next_f = sg_policy->policy->cpuinfo.max_freq; 311 else 312 next_f = sugov_next_freq_shared(sg_cpu); 313 314 sugov_update_commit(sg_policy, time, next_f); 315 } 316 317 raw_spin_unlock(&sg_policy->update_lock); 318 } 319 320 static void sugov_work(struct kthread_work *work) 321 { 322 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 323 324 mutex_lock(&sg_policy->work_lock); 325 __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, 326 CPUFREQ_RELATION_L); 327 mutex_unlock(&sg_policy->work_lock); 328 329 sg_policy->work_in_progress = false; 330 } 331 332 static void sugov_irq_work(struct irq_work *irq_work) 333 { 334 struct sugov_policy *sg_policy; 335 336 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 337 338 /* 339 * For RT and deadline tasks, the schedutil governor shoots the 340 * frequency to maximum. Special care must be taken to ensure that this 341 * kthread doesn't result in the same behavior. 342 * 343 * This is (mostly) guaranteed by the work_in_progress flag. The flag is 344 * updated only at the end of the sugov_work() function and before that 345 * the schedutil governor rejects all other frequency scaling requests. 346 * 347 * There is a very rare case though, where the RT thread yields right 348 * after the work_in_progress flag is cleared. The effects of that are 349 * neglected for now. 350 */ 351 kthread_queue_work(&sg_policy->worker, &sg_policy->work); 352 } 353 354 /************************** sysfs interface ************************/ 355 356 static struct sugov_tunables *global_tunables; 357 static DEFINE_MUTEX(global_tunables_lock); 358 359 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) 360 { 361 return container_of(attr_set, struct sugov_tunables, attr_set); 362 } 363 364 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) 365 { 366 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 367 368 return sprintf(buf, "%u\n", tunables->rate_limit_us); 369 } 370 371 static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, 372 size_t count) 373 { 374 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 375 struct sugov_policy *sg_policy; 376 unsigned int rate_limit_us; 377 378 if (kstrtouint(buf, 10, &rate_limit_us)) 379 return -EINVAL; 380 381 tunables->rate_limit_us = rate_limit_us; 382 383 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) 384 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; 385 386 return count; 387 } 388 389 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 390 391 static struct attribute *sugov_attributes[] = { 392 &rate_limit_us.attr, 393 NULL 394 }; 395 396 static struct kobj_type sugov_tunables_ktype = { 397 .default_attrs = sugov_attributes, 398 .sysfs_ops = &governor_sysfs_ops, 399 }; 400 401 /********************** cpufreq governor interface *********************/ 402 403 static struct cpufreq_governor schedutil_gov; 404 405 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) 406 { 407 struct sugov_policy *sg_policy; 408 409 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); 410 if (!sg_policy) 411 return NULL; 412 413 sg_policy->policy = policy; 414 raw_spin_lock_init(&sg_policy->update_lock); 415 return sg_policy; 416 } 417 418 static void sugov_policy_free(struct sugov_policy *sg_policy) 419 { 420 kfree(sg_policy); 421 } 422 423 static int sugov_kthread_create(struct sugov_policy *sg_policy) 424 { 425 struct task_struct *thread; 426 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; 427 struct cpufreq_policy *policy = sg_policy->policy; 428 int ret; 429 430 /* kthread only required for slow path */ 431 if (policy->fast_switch_enabled) 432 return 0; 433 434 kthread_init_work(&sg_policy->work, sugov_work); 435 kthread_init_worker(&sg_policy->worker); 436 thread = kthread_create(kthread_worker_fn, &sg_policy->worker, 437 "sugov:%d", 438 cpumask_first(policy->related_cpus)); 439 if (IS_ERR(thread)) { 440 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); 441 return PTR_ERR(thread); 442 } 443 444 ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); 445 if (ret) { 446 kthread_stop(thread); 447 pr_warn("%s: failed to set SCHED_FIFO\n", __func__); 448 return ret; 449 } 450 451 sg_policy->thread = thread; 452 kthread_bind_mask(thread, policy->related_cpus); 453 init_irq_work(&sg_policy->irq_work, sugov_irq_work); 454 mutex_init(&sg_policy->work_lock); 455 456 wake_up_process(thread); 457 458 return 0; 459 } 460 461 static void sugov_kthread_stop(struct sugov_policy *sg_policy) 462 { 463 /* kthread only required for slow path */ 464 if (sg_policy->policy->fast_switch_enabled) 465 return; 466 467 kthread_flush_worker(&sg_policy->worker); 468 kthread_stop(sg_policy->thread); 469 mutex_destroy(&sg_policy->work_lock); 470 } 471 472 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 473 { 474 struct sugov_tunables *tunables; 475 476 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); 477 if (tunables) { 478 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); 479 if (!have_governor_per_policy()) 480 global_tunables = tunables; 481 } 482 return tunables; 483 } 484 485 static void sugov_tunables_free(struct sugov_tunables *tunables) 486 { 487 if (!have_governor_per_policy()) 488 global_tunables = NULL; 489 490 kfree(tunables); 491 } 492 493 static int sugov_init(struct cpufreq_policy *policy) 494 { 495 struct sugov_policy *sg_policy; 496 struct sugov_tunables *tunables; 497 int ret = 0; 498 499 /* State should be equivalent to EXIT */ 500 if (policy->governor_data) 501 return -EBUSY; 502 503 cpufreq_enable_fast_switch(policy); 504 505 sg_policy = sugov_policy_alloc(policy); 506 if (!sg_policy) { 507 ret = -ENOMEM; 508 goto disable_fast_switch; 509 } 510 511 ret = sugov_kthread_create(sg_policy); 512 if (ret) 513 goto free_sg_policy; 514 515 mutex_lock(&global_tunables_lock); 516 517 if (global_tunables) { 518 if (WARN_ON(have_governor_per_policy())) { 519 ret = -EINVAL; 520 goto stop_kthread; 521 } 522 policy->governor_data = sg_policy; 523 sg_policy->tunables = global_tunables; 524 525 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); 526 goto out; 527 } 528 529 tunables = sugov_tunables_alloc(sg_policy); 530 if (!tunables) { 531 ret = -ENOMEM; 532 goto stop_kthread; 533 } 534 535 if (policy->transition_delay_us) { 536 tunables->rate_limit_us = policy->transition_delay_us; 537 } else { 538 unsigned int lat; 539 540 tunables->rate_limit_us = LATENCY_MULTIPLIER; 541 lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; 542 if (lat) 543 tunables->rate_limit_us *= lat; 544 } 545 546 policy->governor_data = sg_policy; 547 sg_policy->tunables = tunables; 548 549 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, 550 get_governor_parent_kobj(policy), "%s", 551 schedutil_gov.name); 552 if (ret) 553 goto fail; 554 555 out: 556 mutex_unlock(&global_tunables_lock); 557 return 0; 558 559 fail: 560 policy->governor_data = NULL; 561 sugov_tunables_free(tunables); 562 563 stop_kthread: 564 sugov_kthread_stop(sg_policy); 565 566 free_sg_policy: 567 mutex_unlock(&global_tunables_lock); 568 569 sugov_policy_free(sg_policy); 570 571 disable_fast_switch: 572 cpufreq_disable_fast_switch(policy); 573 574 pr_err("initialization failed (error %d)\n", ret); 575 return ret; 576 } 577 578 static void sugov_exit(struct cpufreq_policy *policy) 579 { 580 struct sugov_policy *sg_policy = policy->governor_data; 581 struct sugov_tunables *tunables = sg_policy->tunables; 582 unsigned int count; 583 584 mutex_lock(&global_tunables_lock); 585 586 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 587 policy->governor_data = NULL; 588 if (!count) 589 sugov_tunables_free(tunables); 590 591 mutex_unlock(&global_tunables_lock); 592 593 sugov_kthread_stop(sg_policy); 594 sugov_policy_free(sg_policy); 595 cpufreq_disable_fast_switch(policy); 596 } 597 598 static int sugov_start(struct cpufreq_policy *policy) 599 { 600 struct sugov_policy *sg_policy = policy->governor_data; 601 unsigned int cpu; 602 603 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 604 sg_policy->last_freq_update_time = 0; 605 sg_policy->next_freq = UINT_MAX; 606 sg_policy->work_in_progress = false; 607 sg_policy->need_freq_update = false; 608 sg_policy->cached_raw_freq = 0; 609 610 for_each_cpu(cpu, policy->cpus) { 611 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 612 613 memset(sg_cpu, 0, sizeof(*sg_cpu)); 614 sg_cpu->sg_policy = sg_policy; 615 sg_cpu->flags = SCHED_CPUFREQ_RT; 616 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; 617 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 618 policy_is_shared(policy) ? 619 sugov_update_shared : 620 sugov_update_single); 621 } 622 return 0; 623 } 624 625 static void sugov_stop(struct cpufreq_policy *policy) 626 { 627 struct sugov_policy *sg_policy = policy->governor_data; 628 unsigned int cpu; 629 630 for_each_cpu(cpu, policy->cpus) 631 cpufreq_remove_update_util_hook(cpu); 632 633 synchronize_sched(); 634 635 if (!policy->fast_switch_enabled) { 636 irq_work_sync(&sg_policy->irq_work); 637 kthread_cancel_work_sync(&sg_policy->work); 638 } 639 } 640 641 static void sugov_limits(struct cpufreq_policy *policy) 642 { 643 struct sugov_policy *sg_policy = policy->governor_data; 644 645 if (!policy->fast_switch_enabled) { 646 mutex_lock(&sg_policy->work_lock); 647 cpufreq_policy_apply_limits(policy); 648 mutex_unlock(&sg_policy->work_lock); 649 } 650 651 sg_policy->need_freq_update = true; 652 } 653 654 static struct cpufreq_governor schedutil_gov = { 655 .name = "schedutil", 656 .owner = THIS_MODULE, 657 .init = sugov_init, 658 .exit = sugov_exit, 659 .start = sugov_start, 660 .stop = sugov_stop, 661 .limits = sugov_limits, 662 }; 663 664 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 665 struct cpufreq_governor *cpufreq_default_governor(void) 666 { 667 return &schedutil_gov; 668 } 669 #endif 670 671 static int __init sugov_register(void) 672 { 673 return cpufreq_register_governor(&schedutil_gov); 674 } 675 fs_initcall(sugov_register); 676