1 /* 2 * CPUFreq governor based on scheduler-provided CPU utilization data. 3 * 4 * Copyright (C) 2016, Intel Corporation 5 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/cpufreq.h> 15 #include <linux/kthread.h> 16 #include <uapi/linux/sched/types.h> 17 #include <linux/slab.h> 18 #include <trace/events/power.h> 19 20 #include "sched.h" 21 22 #define SUGOV_KTHREAD_PRIORITY 50 23 24 struct sugov_tunables { 25 struct gov_attr_set attr_set; 26 unsigned int rate_limit_us; 27 }; 28 29 struct sugov_policy { 30 struct cpufreq_policy *policy; 31 32 struct sugov_tunables *tunables; 33 struct list_head tunables_hook; 34 35 raw_spinlock_t update_lock; /* For shared policies */ 36 u64 last_freq_update_time; 37 s64 freq_update_delay_ns; 38 unsigned int next_freq; 39 unsigned int cached_raw_freq; 40 41 /* The next fields are only needed if fast switch cannot be used. */ 42 struct irq_work irq_work; 43 struct kthread_work work; 44 struct mutex work_lock; 45 struct kthread_worker worker; 46 struct task_struct *thread; 47 bool work_in_progress; 48 49 bool need_freq_update; 50 }; 51 52 struct sugov_cpu { 53 struct update_util_data update_util; 54 struct sugov_policy *sg_policy; 55 56 unsigned long iowait_boost; 57 unsigned long iowait_boost_max; 58 u64 last_update; 59 60 /* The fields below are only needed when sharing a policy. */ 61 unsigned long util; 62 unsigned long max; 63 unsigned int flags; 64 65 /* The field below is for single-CPU policies only. */ 66 #ifdef CONFIG_NO_HZ_COMMON 67 unsigned long saved_idle_calls; 68 #endif 69 }; 70 71 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 72 73 /************************ Governor internals ***********************/ 74 75 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) 76 { 77 s64 delta_ns; 78 79 if (sg_policy->work_in_progress) 80 return false; 81 82 if (unlikely(sg_policy->need_freq_update)) { 83 sg_policy->need_freq_update = false; 84 /* 85 * This happens when limits change, so forget the previous 86 * next_freq value and force an update. 87 */ 88 sg_policy->next_freq = UINT_MAX; 89 return true; 90 } 91 92 delta_ns = time - sg_policy->last_freq_update_time; 93 return delta_ns >= sg_policy->freq_update_delay_ns; 94 } 95 96 static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, 97 unsigned int next_freq) 98 { 99 struct cpufreq_policy *policy = sg_policy->policy; 100 101 if (sg_policy->next_freq == next_freq) 102 return; 103 104 if (sg_policy->next_freq > next_freq) 105 next_freq = (sg_policy->next_freq + next_freq) >> 1; 106 107 sg_policy->next_freq = next_freq; 108 sg_policy->last_freq_update_time = time; 109 110 if (policy->fast_switch_enabled) { 111 next_freq = cpufreq_driver_fast_switch(policy, next_freq); 112 if (next_freq == CPUFREQ_ENTRY_INVALID) 113 return; 114 115 policy->cur = next_freq; 116 trace_cpu_frequency(next_freq, smp_processor_id()); 117 } else { 118 sg_policy->work_in_progress = true; 119 irq_work_queue(&sg_policy->irq_work); 120 } 121 } 122 123 /** 124 * get_next_freq - Compute a new frequency for a given cpufreq policy. 125 * @sg_policy: schedutil policy object to compute the new frequency for. 126 * @util: Current CPU utilization. 127 * @max: CPU capacity. 128 * 129 * If the utilization is frequency-invariant, choose the new frequency to be 130 * proportional to it, that is 131 * 132 * next_freq = C * max_freq * util / max 133 * 134 * Otherwise, approximate the would-be frequency-invariant utilization by 135 * util_raw * (curr_freq / max_freq) which leads to 136 * 137 * next_freq = C * curr_freq * util_raw / max 138 * 139 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 140 * 141 * The lowest driver-supported frequency which is equal or greater than the raw 142 * next_freq (as calculated above) is returned, subject to policy min/max and 143 * cpufreq driver limitations. 144 */ 145 static unsigned int get_next_freq(struct sugov_policy *sg_policy, 146 unsigned long util, unsigned long max) 147 { 148 struct cpufreq_policy *policy = sg_policy->policy; 149 unsigned int freq = arch_scale_freq_invariant() ? 150 policy->cpuinfo.max_freq : policy->cur; 151 152 freq = (freq + (freq >> 2)) * util / max; 153 154 if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) 155 return sg_policy->next_freq; 156 sg_policy->cached_raw_freq = freq; 157 return cpufreq_driver_resolve_freq(policy, freq); 158 } 159 160 static void sugov_get_util(unsigned long *util, unsigned long *max) 161 { 162 struct rq *rq = this_rq(); 163 unsigned long cfs_max; 164 165 cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); 166 167 *util = min(rq->cfs.avg.util_avg, cfs_max); 168 *max = cfs_max; 169 } 170 171 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 172 unsigned int flags) 173 { 174 if (flags & SCHED_CPUFREQ_IOWAIT) { 175 sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; 176 } else if (sg_cpu->iowait_boost) { 177 s64 delta_ns = time - sg_cpu->last_update; 178 179 /* Clear iowait_boost if the CPU apprears to have been idle. */ 180 if (delta_ns > TICK_NSEC) 181 sg_cpu->iowait_boost = 0; 182 } 183 } 184 185 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, 186 unsigned long *max) 187 { 188 unsigned long boost_util = sg_cpu->iowait_boost; 189 unsigned long boost_max = sg_cpu->iowait_boost_max; 190 191 if (!boost_util) 192 return; 193 194 if (*util * boost_max < *max * boost_util) { 195 *util = boost_util; 196 *max = boost_max; 197 } 198 sg_cpu->iowait_boost >>= 1; 199 } 200 201 #ifdef CONFIG_NO_HZ_COMMON 202 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) 203 { 204 unsigned long idle_calls = tick_nohz_get_idle_calls(); 205 bool ret = idle_calls == sg_cpu->saved_idle_calls; 206 207 sg_cpu->saved_idle_calls = idle_calls; 208 return ret; 209 } 210 #else 211 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } 212 #endif /* CONFIG_NO_HZ_COMMON */ 213 214 static void sugov_update_single(struct update_util_data *hook, u64 time, 215 unsigned int flags) 216 { 217 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 218 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 219 struct cpufreq_policy *policy = sg_policy->policy; 220 unsigned long util, max; 221 unsigned int next_f; 222 bool busy; 223 224 sugov_set_iowait_boost(sg_cpu, time, flags); 225 sg_cpu->last_update = time; 226 227 if (!sugov_should_update_freq(sg_policy, time)) 228 return; 229 230 busy = sugov_cpu_is_busy(sg_cpu); 231 232 if (flags & SCHED_CPUFREQ_RT_DL) { 233 next_f = policy->cpuinfo.max_freq; 234 } else { 235 sugov_get_util(&util, &max); 236 sugov_iowait_boost(sg_cpu, &util, &max); 237 next_f = get_next_freq(sg_policy, util, max); 238 /* 239 * Do not reduce the frequency if the CPU has not been idle 240 * recently, as the reduction is likely to be premature then. 241 */ 242 if (busy && next_f < sg_policy->next_freq) 243 next_f = sg_policy->next_freq; 244 } 245 sugov_update_commit(sg_policy, time, next_f); 246 } 247 248 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) 249 { 250 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 251 struct cpufreq_policy *policy = sg_policy->policy; 252 unsigned long util = 0, max = 1; 253 unsigned int j; 254 255 for_each_cpu(j, policy->cpus) { 256 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 257 unsigned long j_util, j_max; 258 s64 delta_ns; 259 260 /* 261 * If the CPU utilization was last updated before the previous 262 * frequency update and the time elapsed between the last update 263 * of the CPU utilization and the last frequency update is long 264 * enough, don't take the CPU into account as it probably is 265 * idle now (and clear iowait_boost for it). 266 */ 267 delta_ns = time - j_sg_cpu->last_update; 268 if (delta_ns > TICK_NSEC) { 269 j_sg_cpu->iowait_boost = 0; 270 continue; 271 } 272 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) 273 return policy->cpuinfo.max_freq; 274 275 j_util = j_sg_cpu->util; 276 j_max = j_sg_cpu->max; 277 if (j_util * max > j_max * util) { 278 util = j_util; 279 max = j_max; 280 } 281 282 sugov_iowait_boost(j_sg_cpu, &util, &max); 283 } 284 285 return get_next_freq(sg_policy, util, max); 286 } 287 288 static void sugov_update_shared(struct update_util_data *hook, u64 time, 289 unsigned int flags) 290 { 291 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 292 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 293 unsigned long util, max; 294 unsigned int next_f; 295 296 sugov_get_util(&util, &max); 297 298 raw_spin_lock(&sg_policy->update_lock); 299 300 sg_cpu->util = util; 301 sg_cpu->max = max; 302 sg_cpu->flags = flags; 303 304 sugov_set_iowait_boost(sg_cpu, time, flags); 305 sg_cpu->last_update = time; 306 307 if (sugov_should_update_freq(sg_policy, time)) { 308 if (flags & SCHED_CPUFREQ_RT_DL) 309 next_f = sg_policy->policy->cpuinfo.max_freq; 310 else 311 next_f = sugov_next_freq_shared(sg_cpu, time); 312 313 sugov_update_commit(sg_policy, time, next_f); 314 } 315 316 raw_spin_unlock(&sg_policy->update_lock); 317 } 318 319 static void sugov_work(struct kthread_work *work) 320 { 321 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 322 323 mutex_lock(&sg_policy->work_lock); 324 __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, 325 CPUFREQ_RELATION_L); 326 mutex_unlock(&sg_policy->work_lock); 327 328 sg_policy->work_in_progress = false; 329 } 330 331 static void sugov_irq_work(struct irq_work *irq_work) 332 { 333 struct sugov_policy *sg_policy; 334 335 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 336 337 /* 338 * For RT and deadline tasks, the schedutil governor shoots the 339 * frequency to maximum. Special care must be taken to ensure that this 340 * kthread doesn't result in the same behavior. 341 * 342 * This is (mostly) guaranteed by the work_in_progress flag. The flag is 343 * updated only at the end of the sugov_work() function and before that 344 * the schedutil governor rejects all other frequency scaling requests. 345 * 346 * There is a very rare case though, where the RT thread yields right 347 * after the work_in_progress flag is cleared. The effects of that are 348 * neglected for now. 349 */ 350 kthread_queue_work(&sg_policy->worker, &sg_policy->work); 351 } 352 353 /************************** sysfs interface ************************/ 354 355 static struct sugov_tunables *global_tunables; 356 static DEFINE_MUTEX(global_tunables_lock); 357 358 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) 359 { 360 return container_of(attr_set, struct sugov_tunables, attr_set); 361 } 362 363 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) 364 { 365 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 366 367 return sprintf(buf, "%u\n", tunables->rate_limit_us); 368 } 369 370 static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, 371 size_t count) 372 { 373 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 374 struct sugov_policy *sg_policy; 375 unsigned int rate_limit_us; 376 377 if (kstrtouint(buf, 10, &rate_limit_us)) 378 return -EINVAL; 379 380 tunables->rate_limit_us = rate_limit_us; 381 382 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) 383 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; 384 385 return count; 386 } 387 388 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 389 390 static struct attribute *sugov_attributes[] = { 391 &rate_limit_us.attr, 392 NULL 393 }; 394 395 static struct kobj_type sugov_tunables_ktype = { 396 .default_attrs = sugov_attributes, 397 .sysfs_ops = &governor_sysfs_ops, 398 }; 399 400 /********************** cpufreq governor interface *********************/ 401 402 static struct cpufreq_governor schedutil_gov; 403 404 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) 405 { 406 struct sugov_policy *sg_policy; 407 408 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); 409 if (!sg_policy) 410 return NULL; 411 412 sg_policy->policy = policy; 413 raw_spin_lock_init(&sg_policy->update_lock); 414 return sg_policy; 415 } 416 417 static void sugov_policy_free(struct sugov_policy *sg_policy) 418 { 419 kfree(sg_policy); 420 } 421 422 static int sugov_kthread_create(struct sugov_policy *sg_policy) 423 { 424 struct task_struct *thread; 425 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; 426 struct cpufreq_policy *policy = sg_policy->policy; 427 int ret; 428 429 /* kthread only required for slow path */ 430 if (policy->fast_switch_enabled) 431 return 0; 432 433 kthread_init_work(&sg_policy->work, sugov_work); 434 kthread_init_worker(&sg_policy->worker); 435 thread = kthread_create(kthread_worker_fn, &sg_policy->worker, 436 "sugov:%d", 437 cpumask_first(policy->related_cpus)); 438 if (IS_ERR(thread)) { 439 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); 440 return PTR_ERR(thread); 441 } 442 443 ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); 444 if (ret) { 445 kthread_stop(thread); 446 pr_warn("%s: failed to set SCHED_FIFO\n", __func__); 447 return ret; 448 } 449 450 sg_policy->thread = thread; 451 kthread_bind_mask(thread, policy->related_cpus); 452 init_irq_work(&sg_policy->irq_work, sugov_irq_work); 453 mutex_init(&sg_policy->work_lock); 454 455 wake_up_process(thread); 456 457 return 0; 458 } 459 460 static void sugov_kthread_stop(struct sugov_policy *sg_policy) 461 { 462 /* kthread only required for slow path */ 463 if (sg_policy->policy->fast_switch_enabled) 464 return; 465 466 kthread_flush_worker(&sg_policy->worker); 467 kthread_stop(sg_policy->thread); 468 mutex_destroy(&sg_policy->work_lock); 469 } 470 471 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 472 { 473 struct sugov_tunables *tunables; 474 475 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); 476 if (tunables) { 477 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); 478 if (!have_governor_per_policy()) 479 global_tunables = tunables; 480 } 481 return tunables; 482 } 483 484 static void sugov_tunables_free(struct sugov_tunables *tunables) 485 { 486 if (!have_governor_per_policy()) 487 global_tunables = NULL; 488 489 kfree(tunables); 490 } 491 492 static int sugov_init(struct cpufreq_policy *policy) 493 { 494 struct sugov_policy *sg_policy; 495 struct sugov_tunables *tunables; 496 int ret = 0; 497 498 /* State should be equivalent to EXIT */ 499 if (policy->governor_data) 500 return -EBUSY; 501 502 cpufreq_enable_fast_switch(policy); 503 504 sg_policy = sugov_policy_alloc(policy); 505 if (!sg_policy) { 506 ret = -ENOMEM; 507 goto disable_fast_switch; 508 } 509 510 ret = sugov_kthread_create(sg_policy); 511 if (ret) 512 goto free_sg_policy; 513 514 mutex_lock(&global_tunables_lock); 515 516 if (global_tunables) { 517 if (WARN_ON(have_governor_per_policy())) { 518 ret = -EINVAL; 519 goto stop_kthread; 520 } 521 policy->governor_data = sg_policy; 522 sg_policy->tunables = global_tunables; 523 524 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); 525 goto out; 526 } 527 528 tunables = sugov_tunables_alloc(sg_policy); 529 if (!tunables) { 530 ret = -ENOMEM; 531 goto stop_kthread; 532 } 533 534 if (policy->transition_delay_us) { 535 tunables->rate_limit_us = policy->transition_delay_us; 536 } else { 537 unsigned int lat; 538 539 tunables->rate_limit_us = LATENCY_MULTIPLIER; 540 lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; 541 if (lat) 542 tunables->rate_limit_us *= lat; 543 } 544 545 policy->governor_data = sg_policy; 546 sg_policy->tunables = tunables; 547 548 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, 549 get_governor_parent_kobj(policy), "%s", 550 schedutil_gov.name); 551 if (ret) 552 goto fail; 553 554 out: 555 mutex_unlock(&global_tunables_lock); 556 return 0; 557 558 fail: 559 policy->governor_data = NULL; 560 sugov_tunables_free(tunables); 561 562 stop_kthread: 563 sugov_kthread_stop(sg_policy); 564 565 free_sg_policy: 566 mutex_unlock(&global_tunables_lock); 567 568 sugov_policy_free(sg_policy); 569 570 disable_fast_switch: 571 cpufreq_disable_fast_switch(policy); 572 573 pr_err("initialization failed (error %d)\n", ret); 574 return ret; 575 } 576 577 static void sugov_exit(struct cpufreq_policy *policy) 578 { 579 struct sugov_policy *sg_policy = policy->governor_data; 580 struct sugov_tunables *tunables = sg_policy->tunables; 581 unsigned int count; 582 583 mutex_lock(&global_tunables_lock); 584 585 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 586 policy->governor_data = NULL; 587 if (!count) 588 sugov_tunables_free(tunables); 589 590 mutex_unlock(&global_tunables_lock); 591 592 sugov_kthread_stop(sg_policy); 593 sugov_policy_free(sg_policy); 594 cpufreq_disable_fast_switch(policy); 595 } 596 597 static int sugov_start(struct cpufreq_policy *policy) 598 { 599 struct sugov_policy *sg_policy = policy->governor_data; 600 unsigned int cpu; 601 602 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 603 sg_policy->last_freq_update_time = 0; 604 sg_policy->next_freq = UINT_MAX; 605 sg_policy->work_in_progress = false; 606 sg_policy->need_freq_update = false; 607 sg_policy->cached_raw_freq = 0; 608 609 for_each_cpu(cpu, policy->cpus) { 610 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 611 612 memset(sg_cpu, 0, sizeof(*sg_cpu)); 613 sg_cpu->sg_policy = sg_policy; 614 sg_cpu->flags = SCHED_CPUFREQ_RT; 615 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; 616 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 617 policy_is_shared(policy) ? 618 sugov_update_shared : 619 sugov_update_single); 620 } 621 return 0; 622 } 623 624 static void sugov_stop(struct cpufreq_policy *policy) 625 { 626 struct sugov_policy *sg_policy = policy->governor_data; 627 unsigned int cpu; 628 629 for_each_cpu(cpu, policy->cpus) 630 cpufreq_remove_update_util_hook(cpu); 631 632 synchronize_sched(); 633 634 if (!policy->fast_switch_enabled) { 635 irq_work_sync(&sg_policy->irq_work); 636 kthread_cancel_work_sync(&sg_policy->work); 637 } 638 } 639 640 static void sugov_limits(struct cpufreq_policy *policy) 641 { 642 struct sugov_policy *sg_policy = policy->governor_data; 643 644 if (!policy->fast_switch_enabled) { 645 mutex_lock(&sg_policy->work_lock); 646 cpufreq_policy_apply_limits(policy); 647 mutex_unlock(&sg_policy->work_lock); 648 } 649 650 sg_policy->need_freq_update = true; 651 } 652 653 static struct cpufreq_governor schedutil_gov = { 654 .name = "schedutil", 655 .owner = THIS_MODULE, 656 .init = sugov_init, 657 .exit = sugov_exit, 658 .start = sugov_start, 659 .stop = sugov_stop, 660 .limits = sugov_limits, 661 }; 662 663 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 664 struct cpufreq_governor *cpufreq_default_governor(void) 665 { 666 return &schedutil_gov; 667 } 668 #endif 669 670 static int __init sugov_register(void) 671 { 672 return cpufreq_register_governor(&schedutil_gov); 673 } 674 fs_initcall(sugov_register); 675