1 /* 2 * drivers/cpufreq/cpufreq_ondemand.c 3 * 4 * Copyright (C) 2001 Russell King 5 * (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>. 6 * Jun Nakajima <jun.nakajima@intel.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License version 2 as 10 * published by the Free Software Foundation. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/cpufreq.h> 16 #include <linux/init.h> 17 #include <linux/kernel.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/kobject.h> 20 #include <linux/module.h> 21 #include <linux/mutex.h> 22 #include <linux/percpu-defs.h> 23 #include <linux/sysfs.h> 24 #include <linux/tick.h> 25 #include <linux/types.h> 26 27 #include "cpufreq_governor.h" 28 29 /* On-demand governor macors */ 30 #define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) 31 #define DEF_FREQUENCY_UP_THRESHOLD (80) 32 #define DEF_SAMPLING_DOWN_FACTOR (1) 33 #define MAX_SAMPLING_DOWN_FACTOR (100000) 34 #define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) 35 #define MICRO_FREQUENCY_UP_THRESHOLD (95) 36 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) 37 #define MIN_FREQUENCY_UP_THRESHOLD (11) 38 #define MAX_FREQUENCY_UP_THRESHOLD (100) 39 40 static struct dbs_data od_dbs_data; 41 static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info); 42 43 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND 44 static struct cpufreq_governor cpufreq_gov_ondemand; 45 #endif 46 47 static struct od_dbs_tuners od_tuners = { 48 .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, 49 .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, 50 .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, 51 .ignore_nice = 0, 52 .powersave_bias = 0, 53 }; 54 55 static void ondemand_powersave_bias_init_cpu(int cpu) 56 { 57 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); 58 59 dbs_info->freq_table = cpufreq_frequency_get_table(cpu); 60 dbs_info->freq_lo = 0; 61 } 62 63 /* 64 * Not all CPUs want IO time to be accounted as busy; this depends on how 65 * efficient idling at a higher frequency/voltage is. 66 * Pavel Machek says this is not so for various generations of AMD and old 67 * Intel systems. 68 * Mike Chan (androidlcom) calis this is also not true for ARM. 69 * Because of this, whitelist specific known (series) of CPUs by default, and 70 * leave all others up to the user. 71 */ 72 static int should_io_be_busy(void) 73 { 74 #if defined(CONFIG_X86) 75 /* 76 * For Intel, Core 2 (model 15) andl later have an efficient idle. 77 */ 78 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 79 boot_cpu_data.x86 == 6 && 80 boot_cpu_data.x86_model >= 15) 81 return 1; 82 #endif 83 return 0; 84 } 85 86 /* 87 * Find right freq to be set now with powersave_bias on. 88 * Returns the freq_hi to be used right now and will set freq_hi_jiffies, 89 * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs. 90 */ 91 static unsigned int powersave_bias_target(struct cpufreq_policy *policy, 92 unsigned int freq_next, unsigned int relation) 93 { 94 unsigned int freq_req, freq_reduc, freq_avg; 95 unsigned int freq_hi, freq_lo; 96 unsigned int index = 0; 97 unsigned int jiffies_total, jiffies_hi, jiffies_lo; 98 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 99 policy->cpu); 100 101 if (!dbs_info->freq_table) { 102 dbs_info->freq_lo = 0; 103 dbs_info->freq_lo_jiffies = 0; 104 return freq_next; 105 } 106 107 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next, 108 relation, &index); 109 freq_req = dbs_info->freq_table[index].frequency; 110 freq_reduc = freq_req * od_tuners.powersave_bias / 1000; 111 freq_avg = freq_req - freq_reduc; 112 113 /* Find freq bounds for freq_avg in freq_table */ 114 index = 0; 115 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg, 116 CPUFREQ_RELATION_H, &index); 117 freq_lo = dbs_info->freq_table[index].frequency; 118 index = 0; 119 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg, 120 CPUFREQ_RELATION_L, &index); 121 freq_hi = dbs_info->freq_table[index].frequency; 122 123 /* Find out how long we have to be in hi and lo freqs */ 124 if (freq_hi == freq_lo) { 125 dbs_info->freq_lo = 0; 126 dbs_info->freq_lo_jiffies = 0; 127 return freq_lo; 128 } 129 jiffies_total = usecs_to_jiffies(od_tuners.sampling_rate); 130 jiffies_hi = (freq_avg - freq_lo) * jiffies_total; 131 jiffies_hi += ((freq_hi - freq_lo) / 2); 132 jiffies_hi /= (freq_hi - freq_lo); 133 jiffies_lo = jiffies_total - jiffies_hi; 134 dbs_info->freq_lo = freq_lo; 135 dbs_info->freq_lo_jiffies = jiffies_lo; 136 dbs_info->freq_hi_jiffies = jiffies_hi; 137 return freq_hi; 138 } 139 140 static void ondemand_powersave_bias_init(void) 141 { 142 int i; 143 for_each_online_cpu(i) { 144 ondemand_powersave_bias_init_cpu(i); 145 } 146 } 147 148 static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq) 149 { 150 if (od_tuners.powersave_bias) 151 freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H); 152 else if (p->cur == p->max) 153 return; 154 155 __cpufreq_driver_target(p, freq, od_tuners.powersave_bias ? 156 CPUFREQ_RELATION_L : CPUFREQ_RELATION_H); 157 } 158 159 /* 160 * Every sampling_rate, we check, if current idle time is less than 20% 161 * (default), then we try to increase frequency Every sampling_rate, we look for 162 * a the lowest frequency which can sustain the load while keeping idle time 163 * over 30%. If such a frequency exist, we try to decrease to this frequency. 164 * 165 * Any frequency increase takes it to the maximum frequency. Frequency reduction 166 * happens at minimum steps of 5% (default) of current frequency 167 */ 168 static void od_check_cpu(int cpu, unsigned int load_freq) 169 { 170 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); 171 struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy; 172 173 dbs_info->freq_lo = 0; 174 175 /* Check for frequency increase */ 176 if (load_freq > od_tuners.up_threshold * policy->cur) { 177 /* If switching to max speed, apply sampling_down_factor */ 178 if (policy->cur < policy->max) 179 dbs_info->rate_mult = 180 od_tuners.sampling_down_factor; 181 dbs_freq_increase(policy, policy->max); 182 return; 183 } 184 185 /* Check for frequency decrease */ 186 /* if we cannot reduce the frequency anymore, break out early */ 187 if (policy->cur == policy->min) 188 return; 189 190 /* 191 * The optimal frequency is the frequency that is the lowest that can 192 * support the current CPU usage without triggering the up policy. To be 193 * safe, we focus 10 points under the threshold. 194 */ 195 if (load_freq < (od_tuners.up_threshold - od_tuners.down_differential) * 196 policy->cur) { 197 unsigned int freq_next; 198 freq_next = load_freq / (od_tuners.up_threshold - 199 od_tuners.down_differential); 200 201 /* No longer fully busy, reset rate_mult */ 202 dbs_info->rate_mult = 1; 203 204 if (freq_next < policy->min) 205 freq_next = policy->min; 206 207 if (!od_tuners.powersave_bias) { 208 __cpufreq_driver_target(policy, freq_next, 209 CPUFREQ_RELATION_L); 210 } else { 211 int freq = powersave_bias_target(policy, freq_next, 212 CPUFREQ_RELATION_L); 213 __cpufreq_driver_target(policy, freq, 214 CPUFREQ_RELATION_L); 215 } 216 } 217 } 218 219 static void od_dbs_timer(struct work_struct *work) 220 { 221 struct od_cpu_dbs_info_s *dbs_info = 222 container_of(work, struct od_cpu_dbs_info_s, cdbs.work.work); 223 unsigned int cpu = dbs_info->cdbs.cpu; 224 int delay, sample_type = dbs_info->sample_type; 225 226 mutex_lock(&dbs_info->cdbs.timer_mutex); 227 228 /* Common NORMAL_SAMPLE setup */ 229 dbs_info->sample_type = OD_NORMAL_SAMPLE; 230 if (sample_type == OD_SUB_SAMPLE) { 231 delay = dbs_info->freq_lo_jiffies; 232 __cpufreq_driver_target(dbs_info->cdbs.cur_policy, 233 dbs_info->freq_lo, CPUFREQ_RELATION_H); 234 } else { 235 dbs_check_cpu(&od_dbs_data, cpu); 236 if (dbs_info->freq_lo) { 237 /* Setup timer for SUB_SAMPLE */ 238 dbs_info->sample_type = OD_SUB_SAMPLE; 239 delay = dbs_info->freq_hi_jiffies; 240 } else { 241 delay = delay_for_sampling_rate(od_tuners.sampling_rate 242 * dbs_info->rate_mult); 243 } 244 } 245 246 schedule_delayed_work_on(cpu, &dbs_info->cdbs.work, delay); 247 mutex_unlock(&dbs_info->cdbs.timer_mutex); 248 } 249 250 /************************** sysfs interface ************************/ 251 252 static ssize_t show_sampling_rate_min(struct kobject *kobj, 253 struct attribute *attr, char *buf) 254 { 255 return sprintf(buf, "%u\n", od_dbs_data.min_sampling_rate); 256 } 257 258 /** 259 * update_sampling_rate - update sampling rate effective immediately if needed. 260 * @new_rate: new sampling rate 261 * 262 * If new rate is smaller than the old, simply updaing 263 * dbs_tuners_int.sampling_rate might not be appropriate. For example, if the 264 * original sampling_rate was 1 second and the requested new sampling rate is 10 265 * ms because the user needs immediate reaction from ondemand governor, but not 266 * sure if higher frequency will be required or not, then, the governor may 267 * change the sampling rate too late; up to 1 second later. Thus, if we are 268 * reducing the sampling rate, we need to make the new value effective 269 * immediately. 270 */ 271 static void update_sampling_rate(unsigned int new_rate) 272 { 273 int cpu; 274 275 od_tuners.sampling_rate = new_rate = max(new_rate, 276 od_dbs_data.min_sampling_rate); 277 278 for_each_online_cpu(cpu) { 279 struct cpufreq_policy *policy; 280 struct od_cpu_dbs_info_s *dbs_info; 281 unsigned long next_sampling, appointed_at; 282 283 policy = cpufreq_cpu_get(cpu); 284 if (!policy) 285 continue; 286 if (policy->governor != &cpufreq_gov_ondemand) { 287 cpufreq_cpu_put(policy); 288 continue; 289 } 290 dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu); 291 cpufreq_cpu_put(policy); 292 293 mutex_lock(&dbs_info->cdbs.timer_mutex); 294 295 if (!delayed_work_pending(&dbs_info->cdbs.work)) { 296 mutex_unlock(&dbs_info->cdbs.timer_mutex); 297 continue; 298 } 299 300 next_sampling = jiffies + usecs_to_jiffies(new_rate); 301 appointed_at = dbs_info->cdbs.work.timer.expires; 302 303 if (time_before(next_sampling, appointed_at)) { 304 305 mutex_unlock(&dbs_info->cdbs.timer_mutex); 306 cancel_delayed_work_sync(&dbs_info->cdbs.work); 307 mutex_lock(&dbs_info->cdbs.timer_mutex); 308 309 schedule_delayed_work_on(dbs_info->cdbs.cpu, 310 &dbs_info->cdbs.work, 311 usecs_to_jiffies(new_rate)); 312 313 } 314 mutex_unlock(&dbs_info->cdbs.timer_mutex); 315 } 316 } 317 318 static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, 319 const char *buf, size_t count) 320 { 321 unsigned int input; 322 int ret; 323 ret = sscanf(buf, "%u", &input); 324 if (ret != 1) 325 return -EINVAL; 326 update_sampling_rate(input); 327 return count; 328 } 329 330 static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b, 331 const char *buf, size_t count) 332 { 333 unsigned int input; 334 int ret; 335 336 ret = sscanf(buf, "%u", &input); 337 if (ret != 1) 338 return -EINVAL; 339 od_tuners.io_is_busy = !!input; 340 return count; 341 } 342 343 static ssize_t store_up_threshold(struct kobject *a, struct attribute *b, 344 const char *buf, size_t count) 345 { 346 unsigned int input; 347 int ret; 348 ret = sscanf(buf, "%u", &input); 349 350 if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD || 351 input < MIN_FREQUENCY_UP_THRESHOLD) { 352 return -EINVAL; 353 } 354 od_tuners.up_threshold = input; 355 return count; 356 } 357 358 static ssize_t store_sampling_down_factor(struct kobject *a, 359 struct attribute *b, const char *buf, size_t count) 360 { 361 unsigned int input, j; 362 int ret; 363 ret = sscanf(buf, "%u", &input); 364 365 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) 366 return -EINVAL; 367 od_tuners.sampling_down_factor = input; 368 369 /* Reset down sampling multiplier in case it was active */ 370 for_each_online_cpu(j) { 371 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 372 j); 373 dbs_info->rate_mult = 1; 374 } 375 return count; 376 } 377 378 static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, 379 const char *buf, size_t count) 380 { 381 unsigned int input; 382 int ret; 383 384 unsigned int j; 385 386 ret = sscanf(buf, "%u", &input); 387 if (ret != 1) 388 return -EINVAL; 389 390 if (input > 1) 391 input = 1; 392 393 if (input == od_tuners.ignore_nice) { /* nothing to do */ 394 return count; 395 } 396 od_tuners.ignore_nice = input; 397 398 /* we need to re-evaluate prev_cpu_idle */ 399 for_each_online_cpu(j) { 400 struct od_cpu_dbs_info_s *dbs_info; 401 dbs_info = &per_cpu(od_cpu_dbs_info, j); 402 dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j, 403 &dbs_info->cdbs.prev_cpu_wall); 404 if (od_tuners.ignore_nice) 405 dbs_info->cdbs.prev_cpu_nice = 406 kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 407 408 } 409 return count; 410 } 411 412 static ssize_t store_powersave_bias(struct kobject *a, struct attribute *b, 413 const char *buf, size_t count) 414 { 415 unsigned int input; 416 int ret; 417 ret = sscanf(buf, "%u", &input); 418 419 if (ret != 1) 420 return -EINVAL; 421 422 if (input > 1000) 423 input = 1000; 424 425 od_tuners.powersave_bias = input; 426 ondemand_powersave_bias_init(); 427 return count; 428 } 429 430 show_one(od, sampling_rate, sampling_rate); 431 show_one(od, io_is_busy, io_is_busy); 432 show_one(od, up_threshold, up_threshold); 433 show_one(od, sampling_down_factor, sampling_down_factor); 434 show_one(od, ignore_nice_load, ignore_nice); 435 show_one(od, powersave_bias, powersave_bias); 436 437 define_one_global_rw(sampling_rate); 438 define_one_global_rw(io_is_busy); 439 define_one_global_rw(up_threshold); 440 define_one_global_rw(sampling_down_factor); 441 define_one_global_rw(ignore_nice_load); 442 define_one_global_rw(powersave_bias); 443 define_one_global_ro(sampling_rate_min); 444 445 static struct attribute *dbs_attributes[] = { 446 &sampling_rate_min.attr, 447 &sampling_rate.attr, 448 &up_threshold.attr, 449 &sampling_down_factor.attr, 450 &ignore_nice_load.attr, 451 &powersave_bias.attr, 452 &io_is_busy.attr, 453 NULL 454 }; 455 456 static struct attribute_group od_attr_group = { 457 .attrs = dbs_attributes, 458 .name = "ondemand", 459 }; 460 461 /************************** sysfs end ************************/ 462 463 define_get_cpu_dbs_routines(od_cpu_dbs_info); 464 465 static struct od_ops od_ops = { 466 .io_busy = should_io_be_busy, 467 .powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu, 468 .powersave_bias_target = powersave_bias_target, 469 .freq_increase = dbs_freq_increase, 470 }; 471 472 static struct dbs_data od_dbs_data = { 473 .governor = GOV_ONDEMAND, 474 .attr_group = &od_attr_group, 475 .tuners = &od_tuners, 476 .get_cpu_cdbs = get_cpu_cdbs, 477 .get_cpu_dbs_info_s = get_cpu_dbs_info_s, 478 .gov_dbs_timer = od_dbs_timer, 479 .gov_check_cpu = od_check_cpu, 480 .gov_ops = &od_ops, 481 }; 482 483 static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy, 484 unsigned int event) 485 { 486 return cpufreq_governor_dbs(&od_dbs_data, policy, event); 487 } 488 489 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND 490 static 491 #endif 492 struct cpufreq_governor cpufreq_gov_ondemand = { 493 .name = "ondemand", 494 .governor = od_cpufreq_governor_dbs, 495 .max_transition_latency = TRANSITION_LATENCY_LIMIT, 496 .owner = THIS_MODULE, 497 }; 498 499 static int __init cpufreq_gov_dbs_init(void) 500 { 501 u64 idle_time; 502 int cpu = get_cpu(); 503 504 mutex_init(&od_dbs_data.mutex); 505 idle_time = get_cpu_idle_time_us(cpu, NULL); 506 put_cpu(); 507 if (idle_time != -1ULL) { 508 /* Idle micro accounting is supported. Use finer thresholds */ 509 od_tuners.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; 510 od_tuners.down_differential = MICRO_FREQUENCY_DOWN_DIFFERENTIAL; 511 /* 512 * In nohz/micro accounting case we set the minimum frequency 513 * not depending on HZ, but fixed (very low). The deferred 514 * timer might skip some samples if idle/sleeping as needed. 515 */ 516 od_dbs_data.min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; 517 } else { 518 /* For correct statistics, we need 10 ticks for each measure */ 519 od_dbs_data.min_sampling_rate = MIN_SAMPLING_RATE_RATIO * 520 jiffies_to_usecs(10); 521 } 522 523 return cpufreq_register_governor(&cpufreq_gov_ondemand); 524 } 525 526 static void __exit cpufreq_gov_dbs_exit(void) 527 { 528 cpufreq_unregister_governor(&cpufreq_gov_ondemand); 529 } 530 531 MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>"); 532 MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>"); 533 MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for " 534 "Low Latency Frequency Transition capable processors"); 535 MODULE_LICENSE("GPL"); 536 537 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND 538 fs_initcall(cpufreq_gov_dbs_init); 539 #else 540 module_init(cpufreq_gov_dbs_init); 541 #endif 542 module_exit(cpufreq_gov_dbs_exit); 543