1 /* 2 * drivers/cpufreq/cpufreq_ondemand.c 3 * 4 * Copyright (C) 2001 Russell King 5 * (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>. 6 * Jun Nakajima <jun.nakajima@intel.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License version 2 as 10 * published by the Free Software Foundation. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/cpufreq.h> 17 #include <linux/cpu.h> 18 #include <linux/jiffies.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/mutex.h> 21 #include <linux/hrtimer.h> 22 #include <linux/tick.h> 23 #include <linux/ktime.h> 24 #include <linux/sched.h> 25 26 /* 27 * dbs is used in this file as a shortform for demandbased switching 28 * It helps to keep variable names smaller, simpler 29 */ 30 31 #define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) 32 #define DEF_FREQUENCY_UP_THRESHOLD (80) 33 #define DEF_SAMPLING_DOWN_FACTOR (1) 34 #define MAX_SAMPLING_DOWN_FACTOR (100000) 35 #define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) 36 #define MICRO_FREQUENCY_UP_THRESHOLD (95) 37 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) 38 #define MIN_FREQUENCY_UP_THRESHOLD (11) 39 #define MAX_FREQUENCY_UP_THRESHOLD (100) 40 41 /* 42 * The polling frequency of this governor depends on the capability of 43 * the processor. Default polling frequency is 1000 times the transition 44 * latency of the processor. The governor will work on any processor with 45 * transition latency <= 10mS, using appropriate sampling 46 * rate. 47 * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL) 48 * this governor will not work. 49 * All times here are in uS. 50 */ 51 #define MIN_SAMPLING_RATE_RATIO (2) 52 53 static unsigned int min_sampling_rate; 54 55 #define LATENCY_MULTIPLIER (1000) 56 #define MIN_LATENCY_MULTIPLIER (100) 57 #define TRANSITION_LATENCY_LIMIT (10 * 1000 * 1000) 58 59 static void do_dbs_timer(struct work_struct *work); 60 static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 61 unsigned int event); 62 63 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND 64 static 65 #endif 66 struct cpufreq_governor cpufreq_gov_ondemand = { 67 .name = "ondemand", 68 .governor = cpufreq_governor_dbs, 69 .max_transition_latency = TRANSITION_LATENCY_LIMIT, 70 .owner = THIS_MODULE, 71 }; 72 73 /* Sampling types */ 74 enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; 75 76 struct cpu_dbs_info_s { 77 cputime64_t prev_cpu_idle; 78 cputime64_t prev_cpu_iowait; 79 cputime64_t prev_cpu_wall; 80 cputime64_t prev_cpu_nice; 81 struct cpufreq_policy *cur_policy; 82 struct delayed_work work; 83 struct cpufreq_frequency_table *freq_table; 84 unsigned int freq_lo; 85 unsigned int freq_lo_jiffies; 86 unsigned int freq_hi_jiffies; 87 unsigned int rate_mult; 88 int cpu; 89 unsigned int sample_type:1; 90 /* 91 * percpu mutex that serializes governor limit change with 92 * do_dbs_timer invocation. We do not want do_dbs_timer to run 93 * when user is changing the governor or limits. 94 */ 95 struct mutex timer_mutex; 96 }; 97 static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); 98 99 static unsigned int dbs_enable; /* number of CPUs using this policy */ 100 101 /* 102 * dbs_mutex protects dbs_enable in governor start/stop. 103 */ 104 static DEFINE_MUTEX(dbs_mutex); 105 106 static struct dbs_tuners { 107 unsigned int sampling_rate; 108 unsigned int up_threshold; 109 unsigned int down_differential; 110 unsigned int ignore_nice; 111 unsigned int sampling_down_factor; 112 unsigned int powersave_bias; 113 unsigned int io_is_busy; 114 } dbs_tuners_ins = { 115 .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, 116 .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, 117 .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, 118 .ignore_nice = 0, 119 .powersave_bias = 0, 120 }; 121 122 static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) 123 { 124 u64 idle_time; 125 u64 cur_wall_time; 126 u64 busy_time; 127 128 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 129 130 busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; 131 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; 132 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; 133 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; 134 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; 135 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; 136 137 idle_time = cur_wall_time - busy_time; 138 if (wall) 139 *wall = jiffies_to_usecs(cur_wall_time); 140 141 return jiffies_to_usecs(idle_time); 142 } 143 144 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 145 { 146 u64 idle_time = get_cpu_idle_time_us(cpu, NULL); 147 148 if (idle_time == -1ULL) 149 return get_cpu_idle_time_jiffy(cpu, wall); 150 else 151 idle_time += get_cpu_iowait_time_us(cpu, wall); 152 153 return idle_time; 154 } 155 156 static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall) 157 { 158 u64 iowait_time = get_cpu_iowait_time_us(cpu, wall); 159 160 if (iowait_time == -1ULL) 161 return 0; 162 163 return iowait_time; 164 } 165 166 /* 167 * Find right freq to be set now with powersave_bias on. 168 * Returns the freq_hi to be used right now and will set freq_hi_jiffies, 169 * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs. 170 */ 171 static unsigned int powersave_bias_target(struct cpufreq_policy *policy, 172 unsigned int freq_next, 173 unsigned int relation) 174 { 175 unsigned int freq_req, freq_reduc, freq_avg; 176 unsigned int freq_hi, freq_lo; 177 unsigned int index = 0; 178 unsigned int jiffies_total, jiffies_hi, jiffies_lo; 179 struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 180 policy->cpu); 181 182 if (!dbs_info->freq_table) { 183 dbs_info->freq_lo = 0; 184 dbs_info->freq_lo_jiffies = 0; 185 return freq_next; 186 } 187 188 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next, 189 relation, &index); 190 freq_req = dbs_info->freq_table[index].frequency; 191 freq_reduc = freq_req * dbs_tuners_ins.powersave_bias / 1000; 192 freq_avg = freq_req - freq_reduc; 193 194 /* Find freq bounds for freq_avg in freq_table */ 195 index = 0; 196 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg, 197 CPUFREQ_RELATION_H, &index); 198 freq_lo = dbs_info->freq_table[index].frequency; 199 index = 0; 200 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg, 201 CPUFREQ_RELATION_L, &index); 202 freq_hi = dbs_info->freq_table[index].frequency; 203 204 /* Find out how long we have to be in hi and lo freqs */ 205 if (freq_hi == freq_lo) { 206 dbs_info->freq_lo = 0; 207 dbs_info->freq_lo_jiffies = 0; 208 return freq_lo; 209 } 210 jiffies_total = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 211 jiffies_hi = (freq_avg - freq_lo) * jiffies_total; 212 jiffies_hi += ((freq_hi - freq_lo) / 2); 213 jiffies_hi /= (freq_hi - freq_lo); 214 jiffies_lo = jiffies_total - jiffies_hi; 215 dbs_info->freq_lo = freq_lo; 216 dbs_info->freq_lo_jiffies = jiffies_lo; 217 dbs_info->freq_hi_jiffies = jiffies_hi; 218 return freq_hi; 219 } 220 221 static void ondemand_powersave_bias_init_cpu(int cpu) 222 { 223 struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); 224 dbs_info->freq_table = cpufreq_frequency_get_table(cpu); 225 dbs_info->freq_lo = 0; 226 } 227 228 static void ondemand_powersave_bias_init(void) 229 { 230 int i; 231 for_each_online_cpu(i) { 232 ondemand_powersave_bias_init_cpu(i); 233 } 234 } 235 236 /************************** sysfs interface ************************/ 237 238 static ssize_t show_sampling_rate_min(struct kobject *kobj, 239 struct attribute *attr, char *buf) 240 { 241 return sprintf(buf, "%u\n", min_sampling_rate); 242 } 243 244 define_one_global_ro(sampling_rate_min); 245 246 /* cpufreq_ondemand Governor Tunables */ 247 #define show_one(file_name, object) \ 248 static ssize_t show_##file_name \ 249 (struct kobject *kobj, struct attribute *attr, char *buf) \ 250 { \ 251 return sprintf(buf, "%u\n", dbs_tuners_ins.object); \ 252 } 253 show_one(sampling_rate, sampling_rate); 254 show_one(io_is_busy, io_is_busy); 255 show_one(up_threshold, up_threshold); 256 show_one(sampling_down_factor, sampling_down_factor); 257 show_one(ignore_nice_load, ignore_nice); 258 show_one(powersave_bias, powersave_bias); 259 260 /** 261 * update_sampling_rate - update sampling rate effective immediately if needed. 262 * @new_rate: new sampling rate 263 * 264 * If new rate is smaller than the old, simply updaing 265 * dbs_tuners_int.sampling_rate might not be appropriate. For example, 266 * if the original sampling_rate was 1 second and the requested new sampling 267 * rate is 10 ms because the user needs immediate reaction from ondemand 268 * governor, but not sure if higher frequency will be required or not, 269 * then, the governor may change the sampling rate too late; up to 1 second 270 * later. Thus, if we are reducing the sampling rate, we need to make the 271 * new value effective immediately. 272 */ 273 static void update_sampling_rate(unsigned int new_rate) 274 { 275 int cpu; 276 277 dbs_tuners_ins.sampling_rate = new_rate 278 = max(new_rate, min_sampling_rate); 279 280 for_each_online_cpu(cpu) { 281 struct cpufreq_policy *policy; 282 struct cpu_dbs_info_s *dbs_info; 283 unsigned long next_sampling, appointed_at; 284 285 policy = cpufreq_cpu_get(cpu); 286 if (!policy) 287 continue; 288 dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu); 289 cpufreq_cpu_put(policy); 290 291 mutex_lock(&dbs_info->timer_mutex); 292 293 if (!delayed_work_pending(&dbs_info->work)) { 294 mutex_unlock(&dbs_info->timer_mutex); 295 continue; 296 } 297 298 next_sampling = jiffies + usecs_to_jiffies(new_rate); 299 appointed_at = dbs_info->work.timer.expires; 300 301 302 if (time_before(next_sampling, appointed_at)) { 303 304 mutex_unlock(&dbs_info->timer_mutex); 305 cancel_delayed_work_sync(&dbs_info->work); 306 mutex_lock(&dbs_info->timer_mutex); 307 308 schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, 309 usecs_to_jiffies(new_rate)); 310 311 } 312 mutex_unlock(&dbs_info->timer_mutex); 313 } 314 } 315 316 static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, 317 const char *buf, size_t count) 318 { 319 unsigned int input; 320 int ret; 321 ret = sscanf(buf, "%u", &input); 322 if (ret != 1) 323 return -EINVAL; 324 update_sampling_rate(input); 325 return count; 326 } 327 328 static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b, 329 const char *buf, size_t count) 330 { 331 unsigned int input; 332 int ret; 333 334 ret = sscanf(buf, "%u", &input); 335 if (ret != 1) 336 return -EINVAL; 337 dbs_tuners_ins.io_is_busy = !!input; 338 return count; 339 } 340 341 static ssize_t store_up_threshold(struct kobject *a, struct attribute *b, 342 const char *buf, size_t count) 343 { 344 unsigned int input; 345 int ret; 346 ret = sscanf(buf, "%u", &input); 347 348 if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD || 349 input < MIN_FREQUENCY_UP_THRESHOLD) { 350 return -EINVAL; 351 } 352 dbs_tuners_ins.up_threshold = input; 353 return count; 354 } 355 356 static ssize_t store_sampling_down_factor(struct kobject *a, 357 struct attribute *b, const char *buf, size_t count) 358 { 359 unsigned int input, j; 360 int ret; 361 ret = sscanf(buf, "%u", &input); 362 363 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) 364 return -EINVAL; 365 dbs_tuners_ins.sampling_down_factor = input; 366 367 /* Reset down sampling multiplier in case it was active */ 368 for_each_online_cpu(j) { 369 struct cpu_dbs_info_s *dbs_info; 370 dbs_info = &per_cpu(od_cpu_dbs_info, j); 371 dbs_info->rate_mult = 1; 372 } 373 return count; 374 } 375 376 static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, 377 const char *buf, size_t count) 378 { 379 unsigned int input; 380 int ret; 381 382 unsigned int j; 383 384 ret = sscanf(buf, "%u", &input); 385 if (ret != 1) 386 return -EINVAL; 387 388 if (input > 1) 389 input = 1; 390 391 if (input == dbs_tuners_ins.ignore_nice) { /* nothing to do */ 392 return count; 393 } 394 dbs_tuners_ins.ignore_nice = input; 395 396 /* we need to re-evaluate prev_cpu_idle */ 397 for_each_online_cpu(j) { 398 struct cpu_dbs_info_s *dbs_info; 399 dbs_info = &per_cpu(od_cpu_dbs_info, j); 400 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 401 &dbs_info->prev_cpu_wall); 402 if (dbs_tuners_ins.ignore_nice) 403 dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 404 405 } 406 return count; 407 } 408 409 static ssize_t store_powersave_bias(struct kobject *a, struct attribute *b, 410 const char *buf, size_t count) 411 { 412 unsigned int input; 413 int ret; 414 ret = sscanf(buf, "%u", &input); 415 416 if (ret != 1) 417 return -EINVAL; 418 419 if (input > 1000) 420 input = 1000; 421 422 dbs_tuners_ins.powersave_bias = input; 423 ondemand_powersave_bias_init(); 424 return count; 425 } 426 427 define_one_global_rw(sampling_rate); 428 define_one_global_rw(io_is_busy); 429 define_one_global_rw(up_threshold); 430 define_one_global_rw(sampling_down_factor); 431 define_one_global_rw(ignore_nice_load); 432 define_one_global_rw(powersave_bias); 433 434 static struct attribute *dbs_attributes[] = { 435 &sampling_rate_min.attr, 436 &sampling_rate.attr, 437 &up_threshold.attr, 438 &sampling_down_factor.attr, 439 &ignore_nice_load.attr, 440 &powersave_bias.attr, 441 &io_is_busy.attr, 442 NULL 443 }; 444 445 static struct attribute_group dbs_attr_group = { 446 .attrs = dbs_attributes, 447 .name = "ondemand", 448 }; 449 450 /************************** sysfs end ************************/ 451 452 static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq) 453 { 454 if (dbs_tuners_ins.powersave_bias) 455 freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H); 456 else if (p->cur == p->max) 457 return; 458 459 __cpufreq_driver_target(p, freq, dbs_tuners_ins.powersave_bias ? 460 CPUFREQ_RELATION_L : CPUFREQ_RELATION_H); 461 } 462 463 static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) 464 { 465 unsigned int max_load_freq; 466 467 struct cpufreq_policy *policy; 468 unsigned int j; 469 470 this_dbs_info->freq_lo = 0; 471 policy = this_dbs_info->cur_policy; 472 473 /* 474 * Every sampling_rate, we check, if current idle time is less 475 * than 20% (default), then we try to increase frequency 476 * Every sampling_rate, we look for a the lowest 477 * frequency which can sustain the load while keeping idle time over 478 * 30%. If such a frequency exist, we try to decrease to this frequency. 479 * 480 * Any frequency increase takes it to the maximum frequency. 481 * Frequency reduction happens at minimum steps of 482 * 5% (default) of current frequency 483 */ 484 485 /* Get Absolute Load - in terms of freq */ 486 max_load_freq = 0; 487 488 for_each_cpu(j, policy->cpus) { 489 struct cpu_dbs_info_s *j_dbs_info; 490 cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time; 491 unsigned int idle_time, wall_time, iowait_time; 492 unsigned int load, load_freq; 493 int freq_avg; 494 495 j_dbs_info = &per_cpu(od_cpu_dbs_info, j); 496 497 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 498 cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); 499 500 wall_time = (unsigned int) 501 (cur_wall_time - j_dbs_info->prev_cpu_wall); 502 j_dbs_info->prev_cpu_wall = cur_wall_time; 503 504 idle_time = (unsigned int) 505 (cur_idle_time - j_dbs_info->prev_cpu_idle); 506 j_dbs_info->prev_cpu_idle = cur_idle_time; 507 508 iowait_time = (unsigned int) 509 (cur_iowait_time - j_dbs_info->prev_cpu_iowait); 510 j_dbs_info->prev_cpu_iowait = cur_iowait_time; 511 512 if (dbs_tuners_ins.ignore_nice) { 513 u64 cur_nice; 514 unsigned long cur_nice_jiffies; 515 516 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] - 517 j_dbs_info->prev_cpu_nice; 518 /* 519 * Assumption: nice time between sampling periods will 520 * be less than 2^32 jiffies for 32 bit sys 521 */ 522 cur_nice_jiffies = (unsigned long) 523 cputime64_to_jiffies64(cur_nice); 524 525 j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 526 idle_time += jiffies_to_usecs(cur_nice_jiffies); 527 } 528 529 /* 530 * For the purpose of ondemand, waiting for disk IO is an 531 * indication that you're performance critical, and not that 532 * the system is actually idle. So subtract the iowait time 533 * from the cpu idle time. 534 */ 535 536 if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time) 537 idle_time -= iowait_time; 538 539 if (unlikely(!wall_time || wall_time < idle_time)) 540 continue; 541 542 load = 100 * (wall_time - idle_time) / wall_time; 543 544 freq_avg = __cpufreq_driver_getavg(policy, j); 545 if (freq_avg <= 0) 546 freq_avg = policy->cur; 547 548 load_freq = load * freq_avg; 549 if (load_freq > max_load_freq) 550 max_load_freq = load_freq; 551 } 552 553 /* Check for frequency increase */ 554 if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) { 555 /* If switching to max speed, apply sampling_down_factor */ 556 if (policy->cur < policy->max) 557 this_dbs_info->rate_mult = 558 dbs_tuners_ins.sampling_down_factor; 559 dbs_freq_increase(policy, policy->max); 560 return; 561 } 562 563 /* Check for frequency decrease */ 564 /* if we cannot reduce the frequency anymore, break out early */ 565 if (policy->cur == policy->min) 566 return; 567 568 /* 569 * The optimal frequency is the frequency that is the lowest that 570 * can support the current CPU usage without triggering the up 571 * policy. To be safe, we focus 10 points under the threshold. 572 */ 573 if (max_load_freq < 574 (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) * 575 policy->cur) { 576 unsigned int freq_next; 577 freq_next = max_load_freq / 578 (dbs_tuners_ins.up_threshold - 579 dbs_tuners_ins.down_differential); 580 581 /* No longer fully busy, reset rate_mult */ 582 this_dbs_info->rate_mult = 1; 583 584 if (freq_next < policy->min) 585 freq_next = policy->min; 586 587 if (!dbs_tuners_ins.powersave_bias) { 588 __cpufreq_driver_target(policy, freq_next, 589 CPUFREQ_RELATION_L); 590 } else { 591 int freq = powersave_bias_target(policy, freq_next, 592 CPUFREQ_RELATION_L); 593 __cpufreq_driver_target(policy, freq, 594 CPUFREQ_RELATION_L); 595 } 596 } 597 } 598 599 static void do_dbs_timer(struct work_struct *work) 600 { 601 struct cpu_dbs_info_s *dbs_info = 602 container_of(work, struct cpu_dbs_info_s, work.work); 603 unsigned int cpu = dbs_info->cpu; 604 int sample_type = dbs_info->sample_type; 605 606 int delay; 607 608 mutex_lock(&dbs_info->timer_mutex); 609 610 /* Common NORMAL_SAMPLE setup */ 611 dbs_info->sample_type = DBS_NORMAL_SAMPLE; 612 if (!dbs_tuners_ins.powersave_bias || 613 sample_type == DBS_NORMAL_SAMPLE) { 614 dbs_check_cpu(dbs_info); 615 if (dbs_info->freq_lo) { 616 /* Setup timer for SUB_SAMPLE */ 617 dbs_info->sample_type = DBS_SUB_SAMPLE; 618 delay = dbs_info->freq_hi_jiffies; 619 } else { 620 /* We want all CPUs to do sampling nearly on 621 * same jiffy 622 */ 623 delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate 624 * dbs_info->rate_mult); 625 626 if (num_online_cpus() > 1) 627 delay -= jiffies % delay; 628 } 629 } else { 630 __cpufreq_driver_target(dbs_info->cur_policy, 631 dbs_info->freq_lo, CPUFREQ_RELATION_H); 632 delay = dbs_info->freq_lo_jiffies; 633 } 634 schedule_delayed_work_on(cpu, &dbs_info->work, delay); 635 mutex_unlock(&dbs_info->timer_mutex); 636 } 637 638 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) 639 { 640 /* We want all CPUs to do sampling nearly on same jiffy */ 641 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 642 643 if (num_online_cpus() > 1) 644 delay -= jiffies % delay; 645 646 dbs_info->sample_type = DBS_NORMAL_SAMPLE; 647 INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); 648 schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, delay); 649 } 650 651 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) 652 { 653 cancel_delayed_work_sync(&dbs_info->work); 654 } 655 656 /* 657 * Not all CPUs want IO time to be accounted as busy; this dependson how 658 * efficient idling at a higher frequency/voltage is. 659 * Pavel Machek says this is not so for various generations of AMD and old 660 * Intel systems. 661 * Mike Chan (androidlcom) calis this is also not true for ARM. 662 * Because of this, whitelist specific known (series) of CPUs by default, and 663 * leave all others up to the user. 664 */ 665 static int should_io_be_busy(void) 666 { 667 #if defined(CONFIG_X86) 668 /* 669 * For Intel, Core 2 (model 15) andl later have an efficient idle. 670 */ 671 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 672 boot_cpu_data.x86 == 6 && 673 boot_cpu_data.x86_model >= 15) 674 return 1; 675 #endif 676 return 0; 677 } 678 679 static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 680 unsigned int event) 681 { 682 unsigned int cpu = policy->cpu; 683 struct cpu_dbs_info_s *this_dbs_info; 684 unsigned int j; 685 int rc; 686 687 this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); 688 689 switch (event) { 690 case CPUFREQ_GOV_START: 691 if ((!cpu_online(cpu)) || (!policy->cur)) 692 return -EINVAL; 693 694 mutex_lock(&dbs_mutex); 695 696 dbs_enable++; 697 for_each_cpu(j, policy->cpus) { 698 struct cpu_dbs_info_s *j_dbs_info; 699 j_dbs_info = &per_cpu(od_cpu_dbs_info, j); 700 j_dbs_info->cur_policy = policy; 701 702 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 703 &j_dbs_info->prev_cpu_wall); 704 if (dbs_tuners_ins.ignore_nice) 705 j_dbs_info->prev_cpu_nice = 706 kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 707 } 708 this_dbs_info->cpu = cpu; 709 this_dbs_info->rate_mult = 1; 710 ondemand_powersave_bias_init_cpu(cpu); 711 /* 712 * Start the timerschedule work, when this governor 713 * is used for first time 714 */ 715 if (dbs_enable == 1) { 716 unsigned int latency; 717 718 rc = sysfs_create_group(cpufreq_global_kobject, 719 &dbs_attr_group); 720 if (rc) { 721 mutex_unlock(&dbs_mutex); 722 return rc; 723 } 724 725 /* policy latency is in nS. Convert it to uS first */ 726 latency = policy->cpuinfo.transition_latency / 1000; 727 if (latency == 0) 728 latency = 1; 729 /* Bring kernel and HW constraints together */ 730 min_sampling_rate = max(min_sampling_rate, 731 MIN_LATENCY_MULTIPLIER * latency); 732 dbs_tuners_ins.sampling_rate = 733 max(min_sampling_rate, 734 latency * LATENCY_MULTIPLIER); 735 dbs_tuners_ins.io_is_busy = should_io_be_busy(); 736 } 737 mutex_unlock(&dbs_mutex); 738 739 mutex_init(&this_dbs_info->timer_mutex); 740 dbs_timer_init(this_dbs_info); 741 break; 742 743 case CPUFREQ_GOV_STOP: 744 dbs_timer_exit(this_dbs_info); 745 746 mutex_lock(&dbs_mutex); 747 mutex_destroy(&this_dbs_info->timer_mutex); 748 dbs_enable--; 749 mutex_unlock(&dbs_mutex); 750 if (!dbs_enable) 751 sysfs_remove_group(cpufreq_global_kobject, 752 &dbs_attr_group); 753 754 break; 755 756 case CPUFREQ_GOV_LIMITS: 757 mutex_lock(&this_dbs_info->timer_mutex); 758 if (policy->max < this_dbs_info->cur_policy->cur) 759 __cpufreq_driver_target(this_dbs_info->cur_policy, 760 policy->max, CPUFREQ_RELATION_H); 761 else if (policy->min > this_dbs_info->cur_policy->cur) 762 __cpufreq_driver_target(this_dbs_info->cur_policy, 763 policy->min, CPUFREQ_RELATION_L); 764 mutex_unlock(&this_dbs_info->timer_mutex); 765 break; 766 } 767 return 0; 768 } 769 770 static int __init cpufreq_gov_dbs_init(void) 771 { 772 u64 idle_time; 773 int cpu = get_cpu(); 774 775 idle_time = get_cpu_idle_time_us(cpu, NULL); 776 put_cpu(); 777 if (idle_time != -1ULL) { 778 /* Idle micro accounting is supported. Use finer thresholds */ 779 dbs_tuners_ins.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; 780 dbs_tuners_ins.down_differential = 781 MICRO_FREQUENCY_DOWN_DIFFERENTIAL; 782 /* 783 * In nohz/micro accounting case we set the minimum frequency 784 * not depending on HZ, but fixed (very low). The deferred 785 * timer might skip some samples if idle/sleeping as needed. 786 */ 787 min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; 788 } else { 789 /* For correct statistics, we need 10 ticks for each measure */ 790 min_sampling_rate = 791 MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10); 792 } 793 794 return cpufreq_register_governor(&cpufreq_gov_ondemand); 795 } 796 797 static void __exit cpufreq_gov_dbs_exit(void) 798 { 799 cpufreq_unregister_governor(&cpufreq_gov_ondemand); 800 } 801 802 803 MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>"); 804 MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>"); 805 MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for " 806 "Low Latency Frequency Transition capable processors"); 807 MODULE_LICENSE("GPL"); 808 809 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND 810 fs_initcall(cpufreq_gov_dbs_init); 811 #else 812 module_init(cpufreq_gov_dbs_init); 813 #endif 814 module_exit(cpufreq_gov_dbs_exit); 815