1 /* 2 * drivers/cpufreq/cpufreq_ondemand.c 3 * 4 * Copyright (C) 2001 Russell King 5 * (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>. 6 * Jun Nakajima <jun.nakajima@intel.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License version 2 as 10 * published by the Free Software Foundation. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/cpufreq.h> 17 #include <linux/cpu.h> 18 #include <linux/jiffies.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/mutex.h> 21 #include <linux/hrtimer.h> 22 #include <linux/tick.h> 23 #include <linux/ktime.h> 24 #include <linux/sched.h> 25 26 /* 27 * dbs is used in this file as a shortform for demandbased switching 28 * It helps to keep variable names smaller, simpler 29 */ 30 31 #define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) 32 #define DEF_FREQUENCY_UP_THRESHOLD (80) 33 #define DEF_SAMPLING_DOWN_FACTOR (1) 34 #define MAX_SAMPLING_DOWN_FACTOR (100000) 35 #define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) 36 #define MICRO_FREQUENCY_UP_THRESHOLD (95) 37 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) 38 #define MIN_FREQUENCY_UP_THRESHOLD (11) 39 #define MAX_FREQUENCY_UP_THRESHOLD (100) 40 41 /* 42 * The polling frequency of this governor depends on the capability of 43 * the processor. Default polling frequency is 1000 times the transition 44 * latency of the processor. The governor will work on any processor with 45 * transition latency <= 10mS, using appropriate sampling 46 * rate. 47 * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL) 48 * this governor will not work. 49 * All times here are in uS. 50 */ 51 #define MIN_SAMPLING_RATE_RATIO (2) 52 53 static unsigned int min_sampling_rate; 54 55 #define LATENCY_MULTIPLIER (1000) 56 #define MIN_LATENCY_MULTIPLIER (100) 57 #define TRANSITION_LATENCY_LIMIT (10 * 1000 * 1000) 58 59 static void do_dbs_timer(struct work_struct *work); 60 static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 61 unsigned int event); 62 63 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND 64 static 65 #endif 66 struct cpufreq_governor cpufreq_gov_ondemand = { 67 .name = "ondemand", 68 .governor = cpufreq_governor_dbs, 69 .max_transition_latency = TRANSITION_LATENCY_LIMIT, 70 .owner = THIS_MODULE, 71 }; 72 73 /* Sampling types */ 74 enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; 75 76 struct cpu_dbs_info_s { 77 cputime64_t prev_cpu_idle; 78 cputime64_t prev_cpu_iowait; 79 cputime64_t prev_cpu_wall; 80 cputime64_t prev_cpu_nice; 81 struct cpufreq_policy *cur_policy; 82 struct delayed_work work; 83 struct cpufreq_frequency_table *freq_table; 84 unsigned int freq_lo; 85 unsigned int freq_lo_jiffies; 86 unsigned int freq_hi_jiffies; 87 unsigned int rate_mult; 88 int cpu; 89 unsigned int sample_type:1; 90 /* 91 * percpu mutex that serializes governor limit change with 92 * do_dbs_timer invocation. We do not want do_dbs_timer to run 93 * when user is changing the governor or limits. 94 */ 95 struct mutex timer_mutex; 96 }; 97 static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); 98 99 static unsigned int dbs_enable; /* number of CPUs using this policy */ 100 101 /* 102 * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on 103 * different CPUs. It protects dbs_enable in governor start/stop. 104 */ 105 static DEFINE_MUTEX(dbs_mutex); 106 107 static struct workqueue_struct *kondemand_wq; 108 109 static struct dbs_tuners { 110 unsigned int sampling_rate; 111 unsigned int up_threshold; 112 unsigned int down_differential; 113 unsigned int ignore_nice; 114 unsigned int sampling_down_factor; 115 unsigned int powersave_bias; 116 unsigned int io_is_busy; 117 } dbs_tuners_ins = { 118 .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, 119 .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, 120 .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, 121 .ignore_nice = 0, 122 .powersave_bias = 0, 123 }; 124 125 static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, 126 cputime64_t *wall) 127 { 128 cputime64_t idle_time; 129 cputime64_t cur_wall_time; 130 cputime64_t busy_time; 131 132 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 133 busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, 134 kstat_cpu(cpu).cpustat.system); 135 136 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); 137 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); 138 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); 139 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); 140 141 idle_time = cputime64_sub(cur_wall_time, busy_time); 142 if (wall) 143 *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); 144 145 return (cputime64_t)jiffies_to_usecs(idle_time); 146 } 147 148 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 149 { 150 u64 idle_time = get_cpu_idle_time_us(cpu, wall); 151 152 if (idle_time == -1ULL) 153 return get_cpu_idle_time_jiffy(cpu, wall); 154 155 return idle_time; 156 } 157 158 static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall) 159 { 160 u64 iowait_time = get_cpu_iowait_time_us(cpu, wall); 161 162 if (iowait_time == -1ULL) 163 return 0; 164 165 return iowait_time; 166 } 167 168 /* 169 * Find right freq to be set now with powersave_bias on. 170 * Returns the freq_hi to be used right now and will set freq_hi_jiffies, 171 * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs. 172 */ 173 static unsigned int powersave_bias_target(struct cpufreq_policy *policy, 174 unsigned int freq_next, 175 unsigned int relation) 176 { 177 unsigned int freq_req, freq_reduc, freq_avg; 178 unsigned int freq_hi, freq_lo; 179 unsigned int index = 0; 180 unsigned int jiffies_total, jiffies_hi, jiffies_lo; 181 struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 182 policy->cpu); 183 184 if (!dbs_info->freq_table) { 185 dbs_info->freq_lo = 0; 186 dbs_info->freq_lo_jiffies = 0; 187 return freq_next; 188 } 189 190 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next, 191 relation, &index); 192 freq_req = dbs_info->freq_table[index].frequency; 193 freq_reduc = freq_req * dbs_tuners_ins.powersave_bias / 1000; 194 freq_avg = freq_req - freq_reduc; 195 196 /* Find freq bounds for freq_avg in freq_table */ 197 index = 0; 198 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg, 199 CPUFREQ_RELATION_H, &index); 200 freq_lo = dbs_info->freq_table[index].frequency; 201 index = 0; 202 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg, 203 CPUFREQ_RELATION_L, &index); 204 freq_hi = dbs_info->freq_table[index].frequency; 205 206 /* Find out how long we have to be in hi and lo freqs */ 207 if (freq_hi == freq_lo) { 208 dbs_info->freq_lo = 0; 209 dbs_info->freq_lo_jiffies = 0; 210 return freq_lo; 211 } 212 jiffies_total = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 213 jiffies_hi = (freq_avg - freq_lo) * jiffies_total; 214 jiffies_hi += ((freq_hi - freq_lo) / 2); 215 jiffies_hi /= (freq_hi - freq_lo); 216 jiffies_lo = jiffies_total - jiffies_hi; 217 dbs_info->freq_lo = freq_lo; 218 dbs_info->freq_lo_jiffies = jiffies_lo; 219 dbs_info->freq_hi_jiffies = jiffies_hi; 220 return freq_hi; 221 } 222 223 static void ondemand_powersave_bias_init_cpu(int cpu) 224 { 225 struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); 226 dbs_info->freq_table = cpufreq_frequency_get_table(cpu); 227 dbs_info->freq_lo = 0; 228 } 229 230 static void ondemand_powersave_bias_init(void) 231 { 232 int i; 233 for_each_online_cpu(i) { 234 ondemand_powersave_bias_init_cpu(i); 235 } 236 } 237 238 /************************** sysfs interface ************************/ 239 240 static ssize_t show_sampling_rate_max(struct kobject *kobj, 241 struct attribute *attr, char *buf) 242 { 243 printk_once(KERN_INFO "CPUFREQ: ondemand sampling_rate_max " 244 "sysfs file is deprecated - used by: %s\n", current->comm); 245 return sprintf(buf, "%u\n", -1U); 246 } 247 248 static ssize_t show_sampling_rate_min(struct kobject *kobj, 249 struct attribute *attr, char *buf) 250 { 251 return sprintf(buf, "%u\n", min_sampling_rate); 252 } 253 254 define_one_global_ro(sampling_rate_max); 255 define_one_global_ro(sampling_rate_min); 256 257 /* cpufreq_ondemand Governor Tunables */ 258 #define show_one(file_name, object) \ 259 static ssize_t show_##file_name \ 260 (struct kobject *kobj, struct attribute *attr, char *buf) \ 261 { \ 262 return sprintf(buf, "%u\n", dbs_tuners_ins.object); \ 263 } 264 show_one(sampling_rate, sampling_rate); 265 show_one(io_is_busy, io_is_busy); 266 show_one(up_threshold, up_threshold); 267 show_one(sampling_down_factor, sampling_down_factor); 268 show_one(ignore_nice_load, ignore_nice); 269 show_one(powersave_bias, powersave_bias); 270 271 /*** delete after deprecation time ***/ 272 273 #define DEPRECATION_MSG(file_name) \ 274 printk_once(KERN_INFO "CPUFREQ: Per core ondemand sysfs " \ 275 "interface is deprecated - " #file_name "\n"); 276 277 #define show_one_old(file_name) \ 278 static ssize_t show_##file_name##_old \ 279 (struct cpufreq_policy *unused, char *buf) \ 280 { \ 281 printk_once(KERN_INFO "CPUFREQ: Per core ondemand sysfs " \ 282 "interface is deprecated - " #file_name "\n"); \ 283 return show_##file_name(NULL, NULL, buf); \ 284 } 285 show_one_old(sampling_rate); 286 show_one_old(up_threshold); 287 show_one_old(ignore_nice_load); 288 show_one_old(powersave_bias); 289 show_one_old(sampling_rate_min); 290 show_one_old(sampling_rate_max); 291 292 cpufreq_freq_attr_ro_old(sampling_rate_min); 293 cpufreq_freq_attr_ro_old(sampling_rate_max); 294 295 /*** delete after deprecation time ***/ 296 297 static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, 298 const char *buf, size_t count) 299 { 300 unsigned int input; 301 int ret; 302 ret = sscanf(buf, "%u", &input); 303 if (ret != 1) 304 return -EINVAL; 305 306 mutex_lock(&dbs_mutex); 307 dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate); 308 mutex_unlock(&dbs_mutex); 309 310 return count; 311 } 312 313 static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b, 314 const char *buf, size_t count) 315 { 316 unsigned int input; 317 int ret; 318 319 ret = sscanf(buf, "%u", &input); 320 if (ret != 1) 321 return -EINVAL; 322 323 mutex_lock(&dbs_mutex); 324 dbs_tuners_ins.io_is_busy = !!input; 325 mutex_unlock(&dbs_mutex); 326 327 return count; 328 } 329 330 static ssize_t store_up_threshold(struct kobject *a, struct attribute *b, 331 const char *buf, size_t count) 332 { 333 unsigned int input; 334 int ret; 335 ret = sscanf(buf, "%u", &input); 336 337 if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD || 338 input < MIN_FREQUENCY_UP_THRESHOLD) { 339 return -EINVAL; 340 } 341 342 mutex_lock(&dbs_mutex); 343 dbs_tuners_ins.up_threshold = input; 344 mutex_unlock(&dbs_mutex); 345 346 return count; 347 } 348 349 static ssize_t store_sampling_down_factor(struct kobject *a, 350 struct attribute *b, const char *buf, size_t count) 351 { 352 unsigned int input, j; 353 int ret; 354 ret = sscanf(buf, "%u", &input); 355 356 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) 357 return -EINVAL; 358 mutex_lock(&dbs_mutex); 359 dbs_tuners_ins.sampling_down_factor = input; 360 361 /* Reset down sampling multiplier in case it was active */ 362 for_each_online_cpu(j) { 363 struct cpu_dbs_info_s *dbs_info; 364 dbs_info = &per_cpu(od_cpu_dbs_info, j); 365 dbs_info->rate_mult = 1; 366 } 367 mutex_unlock(&dbs_mutex); 368 369 return count; 370 } 371 372 static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, 373 const char *buf, size_t count) 374 { 375 unsigned int input; 376 int ret; 377 378 unsigned int j; 379 380 ret = sscanf(buf, "%u", &input); 381 if (ret != 1) 382 return -EINVAL; 383 384 if (input > 1) 385 input = 1; 386 387 mutex_lock(&dbs_mutex); 388 if (input == dbs_tuners_ins.ignore_nice) { /* nothing to do */ 389 mutex_unlock(&dbs_mutex); 390 return count; 391 } 392 dbs_tuners_ins.ignore_nice = input; 393 394 /* we need to re-evaluate prev_cpu_idle */ 395 for_each_online_cpu(j) { 396 struct cpu_dbs_info_s *dbs_info; 397 dbs_info = &per_cpu(od_cpu_dbs_info, j); 398 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 399 &dbs_info->prev_cpu_wall); 400 if (dbs_tuners_ins.ignore_nice) 401 dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 402 403 } 404 mutex_unlock(&dbs_mutex); 405 406 return count; 407 } 408 409 static ssize_t store_powersave_bias(struct kobject *a, struct attribute *b, 410 const char *buf, size_t count) 411 { 412 unsigned int input; 413 int ret; 414 ret = sscanf(buf, "%u", &input); 415 416 if (ret != 1) 417 return -EINVAL; 418 419 if (input > 1000) 420 input = 1000; 421 422 mutex_lock(&dbs_mutex); 423 dbs_tuners_ins.powersave_bias = input; 424 ondemand_powersave_bias_init(); 425 mutex_unlock(&dbs_mutex); 426 427 return count; 428 } 429 430 define_one_global_rw(sampling_rate); 431 define_one_global_rw(io_is_busy); 432 define_one_global_rw(up_threshold); 433 define_one_global_rw(sampling_down_factor); 434 define_one_global_rw(ignore_nice_load); 435 define_one_global_rw(powersave_bias); 436 437 static struct attribute *dbs_attributes[] = { 438 &sampling_rate_max.attr, 439 &sampling_rate_min.attr, 440 &sampling_rate.attr, 441 &up_threshold.attr, 442 &sampling_down_factor.attr, 443 &ignore_nice_load.attr, 444 &powersave_bias.attr, 445 &io_is_busy.attr, 446 NULL 447 }; 448 449 static struct attribute_group dbs_attr_group = { 450 .attrs = dbs_attributes, 451 .name = "ondemand", 452 }; 453 454 /*** delete after deprecation time ***/ 455 456 #define write_one_old(file_name) \ 457 static ssize_t store_##file_name##_old \ 458 (struct cpufreq_policy *unused, const char *buf, size_t count) \ 459 { \ 460 printk_once(KERN_INFO "CPUFREQ: Per core ondemand sysfs " \ 461 "interface is deprecated - " #file_name "\n"); \ 462 return store_##file_name(NULL, NULL, buf, count); \ 463 } 464 write_one_old(sampling_rate); 465 write_one_old(up_threshold); 466 write_one_old(ignore_nice_load); 467 write_one_old(powersave_bias); 468 469 cpufreq_freq_attr_rw_old(sampling_rate); 470 cpufreq_freq_attr_rw_old(up_threshold); 471 cpufreq_freq_attr_rw_old(ignore_nice_load); 472 cpufreq_freq_attr_rw_old(powersave_bias); 473 474 static struct attribute *dbs_attributes_old[] = { 475 &sampling_rate_max_old.attr, 476 &sampling_rate_min_old.attr, 477 &sampling_rate_old.attr, 478 &up_threshold_old.attr, 479 &ignore_nice_load_old.attr, 480 &powersave_bias_old.attr, 481 NULL 482 }; 483 484 static struct attribute_group dbs_attr_group_old = { 485 .attrs = dbs_attributes_old, 486 .name = "ondemand", 487 }; 488 489 /*** delete after deprecation time ***/ 490 491 /************************** sysfs end ************************/ 492 493 static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq) 494 { 495 if (dbs_tuners_ins.powersave_bias) 496 freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H); 497 else if (p->cur == p->max) 498 return; 499 500 __cpufreq_driver_target(p, freq, dbs_tuners_ins.powersave_bias ? 501 CPUFREQ_RELATION_L : CPUFREQ_RELATION_H); 502 } 503 504 static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) 505 { 506 unsigned int max_load_freq; 507 508 struct cpufreq_policy *policy; 509 unsigned int j; 510 511 this_dbs_info->freq_lo = 0; 512 policy = this_dbs_info->cur_policy; 513 514 /* 515 * Every sampling_rate, we check, if current idle time is less 516 * than 20% (default), then we try to increase frequency 517 * Every sampling_rate, we look for a the lowest 518 * frequency which can sustain the load while keeping idle time over 519 * 30%. If such a frequency exist, we try to decrease to this frequency. 520 * 521 * Any frequency increase takes it to the maximum frequency. 522 * Frequency reduction happens at minimum steps of 523 * 5% (default) of current frequency 524 */ 525 526 /* Get Absolute Load - in terms of freq */ 527 max_load_freq = 0; 528 529 for_each_cpu(j, policy->cpus) { 530 struct cpu_dbs_info_s *j_dbs_info; 531 cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time; 532 unsigned int idle_time, wall_time, iowait_time; 533 unsigned int load, load_freq; 534 int freq_avg; 535 536 j_dbs_info = &per_cpu(od_cpu_dbs_info, j); 537 538 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 539 cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); 540 541 wall_time = (unsigned int) cputime64_sub(cur_wall_time, 542 j_dbs_info->prev_cpu_wall); 543 j_dbs_info->prev_cpu_wall = cur_wall_time; 544 545 idle_time = (unsigned int) cputime64_sub(cur_idle_time, 546 j_dbs_info->prev_cpu_idle); 547 j_dbs_info->prev_cpu_idle = cur_idle_time; 548 549 iowait_time = (unsigned int) cputime64_sub(cur_iowait_time, 550 j_dbs_info->prev_cpu_iowait); 551 j_dbs_info->prev_cpu_iowait = cur_iowait_time; 552 553 if (dbs_tuners_ins.ignore_nice) { 554 cputime64_t cur_nice; 555 unsigned long cur_nice_jiffies; 556 557 cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, 558 j_dbs_info->prev_cpu_nice); 559 /* 560 * Assumption: nice time between sampling periods will 561 * be less than 2^32 jiffies for 32 bit sys 562 */ 563 cur_nice_jiffies = (unsigned long) 564 cputime64_to_jiffies64(cur_nice); 565 566 j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 567 idle_time += jiffies_to_usecs(cur_nice_jiffies); 568 } 569 570 /* 571 * For the purpose of ondemand, waiting for disk IO is an 572 * indication that you're performance critical, and not that 573 * the system is actually idle. So subtract the iowait time 574 * from the cpu idle time. 575 */ 576 577 if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time) 578 idle_time -= iowait_time; 579 580 if (unlikely(!wall_time || wall_time < idle_time)) 581 continue; 582 583 load = 100 * (wall_time - idle_time) / wall_time; 584 585 freq_avg = __cpufreq_driver_getavg(policy, j); 586 if (freq_avg <= 0) 587 freq_avg = policy->cur; 588 589 load_freq = load * freq_avg; 590 if (load_freq > max_load_freq) 591 max_load_freq = load_freq; 592 } 593 594 /* Check for frequency increase */ 595 if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) { 596 /* If switching to max speed, apply sampling_down_factor */ 597 if (policy->cur < policy->max) 598 this_dbs_info->rate_mult = 599 dbs_tuners_ins.sampling_down_factor; 600 dbs_freq_increase(policy, policy->max); 601 return; 602 } 603 604 /* Check for frequency decrease */ 605 /* if we cannot reduce the frequency anymore, break out early */ 606 if (policy->cur == policy->min) 607 return; 608 609 /* 610 * The optimal frequency is the frequency that is the lowest that 611 * can support the current CPU usage without triggering the up 612 * policy. To be safe, we focus 10 points under the threshold. 613 */ 614 if (max_load_freq < 615 (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) * 616 policy->cur) { 617 unsigned int freq_next; 618 freq_next = max_load_freq / 619 (dbs_tuners_ins.up_threshold - 620 dbs_tuners_ins.down_differential); 621 622 /* No longer fully busy, reset rate_mult */ 623 this_dbs_info->rate_mult = 1; 624 625 if (freq_next < policy->min) 626 freq_next = policy->min; 627 628 if (!dbs_tuners_ins.powersave_bias) { 629 __cpufreq_driver_target(policy, freq_next, 630 CPUFREQ_RELATION_L); 631 } else { 632 int freq = powersave_bias_target(policy, freq_next, 633 CPUFREQ_RELATION_L); 634 __cpufreq_driver_target(policy, freq, 635 CPUFREQ_RELATION_L); 636 } 637 } 638 } 639 640 static void do_dbs_timer(struct work_struct *work) 641 { 642 struct cpu_dbs_info_s *dbs_info = 643 container_of(work, struct cpu_dbs_info_s, work.work); 644 unsigned int cpu = dbs_info->cpu; 645 int sample_type = dbs_info->sample_type; 646 647 /* We want all CPUs to do sampling nearly on same jiffy */ 648 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate 649 * dbs_info->rate_mult); 650 651 if (num_online_cpus() > 1) 652 delay -= jiffies % delay; 653 654 mutex_lock(&dbs_info->timer_mutex); 655 656 /* Common NORMAL_SAMPLE setup */ 657 dbs_info->sample_type = DBS_NORMAL_SAMPLE; 658 if (!dbs_tuners_ins.powersave_bias || 659 sample_type == DBS_NORMAL_SAMPLE) { 660 dbs_check_cpu(dbs_info); 661 if (dbs_info->freq_lo) { 662 /* Setup timer for SUB_SAMPLE */ 663 dbs_info->sample_type = DBS_SUB_SAMPLE; 664 delay = dbs_info->freq_hi_jiffies; 665 } 666 } else { 667 __cpufreq_driver_target(dbs_info->cur_policy, 668 dbs_info->freq_lo, CPUFREQ_RELATION_H); 669 } 670 queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay); 671 mutex_unlock(&dbs_info->timer_mutex); 672 } 673 674 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) 675 { 676 /* We want all CPUs to do sampling nearly on same jiffy */ 677 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 678 679 if (num_online_cpus() > 1) 680 delay -= jiffies % delay; 681 682 dbs_info->sample_type = DBS_NORMAL_SAMPLE; 683 INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); 684 queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work, 685 delay); 686 } 687 688 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) 689 { 690 cancel_delayed_work_sync(&dbs_info->work); 691 } 692 693 /* 694 * Not all CPUs want IO time to be accounted as busy; this dependson how 695 * efficient idling at a higher frequency/voltage is. 696 * Pavel Machek says this is not so for various generations of AMD and old 697 * Intel systems. 698 * Mike Chan (androidlcom) calis this is also not true for ARM. 699 * Because of this, whitelist specific known (series) of CPUs by default, and 700 * leave all others up to the user. 701 */ 702 static int should_io_be_busy(void) 703 { 704 #if defined(CONFIG_X86) 705 /* 706 * For Intel, Core 2 (model 15) andl later have an efficient idle. 707 */ 708 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 709 boot_cpu_data.x86 == 6 && 710 boot_cpu_data.x86_model >= 15) 711 return 1; 712 #endif 713 return 0; 714 } 715 716 static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 717 unsigned int event) 718 { 719 unsigned int cpu = policy->cpu; 720 struct cpu_dbs_info_s *this_dbs_info; 721 unsigned int j; 722 int rc; 723 724 this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); 725 726 switch (event) { 727 case CPUFREQ_GOV_START: 728 if ((!cpu_online(cpu)) || (!policy->cur)) 729 return -EINVAL; 730 731 mutex_lock(&dbs_mutex); 732 733 rc = sysfs_create_group(&policy->kobj, &dbs_attr_group_old); 734 if (rc) { 735 mutex_unlock(&dbs_mutex); 736 return rc; 737 } 738 739 dbs_enable++; 740 for_each_cpu(j, policy->cpus) { 741 struct cpu_dbs_info_s *j_dbs_info; 742 j_dbs_info = &per_cpu(od_cpu_dbs_info, j); 743 j_dbs_info->cur_policy = policy; 744 745 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 746 &j_dbs_info->prev_cpu_wall); 747 if (dbs_tuners_ins.ignore_nice) { 748 j_dbs_info->prev_cpu_nice = 749 kstat_cpu(j).cpustat.nice; 750 } 751 } 752 this_dbs_info->cpu = cpu; 753 this_dbs_info->rate_mult = 1; 754 ondemand_powersave_bias_init_cpu(cpu); 755 /* 756 * Start the timerschedule work, when this governor 757 * is used for first time 758 */ 759 if (dbs_enable == 1) { 760 unsigned int latency; 761 762 rc = sysfs_create_group(cpufreq_global_kobject, 763 &dbs_attr_group); 764 if (rc) { 765 mutex_unlock(&dbs_mutex); 766 return rc; 767 } 768 769 /* policy latency is in nS. Convert it to uS first */ 770 latency = policy->cpuinfo.transition_latency / 1000; 771 if (latency == 0) 772 latency = 1; 773 /* Bring kernel and HW constraints together */ 774 min_sampling_rate = max(min_sampling_rate, 775 MIN_LATENCY_MULTIPLIER * latency); 776 dbs_tuners_ins.sampling_rate = 777 max(min_sampling_rate, 778 latency * LATENCY_MULTIPLIER); 779 dbs_tuners_ins.io_is_busy = should_io_be_busy(); 780 } 781 mutex_unlock(&dbs_mutex); 782 783 mutex_init(&this_dbs_info->timer_mutex); 784 dbs_timer_init(this_dbs_info); 785 break; 786 787 case CPUFREQ_GOV_STOP: 788 dbs_timer_exit(this_dbs_info); 789 790 mutex_lock(&dbs_mutex); 791 sysfs_remove_group(&policy->kobj, &dbs_attr_group_old); 792 mutex_destroy(&this_dbs_info->timer_mutex); 793 dbs_enable--; 794 mutex_unlock(&dbs_mutex); 795 if (!dbs_enable) 796 sysfs_remove_group(cpufreq_global_kobject, 797 &dbs_attr_group); 798 799 break; 800 801 case CPUFREQ_GOV_LIMITS: 802 mutex_lock(&this_dbs_info->timer_mutex); 803 if (policy->max < this_dbs_info->cur_policy->cur) 804 __cpufreq_driver_target(this_dbs_info->cur_policy, 805 policy->max, CPUFREQ_RELATION_H); 806 else if (policy->min > this_dbs_info->cur_policy->cur) 807 __cpufreq_driver_target(this_dbs_info->cur_policy, 808 policy->min, CPUFREQ_RELATION_L); 809 mutex_unlock(&this_dbs_info->timer_mutex); 810 break; 811 } 812 return 0; 813 } 814 815 static int __init cpufreq_gov_dbs_init(void) 816 { 817 int err; 818 cputime64_t wall; 819 u64 idle_time; 820 int cpu = get_cpu(); 821 822 idle_time = get_cpu_idle_time_us(cpu, &wall); 823 put_cpu(); 824 if (idle_time != -1ULL) { 825 /* Idle micro accounting is supported. Use finer thresholds */ 826 dbs_tuners_ins.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; 827 dbs_tuners_ins.down_differential = 828 MICRO_FREQUENCY_DOWN_DIFFERENTIAL; 829 /* 830 * In no_hz/micro accounting case we set the minimum frequency 831 * not depending on HZ, but fixed (very low). The deferred 832 * timer might skip some samples if idle/sleeping as needed. 833 */ 834 min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; 835 } else { 836 /* For correct statistics, we need 10 ticks for each measure */ 837 min_sampling_rate = 838 MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10); 839 } 840 841 kondemand_wq = create_workqueue("kondemand"); 842 if (!kondemand_wq) { 843 printk(KERN_ERR "Creation of kondemand failed\n"); 844 return -EFAULT; 845 } 846 err = cpufreq_register_governor(&cpufreq_gov_ondemand); 847 if (err) 848 destroy_workqueue(kondemand_wq); 849 850 return err; 851 } 852 853 static void __exit cpufreq_gov_dbs_exit(void) 854 { 855 cpufreq_unregister_governor(&cpufreq_gov_ondemand); 856 destroy_workqueue(kondemand_wq); 857 } 858 859 860 MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>"); 861 MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>"); 862 MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for " 863 "Low Latency Frequency Transition capable processors"); 864 MODULE_LICENSE("GPL"); 865 866 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND 867 fs_initcall(cpufreq_gov_dbs_init); 868 #else 869 module_init(cpufreq_gov_dbs_init); 870 #endif 871 module_exit(cpufreq_gov_dbs_exit); 872