1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/drivers/thermal/cpufreq_cooling.c 4 * 5 * Copyright (C) 2012 Samsung Electronics Co., Ltd(http://www.samsung.com) 6 * 7 * Copyright (C) 2012-2018 Linaro Limited. 8 * 9 * Authors: Amit Daniel <amit.kachhap@linaro.org> 10 * Viresh Kumar <viresh.kumar@linaro.org> 11 * 12 */ 13 #include <linux/cpu.h> 14 #include <linux/cpufreq.h> 15 #include <linux/cpu_cooling.h> 16 #include <linux/device.h> 17 #include <linux/energy_model.h> 18 #include <linux/err.h> 19 #include <linux/export.h> 20 #include <linux/pm_opp.h> 21 #include <linux/pm_qos.h> 22 #include <linux/slab.h> 23 #include <linux/thermal.h> 24 #include <linux/units.h> 25 26 #include <trace/events/thermal.h> 27 28 /* 29 * Cooling state <-> CPUFreq frequency 30 * 31 * Cooling states are translated to frequencies throughout this driver and this 32 * is the relation between them. 33 * 34 * Highest cooling state corresponds to lowest possible frequency. 35 * 36 * i.e. 37 * level 0 --> 1st Max Freq 38 * level 1 --> 2nd Max Freq 39 * ... 40 */ 41 42 /** 43 * struct time_in_idle - Idle time stats 44 * @time: previous reading of the absolute time that this cpu was idle 45 * @timestamp: wall time of the last invocation of get_cpu_idle_time_us() 46 */ 47 struct time_in_idle { 48 u64 time; 49 u64 timestamp; 50 }; 51 52 /** 53 * struct cpufreq_cooling_device - data for cooling device with cpufreq 54 * @last_load: load measured by the latest call to cpufreq_get_requested_power() 55 * @cpufreq_state: integer value representing the current state of cpufreq 56 * cooling devices. 57 * @max_level: maximum cooling level. One less than total number of valid 58 * cpufreq frequencies. 59 * @em: Reference on the Energy Model of the device 60 * @cdev: thermal_cooling_device pointer to keep track of the 61 * registered cooling device. 62 * @policy: cpufreq policy. 63 * @cooling_ops: cpufreq callbacks to thermal cooling device ops 64 * @idle_time: idle time stats 65 * @qos_req: PM QoS contraint to apply 66 * 67 * This structure is required for keeping information of each registered 68 * cpufreq_cooling_device. 69 */ 70 struct cpufreq_cooling_device { 71 u32 last_load; 72 unsigned int cpufreq_state; 73 unsigned int max_level; 74 struct em_perf_domain *em; 75 struct cpufreq_policy *policy; 76 struct thermal_cooling_device_ops cooling_ops; 77 #ifndef CONFIG_SMP 78 struct time_in_idle *idle_time; 79 #endif 80 struct freq_qos_request qos_req; 81 }; 82 83 #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR 84 /** 85 * get_level: Find the level for a particular frequency 86 * @cpufreq_cdev: cpufreq_cdev for which the property is required 87 * @freq: Frequency 88 * 89 * Return: level corresponding to the frequency. 90 */ 91 static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev, 92 unsigned int freq) 93 { 94 int i; 95 96 for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) { 97 if (freq > cpufreq_cdev->em->table[i].frequency) 98 break; 99 } 100 101 return cpufreq_cdev->max_level - i - 1; 102 } 103 104 static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev, 105 u32 freq) 106 { 107 unsigned long power_mw; 108 int i; 109 110 for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) { 111 if (freq > cpufreq_cdev->em->table[i].frequency) 112 break; 113 } 114 115 power_mw = cpufreq_cdev->em->table[i + 1].power; 116 power_mw /= MICROWATT_PER_MILLIWATT; 117 118 return power_mw; 119 } 120 121 static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, 122 u32 power) 123 { 124 unsigned long em_power_mw; 125 int i; 126 127 for (i = cpufreq_cdev->max_level; i > 0; i--) { 128 /* Convert EM power to milli-Watts to make safe comparison */ 129 em_power_mw = cpufreq_cdev->em->table[i].power; 130 em_power_mw /= MICROWATT_PER_MILLIWATT; 131 if (power >= em_power_mw) 132 break; 133 } 134 135 return cpufreq_cdev->em->table[i].frequency; 136 } 137 138 /** 139 * get_load() - get load for a cpu 140 * @cpufreq_cdev: struct cpufreq_cooling_device for the cpu 141 * @cpu: cpu number 142 * @cpu_idx: index of the cpu in time_in_idle array 143 * 144 * Return: The average load of cpu @cpu in percentage since this 145 * function was last called. 146 */ 147 #ifdef CONFIG_SMP 148 static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, 149 int cpu_idx) 150 { 151 unsigned long util = sched_cpu_util(cpu); 152 153 return (util * 100) / arch_scale_cpu_capacity(cpu); 154 } 155 #else /* !CONFIG_SMP */ 156 static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, 157 int cpu_idx) 158 { 159 u32 load; 160 u64 now, now_idle, delta_time, delta_idle; 161 struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; 162 163 now_idle = get_cpu_idle_time(cpu, &now, 0); 164 delta_idle = now_idle - idle_time->time; 165 delta_time = now - idle_time->timestamp; 166 167 if (delta_time <= delta_idle) 168 load = 0; 169 else 170 load = div64_u64(100 * (delta_time - delta_idle), delta_time); 171 172 idle_time->time = now_idle; 173 idle_time->timestamp = now; 174 175 return load; 176 } 177 #endif /* CONFIG_SMP */ 178 179 /** 180 * get_dynamic_power() - calculate the dynamic power 181 * @cpufreq_cdev: &cpufreq_cooling_device for this cdev 182 * @freq: current frequency 183 * 184 * Return: the dynamic power consumed by the cpus described by 185 * @cpufreq_cdev. 186 */ 187 static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev, 188 unsigned long freq) 189 { 190 u32 raw_cpu_power; 191 192 raw_cpu_power = cpu_freq_to_power(cpufreq_cdev, freq); 193 return (raw_cpu_power * cpufreq_cdev->last_load) / 100; 194 } 195 196 /** 197 * cpufreq_get_requested_power() - get the current power 198 * @cdev: &thermal_cooling_device pointer 199 * @power: pointer in which to store the resulting power 200 * 201 * Calculate the current power consumption of the cpus in milliwatts 202 * and store it in @power. This function should actually calculate 203 * the requested power, but it's hard to get the frequency that 204 * cpufreq would have assigned if there were no thermal limits. 205 * Instead, we calculate the current power on the assumption that the 206 * immediate future will look like the immediate past. 207 * 208 * We use the current frequency and the average load since this 209 * function was last called. In reality, there could have been 210 * multiple opps since this function was last called and that affects 211 * the load calculation. While it's not perfectly accurate, this 212 * simplification is good enough and works. REVISIT this, as more 213 * complex code may be needed if experiments show that it's not 214 * accurate enough. 215 * 216 * Return: 0 on success, this function doesn't fail. 217 */ 218 static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev, 219 u32 *power) 220 { 221 unsigned long freq; 222 int i = 0, cpu; 223 u32 total_load = 0; 224 struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; 225 struct cpufreq_policy *policy = cpufreq_cdev->policy; 226 227 freq = cpufreq_quick_get(policy->cpu); 228 229 for_each_cpu(cpu, policy->related_cpus) { 230 u32 load; 231 232 if (cpu_online(cpu)) 233 load = get_load(cpufreq_cdev, cpu, i); 234 else 235 load = 0; 236 237 total_load += load; 238 } 239 240 cpufreq_cdev->last_load = total_load; 241 242 *power = get_dynamic_power(cpufreq_cdev, freq); 243 244 trace_thermal_power_cpu_get_power_simple(policy->cpu, *power); 245 246 return 0; 247 } 248 249 /** 250 * cpufreq_state2power() - convert a cpu cdev state to power consumed 251 * @cdev: &thermal_cooling_device pointer 252 * @state: cooling device state to be converted 253 * @power: pointer in which to store the resulting power 254 * 255 * Convert cooling device state @state into power consumption in 256 * milliwatts assuming 100% load. Store the calculated power in 257 * @power. 258 * 259 * Return: 0 on success, -EINVAL if the cooling device state is bigger 260 * than maximum allowed. 261 */ 262 static int cpufreq_state2power(struct thermal_cooling_device *cdev, 263 unsigned long state, u32 *power) 264 { 265 unsigned int freq, num_cpus, idx; 266 struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; 267 268 /* Request state should be less than max_level */ 269 if (state > cpufreq_cdev->max_level) 270 return -EINVAL; 271 272 num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus); 273 274 idx = cpufreq_cdev->max_level - state; 275 freq = cpufreq_cdev->em->table[idx].frequency; 276 *power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus; 277 278 return 0; 279 } 280 281 /** 282 * cpufreq_power2state() - convert power to a cooling device state 283 * @cdev: &thermal_cooling_device pointer 284 * @power: power in milliwatts to be converted 285 * @state: pointer in which to store the resulting state 286 * 287 * Calculate a cooling device state for the cpus described by @cdev 288 * that would allow them to consume at most @power mW and store it in 289 * @state. Note that this calculation depends on external factors 290 * such as the CPUs load. Calling this function with the same power 291 * as input can yield different cooling device states depending on those 292 * external factors. 293 * 294 * Return: 0 on success, this function doesn't fail. 295 */ 296 static int cpufreq_power2state(struct thermal_cooling_device *cdev, 297 u32 power, unsigned long *state) 298 { 299 unsigned int target_freq; 300 u32 last_load, normalised_power; 301 struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; 302 struct cpufreq_policy *policy = cpufreq_cdev->policy; 303 304 last_load = cpufreq_cdev->last_load ?: 1; 305 normalised_power = (power * 100) / last_load; 306 target_freq = cpu_power_to_freq(cpufreq_cdev, normalised_power); 307 308 *state = get_level(cpufreq_cdev, target_freq); 309 trace_thermal_power_cpu_limit(policy->related_cpus, target_freq, *state, 310 power); 311 return 0; 312 } 313 314 static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev, 315 struct em_perf_domain *em) { 316 struct cpufreq_policy *policy; 317 unsigned int nr_levels; 318 319 if (!em || em_is_artificial(em)) 320 return false; 321 322 policy = cpufreq_cdev->policy; 323 if (!cpumask_equal(policy->related_cpus, em_span_cpus(em))) { 324 pr_err("The span of pd %*pbl is misaligned with cpufreq policy %*pbl\n", 325 cpumask_pr_args(em_span_cpus(em)), 326 cpumask_pr_args(policy->related_cpus)); 327 return false; 328 } 329 330 nr_levels = cpufreq_cdev->max_level + 1; 331 if (em_pd_nr_perf_states(em) != nr_levels) { 332 pr_err("The number of performance states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n", 333 cpumask_pr_args(em_span_cpus(em)), 334 em_pd_nr_perf_states(em), nr_levels); 335 return false; 336 } 337 338 return true; 339 } 340 #endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */ 341 342 #ifdef CONFIG_SMP 343 static inline int allocate_idle_time(struct cpufreq_cooling_device *cpufreq_cdev) 344 { 345 return 0; 346 } 347 348 static inline void free_idle_time(struct cpufreq_cooling_device *cpufreq_cdev) 349 { 350 } 351 #else 352 static int allocate_idle_time(struct cpufreq_cooling_device *cpufreq_cdev) 353 { 354 unsigned int num_cpus = cpumask_weight(cpufreq_cdev->policy->related_cpus); 355 356 cpufreq_cdev->idle_time = kcalloc(num_cpus, 357 sizeof(*cpufreq_cdev->idle_time), 358 GFP_KERNEL); 359 if (!cpufreq_cdev->idle_time) 360 return -ENOMEM; 361 362 return 0; 363 } 364 365 static void free_idle_time(struct cpufreq_cooling_device *cpufreq_cdev) 366 { 367 kfree(cpufreq_cdev->idle_time); 368 cpufreq_cdev->idle_time = NULL; 369 } 370 #endif /* CONFIG_SMP */ 371 372 static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev, 373 unsigned long state) 374 { 375 struct cpufreq_policy *policy; 376 unsigned long idx; 377 378 #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR 379 /* Use the Energy Model table if available */ 380 if (cpufreq_cdev->em) { 381 idx = cpufreq_cdev->max_level - state; 382 return cpufreq_cdev->em->table[idx].frequency; 383 } 384 #endif 385 386 /* Otherwise, fallback on the CPUFreq table */ 387 policy = cpufreq_cdev->policy; 388 if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) 389 idx = cpufreq_cdev->max_level - state; 390 else 391 idx = state; 392 393 return policy->freq_table[idx].frequency; 394 } 395 396 /* cpufreq cooling device callback functions are defined below */ 397 398 /** 399 * cpufreq_get_max_state - callback function to get the max cooling state. 400 * @cdev: thermal cooling device pointer. 401 * @state: fill this variable with the max cooling state. 402 * 403 * Callback for the thermal cooling device to return the cpufreq 404 * max cooling state. 405 * 406 * Return: 0 on success, this function doesn't fail. 407 */ 408 static int cpufreq_get_max_state(struct thermal_cooling_device *cdev, 409 unsigned long *state) 410 { 411 struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; 412 413 *state = cpufreq_cdev->max_level; 414 return 0; 415 } 416 417 /** 418 * cpufreq_get_cur_state - callback function to get the current cooling state. 419 * @cdev: thermal cooling device pointer. 420 * @state: fill this variable with the current cooling state. 421 * 422 * Callback for the thermal cooling device to return the cpufreq 423 * current cooling state. 424 * 425 * Return: 0 on success, this function doesn't fail. 426 */ 427 static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev, 428 unsigned long *state) 429 { 430 struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; 431 432 *state = cpufreq_cdev->cpufreq_state; 433 434 return 0; 435 } 436 437 /** 438 * cpufreq_set_cur_state - callback function to set the current cooling state. 439 * @cdev: thermal cooling device pointer. 440 * @state: set this variable to the current cooling state. 441 * 442 * Callback for the thermal cooling device to change the cpufreq 443 * current cooling state. 444 * 445 * Return: 0 on success, an error code otherwise. 446 */ 447 static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, 448 unsigned long state) 449 { 450 struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; 451 struct cpumask *cpus; 452 unsigned int frequency; 453 int ret; 454 455 /* Request state should be less than max_level */ 456 if (state > cpufreq_cdev->max_level) 457 return -EINVAL; 458 459 /* Check if the old cooling action is same as new cooling action */ 460 if (cpufreq_cdev->cpufreq_state == state) 461 return 0; 462 463 frequency = get_state_freq(cpufreq_cdev, state); 464 465 ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency); 466 if (ret >= 0) { 467 cpufreq_cdev->cpufreq_state = state; 468 cpus = cpufreq_cdev->policy->related_cpus; 469 arch_update_thermal_pressure(cpus, frequency); 470 ret = 0; 471 } 472 473 return ret; 474 } 475 476 /** 477 * __cpufreq_cooling_register - helper function to create cpufreq cooling device 478 * @np: a valid struct device_node to the cooling device device tree node 479 * @policy: cpufreq policy 480 * Normally this should be same as cpufreq policy->related_cpus. 481 * @em: Energy Model of the cpufreq policy 482 * 483 * This interface function registers the cpufreq cooling device with the name 484 * "cpufreq-%s". This API can support multiple instances of cpufreq 485 * cooling devices. It also gives the opportunity to link the cooling device 486 * with a device tree node, in order to bind it via the thermal DT code. 487 * 488 * Return: a valid struct thermal_cooling_device pointer on success, 489 * on failure, it returns a corresponding ERR_PTR(). 490 */ 491 static struct thermal_cooling_device * 492 __cpufreq_cooling_register(struct device_node *np, 493 struct cpufreq_policy *policy, 494 struct em_perf_domain *em) 495 { 496 struct thermal_cooling_device *cdev; 497 struct cpufreq_cooling_device *cpufreq_cdev; 498 unsigned int i; 499 struct device *dev; 500 int ret; 501 struct thermal_cooling_device_ops *cooling_ops; 502 char *name; 503 504 dev = get_cpu_device(policy->cpu); 505 if (unlikely(!dev)) { 506 pr_warn("No cpu device for cpu %d\n", policy->cpu); 507 return ERR_PTR(-ENODEV); 508 } 509 510 if (IS_ERR_OR_NULL(policy)) { 511 pr_err("%s: cpufreq policy isn't valid: %p\n", __func__, policy); 512 return ERR_PTR(-EINVAL); 513 } 514 515 i = cpufreq_table_count_valid_entries(policy); 516 if (!i) { 517 pr_debug("%s: CPUFreq table not found or has no valid entries\n", 518 __func__); 519 return ERR_PTR(-ENODEV); 520 } 521 522 cpufreq_cdev = kzalloc(sizeof(*cpufreq_cdev), GFP_KERNEL); 523 if (!cpufreq_cdev) 524 return ERR_PTR(-ENOMEM); 525 526 cpufreq_cdev->policy = policy; 527 528 ret = allocate_idle_time(cpufreq_cdev); 529 if (ret) { 530 cdev = ERR_PTR(ret); 531 goto free_cdev; 532 } 533 534 /* max_level is an index, not a counter */ 535 cpufreq_cdev->max_level = i - 1; 536 537 cooling_ops = &cpufreq_cdev->cooling_ops; 538 cooling_ops->get_max_state = cpufreq_get_max_state; 539 cooling_ops->get_cur_state = cpufreq_get_cur_state; 540 cooling_ops->set_cur_state = cpufreq_set_cur_state; 541 542 #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR 543 if (em_is_sane(cpufreq_cdev, em)) { 544 cpufreq_cdev->em = em; 545 cooling_ops->get_requested_power = cpufreq_get_requested_power; 546 cooling_ops->state2power = cpufreq_state2power; 547 cooling_ops->power2state = cpufreq_power2state; 548 } else 549 #endif 550 if (policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED) { 551 pr_err("%s: unsorted frequency tables are not supported\n", 552 __func__); 553 cdev = ERR_PTR(-EINVAL); 554 goto free_idle_time; 555 } 556 557 ret = freq_qos_add_request(&policy->constraints, 558 &cpufreq_cdev->qos_req, FREQ_QOS_MAX, 559 get_state_freq(cpufreq_cdev, 0)); 560 if (ret < 0) { 561 pr_err("%s: Failed to add freq constraint (%d)\n", __func__, 562 ret); 563 cdev = ERR_PTR(ret); 564 goto free_idle_time; 565 } 566 567 cdev = ERR_PTR(-ENOMEM); 568 name = kasprintf(GFP_KERNEL, "cpufreq-%s", dev_name(dev)); 569 if (!name) 570 goto remove_qos_req; 571 572 cdev = thermal_of_cooling_device_register(np, name, cpufreq_cdev, 573 cooling_ops); 574 kfree(name); 575 576 if (IS_ERR(cdev)) 577 goto remove_qos_req; 578 579 return cdev; 580 581 remove_qos_req: 582 freq_qos_remove_request(&cpufreq_cdev->qos_req); 583 free_idle_time: 584 free_idle_time(cpufreq_cdev); 585 free_cdev: 586 kfree(cpufreq_cdev); 587 return cdev; 588 } 589 590 /** 591 * cpufreq_cooling_register - function to create cpufreq cooling device. 592 * @policy: cpufreq policy 593 * 594 * This interface function registers the cpufreq cooling device with the name 595 * "cpufreq-%s". This API can support multiple instances of cpufreq cooling 596 * devices. 597 * 598 * Return: a valid struct thermal_cooling_device pointer on success, 599 * on failure, it returns a corresponding ERR_PTR(). 600 */ 601 struct thermal_cooling_device * 602 cpufreq_cooling_register(struct cpufreq_policy *policy) 603 { 604 return __cpufreq_cooling_register(NULL, policy, NULL); 605 } 606 EXPORT_SYMBOL_GPL(cpufreq_cooling_register); 607 608 /** 609 * of_cpufreq_cooling_register - function to create cpufreq cooling device. 610 * @policy: cpufreq policy 611 * 612 * This interface function registers the cpufreq cooling device with the name 613 * "cpufreq-%s". This API can support multiple instances of cpufreq cooling 614 * devices. Using this API, the cpufreq cooling device will be linked to the 615 * device tree node provided. 616 * 617 * Using this function, the cooling device will implement the power 618 * extensions by using the Energy Model (if present). The cpus must have 619 * registered their OPPs using the OPP library. 620 * 621 * Return: a valid struct thermal_cooling_device pointer on success, 622 * and NULL on failure. 623 */ 624 struct thermal_cooling_device * 625 of_cpufreq_cooling_register(struct cpufreq_policy *policy) 626 { 627 struct device_node *np = of_get_cpu_node(policy->cpu, NULL); 628 struct thermal_cooling_device *cdev = NULL; 629 630 if (!np) { 631 pr_err("cpufreq_cooling: OF node not available for cpu%d\n", 632 policy->cpu); 633 return NULL; 634 } 635 636 if (of_find_property(np, "#cooling-cells", NULL)) { 637 struct em_perf_domain *em = em_cpu_get(policy->cpu); 638 639 cdev = __cpufreq_cooling_register(np, policy, em); 640 if (IS_ERR(cdev)) { 641 pr_err("cpufreq_cooling: cpu%d failed to register as cooling device: %ld\n", 642 policy->cpu, PTR_ERR(cdev)); 643 cdev = NULL; 644 } 645 } 646 647 of_node_put(np); 648 return cdev; 649 } 650 EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register); 651 652 /** 653 * cpufreq_cooling_unregister - function to remove cpufreq cooling device. 654 * @cdev: thermal cooling device pointer. 655 * 656 * This interface function unregisters the "cpufreq-%x" cooling device. 657 */ 658 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) 659 { 660 struct cpufreq_cooling_device *cpufreq_cdev; 661 662 if (!cdev) 663 return; 664 665 cpufreq_cdev = cdev->devdata; 666 667 thermal_cooling_device_unregister(cdev); 668 freq_qos_remove_request(&cpufreq_cdev->qos_req); 669 free_idle_time(cpufreq_cdev); 670 kfree(cpufreq_cdev); 671 } 672 EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister); 673