1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Energy Model of devices 4 * 5 * Copyright (c) 2018-2020, Arm ltd. 6 * Written by: Quentin Perret, Arm ltd. 7 * Improvements provided by: Lukasz Luba, Arm ltd. 8 */ 9 10 #define pr_fmt(fmt) "energy_model: " fmt 11 12 #include <linux/cpu.h> 13 #include <linux/cpumask.h> 14 #include <linux/debugfs.h> 15 #include <linux/energy_model.h> 16 #include <linux/sched/topology.h> 17 #include <linux/slab.h> 18 19 /* 20 * Mutex serializing the registrations of performance domains and letting 21 * callbacks defined by drivers sleep. 22 */ 23 static DEFINE_MUTEX(em_pd_mutex); 24 25 static bool _is_cpu_device(struct device *dev) 26 { 27 return (dev->bus == &cpu_subsys); 28 } 29 30 #ifdef CONFIG_DEBUG_FS 31 static struct dentry *rootdir; 32 33 static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) 34 { 35 struct dentry *d; 36 char name[24]; 37 38 snprintf(name, sizeof(name), "ps:%lu", ps->frequency); 39 40 /* Create per-ps directory */ 41 d = debugfs_create_dir(name, pd); 42 debugfs_create_ulong("frequency", 0444, d, &ps->frequency); 43 debugfs_create_ulong("power", 0444, d, &ps->power); 44 debugfs_create_ulong("cost", 0444, d, &ps->cost); 45 } 46 47 static int em_debug_cpus_show(struct seq_file *s, void *unused) 48 { 49 seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); 50 51 return 0; 52 } 53 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); 54 55 static int em_debug_units_show(struct seq_file *s, void *unused) 56 { 57 struct em_perf_domain *pd = s->private; 58 char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; 59 60 seq_printf(s, "%s\n", units); 61 62 return 0; 63 } 64 DEFINE_SHOW_ATTRIBUTE(em_debug_units); 65 66 static void em_debug_create_pd(struct device *dev) 67 { 68 struct dentry *d; 69 int i; 70 71 /* Create the directory of the performance domain */ 72 d = debugfs_create_dir(dev_name(dev), rootdir); 73 74 if (_is_cpu_device(dev)) 75 debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, 76 &em_debug_cpus_fops); 77 78 debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); 79 80 /* Create a sub-directory for each performance state */ 81 for (i = 0; i < dev->em_pd->nr_perf_states; i++) 82 em_debug_create_ps(&dev->em_pd->table[i], d); 83 84 } 85 86 static void em_debug_remove_pd(struct device *dev) 87 { 88 struct dentry *debug_dir; 89 90 debug_dir = debugfs_lookup(dev_name(dev), rootdir); 91 debugfs_remove_recursive(debug_dir); 92 } 93 94 static int __init em_debug_init(void) 95 { 96 /* Create /sys/kernel/debug/energy_model directory */ 97 rootdir = debugfs_create_dir("energy_model", NULL); 98 99 return 0; 100 } 101 fs_initcall(em_debug_init); 102 #else /* CONFIG_DEBUG_FS */ 103 static void em_debug_create_pd(struct device *dev) {} 104 static void em_debug_remove_pd(struct device *dev) {} 105 #endif 106 107 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, 108 int nr_states, struct em_data_callback *cb) 109 { 110 unsigned long opp_eff, prev_opp_eff = ULONG_MAX; 111 unsigned long power, freq, prev_freq = 0; 112 struct em_perf_state *table; 113 int i, ret; 114 u64 fmax; 115 116 table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); 117 if (!table) 118 return -ENOMEM; 119 120 /* Build the list of performance states for this performance domain */ 121 for (i = 0, freq = 0; i < nr_states; i++, freq++) { 122 /* 123 * active_power() is a driver callback which ceils 'freq' to 124 * lowest performance state of 'dev' above 'freq' and updates 125 * 'power' and 'freq' accordingly. 126 */ 127 ret = cb->active_power(&power, &freq, dev); 128 if (ret) { 129 dev_err(dev, "EM: invalid perf. state: %d\n", 130 ret); 131 goto free_ps_table; 132 } 133 134 /* 135 * We expect the driver callback to increase the frequency for 136 * higher performance states. 137 */ 138 if (freq <= prev_freq) { 139 dev_err(dev, "EM: non-increasing freq: %lu\n", 140 freq); 141 goto free_ps_table; 142 } 143 144 /* 145 * The power returned by active_state() is expected to be 146 * positive and to fit into 16 bits. 147 */ 148 if (!power || power > EM_MAX_POWER) { 149 dev_err(dev, "EM: invalid power: %lu\n", 150 power); 151 goto free_ps_table; 152 } 153 154 table[i].power = power; 155 table[i].frequency = prev_freq = freq; 156 157 /* 158 * The hertz/watts efficiency ratio should decrease as the 159 * frequency grows on sane platforms. But this isn't always 160 * true in practice so warn the user if a higher OPP is more 161 * power efficient than a lower one. 162 */ 163 opp_eff = freq / power; 164 if (opp_eff >= prev_opp_eff) 165 dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", 166 i, i - 1); 167 prev_opp_eff = opp_eff; 168 } 169 170 /* Compute the cost of each performance state. */ 171 fmax = (u64) table[nr_states - 1].frequency; 172 for (i = 0; i < nr_states; i++) { 173 unsigned long power_res = em_scale_power(table[i].power); 174 175 table[i].cost = div64_u64(fmax * power_res, 176 table[i].frequency); 177 } 178 179 pd->table = table; 180 pd->nr_perf_states = nr_states; 181 182 return 0; 183 184 free_ps_table: 185 kfree(table); 186 return -EINVAL; 187 } 188 189 static int em_create_pd(struct device *dev, int nr_states, 190 struct em_data_callback *cb, cpumask_t *cpus) 191 { 192 struct em_perf_domain *pd; 193 struct device *cpu_dev; 194 int cpu, ret; 195 196 if (_is_cpu_device(dev)) { 197 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); 198 if (!pd) 199 return -ENOMEM; 200 201 cpumask_copy(em_span_cpus(pd), cpus); 202 } else { 203 pd = kzalloc(sizeof(*pd), GFP_KERNEL); 204 if (!pd) 205 return -ENOMEM; 206 } 207 208 ret = em_create_perf_table(dev, pd, nr_states, cb); 209 if (ret) { 210 kfree(pd); 211 return ret; 212 } 213 214 if (_is_cpu_device(dev)) 215 for_each_cpu(cpu, cpus) { 216 cpu_dev = get_cpu_device(cpu); 217 cpu_dev->em_pd = pd; 218 } 219 220 dev->em_pd = pd; 221 222 return 0; 223 } 224 225 /** 226 * em_pd_get() - Return the performance domain for a device 227 * @dev : Device to find the performance domain for 228 * 229 * Returns the performance domain to which @dev belongs, or NULL if it doesn't 230 * exist. 231 */ 232 struct em_perf_domain *em_pd_get(struct device *dev) 233 { 234 if (IS_ERR_OR_NULL(dev)) 235 return NULL; 236 237 return dev->em_pd; 238 } 239 EXPORT_SYMBOL_GPL(em_pd_get); 240 241 /** 242 * em_cpu_get() - Return the performance domain for a CPU 243 * @cpu : CPU to find the performance domain for 244 * 245 * Returns the performance domain to which @cpu belongs, or NULL if it doesn't 246 * exist. 247 */ 248 struct em_perf_domain *em_cpu_get(int cpu) 249 { 250 struct device *cpu_dev; 251 252 cpu_dev = get_cpu_device(cpu); 253 if (!cpu_dev) 254 return NULL; 255 256 return em_pd_get(cpu_dev); 257 } 258 EXPORT_SYMBOL_GPL(em_cpu_get); 259 260 /** 261 * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device 262 * @dev : Device for which the EM is to register 263 * @nr_states : Number of performance states to register 264 * @cb : Callback functions providing the data of the Energy Model 265 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 266 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 267 * type of devices this should be set to NULL. 268 * @milliwatts : Flag indicating that the power values are in milliWatts or 269 * in some other scale. It must be set properly. 270 * 271 * Create Energy Model tables for a performance domain using the callbacks 272 * defined in cb. 273 * 274 * The @milliwatts is important to set with correct value. Some kernel 275 * sub-systems might rely on this flag and check if all devices in the EM are 276 * using the same scale. 277 * 278 * If multiple clients register the same performance domain, all but the first 279 * registration will be ignored. 280 * 281 * Return 0 on success 282 */ 283 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 284 struct em_data_callback *cb, cpumask_t *cpus, 285 bool milliwatts) 286 { 287 unsigned long cap, prev_cap = 0; 288 int cpu, ret; 289 290 if (!dev || !nr_states || !cb) 291 return -EINVAL; 292 293 /* 294 * Use a mutex to serialize the registration of performance domains and 295 * let the driver-defined callback functions sleep. 296 */ 297 mutex_lock(&em_pd_mutex); 298 299 if (dev->em_pd) { 300 ret = -EEXIST; 301 goto unlock; 302 } 303 304 if (_is_cpu_device(dev)) { 305 if (!cpus) { 306 dev_err(dev, "EM: invalid CPU mask\n"); 307 ret = -EINVAL; 308 goto unlock; 309 } 310 311 for_each_cpu(cpu, cpus) { 312 if (em_cpu_get(cpu)) { 313 dev_err(dev, "EM: exists for CPU%d\n", cpu); 314 ret = -EEXIST; 315 goto unlock; 316 } 317 /* 318 * All CPUs of a domain must have the same 319 * micro-architecture since they all share the same 320 * table. 321 */ 322 cap = arch_scale_cpu_capacity(cpu); 323 if (prev_cap && prev_cap != cap) { 324 dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", 325 cpumask_pr_args(cpus)); 326 327 ret = -EINVAL; 328 goto unlock; 329 } 330 prev_cap = cap; 331 } 332 } 333 334 ret = em_create_pd(dev, nr_states, cb, cpus); 335 if (ret) 336 goto unlock; 337 338 dev->em_pd->milliwatts = milliwatts; 339 340 em_debug_create_pd(dev); 341 dev_info(dev, "EM: created perf domain\n"); 342 343 unlock: 344 mutex_unlock(&em_pd_mutex); 345 return ret; 346 } 347 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); 348 349 /** 350 * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device 351 * @dev : Device for which the EM is registered 352 * 353 * Unregister the EM for the specified @dev (but not a CPU device). 354 */ 355 void em_dev_unregister_perf_domain(struct device *dev) 356 { 357 if (IS_ERR_OR_NULL(dev) || !dev->em_pd) 358 return; 359 360 if (_is_cpu_device(dev)) 361 return; 362 363 /* 364 * The mutex separates all register/unregister requests and protects 365 * from potential clean-up/setup issues in the debugfs directories. 366 * The debugfs directory name is the same as device's name. 367 */ 368 mutex_lock(&em_pd_mutex); 369 em_debug_remove_pd(dev); 370 371 kfree(dev->em_pd->table); 372 kfree(dev->em_pd); 373 dev->em_pd = NULL; 374 mutex_unlock(&em_pd_mutex); 375 } 376 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); 377