1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Energy Model of devices 4 * 5 * Copyright (c) 2018-2020, Arm ltd. 6 * Written by: Quentin Perret, Arm ltd. 7 * Improvements provided by: Lukasz Luba, Arm ltd. 8 */ 9 10 #define pr_fmt(fmt) "energy_model: " fmt 11 12 #include <linux/cpu.h> 13 #include <linux/cpumask.h> 14 #include <linux/debugfs.h> 15 #include <linux/energy_model.h> 16 #include <linux/sched/topology.h> 17 #include <linux/slab.h> 18 19 /* 20 * Mutex serializing the registrations of performance domains and letting 21 * callbacks defined by drivers sleep. 22 */ 23 static DEFINE_MUTEX(em_pd_mutex); 24 25 static bool _is_cpu_device(struct device *dev) 26 { 27 return (dev->bus == &cpu_subsys); 28 } 29 30 #ifdef CONFIG_DEBUG_FS 31 static struct dentry *rootdir; 32 33 static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) 34 { 35 struct dentry *d; 36 char name[24]; 37 38 snprintf(name, sizeof(name), "ps:%lu", ps->frequency); 39 40 /* Create per-ps directory */ 41 d = debugfs_create_dir(name, pd); 42 debugfs_create_ulong("frequency", 0444, d, &ps->frequency); 43 debugfs_create_ulong("power", 0444, d, &ps->power); 44 debugfs_create_ulong("cost", 0444, d, &ps->cost); 45 } 46 47 static int em_debug_cpus_show(struct seq_file *s, void *unused) 48 { 49 seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); 50 51 return 0; 52 } 53 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); 54 55 static int em_debug_units_show(struct seq_file *s, void *unused) 56 { 57 struct em_perf_domain *pd = s->private; 58 char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; 59 60 seq_printf(s, "%s\n", units); 61 62 return 0; 63 } 64 DEFINE_SHOW_ATTRIBUTE(em_debug_units); 65 66 static void em_debug_create_pd(struct device *dev) 67 { 68 struct dentry *d; 69 int i; 70 71 /* Create the directory of the performance domain */ 72 d = debugfs_create_dir(dev_name(dev), rootdir); 73 74 if (_is_cpu_device(dev)) 75 debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, 76 &em_debug_cpus_fops); 77 78 debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); 79 80 /* Create a sub-directory for each performance state */ 81 for (i = 0; i < dev->em_pd->nr_perf_states; i++) 82 em_debug_create_ps(&dev->em_pd->table[i], d); 83 84 } 85 86 static void em_debug_remove_pd(struct device *dev) 87 { 88 struct dentry *debug_dir; 89 90 debug_dir = debugfs_lookup(dev_name(dev), rootdir); 91 debugfs_remove_recursive(debug_dir); 92 } 93 94 static int __init em_debug_init(void) 95 { 96 /* Create /sys/kernel/debug/energy_model directory */ 97 rootdir = debugfs_create_dir("energy_model", NULL); 98 99 return 0; 100 } 101 fs_initcall(em_debug_init); 102 #else /* CONFIG_DEBUG_FS */ 103 static void em_debug_create_pd(struct device *dev) {} 104 static void em_debug_remove_pd(struct device *dev) {} 105 #endif 106 107 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, 108 int nr_states, struct em_data_callback *cb) 109 { 110 unsigned long opp_eff, prev_opp_eff = ULONG_MAX; 111 unsigned long power, freq, prev_freq = 0; 112 struct em_perf_state *table; 113 int i, ret; 114 u64 fmax; 115 116 table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); 117 if (!table) 118 return -ENOMEM; 119 120 /* Build the list of performance states for this performance domain */ 121 for (i = 0, freq = 0; i < nr_states; i++, freq++) { 122 /* 123 * active_power() is a driver callback which ceils 'freq' to 124 * lowest performance state of 'dev' above 'freq' and updates 125 * 'power' and 'freq' accordingly. 126 */ 127 ret = cb->active_power(&power, &freq, dev); 128 if (ret) { 129 dev_err(dev, "EM: invalid perf. state: %d\n", 130 ret); 131 goto free_ps_table; 132 } 133 134 /* 135 * We expect the driver callback to increase the frequency for 136 * higher performance states. 137 */ 138 if (freq <= prev_freq) { 139 dev_err(dev, "EM: non-increasing freq: %lu\n", 140 freq); 141 goto free_ps_table; 142 } 143 144 /* 145 * The power returned by active_state() is expected to be 146 * positive and to fit into 16 bits. 147 */ 148 if (!power || power > EM_MAX_POWER) { 149 dev_err(dev, "EM: invalid power: %lu\n", 150 power); 151 goto free_ps_table; 152 } 153 154 table[i].power = power; 155 table[i].frequency = prev_freq = freq; 156 157 /* 158 * The hertz/watts efficiency ratio should decrease as the 159 * frequency grows on sane platforms. But this isn't always 160 * true in practice so warn the user if a higher OPP is more 161 * power efficient than a lower one. 162 */ 163 opp_eff = freq / power; 164 if (opp_eff >= prev_opp_eff) 165 dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", 166 i, i - 1); 167 prev_opp_eff = opp_eff; 168 } 169 170 /* Compute the cost of each performance state. */ 171 fmax = (u64) table[nr_states - 1].frequency; 172 for (i = 0; i < nr_states; i++) { 173 table[i].cost = div64_u64(fmax * table[i].power, 174 table[i].frequency); 175 } 176 177 pd->table = table; 178 pd->nr_perf_states = nr_states; 179 180 return 0; 181 182 free_ps_table: 183 kfree(table); 184 return -EINVAL; 185 } 186 187 static int em_create_pd(struct device *dev, int nr_states, 188 struct em_data_callback *cb, cpumask_t *cpus) 189 { 190 struct em_perf_domain *pd; 191 struct device *cpu_dev; 192 int cpu, ret; 193 194 if (_is_cpu_device(dev)) { 195 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); 196 if (!pd) 197 return -ENOMEM; 198 199 cpumask_copy(em_span_cpus(pd), cpus); 200 } else { 201 pd = kzalloc(sizeof(*pd), GFP_KERNEL); 202 if (!pd) 203 return -ENOMEM; 204 } 205 206 ret = em_create_perf_table(dev, pd, nr_states, cb); 207 if (ret) { 208 kfree(pd); 209 return ret; 210 } 211 212 if (_is_cpu_device(dev)) 213 for_each_cpu(cpu, cpus) { 214 cpu_dev = get_cpu_device(cpu); 215 cpu_dev->em_pd = pd; 216 } 217 218 dev->em_pd = pd; 219 220 return 0; 221 } 222 223 /** 224 * em_pd_get() - Return the performance domain for a device 225 * @dev : Device to find the performance domain for 226 * 227 * Returns the performance domain to which @dev belongs, or NULL if it doesn't 228 * exist. 229 */ 230 struct em_perf_domain *em_pd_get(struct device *dev) 231 { 232 if (IS_ERR_OR_NULL(dev)) 233 return NULL; 234 235 return dev->em_pd; 236 } 237 EXPORT_SYMBOL_GPL(em_pd_get); 238 239 /** 240 * em_cpu_get() - Return the performance domain for a CPU 241 * @cpu : CPU to find the performance domain for 242 * 243 * Returns the performance domain to which @cpu belongs, or NULL if it doesn't 244 * exist. 245 */ 246 struct em_perf_domain *em_cpu_get(int cpu) 247 { 248 struct device *cpu_dev; 249 250 cpu_dev = get_cpu_device(cpu); 251 if (!cpu_dev) 252 return NULL; 253 254 return em_pd_get(cpu_dev); 255 } 256 EXPORT_SYMBOL_GPL(em_cpu_get); 257 258 /** 259 * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device 260 * @dev : Device for which the EM is to register 261 * @nr_states : Number of performance states to register 262 * @cb : Callback functions providing the data of the Energy Model 263 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 264 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 265 * type of devices this should be set to NULL. 266 * @milliwatts : Flag indicating that the power values are in milliWatts or 267 * in some other scale. It must be set properly. 268 * 269 * Create Energy Model tables for a performance domain using the callbacks 270 * defined in cb. 271 * 272 * The @milliwatts is important to set with correct value. Some kernel 273 * sub-systems might rely on this flag and check if all devices in the EM are 274 * using the same scale. 275 * 276 * If multiple clients register the same performance domain, all but the first 277 * registration will be ignored. 278 * 279 * Return 0 on success 280 */ 281 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 282 struct em_data_callback *cb, cpumask_t *cpus, 283 bool milliwatts) 284 { 285 unsigned long cap, prev_cap = 0; 286 int cpu, ret; 287 288 if (!dev || !nr_states || !cb) 289 return -EINVAL; 290 291 /* 292 * Use a mutex to serialize the registration of performance domains and 293 * let the driver-defined callback functions sleep. 294 */ 295 mutex_lock(&em_pd_mutex); 296 297 if (dev->em_pd) { 298 ret = -EEXIST; 299 goto unlock; 300 } 301 302 if (_is_cpu_device(dev)) { 303 if (!cpus) { 304 dev_err(dev, "EM: invalid CPU mask\n"); 305 ret = -EINVAL; 306 goto unlock; 307 } 308 309 for_each_cpu(cpu, cpus) { 310 if (em_cpu_get(cpu)) { 311 dev_err(dev, "EM: exists for CPU%d\n", cpu); 312 ret = -EEXIST; 313 goto unlock; 314 } 315 /* 316 * All CPUs of a domain must have the same 317 * micro-architecture since they all share the same 318 * table. 319 */ 320 cap = arch_scale_cpu_capacity(cpu); 321 if (prev_cap && prev_cap != cap) { 322 dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", 323 cpumask_pr_args(cpus)); 324 325 ret = -EINVAL; 326 goto unlock; 327 } 328 prev_cap = cap; 329 } 330 } 331 332 ret = em_create_pd(dev, nr_states, cb, cpus); 333 if (ret) 334 goto unlock; 335 336 dev->em_pd->milliwatts = milliwatts; 337 338 em_debug_create_pd(dev); 339 dev_info(dev, "EM: created perf domain\n"); 340 341 unlock: 342 mutex_unlock(&em_pd_mutex); 343 return ret; 344 } 345 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); 346 347 /** 348 * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device 349 * @dev : Device for which the EM is registered 350 * 351 * Unregister the EM for the specified @dev (but not a CPU device). 352 */ 353 void em_dev_unregister_perf_domain(struct device *dev) 354 { 355 if (IS_ERR_OR_NULL(dev) || !dev->em_pd) 356 return; 357 358 if (_is_cpu_device(dev)) 359 return; 360 361 /* 362 * The mutex separates all register/unregister requests and protects 363 * from potential clean-up/setup issues in the debugfs directories. 364 * The debugfs directory name is the same as device's name. 365 */ 366 mutex_lock(&em_pd_mutex); 367 em_debug_remove_pd(dev); 368 369 kfree(dev->em_pd->table); 370 kfree(dev->em_pd); 371 dev->em_pd = NULL; 372 mutex_unlock(&em_pd_mutex); 373 } 374 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); 375