1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Energy Model of devices 4 * 5 * Copyright (c) 2018-2021, Arm ltd. 6 * Written by: Quentin Perret, Arm ltd. 7 * Improvements provided by: Lukasz Luba, Arm ltd. 8 */ 9 10 #define pr_fmt(fmt) "energy_model: " fmt 11 12 #include <linux/cpu.h> 13 #include <linux/cpumask.h> 14 #include <linux/debugfs.h> 15 #include <linux/energy_model.h> 16 #include <linux/sched/topology.h> 17 #include <linux/slab.h> 18 19 /* 20 * Mutex serializing the registrations of performance domains and letting 21 * callbacks defined by drivers sleep. 22 */ 23 static DEFINE_MUTEX(em_pd_mutex); 24 25 static bool _is_cpu_device(struct device *dev) 26 { 27 return (dev->bus == &cpu_subsys); 28 } 29 30 #ifdef CONFIG_DEBUG_FS 31 static struct dentry *rootdir; 32 33 static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) 34 { 35 struct dentry *d; 36 char name[24]; 37 38 snprintf(name, sizeof(name), "ps:%lu", ps->frequency); 39 40 /* Create per-ps directory */ 41 d = debugfs_create_dir(name, pd); 42 debugfs_create_ulong("frequency", 0444, d, &ps->frequency); 43 debugfs_create_ulong("power", 0444, d, &ps->power); 44 debugfs_create_ulong("cost", 0444, d, &ps->cost); 45 debugfs_create_ulong("inefficient", 0444, d, &ps->flags); 46 } 47 48 static int em_debug_cpus_show(struct seq_file *s, void *unused) 49 { 50 seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); 51 52 return 0; 53 } 54 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); 55 56 static int em_debug_units_show(struct seq_file *s, void *unused) 57 { 58 struct em_perf_domain *pd = s->private; 59 char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? 60 "milliWatts" : "bogoWatts"; 61 62 seq_printf(s, "%s\n", units); 63 64 return 0; 65 } 66 DEFINE_SHOW_ATTRIBUTE(em_debug_units); 67 68 static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) 69 { 70 struct em_perf_domain *pd = s->private; 71 int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; 72 73 seq_printf(s, "%d\n", enabled); 74 75 return 0; 76 } 77 DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); 78 79 static void em_debug_create_pd(struct device *dev) 80 { 81 struct dentry *d; 82 int i; 83 84 /* Create the directory of the performance domain */ 85 d = debugfs_create_dir(dev_name(dev), rootdir); 86 87 if (_is_cpu_device(dev)) 88 debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, 89 &em_debug_cpus_fops); 90 91 debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); 92 debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, 93 &em_debug_skip_inefficiencies_fops); 94 95 /* Create a sub-directory for each performance state */ 96 for (i = 0; i < dev->em_pd->nr_perf_states; i++) 97 em_debug_create_ps(&dev->em_pd->table[i], d); 98 99 } 100 101 static void em_debug_remove_pd(struct device *dev) 102 { 103 struct dentry *debug_dir; 104 105 debug_dir = debugfs_lookup(dev_name(dev), rootdir); 106 debugfs_remove_recursive(debug_dir); 107 } 108 109 static int __init em_debug_init(void) 110 { 111 /* Create /sys/kernel/debug/energy_model directory */ 112 rootdir = debugfs_create_dir("energy_model", NULL); 113 114 return 0; 115 } 116 fs_initcall(em_debug_init); 117 #else /* CONFIG_DEBUG_FS */ 118 static void em_debug_create_pd(struct device *dev) {} 119 static void em_debug_remove_pd(struct device *dev) {} 120 #endif 121 122 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, 123 int nr_states, struct em_data_callback *cb) 124 { 125 unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; 126 struct em_perf_state *table; 127 int i, ret; 128 u64 fmax; 129 130 table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); 131 if (!table) 132 return -ENOMEM; 133 134 /* Build the list of performance states for this performance domain */ 135 for (i = 0, freq = 0; i < nr_states; i++, freq++) { 136 /* 137 * active_power() is a driver callback which ceils 'freq' to 138 * lowest performance state of 'dev' above 'freq' and updates 139 * 'power' and 'freq' accordingly. 140 */ 141 ret = cb->active_power(&power, &freq, dev); 142 if (ret) { 143 dev_err(dev, "EM: invalid perf. state: %d\n", 144 ret); 145 goto free_ps_table; 146 } 147 148 /* 149 * We expect the driver callback to increase the frequency for 150 * higher performance states. 151 */ 152 if (freq <= prev_freq) { 153 dev_err(dev, "EM: non-increasing freq: %lu\n", 154 freq); 155 goto free_ps_table; 156 } 157 158 /* 159 * The power returned by active_state() is expected to be 160 * positive and to fit into 16 bits. 161 */ 162 if (!power || power > EM_MAX_POWER) { 163 dev_err(dev, "EM: invalid power: %lu\n", 164 power); 165 goto free_ps_table; 166 } 167 168 table[i].power = power; 169 table[i].frequency = prev_freq = freq; 170 } 171 172 /* Compute the cost of each performance state. */ 173 fmax = (u64) table[nr_states - 1].frequency; 174 for (i = nr_states - 1; i >= 0; i--) { 175 unsigned long power_res = em_scale_power(table[i].power); 176 177 table[i].cost = div64_u64(fmax * power_res, 178 table[i].frequency); 179 if (table[i].cost >= prev_cost) { 180 table[i].flags = EM_PERF_STATE_INEFFICIENT; 181 dev_dbg(dev, "EM: OPP:%lu is inefficient\n", 182 table[i].frequency); 183 } else { 184 prev_cost = table[i].cost; 185 } 186 } 187 188 pd->table = table; 189 pd->nr_perf_states = nr_states; 190 191 return 0; 192 193 free_ps_table: 194 kfree(table); 195 return -EINVAL; 196 } 197 198 static int em_create_pd(struct device *dev, int nr_states, 199 struct em_data_callback *cb, cpumask_t *cpus) 200 { 201 struct em_perf_domain *pd; 202 struct device *cpu_dev; 203 int cpu, ret; 204 205 if (_is_cpu_device(dev)) { 206 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); 207 if (!pd) 208 return -ENOMEM; 209 210 cpumask_copy(em_span_cpus(pd), cpus); 211 } else { 212 pd = kzalloc(sizeof(*pd), GFP_KERNEL); 213 if (!pd) 214 return -ENOMEM; 215 } 216 217 ret = em_create_perf_table(dev, pd, nr_states, cb); 218 if (ret) { 219 kfree(pd); 220 return ret; 221 } 222 223 if (_is_cpu_device(dev)) 224 for_each_cpu(cpu, cpus) { 225 cpu_dev = get_cpu_device(cpu); 226 cpu_dev->em_pd = pd; 227 } 228 229 dev->em_pd = pd; 230 231 return 0; 232 } 233 234 /** 235 * em_pd_get() - Return the performance domain for a device 236 * @dev : Device to find the performance domain for 237 * 238 * Returns the performance domain to which @dev belongs, or NULL if it doesn't 239 * exist. 240 */ 241 struct em_perf_domain *em_pd_get(struct device *dev) 242 { 243 if (IS_ERR_OR_NULL(dev)) 244 return NULL; 245 246 return dev->em_pd; 247 } 248 EXPORT_SYMBOL_GPL(em_pd_get); 249 250 /** 251 * em_cpu_get() - Return the performance domain for a CPU 252 * @cpu : CPU to find the performance domain for 253 * 254 * Returns the performance domain to which @cpu belongs, or NULL if it doesn't 255 * exist. 256 */ 257 struct em_perf_domain *em_cpu_get(int cpu) 258 { 259 struct device *cpu_dev; 260 261 cpu_dev = get_cpu_device(cpu); 262 if (!cpu_dev) 263 return NULL; 264 265 return em_pd_get(cpu_dev); 266 } 267 EXPORT_SYMBOL_GPL(em_cpu_get); 268 269 /** 270 * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device 271 * @dev : Device for which the EM is to register 272 * @nr_states : Number of performance states to register 273 * @cb : Callback functions providing the data of the Energy Model 274 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 275 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 276 * type of devices this should be set to NULL. 277 * @milliwatts : Flag indicating that the power values are in milliWatts or 278 * in some other scale. It must be set properly. 279 * 280 * Create Energy Model tables for a performance domain using the callbacks 281 * defined in cb. 282 * 283 * The @milliwatts is important to set with correct value. Some kernel 284 * sub-systems might rely on this flag and check if all devices in the EM are 285 * using the same scale. 286 * 287 * If multiple clients register the same performance domain, all but the first 288 * registration will be ignored. 289 * 290 * Return 0 on success 291 */ 292 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 293 struct em_data_callback *cb, cpumask_t *cpus, 294 bool milliwatts) 295 { 296 unsigned long cap, prev_cap = 0; 297 int cpu, ret; 298 299 if (!dev || !nr_states || !cb) 300 return -EINVAL; 301 302 /* 303 * Use a mutex to serialize the registration of performance domains and 304 * let the driver-defined callback functions sleep. 305 */ 306 mutex_lock(&em_pd_mutex); 307 308 if (dev->em_pd) { 309 ret = -EEXIST; 310 goto unlock; 311 } 312 313 if (_is_cpu_device(dev)) { 314 if (!cpus) { 315 dev_err(dev, "EM: invalid CPU mask\n"); 316 ret = -EINVAL; 317 goto unlock; 318 } 319 320 for_each_cpu(cpu, cpus) { 321 if (em_cpu_get(cpu)) { 322 dev_err(dev, "EM: exists for CPU%d\n", cpu); 323 ret = -EEXIST; 324 goto unlock; 325 } 326 /* 327 * All CPUs of a domain must have the same 328 * micro-architecture since they all share the same 329 * table. 330 */ 331 cap = arch_scale_cpu_capacity(cpu); 332 if (prev_cap && prev_cap != cap) { 333 dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", 334 cpumask_pr_args(cpus)); 335 336 ret = -EINVAL; 337 goto unlock; 338 } 339 prev_cap = cap; 340 } 341 } 342 343 ret = em_create_pd(dev, nr_states, cb, cpus); 344 if (ret) 345 goto unlock; 346 347 if (milliwatts) 348 dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; 349 350 em_debug_create_pd(dev); 351 dev_info(dev, "EM: created perf domain\n"); 352 353 unlock: 354 mutex_unlock(&em_pd_mutex); 355 return ret; 356 } 357 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); 358 359 /** 360 * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device 361 * @dev : Device for which the EM is registered 362 * 363 * Unregister the EM for the specified @dev (but not a CPU device). 364 */ 365 void em_dev_unregister_perf_domain(struct device *dev) 366 { 367 if (IS_ERR_OR_NULL(dev) || !dev->em_pd) 368 return; 369 370 if (_is_cpu_device(dev)) 371 return; 372 373 /* 374 * The mutex separates all register/unregister requests and protects 375 * from potential clean-up/setup issues in the debugfs directories. 376 * The debugfs directory name is the same as device's name. 377 */ 378 mutex_lock(&em_pd_mutex); 379 em_debug_remove_pd(dev); 380 381 kfree(dev->em_pd->table); 382 kfree(dev->em_pd); 383 dev->em_pd = NULL; 384 mutex_unlock(&em_pd_mutex); 385 } 386 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); 387