xref: /openbmc/linux/drivers/base/arch_topology.c (revision 0760aad038b5a032c31ea124feed63d88627d2f1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Arch specific cpu topology information
4  *
5  * Copyright (C) 2016, ARM Ltd.
6  * Written by: Juri Lelli, ARM Ltd.
7  */
8 
9 #include <linux/acpi.h>
10 #include <linux/cpu.h>
11 #include <linux/cpufreq.h>
12 #include <linux/device.h>
13 #include <linux/of.h>
14 #include <linux/slab.h>
15 #include <linux/string.h>
16 #include <linux/sched/topology.h>
17 #include <linux/cpuset.h>
18 #include <linux/cpumask.h>
19 #include <linux/init.h>
20 #include <linux/percpu.h>
21 #include <linux/sched.h>
22 #include <linux/smp.h>
23 
24 __weak bool arch_freq_counters_available(struct cpumask *cpus)
25 {
26 	return false;
27 }
28 DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
29 
30 void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
31 			 unsigned long max_freq)
32 {
33 	unsigned long scale;
34 	int i;
35 
36 	/*
37 	 * If the use of counters for FIE is enabled, just return as we don't
38 	 * want to update the scale factor with information from CPUFREQ.
39 	 * Instead the scale factor will be updated from arch_scale_freq_tick.
40 	 */
41 	if (arch_freq_counters_available(cpus))
42 		return;
43 
44 	scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
45 
46 	for_each_cpu(i, cpus)
47 		per_cpu(freq_scale, i) = scale;
48 }
49 
50 DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
51 
52 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
53 {
54 	per_cpu(cpu_scale, cpu) = capacity;
55 }
56 
57 DEFINE_PER_CPU(unsigned long, thermal_pressure);
58 
59 void topology_set_thermal_pressure(const struct cpumask *cpus,
60 			       unsigned long th_pressure)
61 {
62 	int cpu;
63 
64 	for_each_cpu(cpu, cpus)
65 		WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
66 }
67 
68 static ssize_t cpu_capacity_show(struct device *dev,
69 				 struct device_attribute *attr,
70 				 char *buf)
71 {
72 	struct cpu *cpu = container_of(dev, struct cpu, dev);
73 
74 	return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id));
75 }
76 
77 static void update_topology_flags_workfn(struct work_struct *work);
78 static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn);
79 
80 static DEVICE_ATTR_RO(cpu_capacity);
81 
82 static int register_cpu_capacity_sysctl(void)
83 {
84 	int i;
85 	struct device *cpu;
86 
87 	for_each_possible_cpu(i) {
88 		cpu = get_cpu_device(i);
89 		if (!cpu) {
90 			pr_err("%s: too early to get CPU%d device!\n",
91 			       __func__, i);
92 			continue;
93 		}
94 		device_create_file(cpu, &dev_attr_cpu_capacity);
95 	}
96 
97 	return 0;
98 }
99 subsys_initcall(register_cpu_capacity_sysctl);
100 
101 static int update_topology;
102 
103 int topology_update_cpu_topology(void)
104 {
105 	return update_topology;
106 }
107 
108 /*
109  * Updating the sched_domains can't be done directly from cpufreq callbacks
110  * due to locking, so queue the work for later.
111  */
112 static void update_topology_flags_workfn(struct work_struct *work)
113 {
114 	update_topology = 1;
115 	rebuild_sched_domains();
116 	pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
117 	update_topology = 0;
118 }
119 
120 static DEFINE_PER_CPU(u32, freq_factor) = 1;
121 static u32 *raw_capacity;
122 
123 static int free_raw_capacity(void)
124 {
125 	kfree(raw_capacity);
126 	raw_capacity = NULL;
127 
128 	return 0;
129 }
130 
131 void topology_normalize_cpu_scale(void)
132 {
133 	u64 capacity;
134 	u64 capacity_scale;
135 	int cpu;
136 
137 	if (!raw_capacity)
138 		return;
139 
140 	capacity_scale = 1;
141 	for_each_possible_cpu(cpu) {
142 		capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu);
143 		capacity_scale = max(capacity, capacity_scale);
144 	}
145 
146 	pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale);
147 	for_each_possible_cpu(cpu) {
148 		capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu);
149 		capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
150 			capacity_scale);
151 		topology_set_cpu_scale(cpu, capacity);
152 		pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
153 			cpu, topology_get_cpu_scale(cpu));
154 	}
155 }
156 
157 bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
158 {
159 	struct clk *cpu_clk;
160 	static bool cap_parsing_failed;
161 	int ret;
162 	u32 cpu_capacity;
163 
164 	if (cap_parsing_failed)
165 		return false;
166 
167 	ret = of_property_read_u32(cpu_node, "capacity-dmips-mhz",
168 				   &cpu_capacity);
169 	if (!ret) {
170 		if (!raw_capacity) {
171 			raw_capacity = kcalloc(num_possible_cpus(),
172 					       sizeof(*raw_capacity),
173 					       GFP_KERNEL);
174 			if (!raw_capacity) {
175 				cap_parsing_failed = true;
176 				return false;
177 			}
178 		}
179 		raw_capacity[cpu] = cpu_capacity;
180 		pr_debug("cpu_capacity: %pOF cpu_capacity=%u (raw)\n",
181 			cpu_node, raw_capacity[cpu]);
182 
183 		/*
184 		 * Update freq_factor for calculating early boot cpu capacities.
185 		 * For non-clk CPU DVFS mechanism, there's no way to get the
186 		 * frequency value now, assuming they are running at the same
187 		 * frequency (by keeping the initial freq_factor value).
188 		 */
189 		cpu_clk = of_clk_get(cpu_node, 0);
190 		if (!PTR_ERR_OR_ZERO(cpu_clk)) {
191 			per_cpu(freq_factor, cpu) =
192 				clk_get_rate(cpu_clk) / 1000;
193 			clk_put(cpu_clk);
194 		}
195 	} else {
196 		if (raw_capacity) {
197 			pr_err("cpu_capacity: missing %pOF raw capacity\n",
198 				cpu_node);
199 			pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n");
200 		}
201 		cap_parsing_failed = true;
202 		free_raw_capacity();
203 	}
204 
205 	return !ret;
206 }
207 
208 #ifdef CONFIG_CPU_FREQ
209 static cpumask_var_t cpus_to_visit;
210 static void parsing_done_workfn(struct work_struct *work);
211 static DECLARE_WORK(parsing_done_work, parsing_done_workfn);
212 
213 static int
214 init_cpu_capacity_callback(struct notifier_block *nb,
215 			   unsigned long val,
216 			   void *data)
217 {
218 	struct cpufreq_policy *policy = data;
219 	int cpu;
220 
221 	if (!raw_capacity)
222 		return 0;
223 
224 	if (val != CPUFREQ_CREATE_POLICY)
225 		return 0;
226 
227 	pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n",
228 		 cpumask_pr_args(policy->related_cpus),
229 		 cpumask_pr_args(cpus_to_visit));
230 
231 	cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
232 
233 	for_each_cpu(cpu, policy->related_cpus)
234 		per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000;
235 
236 	if (cpumask_empty(cpus_to_visit)) {
237 		topology_normalize_cpu_scale();
238 		schedule_work(&update_topology_flags_work);
239 		free_raw_capacity();
240 		pr_debug("cpu_capacity: parsing done\n");
241 		schedule_work(&parsing_done_work);
242 	}
243 
244 	return 0;
245 }
246 
247 static struct notifier_block init_cpu_capacity_notifier = {
248 	.notifier_call = init_cpu_capacity_callback,
249 };
250 
251 static int __init register_cpufreq_notifier(void)
252 {
253 	int ret;
254 
255 	/*
256 	 * on ACPI-based systems we need to use the default cpu capacity
257 	 * until we have the necessary code to parse the cpu capacity, so
258 	 * skip registering cpufreq notifier.
259 	 */
260 	if (!acpi_disabled || !raw_capacity)
261 		return -EINVAL;
262 
263 	if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL))
264 		return -ENOMEM;
265 
266 	cpumask_copy(cpus_to_visit, cpu_possible_mask);
267 
268 	ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
269 					CPUFREQ_POLICY_NOTIFIER);
270 
271 	if (ret)
272 		free_cpumask_var(cpus_to_visit);
273 
274 	return ret;
275 }
276 core_initcall(register_cpufreq_notifier);
277 
278 static void parsing_done_workfn(struct work_struct *work)
279 {
280 	cpufreq_unregister_notifier(&init_cpu_capacity_notifier,
281 					 CPUFREQ_POLICY_NOTIFIER);
282 	free_cpumask_var(cpus_to_visit);
283 }
284 
285 #else
286 core_initcall(free_raw_capacity);
287 #endif
288 
289 #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
290 /*
291  * This function returns the logic cpu number of the node.
292  * There are basically three kinds of return values:
293  * (1) logic cpu number which is > 0.
294  * (2) -ENODEV when the device tree(DT) node is valid and found in the DT but
295  * there is no possible logical CPU in the kernel to match. This happens
296  * when CONFIG_NR_CPUS is configure to be smaller than the number of
297  * CPU nodes in DT. We need to just ignore this case.
298  * (3) -1 if the node does not exist in the device tree
299  */
300 static int __init get_cpu_for_node(struct device_node *node)
301 {
302 	struct device_node *cpu_node;
303 	int cpu;
304 
305 	cpu_node = of_parse_phandle(node, "cpu", 0);
306 	if (!cpu_node)
307 		return -1;
308 
309 	cpu = of_cpu_node_to_id(cpu_node);
310 	if (cpu >= 0)
311 		topology_parse_cpu_capacity(cpu_node, cpu);
312 	else
313 		pr_info("CPU node for %pOF exist but the possible cpu range is :%*pbl\n",
314 			cpu_node, cpumask_pr_args(cpu_possible_mask));
315 
316 	of_node_put(cpu_node);
317 	return cpu;
318 }
319 
320 static int __init parse_core(struct device_node *core, int package_id,
321 			     int core_id)
322 {
323 	char name[20];
324 	bool leaf = true;
325 	int i = 0;
326 	int cpu;
327 	struct device_node *t;
328 
329 	do {
330 		snprintf(name, sizeof(name), "thread%d", i);
331 		t = of_get_child_by_name(core, name);
332 		if (t) {
333 			leaf = false;
334 			cpu = get_cpu_for_node(t);
335 			if (cpu >= 0) {
336 				cpu_topology[cpu].package_id = package_id;
337 				cpu_topology[cpu].core_id = core_id;
338 				cpu_topology[cpu].thread_id = i;
339 			} else if (cpu != -ENODEV) {
340 				pr_err("%pOF: Can't get CPU for thread\n", t);
341 				of_node_put(t);
342 				return -EINVAL;
343 			}
344 			of_node_put(t);
345 		}
346 		i++;
347 	} while (t);
348 
349 	cpu = get_cpu_for_node(core);
350 	if (cpu >= 0) {
351 		if (!leaf) {
352 			pr_err("%pOF: Core has both threads and CPU\n",
353 			       core);
354 			return -EINVAL;
355 		}
356 
357 		cpu_topology[cpu].package_id = package_id;
358 		cpu_topology[cpu].core_id = core_id;
359 	} else if (leaf && cpu != -ENODEV) {
360 		pr_err("%pOF: Can't get CPU for leaf core\n", core);
361 		return -EINVAL;
362 	}
363 
364 	return 0;
365 }
366 
367 static int __init parse_cluster(struct device_node *cluster, int depth)
368 {
369 	char name[20];
370 	bool leaf = true;
371 	bool has_cores = false;
372 	struct device_node *c;
373 	static int package_id __initdata;
374 	int core_id = 0;
375 	int i, ret;
376 
377 	/*
378 	 * First check for child clusters; we currently ignore any
379 	 * information about the nesting of clusters and present the
380 	 * scheduler with a flat list of them.
381 	 */
382 	i = 0;
383 	do {
384 		snprintf(name, sizeof(name), "cluster%d", i);
385 		c = of_get_child_by_name(cluster, name);
386 		if (c) {
387 			leaf = false;
388 			ret = parse_cluster(c, depth + 1);
389 			of_node_put(c);
390 			if (ret != 0)
391 				return ret;
392 		}
393 		i++;
394 	} while (c);
395 
396 	/* Now check for cores */
397 	i = 0;
398 	do {
399 		snprintf(name, sizeof(name), "core%d", i);
400 		c = of_get_child_by_name(cluster, name);
401 		if (c) {
402 			has_cores = true;
403 
404 			if (depth == 0) {
405 				pr_err("%pOF: cpu-map children should be clusters\n",
406 				       c);
407 				of_node_put(c);
408 				return -EINVAL;
409 			}
410 
411 			if (leaf) {
412 				ret = parse_core(c, package_id, core_id++);
413 			} else {
414 				pr_err("%pOF: Non-leaf cluster with core %s\n",
415 				       cluster, name);
416 				ret = -EINVAL;
417 			}
418 
419 			of_node_put(c);
420 			if (ret != 0)
421 				return ret;
422 		}
423 		i++;
424 	} while (c);
425 
426 	if (leaf && !has_cores)
427 		pr_warn("%pOF: empty cluster\n", cluster);
428 
429 	if (leaf)
430 		package_id++;
431 
432 	return 0;
433 }
434 
435 static int __init parse_dt_topology(void)
436 {
437 	struct device_node *cn, *map;
438 	int ret = 0;
439 	int cpu;
440 
441 	cn = of_find_node_by_path("/cpus");
442 	if (!cn) {
443 		pr_err("No CPU information found in DT\n");
444 		return 0;
445 	}
446 
447 	/*
448 	 * When topology is provided cpu-map is essentially a root
449 	 * cluster with restricted subnodes.
450 	 */
451 	map = of_get_child_by_name(cn, "cpu-map");
452 	if (!map)
453 		goto out;
454 
455 	ret = parse_cluster(map, 0);
456 	if (ret != 0)
457 		goto out_map;
458 
459 	topology_normalize_cpu_scale();
460 
461 	/*
462 	 * Check that all cores are in the topology; the SMP code will
463 	 * only mark cores described in the DT as possible.
464 	 */
465 	for_each_possible_cpu(cpu)
466 		if (cpu_topology[cpu].package_id == -1)
467 			ret = -EINVAL;
468 
469 out_map:
470 	of_node_put(map);
471 out:
472 	of_node_put(cn);
473 	return ret;
474 }
475 #endif
476 
477 /*
478  * cpu topology table
479  */
480 struct cpu_topology cpu_topology[NR_CPUS];
481 EXPORT_SYMBOL_GPL(cpu_topology);
482 
483 const struct cpumask *cpu_coregroup_mask(int cpu)
484 {
485 	const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu));
486 
487 	/* Find the smaller of NUMA, core or LLC siblings */
488 	if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) {
489 		/* not numa in package, lets use the package siblings */
490 		core_mask = &cpu_topology[cpu].core_sibling;
491 	}
492 	if (cpu_topology[cpu].llc_id != -1) {
493 		if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask))
494 			core_mask = &cpu_topology[cpu].llc_sibling;
495 	}
496 
497 	return core_mask;
498 }
499 
500 void update_siblings_masks(unsigned int cpuid)
501 {
502 	struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
503 	int cpu;
504 
505 	/* update core and thread sibling masks */
506 	for_each_online_cpu(cpu) {
507 		cpu_topo = &cpu_topology[cpu];
508 
509 		if (cpuid_topo->llc_id == cpu_topo->llc_id) {
510 			cpumask_set_cpu(cpu, &cpuid_topo->llc_sibling);
511 			cpumask_set_cpu(cpuid, &cpu_topo->llc_sibling);
512 		}
513 
514 		if (cpuid_topo->package_id != cpu_topo->package_id)
515 			continue;
516 
517 		cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
518 		cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
519 
520 		if (cpuid_topo->core_id != cpu_topo->core_id)
521 			continue;
522 
523 		cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
524 		cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
525 	}
526 }
527 
528 static void clear_cpu_topology(int cpu)
529 {
530 	struct cpu_topology *cpu_topo = &cpu_topology[cpu];
531 
532 	cpumask_clear(&cpu_topo->llc_sibling);
533 	cpumask_set_cpu(cpu, &cpu_topo->llc_sibling);
534 
535 	cpumask_clear(&cpu_topo->core_sibling);
536 	cpumask_set_cpu(cpu, &cpu_topo->core_sibling);
537 	cpumask_clear(&cpu_topo->thread_sibling);
538 	cpumask_set_cpu(cpu, &cpu_topo->thread_sibling);
539 }
540 
541 void __init reset_cpu_topology(void)
542 {
543 	unsigned int cpu;
544 
545 	for_each_possible_cpu(cpu) {
546 		struct cpu_topology *cpu_topo = &cpu_topology[cpu];
547 
548 		cpu_topo->thread_id = -1;
549 		cpu_topo->core_id = -1;
550 		cpu_topo->package_id = -1;
551 		cpu_topo->llc_id = -1;
552 
553 		clear_cpu_topology(cpu);
554 	}
555 }
556 
557 void remove_cpu_topology(unsigned int cpu)
558 {
559 	int sibling;
560 
561 	for_each_cpu(sibling, topology_core_cpumask(cpu))
562 		cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
563 	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
564 		cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
565 	for_each_cpu(sibling, topology_llc_cpumask(cpu))
566 		cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling));
567 
568 	clear_cpu_topology(cpu);
569 }
570 
571 __weak int __init parse_acpi_topology(void)
572 {
573 	return 0;
574 }
575 
576 #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
577 void __init init_cpu_topology(void)
578 {
579 	reset_cpu_topology();
580 
581 	/*
582 	 * Discard anything that was parsed if we hit an error so we
583 	 * don't use partial information.
584 	 */
585 	if (parse_acpi_topology())
586 		reset_cpu_topology();
587 	else if (of_have_populated_dt() && parse_dt_topology())
588 		reset_cpu_topology();
589 }
590 #endif
591