1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Arch specific cpu topology information 4 * 5 * Copyright (C) 2016, ARM Ltd. 6 * Written by: Juri Lelli, ARM Ltd. 7 */ 8 9 #include <linux/acpi.h> 10 #include <linux/cpu.h> 11 #include <linux/cpufreq.h> 12 #include <linux/device.h> 13 #include <linux/of.h> 14 #include <linux/slab.h> 15 #include <linux/string.h> 16 #include <linux/sched/topology.h> 17 #include <linux/cpuset.h> 18 #include <linux/cpumask.h> 19 #include <linux/init.h> 20 #include <linux/percpu.h> 21 #include <linux/sched.h> 22 #include <linux/smp.h> 23 24 static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); 25 static struct cpumask scale_freq_counters_mask; 26 static bool scale_freq_invariant; 27 28 static bool supports_scale_freq_counters(const struct cpumask *cpus) 29 { 30 return cpumask_subset(cpus, &scale_freq_counters_mask); 31 } 32 33 bool topology_scale_freq_invariant(void) 34 { 35 return cpufreq_supports_freq_invariance() || 36 supports_scale_freq_counters(cpu_online_mask); 37 } 38 39 static void update_scale_freq_invariant(bool status) 40 { 41 if (scale_freq_invariant == status) 42 return; 43 44 /* 45 * Task scheduler behavior depends on frequency invariance support, 46 * either cpufreq or counter driven. If the support status changes as 47 * a result of counter initialisation and use, retrigger the build of 48 * scheduling domains to ensure the information is propagated properly. 49 */ 50 if (topology_scale_freq_invariant() == status) { 51 scale_freq_invariant = status; 52 rebuild_sched_domains_energy(); 53 } 54 } 55 56 void topology_set_scale_freq_source(struct scale_freq_data *data, 57 const struct cpumask *cpus) 58 { 59 struct scale_freq_data *sfd; 60 int cpu; 61 62 /* 63 * Avoid calling rebuild_sched_domains() unnecessarily if FIE is 64 * supported by cpufreq. 65 */ 66 if (cpumask_empty(&scale_freq_counters_mask)) 67 scale_freq_invariant = topology_scale_freq_invariant(); 68 69 for_each_cpu(cpu, cpus) { 70 sfd = per_cpu(sft_data, cpu); 71 72 /* Use ARCH provided counters whenever possible */ 73 if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { 74 per_cpu(sft_data, cpu) = data; 75 cpumask_set_cpu(cpu, &scale_freq_counters_mask); 76 } 77 } 78 79 update_scale_freq_invariant(true); 80 } 81 EXPORT_SYMBOL_GPL(topology_set_scale_freq_source); 82 83 void topology_clear_scale_freq_source(enum scale_freq_source source, 84 const struct cpumask *cpus) 85 { 86 struct scale_freq_data *sfd; 87 int cpu; 88 89 for_each_cpu(cpu, cpus) { 90 sfd = per_cpu(sft_data, cpu); 91 92 if (sfd && sfd->source == source) { 93 per_cpu(sft_data, cpu) = NULL; 94 cpumask_clear_cpu(cpu, &scale_freq_counters_mask); 95 } 96 } 97 98 update_scale_freq_invariant(false); 99 } 100 EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source); 101 102 void topology_scale_freq_tick(void) 103 { 104 struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data); 105 106 if (sfd) 107 sfd->set_freq_scale(); 108 } 109 110 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; 111 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); 112 113 void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, 114 unsigned long max_freq) 115 { 116 unsigned long scale; 117 int i; 118 119 if (WARN_ON_ONCE(!cur_freq || !max_freq)) 120 return; 121 122 /* 123 * If the use of counters for FIE is enabled, just return as we don't 124 * want to update the scale factor with information from CPUFREQ. 125 * Instead the scale factor will be updated from arch_scale_freq_tick. 126 */ 127 if (supports_scale_freq_counters(cpus)) 128 return; 129 130 scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; 131 132 for_each_cpu(i, cpus) 133 per_cpu(arch_freq_scale, i) = scale; 134 } 135 136 DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; 137 138 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) 139 { 140 per_cpu(cpu_scale, cpu) = capacity; 141 } 142 143 DEFINE_PER_CPU(unsigned long, thermal_pressure); 144 145 void topology_set_thermal_pressure(const struct cpumask *cpus, 146 unsigned long th_pressure) 147 { 148 int cpu; 149 150 for_each_cpu(cpu, cpus) 151 WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); 152 } 153 154 static ssize_t cpu_capacity_show(struct device *dev, 155 struct device_attribute *attr, 156 char *buf) 157 { 158 struct cpu *cpu = container_of(dev, struct cpu, dev); 159 160 return sysfs_emit(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id)); 161 } 162 163 static void update_topology_flags_workfn(struct work_struct *work); 164 static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn); 165 166 static DEVICE_ATTR_RO(cpu_capacity); 167 168 static int register_cpu_capacity_sysctl(void) 169 { 170 int i; 171 struct device *cpu; 172 173 for_each_possible_cpu(i) { 174 cpu = get_cpu_device(i); 175 if (!cpu) { 176 pr_err("%s: too early to get CPU%d device!\n", 177 __func__, i); 178 continue; 179 } 180 device_create_file(cpu, &dev_attr_cpu_capacity); 181 } 182 183 return 0; 184 } 185 subsys_initcall(register_cpu_capacity_sysctl); 186 187 static int update_topology; 188 189 int topology_update_cpu_topology(void) 190 { 191 return update_topology; 192 } 193 194 /* 195 * Updating the sched_domains can't be done directly from cpufreq callbacks 196 * due to locking, so queue the work for later. 197 */ 198 static void update_topology_flags_workfn(struct work_struct *work) 199 { 200 update_topology = 1; 201 rebuild_sched_domains(); 202 pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); 203 update_topology = 0; 204 } 205 206 static DEFINE_PER_CPU(u32, freq_factor) = 1; 207 static u32 *raw_capacity; 208 209 static int free_raw_capacity(void) 210 { 211 kfree(raw_capacity); 212 raw_capacity = NULL; 213 214 return 0; 215 } 216 217 void topology_normalize_cpu_scale(void) 218 { 219 u64 capacity; 220 u64 capacity_scale; 221 int cpu; 222 223 if (!raw_capacity) 224 return; 225 226 capacity_scale = 1; 227 for_each_possible_cpu(cpu) { 228 capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); 229 capacity_scale = max(capacity, capacity_scale); 230 } 231 232 pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); 233 for_each_possible_cpu(cpu) { 234 capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); 235 capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, 236 capacity_scale); 237 topology_set_cpu_scale(cpu, capacity); 238 pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", 239 cpu, topology_get_cpu_scale(cpu)); 240 } 241 } 242 243 bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) 244 { 245 struct clk *cpu_clk; 246 static bool cap_parsing_failed; 247 int ret; 248 u32 cpu_capacity; 249 250 if (cap_parsing_failed) 251 return false; 252 253 ret = of_property_read_u32(cpu_node, "capacity-dmips-mhz", 254 &cpu_capacity); 255 if (!ret) { 256 if (!raw_capacity) { 257 raw_capacity = kcalloc(num_possible_cpus(), 258 sizeof(*raw_capacity), 259 GFP_KERNEL); 260 if (!raw_capacity) { 261 cap_parsing_failed = true; 262 return false; 263 } 264 } 265 raw_capacity[cpu] = cpu_capacity; 266 pr_debug("cpu_capacity: %pOF cpu_capacity=%u (raw)\n", 267 cpu_node, raw_capacity[cpu]); 268 269 /* 270 * Update freq_factor for calculating early boot cpu capacities. 271 * For non-clk CPU DVFS mechanism, there's no way to get the 272 * frequency value now, assuming they are running at the same 273 * frequency (by keeping the initial freq_factor value). 274 */ 275 cpu_clk = of_clk_get(cpu_node, 0); 276 if (!PTR_ERR_OR_ZERO(cpu_clk)) { 277 per_cpu(freq_factor, cpu) = 278 clk_get_rate(cpu_clk) / 1000; 279 clk_put(cpu_clk); 280 } 281 } else { 282 if (raw_capacity) { 283 pr_err("cpu_capacity: missing %pOF raw capacity\n", 284 cpu_node); 285 pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n"); 286 } 287 cap_parsing_failed = true; 288 free_raw_capacity(); 289 } 290 291 return !ret; 292 } 293 294 #ifdef CONFIG_CPU_FREQ 295 static cpumask_var_t cpus_to_visit; 296 static void parsing_done_workfn(struct work_struct *work); 297 static DECLARE_WORK(parsing_done_work, parsing_done_workfn); 298 299 static int 300 init_cpu_capacity_callback(struct notifier_block *nb, 301 unsigned long val, 302 void *data) 303 { 304 struct cpufreq_policy *policy = data; 305 int cpu; 306 307 if (!raw_capacity) 308 return 0; 309 310 if (val != CPUFREQ_CREATE_POLICY) 311 return 0; 312 313 pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n", 314 cpumask_pr_args(policy->related_cpus), 315 cpumask_pr_args(cpus_to_visit)); 316 317 cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); 318 319 for_each_cpu(cpu, policy->related_cpus) 320 per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000; 321 322 if (cpumask_empty(cpus_to_visit)) { 323 topology_normalize_cpu_scale(); 324 schedule_work(&update_topology_flags_work); 325 free_raw_capacity(); 326 pr_debug("cpu_capacity: parsing done\n"); 327 schedule_work(&parsing_done_work); 328 } 329 330 return 0; 331 } 332 333 static struct notifier_block init_cpu_capacity_notifier = { 334 .notifier_call = init_cpu_capacity_callback, 335 }; 336 337 static int __init register_cpufreq_notifier(void) 338 { 339 int ret; 340 341 /* 342 * on ACPI-based systems we need to use the default cpu capacity 343 * until we have the necessary code to parse the cpu capacity, so 344 * skip registering cpufreq notifier. 345 */ 346 if (!acpi_disabled || !raw_capacity) 347 return -EINVAL; 348 349 if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) 350 return -ENOMEM; 351 352 cpumask_copy(cpus_to_visit, cpu_possible_mask); 353 354 ret = cpufreq_register_notifier(&init_cpu_capacity_notifier, 355 CPUFREQ_POLICY_NOTIFIER); 356 357 if (ret) 358 free_cpumask_var(cpus_to_visit); 359 360 return ret; 361 } 362 core_initcall(register_cpufreq_notifier); 363 364 static void parsing_done_workfn(struct work_struct *work) 365 { 366 cpufreq_unregister_notifier(&init_cpu_capacity_notifier, 367 CPUFREQ_POLICY_NOTIFIER); 368 free_cpumask_var(cpus_to_visit); 369 } 370 371 #else 372 core_initcall(free_raw_capacity); 373 #endif 374 375 #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) 376 /* 377 * This function returns the logic cpu number of the node. 378 * There are basically three kinds of return values: 379 * (1) logic cpu number which is > 0. 380 * (2) -ENODEV when the device tree(DT) node is valid and found in the DT but 381 * there is no possible logical CPU in the kernel to match. This happens 382 * when CONFIG_NR_CPUS is configure to be smaller than the number of 383 * CPU nodes in DT. We need to just ignore this case. 384 * (3) -1 if the node does not exist in the device tree 385 */ 386 static int __init get_cpu_for_node(struct device_node *node) 387 { 388 struct device_node *cpu_node; 389 int cpu; 390 391 cpu_node = of_parse_phandle(node, "cpu", 0); 392 if (!cpu_node) 393 return -1; 394 395 cpu = of_cpu_node_to_id(cpu_node); 396 if (cpu >= 0) 397 topology_parse_cpu_capacity(cpu_node, cpu); 398 else 399 pr_info("CPU node for %pOF exist but the possible cpu range is :%*pbl\n", 400 cpu_node, cpumask_pr_args(cpu_possible_mask)); 401 402 of_node_put(cpu_node); 403 return cpu; 404 } 405 406 static int __init parse_core(struct device_node *core, int package_id, 407 int core_id) 408 { 409 char name[20]; 410 bool leaf = true; 411 int i = 0; 412 int cpu; 413 struct device_node *t; 414 415 do { 416 snprintf(name, sizeof(name), "thread%d", i); 417 t = of_get_child_by_name(core, name); 418 if (t) { 419 leaf = false; 420 cpu = get_cpu_for_node(t); 421 if (cpu >= 0) { 422 cpu_topology[cpu].package_id = package_id; 423 cpu_topology[cpu].core_id = core_id; 424 cpu_topology[cpu].thread_id = i; 425 } else if (cpu != -ENODEV) { 426 pr_err("%pOF: Can't get CPU for thread\n", t); 427 of_node_put(t); 428 return -EINVAL; 429 } 430 of_node_put(t); 431 } 432 i++; 433 } while (t); 434 435 cpu = get_cpu_for_node(core); 436 if (cpu >= 0) { 437 if (!leaf) { 438 pr_err("%pOF: Core has both threads and CPU\n", 439 core); 440 return -EINVAL; 441 } 442 443 cpu_topology[cpu].package_id = package_id; 444 cpu_topology[cpu].core_id = core_id; 445 } else if (leaf && cpu != -ENODEV) { 446 pr_err("%pOF: Can't get CPU for leaf core\n", core); 447 return -EINVAL; 448 } 449 450 return 0; 451 } 452 453 static int __init parse_cluster(struct device_node *cluster, int depth) 454 { 455 char name[20]; 456 bool leaf = true; 457 bool has_cores = false; 458 struct device_node *c; 459 static int package_id __initdata; 460 int core_id = 0; 461 int i, ret; 462 463 /* 464 * First check for child clusters; we currently ignore any 465 * information about the nesting of clusters and present the 466 * scheduler with a flat list of them. 467 */ 468 i = 0; 469 do { 470 snprintf(name, sizeof(name), "cluster%d", i); 471 c = of_get_child_by_name(cluster, name); 472 if (c) { 473 leaf = false; 474 ret = parse_cluster(c, depth + 1); 475 of_node_put(c); 476 if (ret != 0) 477 return ret; 478 } 479 i++; 480 } while (c); 481 482 /* Now check for cores */ 483 i = 0; 484 do { 485 snprintf(name, sizeof(name), "core%d", i); 486 c = of_get_child_by_name(cluster, name); 487 if (c) { 488 has_cores = true; 489 490 if (depth == 0) { 491 pr_err("%pOF: cpu-map children should be clusters\n", 492 c); 493 of_node_put(c); 494 return -EINVAL; 495 } 496 497 if (leaf) { 498 ret = parse_core(c, package_id, core_id++); 499 } else { 500 pr_err("%pOF: Non-leaf cluster with core %s\n", 501 cluster, name); 502 ret = -EINVAL; 503 } 504 505 of_node_put(c); 506 if (ret != 0) 507 return ret; 508 } 509 i++; 510 } while (c); 511 512 if (leaf && !has_cores) 513 pr_warn("%pOF: empty cluster\n", cluster); 514 515 if (leaf) 516 package_id++; 517 518 return 0; 519 } 520 521 static int __init parse_dt_topology(void) 522 { 523 struct device_node *cn, *map; 524 int ret = 0; 525 int cpu; 526 527 cn = of_find_node_by_path("/cpus"); 528 if (!cn) { 529 pr_err("No CPU information found in DT\n"); 530 return 0; 531 } 532 533 /* 534 * When topology is provided cpu-map is essentially a root 535 * cluster with restricted subnodes. 536 */ 537 map = of_get_child_by_name(cn, "cpu-map"); 538 if (!map) 539 goto out; 540 541 ret = parse_cluster(map, 0); 542 if (ret != 0) 543 goto out_map; 544 545 topology_normalize_cpu_scale(); 546 547 /* 548 * Check that all cores are in the topology; the SMP code will 549 * only mark cores described in the DT as possible. 550 */ 551 for_each_possible_cpu(cpu) 552 if (cpu_topology[cpu].package_id == -1) 553 ret = -EINVAL; 554 555 out_map: 556 of_node_put(map); 557 out: 558 of_node_put(cn); 559 return ret; 560 } 561 #endif 562 563 /* 564 * cpu topology table 565 */ 566 struct cpu_topology cpu_topology[NR_CPUS]; 567 EXPORT_SYMBOL_GPL(cpu_topology); 568 569 const struct cpumask *cpu_coregroup_mask(int cpu) 570 { 571 const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu)); 572 573 /* Find the smaller of NUMA, core or LLC siblings */ 574 if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) { 575 /* not numa in package, lets use the package siblings */ 576 core_mask = &cpu_topology[cpu].core_sibling; 577 } 578 if (cpu_topology[cpu].llc_id != -1) { 579 if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask)) 580 core_mask = &cpu_topology[cpu].llc_sibling; 581 } 582 583 return core_mask; 584 } 585 586 void update_siblings_masks(unsigned int cpuid) 587 { 588 struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; 589 int cpu; 590 591 /* update core and thread sibling masks */ 592 for_each_online_cpu(cpu) { 593 cpu_topo = &cpu_topology[cpu]; 594 595 if (cpuid_topo->llc_id == cpu_topo->llc_id) { 596 cpumask_set_cpu(cpu, &cpuid_topo->llc_sibling); 597 cpumask_set_cpu(cpuid, &cpu_topo->llc_sibling); 598 } 599 600 if (cpuid_topo->package_id != cpu_topo->package_id) 601 continue; 602 603 cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); 604 cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); 605 606 if (cpuid_topo->core_id != cpu_topo->core_id) 607 continue; 608 609 cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); 610 cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); 611 } 612 } 613 614 static void clear_cpu_topology(int cpu) 615 { 616 struct cpu_topology *cpu_topo = &cpu_topology[cpu]; 617 618 cpumask_clear(&cpu_topo->llc_sibling); 619 cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); 620 621 cpumask_clear(&cpu_topo->core_sibling); 622 cpumask_set_cpu(cpu, &cpu_topo->core_sibling); 623 cpumask_clear(&cpu_topo->thread_sibling); 624 cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); 625 } 626 627 void __init reset_cpu_topology(void) 628 { 629 unsigned int cpu; 630 631 for_each_possible_cpu(cpu) { 632 struct cpu_topology *cpu_topo = &cpu_topology[cpu]; 633 634 cpu_topo->thread_id = -1; 635 cpu_topo->core_id = -1; 636 cpu_topo->package_id = -1; 637 cpu_topo->llc_id = -1; 638 639 clear_cpu_topology(cpu); 640 } 641 } 642 643 void remove_cpu_topology(unsigned int cpu) 644 { 645 int sibling; 646 647 for_each_cpu(sibling, topology_core_cpumask(cpu)) 648 cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); 649 for_each_cpu(sibling, topology_sibling_cpumask(cpu)) 650 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); 651 for_each_cpu(sibling, topology_llc_cpumask(cpu)) 652 cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); 653 654 clear_cpu_topology(cpu); 655 } 656 657 __weak int __init parse_acpi_topology(void) 658 { 659 return 0; 660 } 661 662 #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) 663 void __init init_cpu_topology(void) 664 { 665 reset_cpu_topology(); 666 667 /* 668 * Discard anything that was parsed if we hit an error so we 669 * don't use partial information. 670 */ 671 if (parse_acpi_topology()) 672 reset_cpu_topology(); 673 else if (of_have_populated_dt() && parse_dt_topology()) 674 reset_cpu_topology(); 675 } 676 #endif 677