1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * pSeries NUMA support 4 * 5 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 6 */ 7 #define pr_fmt(fmt) "numa: " fmt 8 9 #include <linux/threads.h> 10 #include <linux/memblock.h> 11 #include <linux/init.h> 12 #include <linux/mm.h> 13 #include <linux/mmzone.h> 14 #include <linux/export.h> 15 #include <linux/nodemask.h> 16 #include <linux/cpu.h> 17 #include <linux/notifier.h> 18 #include <linux/of.h> 19 #include <linux/pfn.h> 20 #include <linux/cpuset.h> 21 #include <linux/node.h> 22 #include <linux/stop_machine.h> 23 #include <linux/proc_fs.h> 24 #include <linux/seq_file.h> 25 #include <linux/uaccess.h> 26 #include <linux/slab.h> 27 #include <asm/cputhreads.h> 28 #include <asm/sparsemem.h> 29 #include <asm/prom.h> 30 #include <asm/smp.h> 31 #include <asm/topology.h> 32 #include <asm/firmware.h> 33 #include <asm/paca.h> 34 #include <asm/hvcall.h> 35 #include <asm/setup.h> 36 #include <asm/vdso.h> 37 #include <asm/drmem.h> 38 39 static int numa_enabled = 1; 40 41 static char *cmdline __initdata; 42 43 static int numa_debug; 44 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 45 46 int numa_cpu_lookup_table[NR_CPUS]; 47 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 48 struct pglist_data *node_data[MAX_NUMNODES]; 49 50 EXPORT_SYMBOL(numa_cpu_lookup_table); 51 EXPORT_SYMBOL(node_to_cpumask_map); 52 EXPORT_SYMBOL(node_data); 53 54 static int min_common_depth; 55 static int n_mem_addr_cells, n_mem_size_cells; 56 static int form1_affinity; 57 58 #define MAX_DISTANCE_REF_POINTS 4 59 static int distance_ref_points_depth; 60 static const __be32 *distance_ref_points; 61 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; 62 63 /* 64 * Allocate node_to_cpumask_map based on number of available nodes 65 * Requires node_possible_map to be valid. 66 * 67 * Note: cpumask_of_node() is not valid until after this is done. 68 */ 69 static void __init setup_node_to_cpumask_map(void) 70 { 71 unsigned int node; 72 73 /* setup nr_node_ids if not done yet */ 74 if (nr_node_ids == MAX_NUMNODES) 75 setup_nr_node_ids(); 76 77 /* allocate the map */ 78 for_each_node(node) 79 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 80 81 /* cpumask_of_node() will now work */ 82 dbg("Node to cpumask map for %u nodes\n", nr_node_ids); 83 } 84 85 static int __init fake_numa_create_new_node(unsigned long end_pfn, 86 unsigned int *nid) 87 { 88 unsigned long long mem; 89 char *p = cmdline; 90 static unsigned int fake_nid; 91 static unsigned long long curr_boundary; 92 93 /* 94 * Modify node id, iff we started creating NUMA nodes 95 * We want to continue from where we left of the last time 96 */ 97 if (fake_nid) 98 *nid = fake_nid; 99 /* 100 * In case there are no more arguments to parse, the 101 * node_id should be the same as the last fake node id 102 * (we've handled this above). 103 */ 104 if (!p) 105 return 0; 106 107 mem = memparse(p, &p); 108 if (!mem) 109 return 0; 110 111 if (mem < curr_boundary) 112 return 0; 113 114 curr_boundary = mem; 115 116 if ((end_pfn << PAGE_SHIFT) > mem) { 117 /* 118 * Skip commas and spaces 119 */ 120 while (*p == ',' || *p == ' ' || *p == '\t') 121 p++; 122 123 cmdline = p; 124 fake_nid++; 125 *nid = fake_nid; 126 dbg("created new fake_node with id %d\n", fake_nid); 127 return 1; 128 } 129 return 0; 130 } 131 132 static void reset_numa_cpu_lookup_table(void) 133 { 134 unsigned int cpu; 135 136 for_each_possible_cpu(cpu) 137 numa_cpu_lookup_table[cpu] = -1; 138 } 139 140 static void map_cpu_to_node(int cpu, int node) 141 { 142 update_numa_cpu_lookup_table(cpu, node); 143 144 dbg("adding cpu %d to node %d\n", cpu, node); 145 146 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) 147 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 148 } 149 150 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) 151 static void unmap_cpu_from_node(unsigned long cpu) 152 { 153 int node = numa_cpu_lookup_table[cpu]; 154 155 dbg("removing cpu %lu from node %d\n", cpu, node); 156 157 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 158 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); 159 } else { 160 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 161 cpu, node); 162 } 163 } 164 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ 165 166 int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 167 { 168 int dist = 0; 169 170 int i, index; 171 172 for (i = 0; i < distance_ref_points_depth; i++) { 173 index = be32_to_cpu(distance_ref_points[i]); 174 if (cpu1_assoc[index] == cpu2_assoc[index]) 175 break; 176 dist++; 177 } 178 179 return dist; 180 } 181 182 /* must hold reference to node during call */ 183 static const __be32 *of_get_associativity(struct device_node *dev) 184 { 185 return of_get_property(dev, "ibm,associativity", NULL); 186 } 187 188 int __node_distance(int a, int b) 189 { 190 int i; 191 int distance = LOCAL_DISTANCE; 192 193 if (!form1_affinity) 194 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); 195 196 for (i = 0; i < distance_ref_points_depth; i++) { 197 if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) 198 break; 199 200 /* Double the distance for each NUMA level */ 201 distance *= 2; 202 } 203 204 return distance; 205 } 206 EXPORT_SYMBOL(__node_distance); 207 208 static void initialize_distance_lookup_table(int nid, 209 const __be32 *associativity) 210 { 211 int i; 212 213 if (!form1_affinity) 214 return; 215 216 for (i = 0; i < distance_ref_points_depth; i++) { 217 const __be32 *entry; 218 219 entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; 220 distance_lookup_table[nid][i] = of_read_number(entry, 1); 221 } 222 } 223 224 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 225 * info is found. 226 */ 227 static int associativity_to_nid(const __be32 *associativity) 228 { 229 int nid = NUMA_NO_NODE; 230 231 if (!numa_enabled) 232 goto out; 233 234 if (of_read_number(associativity, 1) >= min_common_depth) 235 nid = of_read_number(&associativity[min_common_depth], 1); 236 237 /* POWER4 LPAR uses 0xffff as invalid node */ 238 if (nid == 0xffff || nid >= MAX_NUMNODES) 239 nid = NUMA_NO_NODE; 240 241 if (nid > 0 && 242 of_read_number(associativity, 1) >= distance_ref_points_depth) { 243 /* 244 * Skip the length field and send start of associativity array 245 */ 246 initialize_distance_lookup_table(nid, associativity + 1); 247 } 248 249 out: 250 return nid; 251 } 252 253 /* Returns the nid associated with the given device tree node, 254 * or -1 if not found. 255 */ 256 static int of_node_to_nid_single(struct device_node *device) 257 { 258 int nid = NUMA_NO_NODE; 259 const __be32 *tmp; 260 261 tmp = of_get_associativity(device); 262 if (tmp) 263 nid = associativity_to_nid(tmp); 264 return nid; 265 } 266 267 /* Walk the device tree upwards, looking for an associativity id */ 268 int of_node_to_nid(struct device_node *device) 269 { 270 int nid = NUMA_NO_NODE; 271 272 of_node_get(device); 273 while (device) { 274 nid = of_node_to_nid_single(device); 275 if (nid != -1) 276 break; 277 278 device = of_get_next_parent(device); 279 } 280 of_node_put(device); 281 282 return nid; 283 } 284 EXPORT_SYMBOL(of_node_to_nid); 285 286 static int __init find_min_common_depth(void) 287 { 288 int depth; 289 struct device_node *root; 290 291 if (firmware_has_feature(FW_FEATURE_OPAL)) 292 root = of_find_node_by_path("/ibm,opal"); 293 else 294 root = of_find_node_by_path("/rtas"); 295 if (!root) 296 root = of_find_node_by_path("/"); 297 298 /* 299 * This property is a set of 32-bit integers, each representing 300 * an index into the ibm,associativity nodes. 301 * 302 * With form 0 affinity the first integer is for an SMP configuration 303 * (should be all 0's) and the second is for a normal NUMA 304 * configuration. We have only one level of NUMA. 305 * 306 * With form 1 affinity the first integer is the most significant 307 * NUMA boundary and the following are progressively less significant 308 * boundaries. There can be more than one level of NUMA. 309 */ 310 distance_ref_points = of_get_property(root, 311 "ibm,associativity-reference-points", 312 &distance_ref_points_depth); 313 314 if (!distance_ref_points) { 315 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 316 goto err; 317 } 318 319 distance_ref_points_depth /= sizeof(int); 320 321 if (firmware_has_feature(FW_FEATURE_OPAL) || 322 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { 323 dbg("Using form 1 affinity\n"); 324 form1_affinity = 1; 325 } 326 327 if (form1_affinity) { 328 depth = of_read_number(distance_ref_points, 1); 329 } else { 330 if (distance_ref_points_depth < 2) { 331 printk(KERN_WARNING "NUMA: " 332 "short ibm,associativity-reference-points\n"); 333 goto err; 334 } 335 336 depth = of_read_number(&distance_ref_points[1], 1); 337 } 338 339 /* 340 * Warn and cap if the hardware supports more than 341 * MAX_DISTANCE_REF_POINTS domains. 342 */ 343 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { 344 printk(KERN_WARNING "NUMA: distance array capped at " 345 "%d entries\n", MAX_DISTANCE_REF_POINTS); 346 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 347 } 348 349 of_node_put(root); 350 return depth; 351 352 err: 353 of_node_put(root); 354 return -1; 355 } 356 357 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 358 { 359 struct device_node *memory = NULL; 360 361 memory = of_find_node_by_type(memory, "memory"); 362 if (!memory) 363 panic("numa.c: No memory nodes found!"); 364 365 *n_addr_cells = of_n_addr_cells(memory); 366 *n_size_cells = of_n_size_cells(memory); 367 of_node_put(memory); 368 } 369 370 static unsigned long read_n_cells(int n, const __be32 **buf) 371 { 372 unsigned long result = 0; 373 374 while (n--) { 375 result = (result << 32) | of_read_number(*buf, 1); 376 (*buf)++; 377 } 378 return result; 379 } 380 381 struct assoc_arrays { 382 u32 n_arrays; 383 u32 array_sz; 384 const __be32 *arrays; 385 }; 386 387 /* 388 * Retrieve and validate the list of associativity arrays for drconf 389 * memory from the ibm,associativity-lookup-arrays property of the 390 * device tree.. 391 * 392 * The layout of the ibm,associativity-lookup-arrays property is a number N 393 * indicating the number of associativity arrays, followed by a number M 394 * indicating the size of each associativity array, followed by a list 395 * of N associativity arrays. 396 */ 397 static int of_get_assoc_arrays(struct assoc_arrays *aa) 398 { 399 struct device_node *memory; 400 const __be32 *prop; 401 u32 len; 402 403 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 404 if (!memory) 405 return -1; 406 407 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 408 if (!prop || len < 2 * sizeof(unsigned int)) { 409 of_node_put(memory); 410 return -1; 411 } 412 413 aa->n_arrays = of_read_number(prop++, 1); 414 aa->array_sz = of_read_number(prop++, 1); 415 416 of_node_put(memory); 417 418 /* Now that we know the number of arrays and size of each array, 419 * revalidate the size of the property read in. 420 */ 421 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 422 return -1; 423 424 aa->arrays = prop; 425 return 0; 426 } 427 428 /* 429 * This is like of_node_to_nid_single() for memory represented in the 430 * ibm,dynamic-reconfiguration-memory node. 431 */ 432 static int of_drconf_to_nid_single(struct drmem_lmb *lmb) 433 { 434 struct assoc_arrays aa = { .arrays = NULL }; 435 int default_nid = NUMA_NO_NODE; 436 int nid = default_nid; 437 int rc, index; 438 439 if ((min_common_depth < 0) || !numa_enabled) 440 return default_nid; 441 442 rc = of_get_assoc_arrays(&aa); 443 if (rc) 444 return default_nid; 445 446 if (min_common_depth <= aa.array_sz && 447 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { 448 index = lmb->aa_index * aa.array_sz + min_common_depth - 1; 449 nid = of_read_number(&aa.arrays[index], 1); 450 451 if (nid == 0xffff || nid >= MAX_NUMNODES) 452 nid = default_nid; 453 454 if (nid > 0) { 455 index = lmb->aa_index * aa.array_sz; 456 initialize_distance_lookup_table(nid, 457 &aa.arrays[index]); 458 } 459 } 460 461 return nid; 462 } 463 464 /* 465 * Figure out to which domain a cpu belongs and stick it there. 466 * Return the id of the domain used. 467 */ 468 static int numa_setup_cpu(unsigned long lcpu) 469 { 470 int nid = NUMA_NO_NODE; 471 struct device_node *cpu; 472 473 /* 474 * If a valid cpu-to-node mapping is already available, use it 475 * directly instead of querying the firmware, since it represents 476 * the most recent mapping notified to us by the platform (eg: VPHN). 477 */ 478 if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) { 479 map_cpu_to_node(lcpu, nid); 480 return nid; 481 } 482 483 cpu = of_get_cpu_node(lcpu, NULL); 484 485 if (!cpu) { 486 WARN_ON(1); 487 if (cpu_present(lcpu)) 488 goto out_present; 489 else 490 goto out; 491 } 492 493 nid = of_node_to_nid_single(cpu); 494 495 out_present: 496 if (nid < 0 || !node_possible(nid)) 497 nid = first_online_node; 498 499 map_cpu_to_node(lcpu, nid); 500 of_node_put(cpu); 501 out: 502 return nid; 503 } 504 505 static void verify_cpu_node_mapping(int cpu, int node) 506 { 507 int base, sibling, i; 508 509 /* Verify that all the threads in the core belong to the same node */ 510 base = cpu_first_thread_sibling(cpu); 511 512 for (i = 0; i < threads_per_core; i++) { 513 sibling = base + i; 514 515 if (sibling == cpu || cpu_is_offline(sibling)) 516 continue; 517 518 if (cpu_to_node(sibling) != node) { 519 WARN(1, "CPU thread siblings %d and %d don't belong" 520 " to the same node!\n", cpu, sibling); 521 break; 522 } 523 } 524 } 525 526 /* Must run before sched domains notifier. */ 527 static int ppc_numa_cpu_prepare(unsigned int cpu) 528 { 529 int nid; 530 531 nid = numa_setup_cpu(cpu); 532 verify_cpu_node_mapping(cpu, nid); 533 return 0; 534 } 535 536 static int ppc_numa_cpu_dead(unsigned int cpu) 537 { 538 #ifdef CONFIG_HOTPLUG_CPU 539 unmap_cpu_from_node(cpu); 540 #endif 541 return 0; 542 } 543 544 /* 545 * Check and possibly modify a memory region to enforce the memory limit. 546 * 547 * Returns the size the region should have to enforce the memory limit. 548 * This will either be the original value of size, a truncated value, 549 * or zero. If the returned value of size is 0 the region should be 550 * discarded as it lies wholly above the memory limit. 551 */ 552 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 553 unsigned long size) 554 { 555 /* 556 * We use memblock_end_of_DRAM() in here instead of memory_limit because 557 * we've already adjusted it for the limit and it takes care of 558 * having memory holes below the limit. Also, in the case of 559 * iommu_is_off, memory_limit is not set but is implicitly enforced. 560 */ 561 562 if (start + size <= memblock_end_of_DRAM()) 563 return size; 564 565 if (start >= memblock_end_of_DRAM()) 566 return 0; 567 568 return memblock_end_of_DRAM() - start; 569 } 570 571 /* 572 * Reads the counter for a given entry in 573 * linux,drconf-usable-memory property 574 */ 575 static inline int __init read_usm_ranges(const __be32 **usm) 576 { 577 /* 578 * For each lmb in ibm,dynamic-memory a corresponding 579 * entry in linux,drconf-usable-memory property contains 580 * a counter followed by that many (base, size) duple. 581 * read the counter from linux,drconf-usable-memory 582 */ 583 return read_n_cells(n_mem_size_cells, usm); 584 } 585 586 /* 587 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 588 * node. This assumes n_mem_{addr,size}_cells have been set. 589 */ 590 static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, 591 const __be32 **usm) 592 { 593 unsigned int ranges, is_kexec_kdump = 0; 594 unsigned long base, size, sz; 595 int nid; 596 597 /* 598 * Skip this block if the reserved bit is set in flags (0x80) 599 * or if the block is not assigned to this partition (0x8) 600 */ 601 if ((lmb->flags & DRCONF_MEM_RESERVED) 602 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 603 return; 604 605 if (*usm) 606 is_kexec_kdump = 1; 607 608 base = lmb->base_addr; 609 size = drmem_lmb_size(); 610 ranges = 1; 611 612 if (is_kexec_kdump) { 613 ranges = read_usm_ranges(usm); 614 if (!ranges) /* there are no (base, size) duple */ 615 return; 616 } 617 618 do { 619 if (is_kexec_kdump) { 620 base = read_n_cells(n_mem_addr_cells, usm); 621 size = read_n_cells(n_mem_size_cells, usm); 622 } 623 624 nid = of_drconf_to_nid_single(lmb); 625 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), 626 &nid); 627 node_set_online(nid); 628 sz = numa_enforce_memory_limit(base, size); 629 if (sz) 630 memblock_set_node(base, sz, &memblock.memory, nid); 631 } while (--ranges); 632 } 633 634 static int __init parse_numa_properties(void) 635 { 636 struct device_node *memory; 637 int default_nid = 0; 638 unsigned long i; 639 640 if (numa_enabled == 0) { 641 printk(KERN_WARNING "NUMA disabled by user\n"); 642 return -1; 643 } 644 645 min_common_depth = find_min_common_depth(); 646 647 if (min_common_depth < 0) { 648 /* 649 * if we fail to parse min_common_depth from device tree 650 * mark the numa disabled, boot with numa disabled. 651 */ 652 numa_enabled = false; 653 return min_common_depth; 654 } 655 656 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 657 658 /* 659 * Even though we connect cpus to numa domains later in SMP 660 * init, we need to know the node ids now. This is because 661 * each node to be onlined must have NODE_DATA etc backing it. 662 */ 663 for_each_present_cpu(i) { 664 struct device_node *cpu; 665 int nid; 666 667 cpu = of_get_cpu_node(i, NULL); 668 BUG_ON(!cpu); 669 nid = of_node_to_nid_single(cpu); 670 of_node_put(cpu); 671 672 /* 673 * Don't fall back to default_nid yet -- we will plug 674 * cpus into nodes once the memory scan has discovered 675 * the topology. 676 */ 677 if (nid < 0) 678 continue; 679 node_set_online(nid); 680 } 681 682 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 683 684 for_each_node_by_type(memory, "memory") { 685 unsigned long start; 686 unsigned long size; 687 int nid; 688 int ranges; 689 const __be32 *memcell_buf; 690 unsigned int len; 691 692 memcell_buf = of_get_property(memory, 693 "linux,usable-memory", &len); 694 if (!memcell_buf || len <= 0) 695 memcell_buf = of_get_property(memory, "reg", &len); 696 if (!memcell_buf || len <= 0) 697 continue; 698 699 /* ranges in cell */ 700 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 701 new_range: 702 /* these are order-sensitive, and modify the buffer pointer */ 703 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 704 size = read_n_cells(n_mem_size_cells, &memcell_buf); 705 706 /* 707 * Assumption: either all memory nodes or none will 708 * have associativity properties. If none, then 709 * everything goes to default_nid. 710 */ 711 nid = of_node_to_nid_single(memory); 712 if (nid < 0) 713 nid = default_nid; 714 715 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 716 node_set_online(nid); 717 718 size = numa_enforce_memory_limit(start, size); 719 if (size) 720 memblock_set_node(start, size, &memblock.memory, nid); 721 722 if (--ranges) 723 goto new_range; 724 } 725 726 /* 727 * Now do the same thing for each MEMBLOCK listed in the 728 * ibm,dynamic-memory property in the 729 * ibm,dynamic-reconfiguration-memory node. 730 */ 731 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 732 if (memory) { 733 walk_drmem_lmbs(memory, numa_setup_drmem_lmb); 734 of_node_put(memory); 735 } 736 737 return 0; 738 } 739 740 static void __init setup_nonnuma(void) 741 { 742 unsigned long top_of_ram = memblock_end_of_DRAM(); 743 unsigned long total_ram = memblock_phys_mem_size(); 744 unsigned long start_pfn, end_pfn; 745 unsigned int nid = 0; 746 struct memblock_region *reg; 747 748 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 749 top_of_ram, total_ram); 750 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 751 (top_of_ram - total_ram) >> 20); 752 753 for_each_memblock(memory, reg) { 754 start_pfn = memblock_region_memory_base_pfn(reg); 755 end_pfn = memblock_region_memory_end_pfn(reg); 756 757 fake_numa_create_new_node(end_pfn, &nid); 758 memblock_set_node(PFN_PHYS(start_pfn), 759 PFN_PHYS(end_pfn - start_pfn), 760 &memblock.memory, nid); 761 node_set_online(nid); 762 } 763 } 764 765 void __init dump_numa_cpu_topology(void) 766 { 767 unsigned int node; 768 unsigned int cpu, count; 769 770 if (!numa_enabled) 771 return; 772 773 for_each_online_node(node) { 774 pr_info("Node %d CPUs:", node); 775 776 count = 0; 777 /* 778 * If we used a CPU iterator here we would miss printing 779 * the holes in the cpumap. 780 */ 781 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 782 if (cpumask_test_cpu(cpu, 783 node_to_cpumask_map[node])) { 784 if (count == 0) 785 pr_cont(" %u", cpu); 786 ++count; 787 } else { 788 if (count > 1) 789 pr_cont("-%u", cpu - 1); 790 count = 0; 791 } 792 } 793 794 if (count > 1) 795 pr_cont("-%u", nr_cpu_ids - 1); 796 pr_cont("\n"); 797 } 798 } 799 800 /* Initialize NODE_DATA for a node on the local memory */ 801 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) 802 { 803 u64 spanned_pages = end_pfn - start_pfn; 804 const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); 805 u64 nd_pa; 806 void *nd; 807 int tnid; 808 809 nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 810 if (!nd_pa) 811 panic("Cannot allocate %zu bytes for node %d data\n", 812 nd_size, nid); 813 814 nd = __va(nd_pa); 815 816 /* report and initialize */ 817 pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", 818 nd_pa, nd_pa + nd_size - 1); 819 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 820 if (tnid != nid) 821 pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); 822 823 node_data[nid] = nd; 824 memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 825 NODE_DATA(nid)->node_id = nid; 826 NODE_DATA(nid)->node_start_pfn = start_pfn; 827 NODE_DATA(nid)->node_spanned_pages = spanned_pages; 828 } 829 830 static void __init find_possible_nodes(void) 831 { 832 struct device_node *rtas; 833 u32 numnodes, i; 834 835 if (!numa_enabled) 836 return; 837 838 rtas = of_find_node_by_path("/rtas"); 839 if (!rtas) 840 return; 841 842 if (of_property_read_u32_index(rtas, 843 "ibm,max-associativity-domains", 844 min_common_depth, &numnodes)) 845 goto out; 846 847 for (i = 0; i < numnodes; i++) { 848 if (!node_possible(i)) 849 node_set(i, node_possible_map); 850 } 851 852 out: 853 of_node_put(rtas); 854 } 855 856 void __init mem_topology_setup(void) 857 { 858 int cpu; 859 860 if (parse_numa_properties()) 861 setup_nonnuma(); 862 863 /* 864 * Modify the set of possible NUMA nodes to reflect information 865 * available about the set of online nodes, and the set of nodes 866 * that we expect to make use of for this platform's affinity 867 * calculations. 868 */ 869 nodes_and(node_possible_map, node_possible_map, node_online_map); 870 871 find_possible_nodes(); 872 873 setup_node_to_cpumask_map(); 874 875 reset_numa_cpu_lookup_table(); 876 877 for_each_present_cpu(cpu) 878 numa_setup_cpu(cpu); 879 } 880 881 void __init initmem_init(void) 882 { 883 int nid; 884 885 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 886 max_pfn = max_low_pfn; 887 888 memblock_dump_all(); 889 890 for_each_online_node(nid) { 891 unsigned long start_pfn, end_pfn; 892 893 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 894 setup_node_data(nid, start_pfn, end_pfn); 895 sparse_memory_present_with_active_regions(nid); 896 } 897 898 sparse_init(); 899 900 /* 901 * We need the numa_cpu_lookup_table to be accurate for all CPUs, 902 * even before we online them, so that we can use cpu_to_{node,mem} 903 * early in boot, cf. smp_prepare_cpus(). 904 * _nocalls() + manual invocation is used because cpuhp is not yet 905 * initialized for the boot CPU. 906 */ 907 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", 908 ppc_numa_cpu_prepare, ppc_numa_cpu_dead); 909 } 910 911 static int __init early_numa(char *p) 912 { 913 if (!p) 914 return 0; 915 916 if (strstr(p, "off")) 917 numa_enabled = 0; 918 919 if (strstr(p, "debug")) 920 numa_debug = 1; 921 922 p = strstr(p, "fake="); 923 if (p) 924 cmdline = p + strlen("fake="); 925 926 return 0; 927 } 928 early_param("numa", early_numa); 929 930 /* 931 * The platform can inform us through one of several mechanisms 932 * (post-migration device tree updates, PRRN or VPHN) that the NUMA 933 * assignment of a resource has changed. This controls whether we act 934 * on that. Disabled by default. 935 */ 936 static bool topology_updates_enabled; 937 938 static int __init early_topology_updates(char *p) 939 { 940 if (!p) 941 return 0; 942 943 if (!strcmp(p, "on")) { 944 pr_warn("Caution: enabling topology updates\n"); 945 topology_updates_enabled = true; 946 } 947 948 return 0; 949 } 950 early_param("topology_updates", early_topology_updates); 951 952 #ifdef CONFIG_MEMORY_HOTPLUG 953 /* 954 * Find the node associated with a hot added memory section for 955 * memory represented in the device tree by the property 956 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 957 */ 958 static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) 959 { 960 struct drmem_lmb *lmb; 961 unsigned long lmb_size; 962 int nid = NUMA_NO_NODE; 963 964 lmb_size = drmem_lmb_size(); 965 966 for_each_drmem_lmb(lmb) { 967 /* skip this block if it is reserved or not assigned to 968 * this partition */ 969 if ((lmb->flags & DRCONF_MEM_RESERVED) 970 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 971 continue; 972 973 if ((scn_addr < lmb->base_addr) 974 || (scn_addr >= (lmb->base_addr + lmb_size))) 975 continue; 976 977 nid = of_drconf_to_nid_single(lmb); 978 break; 979 } 980 981 return nid; 982 } 983 984 /* 985 * Find the node associated with a hot added memory section for memory 986 * represented in the device tree as a node (i.e. memory@XXXX) for 987 * each memblock. 988 */ 989 static int hot_add_node_scn_to_nid(unsigned long scn_addr) 990 { 991 struct device_node *memory; 992 int nid = NUMA_NO_NODE; 993 994 for_each_node_by_type(memory, "memory") { 995 unsigned long start, size; 996 int ranges; 997 const __be32 *memcell_buf; 998 unsigned int len; 999 1000 memcell_buf = of_get_property(memory, "reg", &len); 1001 if (!memcell_buf || len <= 0) 1002 continue; 1003 1004 /* ranges in cell */ 1005 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1006 1007 while (ranges--) { 1008 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1009 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1010 1011 if ((scn_addr < start) || (scn_addr >= (start + size))) 1012 continue; 1013 1014 nid = of_node_to_nid_single(memory); 1015 break; 1016 } 1017 1018 if (nid >= 0) 1019 break; 1020 } 1021 1022 of_node_put(memory); 1023 1024 return nid; 1025 } 1026 1027 /* 1028 * Find the node associated with a hot added memory section. Section 1029 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that 1030 * sections are fully contained within a single MEMBLOCK. 1031 */ 1032 int hot_add_scn_to_nid(unsigned long scn_addr) 1033 { 1034 struct device_node *memory = NULL; 1035 int nid; 1036 1037 if (!numa_enabled) 1038 return first_online_node; 1039 1040 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1041 if (memory) { 1042 nid = hot_add_drconf_scn_to_nid(scn_addr); 1043 of_node_put(memory); 1044 } else { 1045 nid = hot_add_node_scn_to_nid(scn_addr); 1046 } 1047 1048 if (nid < 0 || !node_possible(nid)) 1049 nid = first_online_node; 1050 1051 return nid; 1052 } 1053 1054 static u64 hot_add_drconf_memory_max(void) 1055 { 1056 struct device_node *memory = NULL; 1057 struct device_node *dn = NULL; 1058 const __be64 *lrdr = NULL; 1059 1060 dn = of_find_node_by_path("/rtas"); 1061 if (dn) { 1062 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL); 1063 of_node_put(dn); 1064 if (lrdr) 1065 return be64_to_cpup(lrdr); 1066 } 1067 1068 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1069 if (memory) { 1070 of_node_put(memory); 1071 return drmem_lmb_memory_max(); 1072 } 1073 return 0; 1074 } 1075 1076 /* 1077 * memory_hotplug_max - return max address of memory that may be added 1078 * 1079 * This is currently only used on systems that support drconfig memory 1080 * hotplug. 1081 */ 1082 u64 memory_hotplug_max(void) 1083 { 1084 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); 1085 } 1086 #endif /* CONFIG_MEMORY_HOTPLUG */ 1087 1088 /* Virtual Processor Home Node (VPHN) support */ 1089 #ifdef CONFIG_PPC_SPLPAR 1090 struct topology_update_data { 1091 struct topology_update_data *next; 1092 unsigned int cpu; 1093 int old_nid; 1094 int new_nid; 1095 }; 1096 1097 #define TOPOLOGY_DEF_TIMER_SECS 60 1098 1099 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; 1100 static cpumask_t cpu_associativity_changes_mask; 1101 static int vphn_enabled; 1102 static int prrn_enabled; 1103 static void reset_topology_timer(void); 1104 static int topology_timer_secs = 1; 1105 static int topology_inited; 1106 1107 /* 1108 * Change polling interval for associativity changes. 1109 */ 1110 int timed_topology_update(int nsecs) 1111 { 1112 if (vphn_enabled) { 1113 if (nsecs > 0) 1114 topology_timer_secs = nsecs; 1115 else 1116 topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS; 1117 1118 reset_topology_timer(); 1119 } 1120 1121 return 0; 1122 } 1123 1124 /* 1125 * Store the current values of the associativity change counters in the 1126 * hypervisor. 1127 */ 1128 static void setup_cpu_associativity_change_counters(void) 1129 { 1130 int cpu; 1131 1132 /* The VPHN feature supports a maximum of 8 reference points */ 1133 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); 1134 1135 for_each_possible_cpu(cpu) { 1136 int i; 1137 u8 *counts = vphn_cpu_change_counts[cpu]; 1138 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts; 1139 1140 for (i = 0; i < distance_ref_points_depth; i++) 1141 counts[i] = hypervisor_counts[i]; 1142 } 1143 } 1144 1145 /* 1146 * The hypervisor maintains a set of 8 associativity change counters in 1147 * the VPA of each cpu that correspond to the associativity levels in the 1148 * ibm,associativity-reference-points property. When an associativity 1149 * level changes, the corresponding counter is incremented. 1150 * 1151 * Set a bit in cpu_associativity_changes_mask for each cpu whose home 1152 * node associativity levels have changed. 1153 * 1154 * Returns the number of cpus with unhandled associativity changes. 1155 */ 1156 static int update_cpu_associativity_changes_mask(void) 1157 { 1158 int cpu; 1159 cpumask_t *changes = &cpu_associativity_changes_mask; 1160 1161 for_each_possible_cpu(cpu) { 1162 int i, changed = 0; 1163 u8 *counts = vphn_cpu_change_counts[cpu]; 1164 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts; 1165 1166 for (i = 0; i < distance_ref_points_depth; i++) { 1167 if (hypervisor_counts[i] != counts[i]) { 1168 counts[i] = hypervisor_counts[i]; 1169 changed = 1; 1170 } 1171 } 1172 if (changed) { 1173 cpumask_or(changes, changes, cpu_sibling_mask(cpu)); 1174 cpu = cpu_last_thread_sibling(cpu); 1175 } 1176 } 1177 1178 return cpumask_weight(changes); 1179 } 1180 1181 /* 1182 * Retrieve the new associativity information for a virtual processor's 1183 * home node. 1184 */ 1185 static long vphn_get_associativity(unsigned long cpu, 1186 __be32 *associativity) 1187 { 1188 long rc; 1189 1190 rc = hcall_vphn(get_hard_smp_processor_id(cpu), 1191 VPHN_FLAG_VCPU, associativity); 1192 1193 switch (rc) { 1194 case H_FUNCTION: 1195 printk_once(KERN_INFO 1196 "VPHN is not supported. Disabling polling...\n"); 1197 stop_topology_update(); 1198 break; 1199 case H_HARDWARE: 1200 printk(KERN_ERR 1201 "hcall_vphn() experienced a hardware fault " 1202 "preventing VPHN. Disabling polling...\n"); 1203 stop_topology_update(); 1204 break; 1205 case H_SUCCESS: 1206 dbg("VPHN hcall succeeded. Reset polling...\n"); 1207 timed_topology_update(0); 1208 break; 1209 } 1210 1211 return rc; 1212 } 1213 1214 int find_and_online_cpu_nid(int cpu) 1215 { 1216 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1217 int new_nid; 1218 1219 /* Use associativity from first thread for all siblings */ 1220 if (vphn_get_associativity(cpu, associativity)) 1221 return cpu_to_node(cpu); 1222 1223 new_nid = associativity_to_nid(associativity); 1224 if (new_nid < 0 || !node_possible(new_nid)) 1225 new_nid = first_online_node; 1226 1227 if (NODE_DATA(new_nid) == NULL) { 1228 #ifdef CONFIG_MEMORY_HOTPLUG 1229 /* 1230 * Need to ensure that NODE_DATA is initialized for a node from 1231 * available memory (see memblock_alloc_try_nid). If unable to 1232 * init the node, then default to nearest node that has memory 1233 * installed. Skip onlining a node if the subsystems are not 1234 * yet initialized. 1235 */ 1236 if (!topology_inited || try_online_node(new_nid)) 1237 new_nid = first_online_node; 1238 #else 1239 /* 1240 * Default to using the nearest node that has memory installed. 1241 * Otherwise, it would be necessary to patch the kernel MM code 1242 * to deal with more memoryless-node error conditions. 1243 */ 1244 new_nid = first_online_node; 1245 #endif 1246 } 1247 1248 pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__, 1249 cpu, new_nid); 1250 return new_nid; 1251 } 1252 1253 /* 1254 * Update the CPU maps and sysfs entries for a single CPU when its NUMA 1255 * characteristics change. This function doesn't perform any locking and is 1256 * only safe to call from stop_machine(). 1257 */ 1258 static int update_cpu_topology(void *data) 1259 { 1260 struct topology_update_data *update; 1261 unsigned long cpu; 1262 1263 if (!data) 1264 return -EINVAL; 1265 1266 cpu = smp_processor_id(); 1267 1268 for (update = data; update; update = update->next) { 1269 int new_nid = update->new_nid; 1270 if (cpu != update->cpu) 1271 continue; 1272 1273 unmap_cpu_from_node(cpu); 1274 map_cpu_to_node(cpu, new_nid); 1275 set_cpu_numa_node(cpu, new_nid); 1276 set_cpu_numa_mem(cpu, local_memory_node(new_nid)); 1277 vdso_getcpu_init(); 1278 } 1279 1280 return 0; 1281 } 1282 1283 static int update_lookup_table(void *data) 1284 { 1285 struct topology_update_data *update; 1286 1287 if (!data) 1288 return -EINVAL; 1289 1290 /* 1291 * Upon topology update, the numa-cpu lookup table needs to be updated 1292 * for all threads in the core, including offline CPUs, to ensure that 1293 * future hotplug operations respect the cpu-to-node associativity 1294 * properly. 1295 */ 1296 for (update = data; update; update = update->next) { 1297 int nid, base, j; 1298 1299 nid = update->new_nid; 1300 base = cpu_first_thread_sibling(update->cpu); 1301 1302 for (j = 0; j < threads_per_core; j++) { 1303 update_numa_cpu_lookup_table(base + j, nid); 1304 } 1305 } 1306 1307 return 0; 1308 } 1309 1310 /* 1311 * Update the node maps and sysfs entries for each cpu whose home node 1312 * has changed. Returns 1 when the topology has changed, and 0 otherwise. 1313 * 1314 * cpus_locked says whether we already hold cpu_hotplug_lock. 1315 */ 1316 int numa_update_cpu_topology(bool cpus_locked) 1317 { 1318 unsigned int cpu, sibling, changed = 0; 1319 struct topology_update_data *updates, *ud; 1320 cpumask_t updated_cpus; 1321 struct device *dev; 1322 int weight, new_nid, i = 0; 1323 1324 if (!prrn_enabled && !vphn_enabled && topology_inited) 1325 return 0; 1326 1327 weight = cpumask_weight(&cpu_associativity_changes_mask); 1328 if (!weight) 1329 return 0; 1330 1331 updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL); 1332 if (!updates) 1333 return 0; 1334 1335 cpumask_clear(&updated_cpus); 1336 1337 for_each_cpu(cpu, &cpu_associativity_changes_mask) { 1338 /* 1339 * If siblings aren't flagged for changes, updates list 1340 * will be too short. Skip on this update and set for next 1341 * update. 1342 */ 1343 if (!cpumask_subset(cpu_sibling_mask(cpu), 1344 &cpu_associativity_changes_mask)) { 1345 pr_info("Sibling bits not set for associativity " 1346 "change, cpu%d\n", cpu); 1347 cpumask_or(&cpu_associativity_changes_mask, 1348 &cpu_associativity_changes_mask, 1349 cpu_sibling_mask(cpu)); 1350 cpu = cpu_last_thread_sibling(cpu); 1351 continue; 1352 } 1353 1354 new_nid = find_and_online_cpu_nid(cpu); 1355 1356 if (new_nid == numa_cpu_lookup_table[cpu]) { 1357 cpumask_andnot(&cpu_associativity_changes_mask, 1358 &cpu_associativity_changes_mask, 1359 cpu_sibling_mask(cpu)); 1360 dbg("Assoc chg gives same node %d for cpu%d\n", 1361 new_nid, cpu); 1362 cpu = cpu_last_thread_sibling(cpu); 1363 continue; 1364 } 1365 1366 for_each_cpu(sibling, cpu_sibling_mask(cpu)) { 1367 ud = &updates[i++]; 1368 ud->next = &updates[i]; 1369 ud->cpu = sibling; 1370 ud->new_nid = new_nid; 1371 ud->old_nid = numa_cpu_lookup_table[sibling]; 1372 cpumask_set_cpu(sibling, &updated_cpus); 1373 } 1374 cpu = cpu_last_thread_sibling(cpu); 1375 } 1376 1377 /* 1378 * Prevent processing of 'updates' from overflowing array 1379 * where last entry filled in a 'next' pointer. 1380 */ 1381 if (i) 1382 updates[i-1].next = NULL; 1383 1384 pr_debug("Topology update for the following CPUs:\n"); 1385 if (cpumask_weight(&updated_cpus)) { 1386 for (ud = &updates[0]; ud; ud = ud->next) { 1387 pr_debug("cpu %d moving from node %d " 1388 "to %d\n", ud->cpu, 1389 ud->old_nid, ud->new_nid); 1390 } 1391 } 1392 1393 /* 1394 * In cases where we have nothing to update (because the updates list 1395 * is too short or because the new topology is same as the old one), 1396 * skip invoking update_cpu_topology() via stop-machine(). This is 1397 * necessary (and not just a fast-path optimization) since stop-machine 1398 * can end up electing a random CPU to run update_cpu_topology(), and 1399 * thus trick us into setting up incorrect cpu-node mappings (since 1400 * 'updates' is kzalloc()'ed). 1401 * 1402 * And for the similar reason, we will skip all the following updating. 1403 */ 1404 if (!cpumask_weight(&updated_cpus)) 1405 goto out; 1406 1407 if (cpus_locked) 1408 stop_machine_cpuslocked(update_cpu_topology, &updates[0], 1409 &updated_cpus); 1410 else 1411 stop_machine(update_cpu_topology, &updates[0], &updated_cpus); 1412 1413 /* 1414 * Update the numa-cpu lookup table with the new mappings, even for 1415 * offline CPUs. It is best to perform this update from the stop- 1416 * machine context. 1417 */ 1418 if (cpus_locked) 1419 stop_machine_cpuslocked(update_lookup_table, &updates[0], 1420 cpumask_of(raw_smp_processor_id())); 1421 else 1422 stop_machine(update_lookup_table, &updates[0], 1423 cpumask_of(raw_smp_processor_id())); 1424 1425 for (ud = &updates[0]; ud; ud = ud->next) { 1426 unregister_cpu_under_node(ud->cpu, ud->old_nid); 1427 register_cpu_under_node(ud->cpu, ud->new_nid); 1428 1429 dev = get_cpu_device(ud->cpu); 1430 if (dev) 1431 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 1432 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask); 1433 changed = 1; 1434 } 1435 1436 out: 1437 kfree(updates); 1438 return changed; 1439 } 1440 1441 int arch_update_cpu_topology(void) 1442 { 1443 return numa_update_cpu_topology(true); 1444 } 1445 1446 static void topology_work_fn(struct work_struct *work) 1447 { 1448 rebuild_sched_domains(); 1449 } 1450 static DECLARE_WORK(topology_work, topology_work_fn); 1451 1452 static void topology_schedule_update(void) 1453 { 1454 schedule_work(&topology_work); 1455 } 1456 1457 static void topology_timer_fn(struct timer_list *unused) 1458 { 1459 if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask)) 1460 topology_schedule_update(); 1461 else if (vphn_enabled) { 1462 if (update_cpu_associativity_changes_mask() > 0) 1463 topology_schedule_update(); 1464 reset_topology_timer(); 1465 } 1466 } 1467 static struct timer_list topology_timer; 1468 1469 static void reset_topology_timer(void) 1470 { 1471 if (vphn_enabled) 1472 mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ); 1473 } 1474 1475 #ifdef CONFIG_SMP 1476 1477 static int dt_update_callback(struct notifier_block *nb, 1478 unsigned long action, void *data) 1479 { 1480 struct of_reconfig_data *update = data; 1481 int rc = NOTIFY_DONE; 1482 1483 switch (action) { 1484 case OF_RECONFIG_UPDATE_PROPERTY: 1485 if (of_node_is_type(update->dn, "cpu") && 1486 !of_prop_cmp(update->prop->name, "ibm,associativity")) { 1487 u32 core_id; 1488 of_property_read_u32(update->dn, "reg", &core_id); 1489 rc = dlpar_cpu_readd(core_id); 1490 rc = NOTIFY_OK; 1491 } 1492 break; 1493 } 1494 1495 return rc; 1496 } 1497 1498 static struct notifier_block dt_update_nb = { 1499 .notifier_call = dt_update_callback, 1500 }; 1501 1502 #endif 1503 1504 /* 1505 * Start polling for associativity changes. 1506 */ 1507 int start_topology_update(void) 1508 { 1509 int rc = 0; 1510 1511 if (!topology_updates_enabled) 1512 return 0; 1513 1514 if (firmware_has_feature(FW_FEATURE_PRRN)) { 1515 if (!prrn_enabled) { 1516 prrn_enabled = 1; 1517 #ifdef CONFIG_SMP 1518 rc = of_reconfig_notifier_register(&dt_update_nb); 1519 #endif 1520 } 1521 } 1522 if (firmware_has_feature(FW_FEATURE_VPHN) && 1523 lppaca_shared_proc(get_lppaca())) { 1524 if (!vphn_enabled) { 1525 vphn_enabled = 1; 1526 setup_cpu_associativity_change_counters(); 1527 timer_setup(&topology_timer, topology_timer_fn, 1528 TIMER_DEFERRABLE); 1529 reset_topology_timer(); 1530 } 1531 } 1532 1533 pr_info("Starting topology update%s%s\n", 1534 (prrn_enabled ? " prrn_enabled" : ""), 1535 (vphn_enabled ? " vphn_enabled" : "")); 1536 1537 return rc; 1538 } 1539 1540 /* 1541 * Disable polling for VPHN associativity changes. 1542 */ 1543 int stop_topology_update(void) 1544 { 1545 int rc = 0; 1546 1547 if (!topology_updates_enabled) 1548 return 0; 1549 1550 if (prrn_enabled) { 1551 prrn_enabled = 0; 1552 #ifdef CONFIG_SMP 1553 rc = of_reconfig_notifier_unregister(&dt_update_nb); 1554 #endif 1555 } 1556 if (vphn_enabled) { 1557 vphn_enabled = 0; 1558 rc = del_timer_sync(&topology_timer); 1559 } 1560 1561 pr_info("Stopping topology update\n"); 1562 1563 return rc; 1564 } 1565 1566 int prrn_is_enabled(void) 1567 { 1568 return prrn_enabled; 1569 } 1570 1571 void __init shared_proc_topology_init(void) 1572 { 1573 if (lppaca_shared_proc(get_lppaca())) { 1574 bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), 1575 nr_cpumask_bits); 1576 numa_update_cpu_topology(false); 1577 } 1578 } 1579 1580 static int topology_read(struct seq_file *file, void *v) 1581 { 1582 if (vphn_enabled || prrn_enabled) 1583 seq_puts(file, "on\n"); 1584 else 1585 seq_puts(file, "off\n"); 1586 1587 return 0; 1588 } 1589 1590 static int topology_open(struct inode *inode, struct file *file) 1591 { 1592 return single_open(file, topology_read, NULL); 1593 } 1594 1595 static ssize_t topology_write(struct file *file, const char __user *buf, 1596 size_t count, loff_t *off) 1597 { 1598 char kbuf[4]; /* "on" or "off" plus null. */ 1599 int read_len; 1600 1601 read_len = count < 3 ? count : 3; 1602 if (copy_from_user(kbuf, buf, read_len)) 1603 return -EINVAL; 1604 1605 kbuf[read_len] = '\0'; 1606 1607 if (!strncmp(kbuf, "on", 2)) { 1608 topology_updates_enabled = true; 1609 start_topology_update(); 1610 } else if (!strncmp(kbuf, "off", 3)) { 1611 stop_topology_update(); 1612 topology_updates_enabled = false; 1613 } else 1614 return -EINVAL; 1615 1616 return count; 1617 } 1618 1619 static const struct file_operations topology_ops = { 1620 .read = seq_read, 1621 .write = topology_write, 1622 .open = topology_open, 1623 .release = single_release 1624 }; 1625 1626 static int topology_update_init(void) 1627 { 1628 start_topology_update(); 1629 1630 if (vphn_enabled) 1631 topology_schedule_update(); 1632 1633 if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) 1634 return -ENOMEM; 1635 1636 topology_inited = 1; 1637 return 0; 1638 } 1639 device_initcall(topology_update_init); 1640 #endif /* CONFIG_PPC_SPLPAR */ 1641