1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * pSeries NUMA support 4 * 5 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 6 */ 7 #define pr_fmt(fmt) "numa: " fmt 8 9 #include <linux/threads.h> 10 #include <linux/memblock.h> 11 #include <linux/init.h> 12 #include <linux/mm.h> 13 #include <linux/mmzone.h> 14 #include <linux/export.h> 15 #include <linux/nodemask.h> 16 #include <linux/cpu.h> 17 #include <linux/notifier.h> 18 #include <linux/of.h> 19 #include <linux/pfn.h> 20 #include <linux/cpuset.h> 21 #include <linux/node.h> 22 #include <linux/stop_machine.h> 23 #include <linux/proc_fs.h> 24 #include <linux/seq_file.h> 25 #include <linux/uaccess.h> 26 #include <linux/slab.h> 27 #include <asm/cputhreads.h> 28 #include <asm/sparsemem.h> 29 #include <asm/prom.h> 30 #include <asm/smp.h> 31 #include <asm/topology.h> 32 #include <asm/firmware.h> 33 #include <asm/paca.h> 34 #include <asm/hvcall.h> 35 #include <asm/setup.h> 36 #include <asm/vdso.h> 37 #include <asm/drmem.h> 38 39 static int numa_enabled = 1; 40 41 static char *cmdline __initdata; 42 43 static int numa_debug; 44 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 45 46 int numa_cpu_lookup_table[NR_CPUS]; 47 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 48 struct pglist_data *node_data[MAX_NUMNODES]; 49 50 EXPORT_SYMBOL(numa_cpu_lookup_table); 51 EXPORT_SYMBOL(node_to_cpumask_map); 52 EXPORT_SYMBOL(node_data); 53 54 static int min_common_depth; 55 static int n_mem_addr_cells, n_mem_size_cells; 56 static int form1_affinity; 57 58 #define MAX_DISTANCE_REF_POINTS 4 59 static int distance_ref_points_depth; 60 static const __be32 *distance_ref_points; 61 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; 62 63 /* 64 * Allocate node_to_cpumask_map based on number of available nodes 65 * Requires node_possible_map to be valid. 66 * 67 * Note: cpumask_of_node() is not valid until after this is done. 68 */ 69 static void __init setup_node_to_cpumask_map(void) 70 { 71 unsigned int node; 72 73 /* setup nr_node_ids if not done yet */ 74 if (nr_node_ids == MAX_NUMNODES) 75 setup_nr_node_ids(); 76 77 /* allocate the map */ 78 for_each_node(node) 79 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 80 81 /* cpumask_of_node() will now work */ 82 dbg("Node to cpumask map for %u nodes\n", nr_node_ids); 83 } 84 85 static int __init fake_numa_create_new_node(unsigned long end_pfn, 86 unsigned int *nid) 87 { 88 unsigned long long mem; 89 char *p = cmdline; 90 static unsigned int fake_nid; 91 static unsigned long long curr_boundary; 92 93 /* 94 * Modify node id, iff we started creating NUMA nodes 95 * We want to continue from where we left of the last time 96 */ 97 if (fake_nid) 98 *nid = fake_nid; 99 /* 100 * In case there are no more arguments to parse, the 101 * node_id should be the same as the last fake node id 102 * (we've handled this above). 103 */ 104 if (!p) 105 return 0; 106 107 mem = memparse(p, &p); 108 if (!mem) 109 return 0; 110 111 if (mem < curr_boundary) 112 return 0; 113 114 curr_boundary = mem; 115 116 if ((end_pfn << PAGE_SHIFT) > mem) { 117 /* 118 * Skip commas and spaces 119 */ 120 while (*p == ',' || *p == ' ' || *p == '\t') 121 p++; 122 123 cmdline = p; 124 fake_nid++; 125 *nid = fake_nid; 126 dbg("created new fake_node with id %d\n", fake_nid); 127 return 1; 128 } 129 return 0; 130 } 131 132 static void reset_numa_cpu_lookup_table(void) 133 { 134 unsigned int cpu; 135 136 for_each_possible_cpu(cpu) 137 numa_cpu_lookup_table[cpu] = -1; 138 } 139 140 static void map_cpu_to_node(int cpu, int node) 141 { 142 update_numa_cpu_lookup_table(cpu, node); 143 144 dbg("adding cpu %d to node %d\n", cpu, node); 145 146 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) 147 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 148 } 149 150 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) 151 static void unmap_cpu_from_node(unsigned long cpu) 152 { 153 int node = numa_cpu_lookup_table[cpu]; 154 155 dbg("removing cpu %lu from node %d\n", cpu, node); 156 157 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 158 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); 159 } else { 160 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 161 cpu, node); 162 } 163 } 164 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ 165 166 int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 167 { 168 int dist = 0; 169 170 int i, index; 171 172 for (i = 0; i < distance_ref_points_depth; i++) { 173 index = be32_to_cpu(distance_ref_points[i]); 174 if (cpu1_assoc[index] == cpu2_assoc[index]) 175 break; 176 dist++; 177 } 178 179 return dist; 180 } 181 182 /* must hold reference to node during call */ 183 static const __be32 *of_get_associativity(struct device_node *dev) 184 { 185 return of_get_property(dev, "ibm,associativity", NULL); 186 } 187 188 int __node_distance(int a, int b) 189 { 190 int i; 191 int distance = LOCAL_DISTANCE; 192 193 if (!form1_affinity) 194 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); 195 196 for (i = 0; i < distance_ref_points_depth; i++) { 197 if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) 198 break; 199 200 /* Double the distance for each NUMA level */ 201 distance *= 2; 202 } 203 204 return distance; 205 } 206 EXPORT_SYMBOL(__node_distance); 207 208 static void initialize_distance_lookup_table(int nid, 209 const __be32 *associativity) 210 { 211 int i; 212 213 if (!form1_affinity) 214 return; 215 216 for (i = 0; i < distance_ref_points_depth; i++) { 217 const __be32 *entry; 218 219 entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; 220 distance_lookup_table[nid][i] = of_read_number(entry, 1); 221 } 222 } 223 224 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 225 * info is found. 226 */ 227 static int associativity_to_nid(const __be32 *associativity) 228 { 229 int nid = NUMA_NO_NODE; 230 231 if (!numa_enabled) 232 goto out; 233 234 if (of_read_number(associativity, 1) >= min_common_depth) 235 nid = of_read_number(&associativity[min_common_depth], 1); 236 237 /* POWER4 LPAR uses 0xffff as invalid node */ 238 if (nid == 0xffff || nid >= MAX_NUMNODES) 239 nid = NUMA_NO_NODE; 240 241 if (nid > 0 && 242 of_read_number(associativity, 1) >= distance_ref_points_depth) { 243 /* 244 * Skip the length field and send start of associativity array 245 */ 246 initialize_distance_lookup_table(nid, associativity + 1); 247 } 248 249 out: 250 return nid; 251 } 252 253 /* Returns the nid associated with the given device tree node, 254 * or -1 if not found. 255 */ 256 static int of_node_to_nid_single(struct device_node *device) 257 { 258 int nid = NUMA_NO_NODE; 259 const __be32 *tmp; 260 261 tmp = of_get_associativity(device); 262 if (tmp) 263 nid = associativity_to_nid(tmp); 264 return nid; 265 } 266 267 /* Walk the device tree upwards, looking for an associativity id */ 268 int of_node_to_nid(struct device_node *device) 269 { 270 int nid = NUMA_NO_NODE; 271 272 of_node_get(device); 273 while (device) { 274 nid = of_node_to_nid_single(device); 275 if (nid != -1) 276 break; 277 278 device = of_get_next_parent(device); 279 } 280 of_node_put(device); 281 282 return nid; 283 } 284 EXPORT_SYMBOL(of_node_to_nid); 285 286 static int __init find_min_common_depth(void) 287 { 288 int depth; 289 struct device_node *root; 290 291 if (firmware_has_feature(FW_FEATURE_OPAL)) 292 root = of_find_node_by_path("/ibm,opal"); 293 else 294 root = of_find_node_by_path("/rtas"); 295 if (!root) 296 root = of_find_node_by_path("/"); 297 298 /* 299 * This property is a set of 32-bit integers, each representing 300 * an index into the ibm,associativity nodes. 301 * 302 * With form 0 affinity the first integer is for an SMP configuration 303 * (should be all 0's) and the second is for a normal NUMA 304 * configuration. We have only one level of NUMA. 305 * 306 * With form 1 affinity the first integer is the most significant 307 * NUMA boundary and the following are progressively less significant 308 * boundaries. There can be more than one level of NUMA. 309 */ 310 distance_ref_points = of_get_property(root, 311 "ibm,associativity-reference-points", 312 &distance_ref_points_depth); 313 314 if (!distance_ref_points) { 315 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 316 goto err; 317 } 318 319 distance_ref_points_depth /= sizeof(int); 320 321 if (firmware_has_feature(FW_FEATURE_OPAL) || 322 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { 323 dbg("Using form 1 affinity\n"); 324 form1_affinity = 1; 325 } 326 327 if (form1_affinity) { 328 depth = of_read_number(distance_ref_points, 1); 329 } else { 330 if (distance_ref_points_depth < 2) { 331 printk(KERN_WARNING "NUMA: " 332 "short ibm,associativity-reference-points\n"); 333 goto err; 334 } 335 336 depth = of_read_number(&distance_ref_points[1], 1); 337 } 338 339 /* 340 * Warn and cap if the hardware supports more than 341 * MAX_DISTANCE_REF_POINTS domains. 342 */ 343 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { 344 printk(KERN_WARNING "NUMA: distance array capped at " 345 "%d entries\n", MAX_DISTANCE_REF_POINTS); 346 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 347 } 348 349 of_node_put(root); 350 return depth; 351 352 err: 353 of_node_put(root); 354 return -1; 355 } 356 357 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 358 { 359 struct device_node *memory = NULL; 360 361 memory = of_find_node_by_type(memory, "memory"); 362 if (!memory) 363 panic("numa.c: No memory nodes found!"); 364 365 *n_addr_cells = of_n_addr_cells(memory); 366 *n_size_cells = of_n_size_cells(memory); 367 of_node_put(memory); 368 } 369 370 static unsigned long read_n_cells(int n, const __be32 **buf) 371 { 372 unsigned long result = 0; 373 374 while (n--) { 375 result = (result << 32) | of_read_number(*buf, 1); 376 (*buf)++; 377 } 378 return result; 379 } 380 381 struct assoc_arrays { 382 u32 n_arrays; 383 u32 array_sz; 384 const __be32 *arrays; 385 }; 386 387 /* 388 * Retrieve and validate the list of associativity arrays for drconf 389 * memory from the ibm,associativity-lookup-arrays property of the 390 * device tree.. 391 * 392 * The layout of the ibm,associativity-lookup-arrays property is a number N 393 * indicating the number of associativity arrays, followed by a number M 394 * indicating the size of each associativity array, followed by a list 395 * of N associativity arrays. 396 */ 397 static int of_get_assoc_arrays(struct assoc_arrays *aa) 398 { 399 struct device_node *memory; 400 const __be32 *prop; 401 u32 len; 402 403 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 404 if (!memory) 405 return -1; 406 407 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 408 if (!prop || len < 2 * sizeof(unsigned int)) { 409 of_node_put(memory); 410 return -1; 411 } 412 413 aa->n_arrays = of_read_number(prop++, 1); 414 aa->array_sz = of_read_number(prop++, 1); 415 416 of_node_put(memory); 417 418 /* Now that we know the number of arrays and size of each array, 419 * revalidate the size of the property read in. 420 */ 421 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 422 return -1; 423 424 aa->arrays = prop; 425 return 0; 426 } 427 428 /* 429 * This is like of_node_to_nid_single() for memory represented in the 430 * ibm,dynamic-reconfiguration-memory node. 431 */ 432 static int of_drconf_to_nid_single(struct drmem_lmb *lmb) 433 { 434 struct assoc_arrays aa = { .arrays = NULL }; 435 int default_nid = NUMA_NO_NODE; 436 int nid = default_nid; 437 int rc, index; 438 439 if ((min_common_depth < 0) || !numa_enabled) 440 return default_nid; 441 442 rc = of_get_assoc_arrays(&aa); 443 if (rc) 444 return default_nid; 445 446 if (min_common_depth <= aa.array_sz && 447 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { 448 index = lmb->aa_index * aa.array_sz + min_common_depth - 1; 449 nid = of_read_number(&aa.arrays[index], 1); 450 451 if (nid == 0xffff || nid >= MAX_NUMNODES) 452 nid = default_nid; 453 454 if (nid > 0) { 455 index = lmb->aa_index * aa.array_sz; 456 initialize_distance_lookup_table(nid, 457 &aa.arrays[index]); 458 } 459 } 460 461 return nid; 462 } 463 464 #ifdef CONFIG_PPC_SPLPAR 465 static int vphn_get_nid(long lcpu) 466 { 467 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 468 long rc, hwid; 469 470 /* 471 * On a shared lpar, device tree will not have node associativity. 472 * At this time lppaca, or its __old_status field may not be 473 * updated. Hence kernel cannot detect if its on a shared lpar. So 474 * request an explicit associativity irrespective of whether the 475 * lpar is shared or dedicated. Use the device tree property as a 476 * fallback. cpu_to_phys_id is only valid between 477 * smp_setup_cpu_maps() and smp_setup_pacas(). 478 */ 479 if (firmware_has_feature(FW_FEATURE_VPHN)) { 480 if (cpu_to_phys_id) 481 hwid = cpu_to_phys_id[lcpu]; 482 else 483 hwid = get_hard_smp_processor_id(lcpu); 484 485 rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); 486 if (rc == H_SUCCESS) 487 return associativity_to_nid(associativity); 488 } 489 490 return NUMA_NO_NODE; 491 } 492 #else 493 static int vphn_get_nid(long unused) 494 { 495 return NUMA_NO_NODE; 496 } 497 #endif /* CONFIG_PPC_SPLPAR */ 498 499 /* 500 * Figure out to which domain a cpu belongs and stick it there. 501 * Return the id of the domain used. 502 */ 503 static int numa_setup_cpu(unsigned long lcpu) 504 { 505 struct device_node *cpu; 506 int fcpu = cpu_first_thread_sibling(lcpu); 507 int nid = NUMA_NO_NODE; 508 509 /* 510 * If a valid cpu-to-node mapping is already available, use it 511 * directly instead of querying the firmware, since it represents 512 * the most recent mapping notified to us by the platform (eg: VPHN). 513 * Since cpu_to_node binding remains the same for all threads in the 514 * core. If a valid cpu-to-node mapping is already available, for 515 * the first thread in the core, use it. 516 */ 517 nid = numa_cpu_lookup_table[fcpu]; 518 if (nid >= 0) { 519 map_cpu_to_node(lcpu, nid); 520 return nid; 521 } 522 523 nid = vphn_get_nid(lcpu); 524 if (nid != NUMA_NO_NODE) 525 goto out_present; 526 527 cpu = of_get_cpu_node(lcpu, NULL); 528 529 if (!cpu) { 530 WARN_ON(1); 531 if (cpu_present(lcpu)) 532 goto out_present; 533 else 534 goto out; 535 } 536 537 nid = of_node_to_nid_single(cpu); 538 of_node_put(cpu); 539 540 out_present: 541 if (nid < 0 || !node_possible(nid)) 542 nid = first_online_node; 543 544 /* 545 * Update for the first thread of the core. All threads of a core 546 * have to be part of the same node. This not only avoids querying 547 * for every other thread in the core, but always avoids a case 548 * where virtual node associativity change causes subsequent threads 549 * of a core to be associated with different nid. However if first 550 * thread is already online, expect it to have a valid mapping. 551 */ 552 if (fcpu != lcpu) { 553 WARN_ON(cpu_online(fcpu)); 554 map_cpu_to_node(fcpu, nid); 555 } 556 557 map_cpu_to_node(lcpu, nid); 558 out: 559 return nid; 560 } 561 562 static void verify_cpu_node_mapping(int cpu, int node) 563 { 564 int base, sibling, i; 565 566 /* Verify that all the threads in the core belong to the same node */ 567 base = cpu_first_thread_sibling(cpu); 568 569 for (i = 0; i < threads_per_core; i++) { 570 sibling = base + i; 571 572 if (sibling == cpu || cpu_is_offline(sibling)) 573 continue; 574 575 if (cpu_to_node(sibling) != node) { 576 WARN(1, "CPU thread siblings %d and %d don't belong" 577 " to the same node!\n", cpu, sibling); 578 break; 579 } 580 } 581 } 582 583 /* Must run before sched domains notifier. */ 584 static int ppc_numa_cpu_prepare(unsigned int cpu) 585 { 586 int nid; 587 588 nid = numa_setup_cpu(cpu); 589 verify_cpu_node_mapping(cpu, nid); 590 return 0; 591 } 592 593 static int ppc_numa_cpu_dead(unsigned int cpu) 594 { 595 #ifdef CONFIG_HOTPLUG_CPU 596 unmap_cpu_from_node(cpu); 597 #endif 598 return 0; 599 } 600 601 /* 602 * Check and possibly modify a memory region to enforce the memory limit. 603 * 604 * Returns the size the region should have to enforce the memory limit. 605 * This will either be the original value of size, a truncated value, 606 * or zero. If the returned value of size is 0 the region should be 607 * discarded as it lies wholly above the memory limit. 608 */ 609 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 610 unsigned long size) 611 { 612 /* 613 * We use memblock_end_of_DRAM() in here instead of memory_limit because 614 * we've already adjusted it for the limit and it takes care of 615 * having memory holes below the limit. Also, in the case of 616 * iommu_is_off, memory_limit is not set but is implicitly enforced. 617 */ 618 619 if (start + size <= memblock_end_of_DRAM()) 620 return size; 621 622 if (start >= memblock_end_of_DRAM()) 623 return 0; 624 625 return memblock_end_of_DRAM() - start; 626 } 627 628 /* 629 * Reads the counter for a given entry in 630 * linux,drconf-usable-memory property 631 */ 632 static inline int __init read_usm_ranges(const __be32 **usm) 633 { 634 /* 635 * For each lmb in ibm,dynamic-memory a corresponding 636 * entry in linux,drconf-usable-memory property contains 637 * a counter followed by that many (base, size) duple. 638 * read the counter from linux,drconf-usable-memory 639 */ 640 return read_n_cells(n_mem_size_cells, usm); 641 } 642 643 /* 644 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 645 * node. This assumes n_mem_{addr,size}_cells have been set. 646 */ 647 static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, 648 const __be32 **usm) 649 { 650 unsigned int ranges, is_kexec_kdump = 0; 651 unsigned long base, size, sz; 652 int nid; 653 654 /* 655 * Skip this block if the reserved bit is set in flags (0x80) 656 * or if the block is not assigned to this partition (0x8) 657 */ 658 if ((lmb->flags & DRCONF_MEM_RESERVED) 659 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 660 return; 661 662 if (*usm) 663 is_kexec_kdump = 1; 664 665 base = lmb->base_addr; 666 size = drmem_lmb_size(); 667 ranges = 1; 668 669 if (is_kexec_kdump) { 670 ranges = read_usm_ranges(usm); 671 if (!ranges) /* there are no (base, size) duple */ 672 return; 673 } 674 675 do { 676 if (is_kexec_kdump) { 677 base = read_n_cells(n_mem_addr_cells, usm); 678 size = read_n_cells(n_mem_size_cells, usm); 679 } 680 681 nid = of_drconf_to_nid_single(lmb); 682 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), 683 &nid); 684 node_set_online(nid); 685 sz = numa_enforce_memory_limit(base, size); 686 if (sz) 687 memblock_set_node(base, sz, &memblock.memory, nid); 688 } while (--ranges); 689 } 690 691 static int __init parse_numa_properties(void) 692 { 693 struct device_node *memory; 694 int default_nid = 0; 695 unsigned long i; 696 697 if (numa_enabled == 0) { 698 printk(KERN_WARNING "NUMA disabled by user\n"); 699 return -1; 700 } 701 702 min_common_depth = find_min_common_depth(); 703 704 if (min_common_depth < 0) { 705 /* 706 * if we fail to parse min_common_depth from device tree 707 * mark the numa disabled, boot with numa disabled. 708 */ 709 numa_enabled = false; 710 return min_common_depth; 711 } 712 713 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 714 715 /* 716 * Even though we connect cpus to numa domains later in SMP 717 * init, we need to know the node ids now. This is because 718 * each node to be onlined must have NODE_DATA etc backing it. 719 */ 720 for_each_present_cpu(i) { 721 struct device_node *cpu; 722 int nid; 723 724 cpu = of_get_cpu_node(i, NULL); 725 BUG_ON(!cpu); 726 nid = of_node_to_nid_single(cpu); 727 of_node_put(cpu); 728 729 /* 730 * Don't fall back to default_nid yet -- we will plug 731 * cpus into nodes once the memory scan has discovered 732 * the topology. 733 */ 734 if (nid < 0) 735 continue; 736 node_set_online(nid); 737 } 738 739 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 740 741 for_each_node_by_type(memory, "memory") { 742 unsigned long start; 743 unsigned long size; 744 int nid; 745 int ranges; 746 const __be32 *memcell_buf; 747 unsigned int len; 748 749 memcell_buf = of_get_property(memory, 750 "linux,usable-memory", &len); 751 if (!memcell_buf || len <= 0) 752 memcell_buf = of_get_property(memory, "reg", &len); 753 if (!memcell_buf || len <= 0) 754 continue; 755 756 /* ranges in cell */ 757 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 758 new_range: 759 /* these are order-sensitive, and modify the buffer pointer */ 760 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 761 size = read_n_cells(n_mem_size_cells, &memcell_buf); 762 763 /* 764 * Assumption: either all memory nodes or none will 765 * have associativity properties. If none, then 766 * everything goes to default_nid. 767 */ 768 nid = of_node_to_nid_single(memory); 769 if (nid < 0) 770 nid = default_nid; 771 772 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 773 node_set_online(nid); 774 775 size = numa_enforce_memory_limit(start, size); 776 if (size) 777 memblock_set_node(start, size, &memblock.memory, nid); 778 779 if (--ranges) 780 goto new_range; 781 } 782 783 /* 784 * Now do the same thing for each MEMBLOCK listed in the 785 * ibm,dynamic-memory property in the 786 * ibm,dynamic-reconfiguration-memory node. 787 */ 788 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 789 if (memory) { 790 walk_drmem_lmbs(memory, numa_setup_drmem_lmb); 791 of_node_put(memory); 792 } 793 794 return 0; 795 } 796 797 static void __init setup_nonnuma(void) 798 { 799 unsigned long top_of_ram = memblock_end_of_DRAM(); 800 unsigned long total_ram = memblock_phys_mem_size(); 801 unsigned long start_pfn, end_pfn; 802 unsigned int nid = 0; 803 struct memblock_region *reg; 804 805 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 806 top_of_ram, total_ram); 807 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 808 (top_of_ram - total_ram) >> 20); 809 810 for_each_memblock(memory, reg) { 811 start_pfn = memblock_region_memory_base_pfn(reg); 812 end_pfn = memblock_region_memory_end_pfn(reg); 813 814 fake_numa_create_new_node(end_pfn, &nid); 815 memblock_set_node(PFN_PHYS(start_pfn), 816 PFN_PHYS(end_pfn - start_pfn), 817 &memblock.memory, nid); 818 node_set_online(nid); 819 } 820 } 821 822 void __init dump_numa_cpu_topology(void) 823 { 824 unsigned int node; 825 unsigned int cpu, count; 826 827 if (!numa_enabled) 828 return; 829 830 for_each_online_node(node) { 831 pr_info("Node %d CPUs:", node); 832 833 count = 0; 834 /* 835 * If we used a CPU iterator here we would miss printing 836 * the holes in the cpumap. 837 */ 838 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 839 if (cpumask_test_cpu(cpu, 840 node_to_cpumask_map[node])) { 841 if (count == 0) 842 pr_cont(" %u", cpu); 843 ++count; 844 } else { 845 if (count > 1) 846 pr_cont("-%u", cpu - 1); 847 count = 0; 848 } 849 } 850 851 if (count > 1) 852 pr_cont("-%u", nr_cpu_ids - 1); 853 pr_cont("\n"); 854 } 855 } 856 857 /* Initialize NODE_DATA for a node on the local memory */ 858 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) 859 { 860 u64 spanned_pages = end_pfn - start_pfn; 861 const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); 862 u64 nd_pa; 863 void *nd; 864 int tnid; 865 866 nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 867 if (!nd_pa) 868 panic("Cannot allocate %zu bytes for node %d data\n", 869 nd_size, nid); 870 871 nd = __va(nd_pa); 872 873 /* report and initialize */ 874 pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", 875 nd_pa, nd_pa + nd_size - 1); 876 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 877 if (tnid != nid) 878 pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); 879 880 node_data[nid] = nd; 881 memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 882 NODE_DATA(nid)->node_id = nid; 883 NODE_DATA(nid)->node_start_pfn = start_pfn; 884 NODE_DATA(nid)->node_spanned_pages = spanned_pages; 885 } 886 887 static void __init find_possible_nodes(void) 888 { 889 struct device_node *rtas; 890 u32 numnodes, i; 891 892 if (!numa_enabled) 893 return; 894 895 rtas = of_find_node_by_path("/rtas"); 896 if (!rtas) 897 return; 898 899 if (of_property_read_u32_index(rtas, 900 "ibm,max-associativity-domains", 901 min_common_depth, &numnodes)) 902 goto out; 903 904 for (i = 0; i < numnodes; i++) { 905 if (!node_possible(i)) 906 node_set(i, node_possible_map); 907 } 908 909 out: 910 of_node_put(rtas); 911 } 912 913 void __init mem_topology_setup(void) 914 { 915 int cpu; 916 917 if (parse_numa_properties()) 918 setup_nonnuma(); 919 920 /* 921 * Modify the set of possible NUMA nodes to reflect information 922 * available about the set of online nodes, and the set of nodes 923 * that we expect to make use of for this platform's affinity 924 * calculations. 925 */ 926 nodes_and(node_possible_map, node_possible_map, node_online_map); 927 928 find_possible_nodes(); 929 930 setup_node_to_cpumask_map(); 931 932 reset_numa_cpu_lookup_table(); 933 934 for_each_present_cpu(cpu) 935 numa_setup_cpu(cpu); 936 } 937 938 void __init initmem_init(void) 939 { 940 int nid; 941 942 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 943 max_pfn = max_low_pfn; 944 945 memblock_dump_all(); 946 947 for_each_online_node(nid) { 948 unsigned long start_pfn, end_pfn; 949 950 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 951 setup_node_data(nid, start_pfn, end_pfn); 952 sparse_memory_present_with_active_regions(nid); 953 } 954 955 sparse_init(); 956 957 /* 958 * We need the numa_cpu_lookup_table to be accurate for all CPUs, 959 * even before we online them, so that we can use cpu_to_{node,mem} 960 * early in boot, cf. smp_prepare_cpus(). 961 * _nocalls() + manual invocation is used because cpuhp is not yet 962 * initialized for the boot CPU. 963 */ 964 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", 965 ppc_numa_cpu_prepare, ppc_numa_cpu_dead); 966 } 967 968 static int __init early_numa(char *p) 969 { 970 if (!p) 971 return 0; 972 973 if (strstr(p, "off")) 974 numa_enabled = 0; 975 976 if (strstr(p, "debug")) 977 numa_debug = 1; 978 979 p = strstr(p, "fake="); 980 if (p) 981 cmdline = p + strlen("fake="); 982 983 return 0; 984 } 985 early_param("numa", early_numa); 986 987 /* 988 * The platform can inform us through one of several mechanisms 989 * (post-migration device tree updates, PRRN or VPHN) that the NUMA 990 * assignment of a resource has changed. This controls whether we act 991 * on that. Disabled by default. 992 */ 993 static bool topology_updates_enabled; 994 995 static int __init early_topology_updates(char *p) 996 { 997 if (!p) 998 return 0; 999 1000 if (!strcmp(p, "on")) { 1001 pr_warn("Caution: enabling topology updates\n"); 1002 topology_updates_enabled = true; 1003 } 1004 1005 return 0; 1006 } 1007 early_param("topology_updates", early_topology_updates); 1008 1009 #ifdef CONFIG_MEMORY_HOTPLUG 1010 /* 1011 * Find the node associated with a hot added memory section for 1012 * memory represented in the device tree by the property 1013 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 1014 */ 1015 static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) 1016 { 1017 struct drmem_lmb *lmb; 1018 unsigned long lmb_size; 1019 int nid = NUMA_NO_NODE; 1020 1021 lmb_size = drmem_lmb_size(); 1022 1023 for_each_drmem_lmb(lmb) { 1024 /* skip this block if it is reserved or not assigned to 1025 * this partition */ 1026 if ((lmb->flags & DRCONF_MEM_RESERVED) 1027 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 1028 continue; 1029 1030 if ((scn_addr < lmb->base_addr) 1031 || (scn_addr >= (lmb->base_addr + lmb_size))) 1032 continue; 1033 1034 nid = of_drconf_to_nid_single(lmb); 1035 break; 1036 } 1037 1038 return nid; 1039 } 1040 1041 /* 1042 * Find the node associated with a hot added memory section for memory 1043 * represented in the device tree as a node (i.e. memory@XXXX) for 1044 * each memblock. 1045 */ 1046 static int hot_add_node_scn_to_nid(unsigned long scn_addr) 1047 { 1048 struct device_node *memory; 1049 int nid = NUMA_NO_NODE; 1050 1051 for_each_node_by_type(memory, "memory") { 1052 unsigned long start, size; 1053 int ranges; 1054 const __be32 *memcell_buf; 1055 unsigned int len; 1056 1057 memcell_buf = of_get_property(memory, "reg", &len); 1058 if (!memcell_buf || len <= 0) 1059 continue; 1060 1061 /* ranges in cell */ 1062 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1063 1064 while (ranges--) { 1065 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1066 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1067 1068 if ((scn_addr < start) || (scn_addr >= (start + size))) 1069 continue; 1070 1071 nid = of_node_to_nid_single(memory); 1072 break; 1073 } 1074 1075 if (nid >= 0) 1076 break; 1077 } 1078 1079 of_node_put(memory); 1080 1081 return nid; 1082 } 1083 1084 /* 1085 * Find the node associated with a hot added memory section. Section 1086 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that 1087 * sections are fully contained within a single MEMBLOCK. 1088 */ 1089 int hot_add_scn_to_nid(unsigned long scn_addr) 1090 { 1091 struct device_node *memory = NULL; 1092 int nid; 1093 1094 if (!numa_enabled) 1095 return first_online_node; 1096 1097 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1098 if (memory) { 1099 nid = hot_add_drconf_scn_to_nid(scn_addr); 1100 of_node_put(memory); 1101 } else { 1102 nid = hot_add_node_scn_to_nid(scn_addr); 1103 } 1104 1105 if (nid < 0 || !node_possible(nid)) 1106 nid = first_online_node; 1107 1108 return nid; 1109 } 1110 1111 static u64 hot_add_drconf_memory_max(void) 1112 { 1113 struct device_node *memory = NULL; 1114 struct device_node *dn = NULL; 1115 const __be64 *lrdr = NULL; 1116 1117 dn = of_find_node_by_path("/rtas"); 1118 if (dn) { 1119 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL); 1120 of_node_put(dn); 1121 if (lrdr) 1122 return be64_to_cpup(lrdr); 1123 } 1124 1125 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1126 if (memory) { 1127 of_node_put(memory); 1128 return drmem_lmb_memory_max(); 1129 } 1130 return 0; 1131 } 1132 1133 /* 1134 * memory_hotplug_max - return max address of memory that may be added 1135 * 1136 * This is currently only used on systems that support drconfig memory 1137 * hotplug. 1138 */ 1139 u64 memory_hotplug_max(void) 1140 { 1141 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); 1142 } 1143 #endif /* CONFIG_MEMORY_HOTPLUG */ 1144 1145 /* Virtual Processor Home Node (VPHN) support */ 1146 #ifdef CONFIG_PPC_SPLPAR 1147 struct topology_update_data { 1148 struct topology_update_data *next; 1149 unsigned int cpu; 1150 int old_nid; 1151 int new_nid; 1152 }; 1153 1154 #define TOPOLOGY_DEF_TIMER_SECS 60 1155 1156 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; 1157 static cpumask_t cpu_associativity_changes_mask; 1158 static int vphn_enabled; 1159 static int prrn_enabled; 1160 static void reset_topology_timer(void); 1161 static int topology_timer_secs = 1; 1162 static int topology_inited; 1163 1164 /* 1165 * Change polling interval for associativity changes. 1166 */ 1167 int timed_topology_update(int nsecs) 1168 { 1169 if (vphn_enabled) { 1170 if (nsecs > 0) 1171 topology_timer_secs = nsecs; 1172 else 1173 topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS; 1174 1175 reset_topology_timer(); 1176 } 1177 1178 return 0; 1179 } 1180 1181 /* 1182 * Store the current values of the associativity change counters in the 1183 * hypervisor. 1184 */ 1185 static void setup_cpu_associativity_change_counters(void) 1186 { 1187 int cpu; 1188 1189 /* The VPHN feature supports a maximum of 8 reference points */ 1190 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); 1191 1192 for_each_possible_cpu(cpu) { 1193 int i; 1194 u8 *counts = vphn_cpu_change_counts[cpu]; 1195 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts; 1196 1197 for (i = 0; i < distance_ref_points_depth; i++) 1198 counts[i] = hypervisor_counts[i]; 1199 } 1200 } 1201 1202 /* 1203 * The hypervisor maintains a set of 8 associativity change counters in 1204 * the VPA of each cpu that correspond to the associativity levels in the 1205 * ibm,associativity-reference-points property. When an associativity 1206 * level changes, the corresponding counter is incremented. 1207 * 1208 * Set a bit in cpu_associativity_changes_mask for each cpu whose home 1209 * node associativity levels have changed. 1210 * 1211 * Returns the number of cpus with unhandled associativity changes. 1212 */ 1213 static int update_cpu_associativity_changes_mask(void) 1214 { 1215 int cpu; 1216 cpumask_t *changes = &cpu_associativity_changes_mask; 1217 1218 for_each_possible_cpu(cpu) { 1219 int i, changed = 0; 1220 u8 *counts = vphn_cpu_change_counts[cpu]; 1221 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts; 1222 1223 for (i = 0; i < distance_ref_points_depth; i++) { 1224 if (hypervisor_counts[i] != counts[i]) { 1225 counts[i] = hypervisor_counts[i]; 1226 changed = 1; 1227 } 1228 } 1229 if (changed) { 1230 cpumask_or(changes, changes, cpu_sibling_mask(cpu)); 1231 cpu = cpu_last_thread_sibling(cpu); 1232 } 1233 } 1234 1235 return cpumask_weight(changes); 1236 } 1237 1238 /* 1239 * Retrieve the new associativity information for a virtual processor's 1240 * home node. 1241 */ 1242 static long vphn_get_associativity(unsigned long cpu, 1243 __be32 *associativity) 1244 { 1245 long rc; 1246 1247 rc = hcall_vphn(get_hard_smp_processor_id(cpu), 1248 VPHN_FLAG_VCPU, associativity); 1249 1250 switch (rc) { 1251 case H_SUCCESS: 1252 dbg("VPHN hcall succeeded. Reset polling...\n"); 1253 timed_topology_update(0); 1254 goto out; 1255 1256 case H_FUNCTION: 1257 pr_err_ratelimited("VPHN unsupported. Disabling polling...\n"); 1258 break; 1259 case H_HARDWARE: 1260 pr_err_ratelimited("hcall_vphn() experienced a hardware fault " 1261 "preventing VPHN. Disabling polling...\n"); 1262 break; 1263 case H_PARAMETER: 1264 pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. " 1265 "Disabling polling...\n"); 1266 break; 1267 default: 1268 pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n" 1269 , rc); 1270 break; 1271 } 1272 1273 stop_topology_update(); 1274 out: 1275 return rc; 1276 } 1277 1278 int find_and_online_cpu_nid(int cpu) 1279 { 1280 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1281 int new_nid; 1282 1283 /* Use associativity from first thread for all siblings */ 1284 if (vphn_get_associativity(cpu, associativity)) 1285 return cpu_to_node(cpu); 1286 1287 new_nid = associativity_to_nid(associativity); 1288 if (new_nid < 0 || !node_possible(new_nid)) 1289 new_nid = first_online_node; 1290 1291 if (NODE_DATA(new_nid) == NULL) { 1292 #ifdef CONFIG_MEMORY_HOTPLUG 1293 /* 1294 * Need to ensure that NODE_DATA is initialized for a node from 1295 * available memory (see memblock_alloc_try_nid). If unable to 1296 * init the node, then default to nearest node that has memory 1297 * installed. Skip onlining a node if the subsystems are not 1298 * yet initialized. 1299 */ 1300 if (!topology_inited || try_online_node(new_nid)) 1301 new_nid = first_online_node; 1302 #else 1303 /* 1304 * Default to using the nearest node that has memory installed. 1305 * Otherwise, it would be necessary to patch the kernel MM code 1306 * to deal with more memoryless-node error conditions. 1307 */ 1308 new_nid = first_online_node; 1309 #endif 1310 } 1311 1312 pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__, 1313 cpu, new_nid); 1314 return new_nid; 1315 } 1316 1317 /* 1318 * Update the CPU maps and sysfs entries for a single CPU when its NUMA 1319 * characteristics change. This function doesn't perform any locking and is 1320 * only safe to call from stop_machine(). 1321 */ 1322 static int update_cpu_topology(void *data) 1323 { 1324 struct topology_update_data *update; 1325 unsigned long cpu; 1326 1327 if (!data) 1328 return -EINVAL; 1329 1330 cpu = smp_processor_id(); 1331 1332 for (update = data; update; update = update->next) { 1333 int new_nid = update->new_nid; 1334 if (cpu != update->cpu) 1335 continue; 1336 1337 unmap_cpu_from_node(cpu); 1338 map_cpu_to_node(cpu, new_nid); 1339 set_cpu_numa_node(cpu, new_nid); 1340 set_cpu_numa_mem(cpu, local_memory_node(new_nid)); 1341 vdso_getcpu_init(); 1342 } 1343 1344 return 0; 1345 } 1346 1347 static int update_lookup_table(void *data) 1348 { 1349 struct topology_update_data *update; 1350 1351 if (!data) 1352 return -EINVAL; 1353 1354 /* 1355 * Upon topology update, the numa-cpu lookup table needs to be updated 1356 * for all threads in the core, including offline CPUs, to ensure that 1357 * future hotplug operations respect the cpu-to-node associativity 1358 * properly. 1359 */ 1360 for (update = data; update; update = update->next) { 1361 int nid, base, j; 1362 1363 nid = update->new_nid; 1364 base = cpu_first_thread_sibling(update->cpu); 1365 1366 for (j = 0; j < threads_per_core; j++) { 1367 update_numa_cpu_lookup_table(base + j, nid); 1368 } 1369 } 1370 1371 return 0; 1372 } 1373 1374 /* 1375 * Update the node maps and sysfs entries for each cpu whose home node 1376 * has changed. Returns 1 when the topology has changed, and 0 otherwise. 1377 * 1378 * cpus_locked says whether we already hold cpu_hotplug_lock. 1379 */ 1380 int numa_update_cpu_topology(bool cpus_locked) 1381 { 1382 unsigned int cpu, sibling, changed = 0; 1383 struct topology_update_data *updates, *ud; 1384 cpumask_t updated_cpus; 1385 struct device *dev; 1386 int weight, new_nid, i = 0; 1387 1388 if (!prrn_enabled && !vphn_enabled && topology_inited) 1389 return 0; 1390 1391 weight = cpumask_weight(&cpu_associativity_changes_mask); 1392 if (!weight) 1393 return 0; 1394 1395 updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL); 1396 if (!updates) 1397 return 0; 1398 1399 cpumask_clear(&updated_cpus); 1400 1401 for_each_cpu(cpu, &cpu_associativity_changes_mask) { 1402 /* 1403 * If siblings aren't flagged for changes, updates list 1404 * will be too short. Skip on this update and set for next 1405 * update. 1406 */ 1407 if (!cpumask_subset(cpu_sibling_mask(cpu), 1408 &cpu_associativity_changes_mask)) { 1409 pr_info("Sibling bits not set for associativity " 1410 "change, cpu%d\n", cpu); 1411 cpumask_or(&cpu_associativity_changes_mask, 1412 &cpu_associativity_changes_mask, 1413 cpu_sibling_mask(cpu)); 1414 cpu = cpu_last_thread_sibling(cpu); 1415 continue; 1416 } 1417 1418 new_nid = find_and_online_cpu_nid(cpu); 1419 1420 if (new_nid == numa_cpu_lookup_table[cpu]) { 1421 cpumask_andnot(&cpu_associativity_changes_mask, 1422 &cpu_associativity_changes_mask, 1423 cpu_sibling_mask(cpu)); 1424 dbg("Assoc chg gives same node %d for cpu%d\n", 1425 new_nid, cpu); 1426 cpu = cpu_last_thread_sibling(cpu); 1427 continue; 1428 } 1429 1430 for_each_cpu(sibling, cpu_sibling_mask(cpu)) { 1431 ud = &updates[i++]; 1432 ud->next = &updates[i]; 1433 ud->cpu = sibling; 1434 ud->new_nid = new_nid; 1435 ud->old_nid = numa_cpu_lookup_table[sibling]; 1436 cpumask_set_cpu(sibling, &updated_cpus); 1437 } 1438 cpu = cpu_last_thread_sibling(cpu); 1439 } 1440 1441 /* 1442 * Prevent processing of 'updates' from overflowing array 1443 * where last entry filled in a 'next' pointer. 1444 */ 1445 if (i) 1446 updates[i-1].next = NULL; 1447 1448 pr_debug("Topology update for the following CPUs:\n"); 1449 if (cpumask_weight(&updated_cpus)) { 1450 for (ud = &updates[0]; ud; ud = ud->next) { 1451 pr_debug("cpu %d moving from node %d " 1452 "to %d\n", ud->cpu, 1453 ud->old_nid, ud->new_nid); 1454 } 1455 } 1456 1457 /* 1458 * In cases where we have nothing to update (because the updates list 1459 * is too short or because the new topology is same as the old one), 1460 * skip invoking update_cpu_topology() via stop-machine(). This is 1461 * necessary (and not just a fast-path optimization) since stop-machine 1462 * can end up electing a random CPU to run update_cpu_topology(), and 1463 * thus trick us into setting up incorrect cpu-node mappings (since 1464 * 'updates' is kzalloc()'ed). 1465 * 1466 * And for the similar reason, we will skip all the following updating. 1467 */ 1468 if (!cpumask_weight(&updated_cpus)) 1469 goto out; 1470 1471 if (cpus_locked) 1472 stop_machine_cpuslocked(update_cpu_topology, &updates[0], 1473 &updated_cpus); 1474 else 1475 stop_machine(update_cpu_topology, &updates[0], &updated_cpus); 1476 1477 /* 1478 * Update the numa-cpu lookup table with the new mappings, even for 1479 * offline CPUs. It is best to perform this update from the stop- 1480 * machine context. 1481 */ 1482 if (cpus_locked) 1483 stop_machine_cpuslocked(update_lookup_table, &updates[0], 1484 cpumask_of(raw_smp_processor_id())); 1485 else 1486 stop_machine(update_lookup_table, &updates[0], 1487 cpumask_of(raw_smp_processor_id())); 1488 1489 for (ud = &updates[0]; ud; ud = ud->next) { 1490 unregister_cpu_under_node(ud->cpu, ud->old_nid); 1491 register_cpu_under_node(ud->cpu, ud->new_nid); 1492 1493 dev = get_cpu_device(ud->cpu); 1494 if (dev) 1495 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 1496 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask); 1497 changed = 1; 1498 } 1499 1500 out: 1501 kfree(updates); 1502 return changed; 1503 } 1504 1505 int arch_update_cpu_topology(void) 1506 { 1507 return numa_update_cpu_topology(true); 1508 } 1509 1510 static void topology_work_fn(struct work_struct *work) 1511 { 1512 rebuild_sched_domains(); 1513 } 1514 static DECLARE_WORK(topology_work, topology_work_fn); 1515 1516 static void topology_schedule_update(void) 1517 { 1518 schedule_work(&topology_work); 1519 } 1520 1521 static void topology_timer_fn(struct timer_list *unused) 1522 { 1523 if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask)) 1524 topology_schedule_update(); 1525 else if (vphn_enabled) { 1526 if (update_cpu_associativity_changes_mask() > 0) 1527 topology_schedule_update(); 1528 reset_topology_timer(); 1529 } 1530 } 1531 static struct timer_list topology_timer; 1532 1533 static void reset_topology_timer(void) 1534 { 1535 if (vphn_enabled) 1536 mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ); 1537 } 1538 1539 #ifdef CONFIG_SMP 1540 1541 static int dt_update_callback(struct notifier_block *nb, 1542 unsigned long action, void *data) 1543 { 1544 struct of_reconfig_data *update = data; 1545 int rc = NOTIFY_DONE; 1546 1547 switch (action) { 1548 case OF_RECONFIG_UPDATE_PROPERTY: 1549 if (of_node_is_type(update->dn, "cpu") && 1550 !of_prop_cmp(update->prop->name, "ibm,associativity")) { 1551 u32 core_id; 1552 of_property_read_u32(update->dn, "reg", &core_id); 1553 rc = dlpar_cpu_readd(core_id); 1554 rc = NOTIFY_OK; 1555 } 1556 break; 1557 } 1558 1559 return rc; 1560 } 1561 1562 static struct notifier_block dt_update_nb = { 1563 .notifier_call = dt_update_callback, 1564 }; 1565 1566 #endif 1567 1568 /* 1569 * Start polling for associativity changes. 1570 */ 1571 int start_topology_update(void) 1572 { 1573 int rc = 0; 1574 1575 if (!topology_updates_enabled) 1576 return 0; 1577 1578 if (firmware_has_feature(FW_FEATURE_PRRN)) { 1579 if (!prrn_enabled) { 1580 prrn_enabled = 1; 1581 #ifdef CONFIG_SMP 1582 rc = of_reconfig_notifier_register(&dt_update_nb); 1583 #endif 1584 } 1585 } 1586 if (firmware_has_feature(FW_FEATURE_VPHN) && 1587 lppaca_shared_proc(get_lppaca())) { 1588 if (!vphn_enabled) { 1589 vphn_enabled = 1; 1590 setup_cpu_associativity_change_counters(); 1591 timer_setup(&topology_timer, topology_timer_fn, 1592 TIMER_DEFERRABLE); 1593 reset_topology_timer(); 1594 } 1595 } 1596 1597 pr_info("Starting topology update%s%s\n", 1598 (prrn_enabled ? " prrn_enabled" : ""), 1599 (vphn_enabled ? " vphn_enabled" : "")); 1600 1601 return rc; 1602 } 1603 1604 /* 1605 * Disable polling for VPHN associativity changes. 1606 */ 1607 int stop_topology_update(void) 1608 { 1609 int rc = 0; 1610 1611 if (!topology_updates_enabled) 1612 return 0; 1613 1614 if (prrn_enabled) { 1615 prrn_enabled = 0; 1616 #ifdef CONFIG_SMP 1617 rc = of_reconfig_notifier_unregister(&dt_update_nb); 1618 #endif 1619 } 1620 if (vphn_enabled) { 1621 vphn_enabled = 0; 1622 rc = del_timer_sync(&topology_timer); 1623 } 1624 1625 pr_info("Stopping topology update\n"); 1626 1627 return rc; 1628 } 1629 1630 int prrn_is_enabled(void) 1631 { 1632 return prrn_enabled; 1633 } 1634 1635 static int topology_read(struct seq_file *file, void *v) 1636 { 1637 if (vphn_enabled || prrn_enabled) 1638 seq_puts(file, "on\n"); 1639 else 1640 seq_puts(file, "off\n"); 1641 1642 return 0; 1643 } 1644 1645 static int topology_open(struct inode *inode, struct file *file) 1646 { 1647 return single_open(file, topology_read, NULL); 1648 } 1649 1650 static ssize_t topology_write(struct file *file, const char __user *buf, 1651 size_t count, loff_t *off) 1652 { 1653 char kbuf[4]; /* "on" or "off" plus null. */ 1654 int read_len; 1655 1656 read_len = count < 3 ? count : 3; 1657 if (copy_from_user(kbuf, buf, read_len)) 1658 return -EINVAL; 1659 1660 kbuf[read_len] = '\0'; 1661 1662 if (!strncmp(kbuf, "on", 2)) { 1663 topology_updates_enabled = true; 1664 start_topology_update(); 1665 } else if (!strncmp(kbuf, "off", 3)) { 1666 stop_topology_update(); 1667 topology_updates_enabled = false; 1668 } else 1669 return -EINVAL; 1670 1671 return count; 1672 } 1673 1674 static const struct proc_ops topology_proc_ops = { 1675 .proc_read = seq_read, 1676 .proc_write = topology_write, 1677 .proc_open = topology_open, 1678 .proc_release = single_release, 1679 }; 1680 1681 static int topology_update_init(void) 1682 { 1683 start_topology_update(); 1684 1685 if (vphn_enabled) 1686 topology_schedule_update(); 1687 1688 if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_proc_ops)) 1689 return -ENOMEM; 1690 1691 topology_inited = 1; 1692 return 0; 1693 } 1694 device_initcall(topology_update_init); 1695 #endif /* CONFIG_PPC_SPLPAR */ 1696