1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * pSeries NUMA support 4 * 5 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 6 */ 7 #define pr_fmt(fmt) "numa: " fmt 8 9 #include <linux/threads.h> 10 #include <linux/memblock.h> 11 #include <linux/init.h> 12 #include <linux/mm.h> 13 #include <linux/mmzone.h> 14 #include <linux/export.h> 15 #include <linux/nodemask.h> 16 #include <linux/cpu.h> 17 #include <linux/notifier.h> 18 #include <linux/of.h> 19 #include <linux/pfn.h> 20 #include <linux/cpuset.h> 21 #include <linux/node.h> 22 #include <linux/stop_machine.h> 23 #include <linux/proc_fs.h> 24 #include <linux/seq_file.h> 25 #include <linux/uaccess.h> 26 #include <linux/slab.h> 27 #include <asm/cputhreads.h> 28 #include <asm/sparsemem.h> 29 #include <asm/prom.h> 30 #include <asm/smp.h> 31 #include <asm/topology.h> 32 #include <asm/firmware.h> 33 #include <asm/paca.h> 34 #include <asm/hvcall.h> 35 #include <asm/setup.h> 36 #include <asm/vdso.h> 37 #include <asm/drmem.h> 38 39 static int numa_enabled = 1; 40 41 static char *cmdline __initdata; 42 43 static int numa_debug; 44 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 45 46 int numa_cpu_lookup_table[NR_CPUS]; 47 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 48 struct pglist_data *node_data[MAX_NUMNODES]; 49 50 EXPORT_SYMBOL(numa_cpu_lookup_table); 51 EXPORT_SYMBOL(node_to_cpumask_map); 52 EXPORT_SYMBOL(node_data); 53 54 static int min_common_depth; 55 static int n_mem_addr_cells, n_mem_size_cells; 56 static int form1_affinity; 57 58 #define MAX_DISTANCE_REF_POINTS 4 59 static int distance_ref_points_depth; 60 static const __be32 *distance_ref_points; 61 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; 62 63 /* 64 * Allocate node_to_cpumask_map based on number of available nodes 65 * Requires node_possible_map to be valid. 66 * 67 * Note: cpumask_of_node() is not valid until after this is done. 68 */ 69 static void __init setup_node_to_cpumask_map(void) 70 { 71 unsigned int node; 72 73 /* setup nr_node_ids if not done yet */ 74 if (nr_node_ids == MAX_NUMNODES) 75 setup_nr_node_ids(); 76 77 /* allocate the map */ 78 for_each_node(node) 79 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 80 81 /* cpumask_of_node() will now work */ 82 dbg("Node to cpumask map for %u nodes\n", nr_node_ids); 83 } 84 85 static int __init fake_numa_create_new_node(unsigned long end_pfn, 86 unsigned int *nid) 87 { 88 unsigned long long mem; 89 char *p = cmdline; 90 static unsigned int fake_nid; 91 static unsigned long long curr_boundary; 92 93 /* 94 * Modify node id, iff we started creating NUMA nodes 95 * We want to continue from where we left of the last time 96 */ 97 if (fake_nid) 98 *nid = fake_nid; 99 /* 100 * In case there are no more arguments to parse, the 101 * node_id should be the same as the last fake node id 102 * (we've handled this above). 103 */ 104 if (!p) 105 return 0; 106 107 mem = memparse(p, &p); 108 if (!mem) 109 return 0; 110 111 if (mem < curr_boundary) 112 return 0; 113 114 curr_boundary = mem; 115 116 if ((end_pfn << PAGE_SHIFT) > mem) { 117 /* 118 * Skip commas and spaces 119 */ 120 while (*p == ',' || *p == ' ' || *p == '\t') 121 p++; 122 123 cmdline = p; 124 fake_nid++; 125 *nid = fake_nid; 126 dbg("created new fake_node with id %d\n", fake_nid); 127 return 1; 128 } 129 return 0; 130 } 131 132 static void reset_numa_cpu_lookup_table(void) 133 { 134 unsigned int cpu; 135 136 for_each_possible_cpu(cpu) 137 numa_cpu_lookup_table[cpu] = -1; 138 } 139 140 static void map_cpu_to_node(int cpu, int node) 141 { 142 update_numa_cpu_lookup_table(cpu, node); 143 144 dbg("adding cpu %d to node %d\n", cpu, node); 145 146 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) 147 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 148 } 149 150 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) 151 static void unmap_cpu_from_node(unsigned long cpu) 152 { 153 int node = numa_cpu_lookup_table[cpu]; 154 155 dbg("removing cpu %lu from node %d\n", cpu, node); 156 157 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 158 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); 159 } else { 160 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 161 cpu, node); 162 } 163 } 164 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ 165 166 int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 167 { 168 int dist = 0; 169 170 int i, index; 171 172 for (i = 0; i < distance_ref_points_depth; i++) { 173 index = be32_to_cpu(distance_ref_points[i]); 174 if (cpu1_assoc[index] == cpu2_assoc[index]) 175 break; 176 dist++; 177 } 178 179 return dist; 180 } 181 182 /* must hold reference to node during call */ 183 static const __be32 *of_get_associativity(struct device_node *dev) 184 { 185 return of_get_property(dev, "ibm,associativity", NULL); 186 } 187 188 int __node_distance(int a, int b) 189 { 190 int i; 191 int distance = LOCAL_DISTANCE; 192 193 if (!form1_affinity) 194 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); 195 196 for (i = 0; i < distance_ref_points_depth; i++) { 197 if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) 198 break; 199 200 /* Double the distance for each NUMA level */ 201 distance *= 2; 202 } 203 204 return distance; 205 } 206 EXPORT_SYMBOL(__node_distance); 207 208 static void initialize_distance_lookup_table(int nid, 209 const __be32 *associativity) 210 { 211 int i; 212 213 if (!form1_affinity) 214 return; 215 216 for (i = 0; i < distance_ref_points_depth; i++) { 217 const __be32 *entry; 218 219 entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; 220 distance_lookup_table[nid][i] = of_read_number(entry, 1); 221 } 222 } 223 224 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 225 * info is found. 226 */ 227 static int associativity_to_nid(const __be32 *associativity) 228 { 229 int nid = NUMA_NO_NODE; 230 231 if (!numa_enabled) 232 goto out; 233 234 if (of_read_number(associativity, 1) >= min_common_depth) 235 nid = of_read_number(&associativity[min_common_depth], 1); 236 237 /* POWER4 LPAR uses 0xffff as invalid node */ 238 if (nid == 0xffff || nid >= MAX_NUMNODES) 239 nid = NUMA_NO_NODE; 240 241 if (nid > 0 && 242 of_read_number(associativity, 1) >= distance_ref_points_depth) { 243 /* 244 * Skip the length field and send start of associativity array 245 */ 246 initialize_distance_lookup_table(nid, associativity + 1); 247 } 248 249 out: 250 return nid; 251 } 252 253 /* Returns the nid associated with the given device tree node, 254 * or -1 if not found. 255 */ 256 static int of_node_to_nid_single(struct device_node *device) 257 { 258 int nid = NUMA_NO_NODE; 259 const __be32 *tmp; 260 261 tmp = of_get_associativity(device); 262 if (tmp) 263 nid = associativity_to_nid(tmp); 264 return nid; 265 } 266 267 /* Walk the device tree upwards, looking for an associativity id */ 268 int of_node_to_nid(struct device_node *device) 269 { 270 int nid = NUMA_NO_NODE; 271 272 of_node_get(device); 273 while (device) { 274 nid = of_node_to_nid_single(device); 275 if (nid != -1) 276 break; 277 278 device = of_get_next_parent(device); 279 } 280 of_node_put(device); 281 282 return nid; 283 } 284 EXPORT_SYMBOL(of_node_to_nid); 285 286 static int __init find_min_common_depth(void) 287 { 288 int depth; 289 struct device_node *root; 290 291 if (firmware_has_feature(FW_FEATURE_OPAL)) 292 root = of_find_node_by_path("/ibm,opal"); 293 else 294 root = of_find_node_by_path("/rtas"); 295 if (!root) 296 root = of_find_node_by_path("/"); 297 298 /* 299 * This property is a set of 32-bit integers, each representing 300 * an index into the ibm,associativity nodes. 301 * 302 * With form 0 affinity the first integer is for an SMP configuration 303 * (should be all 0's) and the second is for a normal NUMA 304 * configuration. We have only one level of NUMA. 305 * 306 * With form 1 affinity the first integer is the most significant 307 * NUMA boundary and the following are progressively less significant 308 * boundaries. There can be more than one level of NUMA. 309 */ 310 distance_ref_points = of_get_property(root, 311 "ibm,associativity-reference-points", 312 &distance_ref_points_depth); 313 314 if (!distance_ref_points) { 315 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 316 goto err; 317 } 318 319 distance_ref_points_depth /= sizeof(int); 320 321 if (firmware_has_feature(FW_FEATURE_OPAL) || 322 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { 323 dbg("Using form 1 affinity\n"); 324 form1_affinity = 1; 325 } 326 327 if (form1_affinity) { 328 depth = of_read_number(distance_ref_points, 1); 329 } else { 330 if (distance_ref_points_depth < 2) { 331 printk(KERN_WARNING "NUMA: " 332 "short ibm,associativity-reference-points\n"); 333 goto err; 334 } 335 336 depth = of_read_number(&distance_ref_points[1], 1); 337 } 338 339 /* 340 * Warn and cap if the hardware supports more than 341 * MAX_DISTANCE_REF_POINTS domains. 342 */ 343 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { 344 printk(KERN_WARNING "NUMA: distance array capped at " 345 "%d entries\n", MAX_DISTANCE_REF_POINTS); 346 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 347 } 348 349 of_node_put(root); 350 return depth; 351 352 err: 353 of_node_put(root); 354 return -1; 355 } 356 357 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 358 { 359 struct device_node *memory = NULL; 360 361 memory = of_find_node_by_type(memory, "memory"); 362 if (!memory) 363 panic("numa.c: No memory nodes found!"); 364 365 *n_addr_cells = of_n_addr_cells(memory); 366 *n_size_cells = of_n_size_cells(memory); 367 of_node_put(memory); 368 } 369 370 static unsigned long read_n_cells(int n, const __be32 **buf) 371 { 372 unsigned long result = 0; 373 374 while (n--) { 375 result = (result << 32) | of_read_number(*buf, 1); 376 (*buf)++; 377 } 378 return result; 379 } 380 381 struct assoc_arrays { 382 u32 n_arrays; 383 u32 array_sz; 384 const __be32 *arrays; 385 }; 386 387 /* 388 * Retrieve and validate the list of associativity arrays for drconf 389 * memory from the ibm,associativity-lookup-arrays property of the 390 * device tree.. 391 * 392 * The layout of the ibm,associativity-lookup-arrays property is a number N 393 * indicating the number of associativity arrays, followed by a number M 394 * indicating the size of each associativity array, followed by a list 395 * of N associativity arrays. 396 */ 397 static int of_get_assoc_arrays(struct assoc_arrays *aa) 398 { 399 struct device_node *memory; 400 const __be32 *prop; 401 u32 len; 402 403 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 404 if (!memory) 405 return -1; 406 407 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 408 if (!prop || len < 2 * sizeof(unsigned int)) { 409 of_node_put(memory); 410 return -1; 411 } 412 413 aa->n_arrays = of_read_number(prop++, 1); 414 aa->array_sz = of_read_number(prop++, 1); 415 416 of_node_put(memory); 417 418 /* Now that we know the number of arrays and size of each array, 419 * revalidate the size of the property read in. 420 */ 421 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 422 return -1; 423 424 aa->arrays = prop; 425 return 0; 426 } 427 428 /* 429 * This is like of_node_to_nid_single() for memory represented in the 430 * ibm,dynamic-reconfiguration-memory node. 431 */ 432 static int of_drconf_to_nid_single(struct drmem_lmb *lmb) 433 { 434 struct assoc_arrays aa = { .arrays = NULL }; 435 int default_nid = NUMA_NO_NODE; 436 int nid = default_nid; 437 int rc, index; 438 439 if ((min_common_depth < 0) || !numa_enabled) 440 return default_nid; 441 442 rc = of_get_assoc_arrays(&aa); 443 if (rc) 444 return default_nid; 445 446 if (min_common_depth <= aa.array_sz && 447 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { 448 index = lmb->aa_index * aa.array_sz + min_common_depth - 1; 449 nid = of_read_number(&aa.arrays[index], 1); 450 451 if (nid == 0xffff || nid >= MAX_NUMNODES) 452 nid = default_nid; 453 454 if (nid > 0) { 455 index = lmb->aa_index * aa.array_sz; 456 initialize_distance_lookup_table(nid, 457 &aa.arrays[index]); 458 } 459 } 460 461 return nid; 462 } 463 464 #ifdef CONFIG_PPC_SPLPAR 465 static int vphn_get_nid(long lcpu) 466 { 467 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 468 long rc, hwid; 469 470 /* 471 * On a shared lpar, device tree will not have node associativity. 472 * At this time lppaca, or its __old_status field may not be 473 * updated. Hence kernel cannot detect if its on a shared lpar. So 474 * request an explicit associativity irrespective of whether the 475 * lpar is shared or dedicated. Use the device tree property as a 476 * fallback. cpu_to_phys_id is only valid between 477 * smp_setup_cpu_maps() and smp_setup_pacas(). 478 */ 479 if (firmware_has_feature(FW_FEATURE_VPHN)) { 480 if (cpu_to_phys_id) 481 hwid = cpu_to_phys_id[lcpu]; 482 else 483 hwid = get_hard_smp_processor_id(lcpu); 484 485 rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); 486 if (rc == H_SUCCESS) 487 return associativity_to_nid(associativity); 488 } 489 490 return NUMA_NO_NODE; 491 } 492 #else 493 static int vphn_get_nid(long unused) 494 { 495 return NUMA_NO_NODE; 496 } 497 #endif /* CONFIG_PPC_SPLPAR */ 498 499 /* 500 * Figure out to which domain a cpu belongs and stick it there. 501 * Return the id of the domain used. 502 */ 503 static int numa_setup_cpu(unsigned long lcpu) 504 { 505 struct device_node *cpu; 506 int fcpu = cpu_first_thread_sibling(lcpu); 507 int nid = NUMA_NO_NODE; 508 509 /* 510 * If a valid cpu-to-node mapping is already available, use it 511 * directly instead of querying the firmware, since it represents 512 * the most recent mapping notified to us by the platform (eg: VPHN). 513 * Since cpu_to_node binding remains the same for all threads in the 514 * core. If a valid cpu-to-node mapping is already available, for 515 * the first thread in the core, use it. 516 */ 517 nid = numa_cpu_lookup_table[fcpu]; 518 if (nid >= 0) { 519 map_cpu_to_node(lcpu, nid); 520 return nid; 521 } 522 523 nid = vphn_get_nid(lcpu); 524 if (nid != NUMA_NO_NODE) 525 goto out_present; 526 527 cpu = of_get_cpu_node(lcpu, NULL); 528 529 if (!cpu) { 530 WARN_ON(1); 531 if (cpu_present(lcpu)) 532 goto out_present; 533 else 534 goto out; 535 } 536 537 nid = of_node_to_nid_single(cpu); 538 of_node_put(cpu); 539 540 out_present: 541 if (nid < 0 || !node_possible(nid)) 542 nid = first_online_node; 543 544 /* 545 * Update for the first thread of the core. All threads of a core 546 * have to be part of the same node. This not only avoids querying 547 * for every other thread in the core, but always avoids a case 548 * where virtual node associativity change causes subsequent threads 549 * of a core to be associated with different nid. However if first 550 * thread is already online, expect it to have a valid mapping. 551 */ 552 if (fcpu != lcpu) { 553 WARN_ON(cpu_online(fcpu)); 554 map_cpu_to_node(fcpu, nid); 555 } 556 557 map_cpu_to_node(lcpu, nid); 558 out: 559 return nid; 560 } 561 562 static void verify_cpu_node_mapping(int cpu, int node) 563 { 564 int base, sibling, i; 565 566 /* Verify that all the threads in the core belong to the same node */ 567 base = cpu_first_thread_sibling(cpu); 568 569 for (i = 0; i < threads_per_core; i++) { 570 sibling = base + i; 571 572 if (sibling == cpu || cpu_is_offline(sibling)) 573 continue; 574 575 if (cpu_to_node(sibling) != node) { 576 WARN(1, "CPU thread siblings %d and %d don't belong" 577 " to the same node!\n", cpu, sibling); 578 break; 579 } 580 } 581 } 582 583 /* Must run before sched domains notifier. */ 584 static int ppc_numa_cpu_prepare(unsigned int cpu) 585 { 586 int nid; 587 588 nid = numa_setup_cpu(cpu); 589 verify_cpu_node_mapping(cpu, nid); 590 return 0; 591 } 592 593 static int ppc_numa_cpu_dead(unsigned int cpu) 594 { 595 #ifdef CONFIG_HOTPLUG_CPU 596 unmap_cpu_from_node(cpu); 597 #endif 598 return 0; 599 } 600 601 /* 602 * Check and possibly modify a memory region to enforce the memory limit. 603 * 604 * Returns the size the region should have to enforce the memory limit. 605 * This will either be the original value of size, a truncated value, 606 * or zero. If the returned value of size is 0 the region should be 607 * discarded as it lies wholly above the memory limit. 608 */ 609 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 610 unsigned long size) 611 { 612 /* 613 * We use memblock_end_of_DRAM() in here instead of memory_limit because 614 * we've already adjusted it for the limit and it takes care of 615 * having memory holes below the limit. Also, in the case of 616 * iommu_is_off, memory_limit is not set but is implicitly enforced. 617 */ 618 619 if (start + size <= memblock_end_of_DRAM()) 620 return size; 621 622 if (start >= memblock_end_of_DRAM()) 623 return 0; 624 625 return memblock_end_of_DRAM() - start; 626 } 627 628 /* 629 * Reads the counter for a given entry in 630 * linux,drconf-usable-memory property 631 */ 632 static inline int __init read_usm_ranges(const __be32 **usm) 633 { 634 /* 635 * For each lmb in ibm,dynamic-memory a corresponding 636 * entry in linux,drconf-usable-memory property contains 637 * a counter followed by that many (base, size) duple. 638 * read the counter from linux,drconf-usable-memory 639 */ 640 return read_n_cells(n_mem_size_cells, usm); 641 } 642 643 /* 644 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 645 * node. This assumes n_mem_{addr,size}_cells have been set. 646 */ 647 static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, 648 const __be32 **usm) 649 { 650 unsigned int ranges, is_kexec_kdump = 0; 651 unsigned long base, size, sz; 652 int nid; 653 654 /* 655 * Skip this block if the reserved bit is set in flags (0x80) 656 * or if the block is not assigned to this partition (0x8) 657 */ 658 if ((lmb->flags & DRCONF_MEM_RESERVED) 659 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 660 return; 661 662 if (*usm) 663 is_kexec_kdump = 1; 664 665 base = lmb->base_addr; 666 size = drmem_lmb_size(); 667 ranges = 1; 668 669 if (is_kexec_kdump) { 670 ranges = read_usm_ranges(usm); 671 if (!ranges) /* there are no (base, size) duple */ 672 return; 673 } 674 675 do { 676 if (is_kexec_kdump) { 677 base = read_n_cells(n_mem_addr_cells, usm); 678 size = read_n_cells(n_mem_size_cells, usm); 679 } 680 681 nid = of_drconf_to_nid_single(lmb); 682 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), 683 &nid); 684 node_set_online(nid); 685 sz = numa_enforce_memory_limit(base, size); 686 if (sz) 687 memblock_set_node(base, sz, &memblock.memory, nid); 688 } while (--ranges); 689 } 690 691 static int __init parse_numa_properties(void) 692 { 693 struct device_node *memory; 694 int default_nid = 0; 695 unsigned long i; 696 697 if (numa_enabled == 0) { 698 printk(KERN_WARNING "NUMA disabled by user\n"); 699 return -1; 700 } 701 702 min_common_depth = find_min_common_depth(); 703 704 if (min_common_depth < 0) { 705 /* 706 * if we fail to parse min_common_depth from device tree 707 * mark the numa disabled, boot with numa disabled. 708 */ 709 numa_enabled = false; 710 return min_common_depth; 711 } 712 713 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 714 715 /* 716 * Even though we connect cpus to numa domains later in SMP 717 * init, we need to know the node ids now. This is because 718 * each node to be onlined must have NODE_DATA etc backing it. 719 */ 720 for_each_present_cpu(i) { 721 struct device_node *cpu; 722 int nid; 723 724 cpu = of_get_cpu_node(i, NULL); 725 BUG_ON(!cpu); 726 nid = of_node_to_nid_single(cpu); 727 of_node_put(cpu); 728 729 /* 730 * Don't fall back to default_nid yet -- we will plug 731 * cpus into nodes once the memory scan has discovered 732 * the topology. 733 */ 734 if (nid < 0) 735 continue; 736 node_set_online(nid); 737 } 738 739 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 740 741 for_each_node_by_type(memory, "memory") { 742 unsigned long start; 743 unsigned long size; 744 int nid; 745 int ranges; 746 const __be32 *memcell_buf; 747 unsigned int len; 748 749 memcell_buf = of_get_property(memory, 750 "linux,usable-memory", &len); 751 if (!memcell_buf || len <= 0) 752 memcell_buf = of_get_property(memory, "reg", &len); 753 if (!memcell_buf || len <= 0) 754 continue; 755 756 /* ranges in cell */ 757 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 758 new_range: 759 /* these are order-sensitive, and modify the buffer pointer */ 760 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 761 size = read_n_cells(n_mem_size_cells, &memcell_buf); 762 763 /* 764 * Assumption: either all memory nodes or none will 765 * have associativity properties. If none, then 766 * everything goes to default_nid. 767 */ 768 nid = of_node_to_nid_single(memory); 769 if (nid < 0) 770 nid = default_nid; 771 772 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 773 node_set_online(nid); 774 775 size = numa_enforce_memory_limit(start, size); 776 if (size) 777 memblock_set_node(start, size, &memblock.memory, nid); 778 779 if (--ranges) 780 goto new_range; 781 } 782 783 /* 784 * Now do the same thing for each MEMBLOCK listed in the 785 * ibm,dynamic-memory property in the 786 * ibm,dynamic-reconfiguration-memory node. 787 */ 788 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 789 if (memory) { 790 walk_drmem_lmbs(memory, numa_setup_drmem_lmb); 791 of_node_put(memory); 792 } 793 794 return 0; 795 } 796 797 static void __init setup_nonnuma(void) 798 { 799 unsigned long top_of_ram = memblock_end_of_DRAM(); 800 unsigned long total_ram = memblock_phys_mem_size(); 801 unsigned long start_pfn, end_pfn; 802 unsigned int nid = 0; 803 struct memblock_region *reg; 804 805 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 806 top_of_ram, total_ram); 807 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 808 (top_of_ram - total_ram) >> 20); 809 810 for_each_memblock(memory, reg) { 811 start_pfn = memblock_region_memory_base_pfn(reg); 812 end_pfn = memblock_region_memory_end_pfn(reg); 813 814 fake_numa_create_new_node(end_pfn, &nid); 815 memblock_set_node(PFN_PHYS(start_pfn), 816 PFN_PHYS(end_pfn - start_pfn), 817 &memblock.memory, nid); 818 node_set_online(nid); 819 } 820 } 821 822 void __init dump_numa_cpu_topology(void) 823 { 824 unsigned int node; 825 unsigned int cpu, count; 826 827 if (!numa_enabled) 828 return; 829 830 for_each_online_node(node) { 831 pr_info("Node %d CPUs:", node); 832 833 count = 0; 834 /* 835 * If we used a CPU iterator here we would miss printing 836 * the holes in the cpumap. 837 */ 838 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 839 if (cpumask_test_cpu(cpu, 840 node_to_cpumask_map[node])) { 841 if (count == 0) 842 pr_cont(" %u", cpu); 843 ++count; 844 } else { 845 if (count > 1) 846 pr_cont("-%u", cpu - 1); 847 count = 0; 848 } 849 } 850 851 if (count > 1) 852 pr_cont("-%u", nr_cpu_ids - 1); 853 pr_cont("\n"); 854 } 855 } 856 857 /* Initialize NODE_DATA for a node on the local memory */ 858 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) 859 { 860 u64 spanned_pages = end_pfn - start_pfn; 861 const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); 862 u64 nd_pa; 863 void *nd; 864 int tnid; 865 866 nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 867 if (!nd_pa) 868 panic("Cannot allocate %zu bytes for node %d data\n", 869 nd_size, nid); 870 871 nd = __va(nd_pa); 872 873 /* report and initialize */ 874 pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", 875 nd_pa, nd_pa + nd_size - 1); 876 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 877 if (tnid != nid) 878 pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); 879 880 node_data[nid] = nd; 881 memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 882 NODE_DATA(nid)->node_id = nid; 883 NODE_DATA(nid)->node_start_pfn = start_pfn; 884 NODE_DATA(nid)->node_spanned_pages = spanned_pages; 885 } 886 887 static void __init find_possible_nodes(void) 888 { 889 struct device_node *rtas; 890 u32 numnodes, i; 891 892 if (!numa_enabled) 893 return; 894 895 rtas = of_find_node_by_path("/rtas"); 896 if (!rtas) 897 return; 898 899 if (of_property_read_u32_index(rtas, 900 "ibm,max-associativity-domains", 901 min_common_depth, &numnodes)) 902 goto out; 903 904 for (i = 0; i < numnodes; i++) { 905 if (!node_possible(i)) 906 node_set(i, node_possible_map); 907 } 908 909 out: 910 of_node_put(rtas); 911 } 912 913 void __init mem_topology_setup(void) 914 { 915 int cpu; 916 917 if (parse_numa_properties()) 918 setup_nonnuma(); 919 920 /* 921 * Modify the set of possible NUMA nodes to reflect information 922 * available about the set of online nodes, and the set of nodes 923 * that we expect to make use of for this platform's affinity 924 * calculations. 925 */ 926 nodes_and(node_possible_map, node_possible_map, node_online_map); 927 928 find_possible_nodes(); 929 930 setup_node_to_cpumask_map(); 931 932 reset_numa_cpu_lookup_table(); 933 934 for_each_present_cpu(cpu) 935 numa_setup_cpu(cpu); 936 } 937 938 void __init initmem_init(void) 939 { 940 int nid; 941 942 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 943 max_pfn = max_low_pfn; 944 945 memblock_dump_all(); 946 947 for_each_online_node(nid) { 948 unsigned long start_pfn, end_pfn; 949 950 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 951 setup_node_data(nid, start_pfn, end_pfn); 952 sparse_memory_present_with_active_regions(nid); 953 } 954 955 sparse_init(); 956 957 /* 958 * We need the numa_cpu_lookup_table to be accurate for all CPUs, 959 * even before we online them, so that we can use cpu_to_{node,mem} 960 * early in boot, cf. smp_prepare_cpus(). 961 * _nocalls() + manual invocation is used because cpuhp is not yet 962 * initialized for the boot CPU. 963 */ 964 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", 965 ppc_numa_cpu_prepare, ppc_numa_cpu_dead); 966 } 967 968 static int __init early_numa(char *p) 969 { 970 if (!p) 971 return 0; 972 973 if (strstr(p, "off")) 974 numa_enabled = 0; 975 976 if (strstr(p, "debug")) 977 numa_debug = 1; 978 979 p = strstr(p, "fake="); 980 if (p) 981 cmdline = p + strlen("fake="); 982 983 return 0; 984 } 985 early_param("numa", early_numa); 986 987 #ifdef CONFIG_MEMORY_HOTPLUG 988 /* 989 * Find the node associated with a hot added memory section for 990 * memory represented in the device tree by the property 991 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 992 */ 993 static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) 994 { 995 struct drmem_lmb *lmb; 996 unsigned long lmb_size; 997 int nid = NUMA_NO_NODE; 998 999 lmb_size = drmem_lmb_size(); 1000 1001 for_each_drmem_lmb(lmb) { 1002 /* skip this block if it is reserved or not assigned to 1003 * this partition */ 1004 if ((lmb->flags & DRCONF_MEM_RESERVED) 1005 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 1006 continue; 1007 1008 if ((scn_addr < lmb->base_addr) 1009 || (scn_addr >= (lmb->base_addr + lmb_size))) 1010 continue; 1011 1012 nid = of_drconf_to_nid_single(lmb); 1013 break; 1014 } 1015 1016 return nid; 1017 } 1018 1019 /* 1020 * Find the node associated with a hot added memory section for memory 1021 * represented in the device tree as a node (i.e. memory@XXXX) for 1022 * each memblock. 1023 */ 1024 static int hot_add_node_scn_to_nid(unsigned long scn_addr) 1025 { 1026 struct device_node *memory; 1027 int nid = NUMA_NO_NODE; 1028 1029 for_each_node_by_type(memory, "memory") { 1030 unsigned long start, size; 1031 int ranges; 1032 const __be32 *memcell_buf; 1033 unsigned int len; 1034 1035 memcell_buf = of_get_property(memory, "reg", &len); 1036 if (!memcell_buf || len <= 0) 1037 continue; 1038 1039 /* ranges in cell */ 1040 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1041 1042 while (ranges--) { 1043 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1044 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1045 1046 if ((scn_addr < start) || (scn_addr >= (start + size))) 1047 continue; 1048 1049 nid = of_node_to_nid_single(memory); 1050 break; 1051 } 1052 1053 if (nid >= 0) 1054 break; 1055 } 1056 1057 of_node_put(memory); 1058 1059 return nid; 1060 } 1061 1062 /* 1063 * Find the node associated with a hot added memory section. Section 1064 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that 1065 * sections are fully contained within a single MEMBLOCK. 1066 */ 1067 int hot_add_scn_to_nid(unsigned long scn_addr) 1068 { 1069 struct device_node *memory = NULL; 1070 int nid; 1071 1072 if (!numa_enabled) 1073 return first_online_node; 1074 1075 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1076 if (memory) { 1077 nid = hot_add_drconf_scn_to_nid(scn_addr); 1078 of_node_put(memory); 1079 } else { 1080 nid = hot_add_node_scn_to_nid(scn_addr); 1081 } 1082 1083 if (nid < 0 || !node_possible(nid)) 1084 nid = first_online_node; 1085 1086 return nid; 1087 } 1088 1089 static u64 hot_add_drconf_memory_max(void) 1090 { 1091 struct device_node *memory = NULL; 1092 struct device_node *dn = NULL; 1093 const __be64 *lrdr = NULL; 1094 1095 dn = of_find_node_by_path("/rtas"); 1096 if (dn) { 1097 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL); 1098 of_node_put(dn); 1099 if (lrdr) 1100 return be64_to_cpup(lrdr); 1101 } 1102 1103 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1104 if (memory) { 1105 of_node_put(memory); 1106 return drmem_lmb_memory_max(); 1107 } 1108 return 0; 1109 } 1110 1111 /* 1112 * memory_hotplug_max - return max address of memory that may be added 1113 * 1114 * This is currently only used on systems that support drconfig memory 1115 * hotplug. 1116 */ 1117 u64 memory_hotplug_max(void) 1118 { 1119 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); 1120 } 1121 #endif /* CONFIG_MEMORY_HOTPLUG */ 1122 1123 /* Virtual Processor Home Node (VPHN) support */ 1124 #ifdef CONFIG_PPC_SPLPAR 1125 static int topology_inited; 1126 1127 /* 1128 * Retrieve the new associativity information for a virtual processor's 1129 * home node. 1130 */ 1131 static long vphn_get_associativity(unsigned long cpu, 1132 __be32 *associativity) 1133 { 1134 long rc; 1135 1136 rc = hcall_vphn(get_hard_smp_processor_id(cpu), 1137 VPHN_FLAG_VCPU, associativity); 1138 1139 switch (rc) { 1140 case H_SUCCESS: 1141 dbg("VPHN hcall succeeded. Reset polling...\n"); 1142 goto out; 1143 1144 case H_FUNCTION: 1145 pr_err_ratelimited("VPHN unsupported. Disabling polling...\n"); 1146 break; 1147 case H_HARDWARE: 1148 pr_err_ratelimited("hcall_vphn() experienced a hardware fault " 1149 "preventing VPHN. Disabling polling...\n"); 1150 break; 1151 case H_PARAMETER: 1152 pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. " 1153 "Disabling polling...\n"); 1154 break; 1155 default: 1156 pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n" 1157 , rc); 1158 break; 1159 } 1160 out: 1161 return rc; 1162 } 1163 1164 int find_and_online_cpu_nid(int cpu) 1165 { 1166 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1167 int new_nid; 1168 1169 /* Use associativity from first thread for all siblings */ 1170 if (vphn_get_associativity(cpu, associativity)) 1171 return cpu_to_node(cpu); 1172 1173 new_nid = associativity_to_nid(associativity); 1174 if (new_nid < 0 || !node_possible(new_nid)) 1175 new_nid = first_online_node; 1176 1177 if (NODE_DATA(new_nid) == NULL) { 1178 #ifdef CONFIG_MEMORY_HOTPLUG 1179 /* 1180 * Need to ensure that NODE_DATA is initialized for a node from 1181 * available memory (see memblock_alloc_try_nid). If unable to 1182 * init the node, then default to nearest node that has memory 1183 * installed. Skip onlining a node if the subsystems are not 1184 * yet initialized. 1185 */ 1186 if (!topology_inited || try_online_node(new_nid)) 1187 new_nid = first_online_node; 1188 #else 1189 /* 1190 * Default to using the nearest node that has memory installed. 1191 * Otherwise, it would be necessary to patch the kernel MM code 1192 * to deal with more memoryless-node error conditions. 1193 */ 1194 new_nid = first_online_node; 1195 #endif 1196 } 1197 1198 pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__, 1199 cpu, new_nid); 1200 return new_nid; 1201 } 1202 1203 static int topology_update_init(void) 1204 { 1205 topology_inited = 1; 1206 return 0; 1207 } 1208 device_initcall(topology_update_init); 1209 #endif /* CONFIG_PPC_SPLPAR */ 1210