1 /* 2 * pSeries NUMA support 3 * 4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #include <linux/threads.h> 12 #include <linux/bootmem.h> 13 #include <linux/init.h> 14 #include <linux/mm.h> 15 #include <linux/mmzone.h> 16 #include <linux/module.h> 17 #include <linux/nodemask.h> 18 #include <linux/cpu.h> 19 #include <linux/notifier.h> 20 #include <linux/lmb.h> 21 #include <linux/of.h> 22 #include <asm/sparsemem.h> 23 #include <asm/prom.h> 24 #include <asm/system.h> 25 #include <asm/smp.h> 26 27 static int numa_enabled = 1; 28 29 static char *cmdline __initdata; 30 31 static int numa_debug; 32 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 33 34 int numa_cpu_lookup_table[NR_CPUS]; 35 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; 36 struct pglist_data *node_data[MAX_NUMNODES]; 37 38 EXPORT_SYMBOL(numa_cpu_lookup_table); 39 EXPORT_SYMBOL(numa_cpumask_lookup_table); 40 EXPORT_SYMBOL(node_data); 41 42 static int min_common_depth; 43 static int n_mem_addr_cells, n_mem_size_cells; 44 45 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, 46 unsigned int *nid) 47 { 48 unsigned long long mem; 49 char *p = cmdline; 50 static unsigned int fake_nid; 51 static unsigned long long curr_boundary; 52 53 /* 54 * Modify node id, iff we started creating NUMA nodes 55 * We want to continue from where we left of the last time 56 */ 57 if (fake_nid) 58 *nid = fake_nid; 59 /* 60 * In case there are no more arguments to parse, the 61 * node_id should be the same as the last fake node id 62 * (we've handled this above). 63 */ 64 if (!p) 65 return 0; 66 67 mem = memparse(p, &p); 68 if (!mem) 69 return 0; 70 71 if (mem < curr_boundary) 72 return 0; 73 74 curr_boundary = mem; 75 76 if ((end_pfn << PAGE_SHIFT) > mem) { 77 /* 78 * Skip commas and spaces 79 */ 80 while (*p == ',' || *p == ' ' || *p == '\t') 81 p++; 82 83 cmdline = p; 84 fake_nid++; 85 *nid = fake_nid; 86 dbg("created new fake_node with id %d\n", fake_nid); 87 return 1; 88 } 89 return 0; 90 } 91 92 static void __cpuinit map_cpu_to_node(int cpu, int node) 93 { 94 numa_cpu_lookup_table[cpu] = node; 95 96 dbg("adding cpu %d to node %d\n", cpu, node); 97 98 if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) 99 cpu_set(cpu, numa_cpumask_lookup_table[node]); 100 } 101 102 #ifdef CONFIG_HOTPLUG_CPU 103 static void unmap_cpu_from_node(unsigned long cpu) 104 { 105 int node = numa_cpu_lookup_table[cpu]; 106 107 dbg("removing cpu %lu from node %d\n", cpu, node); 108 109 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { 110 cpu_clear(cpu, numa_cpumask_lookup_table[node]); 111 } else { 112 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 113 cpu, node); 114 } 115 } 116 #endif /* CONFIG_HOTPLUG_CPU */ 117 118 static struct device_node * __cpuinit find_cpu_node(unsigned int cpu) 119 { 120 unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); 121 struct device_node *cpu_node = NULL; 122 const unsigned int *interrupt_server, *reg; 123 int len; 124 125 while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { 126 /* Try interrupt server first */ 127 interrupt_server = of_get_property(cpu_node, 128 "ibm,ppc-interrupt-server#s", &len); 129 130 len = len / sizeof(u32); 131 132 if (interrupt_server && (len > 0)) { 133 while (len--) { 134 if (interrupt_server[len] == hw_cpuid) 135 return cpu_node; 136 } 137 } else { 138 reg = of_get_property(cpu_node, "reg", &len); 139 if (reg && (len > 0) && (reg[0] == hw_cpuid)) 140 return cpu_node; 141 } 142 } 143 144 return NULL; 145 } 146 147 /* must hold reference to node during call */ 148 static const int *of_get_associativity(struct device_node *dev) 149 { 150 return of_get_property(dev, "ibm,associativity", NULL); 151 } 152 153 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 154 * info is found. 155 */ 156 static int of_node_to_nid_single(struct device_node *device) 157 { 158 int nid = -1; 159 const unsigned int *tmp; 160 161 if (min_common_depth == -1) 162 goto out; 163 164 tmp = of_get_associativity(device); 165 if (!tmp) 166 goto out; 167 168 if (tmp[0] >= min_common_depth) 169 nid = tmp[min_common_depth]; 170 171 /* POWER4 LPAR uses 0xffff as invalid node */ 172 if (nid == 0xffff || nid >= MAX_NUMNODES) 173 nid = -1; 174 out: 175 return nid; 176 } 177 178 /* Walk the device tree upwards, looking for an associativity id */ 179 int of_node_to_nid(struct device_node *device) 180 { 181 struct device_node *tmp; 182 int nid = -1; 183 184 of_node_get(device); 185 while (device) { 186 nid = of_node_to_nid_single(device); 187 if (nid != -1) 188 break; 189 190 tmp = device; 191 device = of_get_parent(tmp); 192 of_node_put(tmp); 193 } 194 of_node_put(device); 195 196 return nid; 197 } 198 EXPORT_SYMBOL_GPL(of_node_to_nid); 199 200 /* 201 * In theory, the "ibm,associativity" property may contain multiple 202 * associativity lists because a resource may be multiply connected 203 * into the machine. This resource then has different associativity 204 * characteristics relative to its multiple connections. We ignore 205 * this for now. We also assume that all cpu and memory sets have 206 * their distances represented at a common level. This won't be 207 * true for hierarchical NUMA. 208 * 209 * In any case the ibm,associativity-reference-points should give 210 * the correct depth for a normal NUMA system. 211 * 212 * - Dave Hansen <haveblue@us.ibm.com> 213 */ 214 static int __init find_min_common_depth(void) 215 { 216 int depth; 217 const unsigned int *ref_points; 218 struct device_node *rtas_root; 219 unsigned int len; 220 221 rtas_root = of_find_node_by_path("/rtas"); 222 223 if (!rtas_root) 224 return -1; 225 226 /* 227 * this property is 2 32-bit integers, each representing a level of 228 * depth in the associativity nodes. The first is for an SMP 229 * configuration (should be all 0's) and the second is for a normal 230 * NUMA configuration. 231 */ 232 ref_points = of_get_property(rtas_root, 233 "ibm,associativity-reference-points", &len); 234 235 if ((len >= 1) && ref_points) { 236 depth = ref_points[1]; 237 } else { 238 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 239 depth = -1; 240 } 241 of_node_put(rtas_root); 242 243 return depth; 244 } 245 246 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 247 { 248 struct device_node *memory = NULL; 249 250 memory = of_find_node_by_type(memory, "memory"); 251 if (!memory) 252 panic("numa.c: No memory nodes found!"); 253 254 *n_addr_cells = of_n_addr_cells(memory); 255 *n_size_cells = of_n_size_cells(memory); 256 of_node_put(memory); 257 } 258 259 static unsigned long __devinit read_n_cells(int n, const unsigned int **buf) 260 { 261 unsigned long result = 0; 262 263 while (n--) { 264 result = (result << 32) | **buf; 265 (*buf)++; 266 } 267 return result; 268 } 269 270 struct of_drconf_cell { 271 u64 base_addr; 272 u32 drc_index; 273 u32 reserved; 274 u32 aa_index; 275 u32 flags; 276 }; 277 278 #define DRCONF_MEM_ASSIGNED 0x00000008 279 #define DRCONF_MEM_AI_INVALID 0x00000040 280 #define DRCONF_MEM_RESERVED 0x00000080 281 282 /* 283 * Read the next lmb list entry from the ibm,dynamic-memory property 284 * and return the information in the provided of_drconf_cell structure. 285 */ 286 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp) 287 { 288 const u32 *cp; 289 290 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); 291 292 cp = *cellp; 293 drmem->drc_index = cp[0]; 294 drmem->reserved = cp[1]; 295 drmem->aa_index = cp[2]; 296 drmem->flags = cp[3]; 297 298 *cellp = cp + 4; 299 } 300 301 /* 302 * Retreive and validate the ibm,dynamic-memory property of the device tree. 303 * 304 * The layout of the ibm,dynamic-memory property is a number N of lmb 305 * list entries followed by N lmb list entries. Each lmb list entry 306 * contains information as layed out in the of_drconf_cell struct above. 307 */ 308 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm) 309 { 310 const u32 *prop; 311 u32 len, entries; 312 313 prop = of_get_property(memory, "ibm,dynamic-memory", &len); 314 if (!prop || len < sizeof(unsigned int)) 315 return 0; 316 317 entries = *prop++; 318 319 /* Now that we know the number of entries, revalidate the size 320 * of the property read in to ensure we have everything 321 */ 322 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) 323 return 0; 324 325 *dm = prop; 326 return entries; 327 } 328 329 /* 330 * Retreive and validate the ibm,lmb-size property for drconf memory 331 * from the device tree. 332 */ 333 static u64 of_get_lmb_size(struct device_node *memory) 334 { 335 const u32 *prop; 336 u32 len; 337 338 prop = of_get_property(memory, "ibm,lmb-size", &len); 339 if (!prop || len < sizeof(unsigned int)) 340 return 0; 341 342 return read_n_cells(n_mem_size_cells, &prop); 343 } 344 345 struct assoc_arrays { 346 u32 n_arrays; 347 u32 array_sz; 348 const u32 *arrays; 349 }; 350 351 /* 352 * Retreive and validate the list of associativity arrays for drconf 353 * memory from the ibm,associativity-lookup-arrays property of the 354 * device tree.. 355 * 356 * The layout of the ibm,associativity-lookup-arrays property is a number N 357 * indicating the number of associativity arrays, followed by a number M 358 * indicating the size of each associativity array, followed by a list 359 * of N associativity arrays. 360 */ 361 static int of_get_assoc_arrays(struct device_node *memory, 362 struct assoc_arrays *aa) 363 { 364 const u32 *prop; 365 u32 len; 366 367 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 368 if (!prop || len < 2 * sizeof(unsigned int)) 369 return -1; 370 371 aa->n_arrays = *prop++; 372 aa->array_sz = *prop++; 373 374 /* Now that we know the number of arrrays and size of each array, 375 * revalidate the size of the property read in. 376 */ 377 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 378 return -1; 379 380 aa->arrays = prop; 381 return 0; 382 } 383 384 /* 385 * This is like of_node_to_nid_single() for memory represented in the 386 * ibm,dynamic-reconfiguration-memory node. 387 */ 388 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, 389 struct assoc_arrays *aa) 390 { 391 int default_nid = 0; 392 int nid = default_nid; 393 int index; 394 395 if (min_common_depth > 0 && min_common_depth <= aa->array_sz && 396 !(drmem->flags & DRCONF_MEM_AI_INVALID) && 397 drmem->aa_index < aa->n_arrays) { 398 index = drmem->aa_index * aa->array_sz + min_common_depth - 1; 399 nid = aa->arrays[index]; 400 401 if (nid == 0xffff || nid >= MAX_NUMNODES) 402 nid = default_nid; 403 } 404 405 return nid; 406 } 407 408 /* 409 * Figure out to which domain a cpu belongs and stick it there. 410 * Return the id of the domain used. 411 */ 412 static int __cpuinit numa_setup_cpu(unsigned long lcpu) 413 { 414 int nid = 0; 415 struct device_node *cpu = find_cpu_node(lcpu); 416 417 if (!cpu) { 418 WARN_ON(1); 419 goto out; 420 } 421 422 nid = of_node_to_nid_single(cpu); 423 424 if (nid < 0 || !node_online(nid)) 425 nid = any_online_node(NODE_MASK_ALL); 426 out: 427 map_cpu_to_node(lcpu, nid); 428 429 of_node_put(cpu); 430 431 return nid; 432 } 433 434 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, 435 unsigned long action, 436 void *hcpu) 437 { 438 unsigned long lcpu = (unsigned long)hcpu; 439 int ret = NOTIFY_DONE; 440 441 switch (action) { 442 case CPU_UP_PREPARE: 443 case CPU_UP_PREPARE_FROZEN: 444 numa_setup_cpu(lcpu); 445 ret = NOTIFY_OK; 446 break; 447 #ifdef CONFIG_HOTPLUG_CPU 448 case CPU_DEAD: 449 case CPU_DEAD_FROZEN: 450 case CPU_UP_CANCELED: 451 case CPU_UP_CANCELED_FROZEN: 452 unmap_cpu_from_node(lcpu); 453 break; 454 ret = NOTIFY_OK; 455 #endif 456 } 457 return ret; 458 } 459 460 /* 461 * Check and possibly modify a memory region to enforce the memory limit. 462 * 463 * Returns the size the region should have to enforce the memory limit. 464 * This will either be the original value of size, a truncated value, 465 * or zero. If the returned value of size is 0 the region should be 466 * discarded as it lies wholy above the memory limit. 467 */ 468 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 469 unsigned long size) 470 { 471 /* 472 * We use lmb_end_of_DRAM() in here instead of memory_limit because 473 * we've already adjusted it for the limit and it takes care of 474 * having memory holes below the limit. 475 */ 476 477 if (! memory_limit) 478 return size; 479 480 if (start + size <= lmb_end_of_DRAM()) 481 return size; 482 483 if (start >= lmb_end_of_DRAM()) 484 return 0; 485 486 return lmb_end_of_DRAM() - start; 487 } 488 489 /* 490 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 491 * node. This assumes n_mem_{addr,size}_cells have been set. 492 */ 493 static void __init parse_drconf_memory(struct device_node *memory) 494 { 495 const u32 *dm; 496 unsigned int n, rc; 497 unsigned long lmb_size, size; 498 int nid; 499 struct assoc_arrays aa; 500 501 n = of_get_drconf_memory(memory, &dm); 502 if (!n) 503 return; 504 505 lmb_size = of_get_lmb_size(memory); 506 if (!lmb_size) 507 return; 508 509 rc = of_get_assoc_arrays(memory, &aa); 510 if (rc) 511 return; 512 513 for (; n != 0; --n) { 514 struct of_drconf_cell drmem; 515 516 read_drconf_cell(&drmem, &dm); 517 518 /* skip this block if the reserved bit is set in flags (0x80) 519 or if the block is not assigned to this partition (0x8) */ 520 if ((drmem.flags & DRCONF_MEM_RESERVED) 521 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 522 continue; 523 524 nid = of_drconf_to_nid_single(&drmem, &aa); 525 526 fake_numa_create_new_node( 527 ((drmem.base_addr + lmb_size) >> PAGE_SHIFT), 528 &nid); 529 530 node_set_online(nid); 531 532 size = numa_enforce_memory_limit(drmem.base_addr, lmb_size); 533 if (!size) 534 continue; 535 536 add_active_range(nid, drmem.base_addr >> PAGE_SHIFT, 537 (drmem.base_addr >> PAGE_SHIFT) 538 + (size >> PAGE_SHIFT)); 539 } 540 } 541 542 static int __init parse_numa_properties(void) 543 { 544 struct device_node *cpu = NULL; 545 struct device_node *memory = NULL; 546 int default_nid = 0; 547 unsigned long i; 548 549 if (numa_enabled == 0) { 550 printk(KERN_WARNING "NUMA disabled by user\n"); 551 return -1; 552 } 553 554 min_common_depth = find_min_common_depth(); 555 556 if (min_common_depth < 0) 557 return min_common_depth; 558 559 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 560 561 /* 562 * Even though we connect cpus to numa domains later in SMP 563 * init, we need to know the node ids now. This is because 564 * each node to be onlined must have NODE_DATA etc backing it. 565 */ 566 for_each_present_cpu(i) { 567 int nid; 568 569 cpu = find_cpu_node(i); 570 BUG_ON(!cpu); 571 nid = of_node_to_nid_single(cpu); 572 of_node_put(cpu); 573 574 /* 575 * Don't fall back to default_nid yet -- we will plug 576 * cpus into nodes once the memory scan has discovered 577 * the topology. 578 */ 579 if (nid < 0) 580 continue; 581 node_set_online(nid); 582 } 583 584 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 585 memory = NULL; 586 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { 587 unsigned long start; 588 unsigned long size; 589 int nid; 590 int ranges; 591 const unsigned int *memcell_buf; 592 unsigned int len; 593 594 memcell_buf = of_get_property(memory, 595 "linux,usable-memory", &len); 596 if (!memcell_buf || len <= 0) 597 memcell_buf = of_get_property(memory, "reg", &len); 598 if (!memcell_buf || len <= 0) 599 continue; 600 601 /* ranges in cell */ 602 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 603 new_range: 604 /* these are order-sensitive, and modify the buffer pointer */ 605 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 606 size = read_n_cells(n_mem_size_cells, &memcell_buf); 607 608 /* 609 * Assumption: either all memory nodes or none will 610 * have associativity properties. If none, then 611 * everything goes to default_nid. 612 */ 613 nid = of_node_to_nid_single(memory); 614 if (nid < 0) 615 nid = default_nid; 616 617 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 618 node_set_online(nid); 619 620 if (!(size = numa_enforce_memory_limit(start, size))) { 621 if (--ranges) 622 goto new_range; 623 else 624 continue; 625 } 626 627 add_active_range(nid, start >> PAGE_SHIFT, 628 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); 629 630 if (--ranges) 631 goto new_range; 632 } 633 634 /* 635 * Now do the same thing for each LMB listed in the ibm,dynamic-memory 636 * property in the ibm,dynamic-reconfiguration-memory node. 637 */ 638 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 639 if (memory) 640 parse_drconf_memory(memory); 641 642 return 0; 643 } 644 645 static void __init setup_nonnuma(void) 646 { 647 unsigned long top_of_ram = lmb_end_of_DRAM(); 648 unsigned long total_ram = lmb_phys_mem_size(); 649 unsigned long start_pfn, end_pfn; 650 unsigned int i, nid = 0; 651 652 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 653 top_of_ram, total_ram); 654 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 655 (top_of_ram - total_ram) >> 20); 656 657 for (i = 0; i < lmb.memory.cnt; ++i) { 658 start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; 659 end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); 660 661 fake_numa_create_new_node(end_pfn, &nid); 662 add_active_range(nid, start_pfn, end_pfn); 663 node_set_online(nid); 664 } 665 } 666 667 void __init dump_numa_cpu_topology(void) 668 { 669 unsigned int node; 670 unsigned int cpu, count; 671 672 if (min_common_depth == -1 || !numa_enabled) 673 return; 674 675 for_each_online_node(node) { 676 printk(KERN_DEBUG "Node %d CPUs:", node); 677 678 count = 0; 679 /* 680 * If we used a CPU iterator here we would miss printing 681 * the holes in the cpumap. 682 */ 683 for (cpu = 0; cpu < NR_CPUS; cpu++) { 684 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { 685 if (count == 0) 686 printk(" %u", cpu); 687 ++count; 688 } else { 689 if (count > 1) 690 printk("-%u", cpu - 1); 691 count = 0; 692 } 693 } 694 695 if (count > 1) 696 printk("-%u", NR_CPUS - 1); 697 printk("\n"); 698 } 699 } 700 701 static void __init dump_numa_memory_topology(void) 702 { 703 unsigned int node; 704 unsigned int count; 705 706 if (min_common_depth == -1 || !numa_enabled) 707 return; 708 709 for_each_online_node(node) { 710 unsigned long i; 711 712 printk(KERN_DEBUG "Node %d Memory:", node); 713 714 count = 0; 715 716 for (i = 0; i < lmb_end_of_DRAM(); 717 i += (1 << SECTION_SIZE_BITS)) { 718 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { 719 if (count == 0) 720 printk(" 0x%lx", i); 721 ++count; 722 } else { 723 if (count > 0) 724 printk("-0x%lx", i); 725 count = 0; 726 } 727 } 728 729 if (count > 0) 730 printk("-0x%lx", i); 731 printk("\n"); 732 } 733 } 734 735 /* 736 * Allocate some memory, satisfying the lmb or bootmem allocator where 737 * required. nid is the preferred node and end is the physical address of 738 * the highest address in the node. 739 * 740 * Returns the physical address of the memory. 741 */ 742 static void __init *careful_allocation(int nid, unsigned long size, 743 unsigned long align, 744 unsigned long end_pfn) 745 { 746 int new_nid; 747 unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); 748 749 /* retry over all memory */ 750 if (!ret) 751 ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM()); 752 753 if (!ret) 754 panic("numa.c: cannot allocate %lu bytes on node %d", 755 size, nid); 756 757 /* 758 * If the memory came from a previously allocated node, we must 759 * retry with the bootmem allocator. 760 */ 761 new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT); 762 if (new_nid < nid) { 763 ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid), 764 size, align, 0); 765 766 if (!ret) 767 panic("numa.c: cannot allocate %lu bytes on node %d", 768 size, new_nid); 769 770 ret = __pa(ret); 771 772 dbg("alloc_bootmem %lx %lx\n", ret, size); 773 } 774 775 return (void *)ret; 776 } 777 778 static struct notifier_block __cpuinitdata ppc64_numa_nb = { 779 .notifier_call = cpu_numa_callback, 780 .priority = 1 /* Must run before sched domains notifier. */ 781 }; 782 783 void __init do_init_bootmem(void) 784 { 785 int nid; 786 unsigned int i; 787 788 min_low_pfn = 0; 789 max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; 790 max_pfn = max_low_pfn; 791 792 if (parse_numa_properties()) 793 setup_nonnuma(); 794 else 795 dump_numa_memory_topology(); 796 797 register_cpu_notifier(&ppc64_numa_nb); 798 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, 799 (void *)(unsigned long)boot_cpuid); 800 801 for_each_online_node(nid) { 802 unsigned long start_pfn, end_pfn; 803 unsigned long bootmem_paddr; 804 unsigned long bootmap_pages; 805 806 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 807 808 /* Allocate the node structure node local if possible */ 809 NODE_DATA(nid) = careful_allocation(nid, 810 sizeof(struct pglist_data), 811 SMP_CACHE_BYTES, end_pfn); 812 NODE_DATA(nid) = __va(NODE_DATA(nid)); 813 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 814 815 dbg("node %d\n", nid); 816 dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); 817 818 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 819 NODE_DATA(nid)->node_start_pfn = start_pfn; 820 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 821 822 if (NODE_DATA(nid)->node_spanned_pages == 0) 823 continue; 824 825 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); 826 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); 827 828 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 829 bootmem_paddr = (unsigned long)careful_allocation(nid, 830 bootmap_pages << PAGE_SHIFT, 831 PAGE_SIZE, end_pfn); 832 memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); 833 834 dbg("bootmap_paddr = %lx\n", bootmem_paddr); 835 836 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, 837 start_pfn, end_pfn); 838 839 free_bootmem_with_active_regions(nid, end_pfn); 840 841 /* Mark reserved regions on this node */ 842 for (i = 0; i < lmb.reserved.cnt; i++) { 843 unsigned long physbase = lmb.reserved.region[i].base; 844 unsigned long size = lmb.reserved.region[i].size; 845 unsigned long start_paddr = start_pfn << PAGE_SHIFT; 846 unsigned long end_paddr = end_pfn << PAGE_SHIFT; 847 848 if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && 849 early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) 850 continue; 851 852 if (physbase < end_paddr && 853 (physbase+size) > start_paddr) { 854 /* overlaps */ 855 if (physbase < start_paddr) { 856 size -= start_paddr - physbase; 857 physbase = start_paddr; 858 } 859 860 if (size > end_paddr - physbase) 861 size = end_paddr - physbase; 862 863 dbg("reserve_bootmem %lx %lx\n", physbase, 864 size); 865 reserve_bootmem_node(NODE_DATA(nid), physbase, 866 size, BOOTMEM_DEFAULT); 867 } 868 } 869 870 sparse_memory_present_with_active_regions(nid); 871 } 872 } 873 874 void __init paging_init(void) 875 { 876 unsigned long max_zone_pfns[MAX_NR_ZONES]; 877 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 878 max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT; 879 free_area_init_nodes(max_zone_pfns); 880 } 881 882 static int __init early_numa(char *p) 883 { 884 if (!p) 885 return 0; 886 887 if (strstr(p, "off")) 888 numa_enabled = 0; 889 890 if (strstr(p, "debug")) 891 numa_debug = 1; 892 893 p = strstr(p, "fake="); 894 if (p) 895 cmdline = p + strlen("fake="); 896 897 return 0; 898 } 899 early_param("numa", early_numa); 900 901 #ifdef CONFIG_MEMORY_HOTPLUG 902 /* 903 * Validate the node associated with the memory section we are 904 * trying to add. 905 */ 906 int valid_hot_add_scn(int *nid, unsigned long start, u32 lmb_size, 907 unsigned long scn_addr) 908 { 909 nodemask_t nodes; 910 911 if (*nid < 0 || !node_online(*nid)) 912 *nid = any_online_node(NODE_MASK_ALL); 913 914 if ((scn_addr >= start) && (scn_addr < (start + lmb_size))) { 915 nodes_setall(nodes); 916 while (NODE_DATA(*nid)->node_spanned_pages == 0) { 917 node_clear(*nid, nodes); 918 *nid = any_online_node(nodes); 919 } 920 921 return 1; 922 } 923 924 return 0; 925 } 926 927 /* 928 * Find the node associated with a hot added memory section represented 929 * by the ibm,dynamic-reconfiguration-memory node. 930 */ 931 static int hot_add_drconf_scn_to_nid(struct device_node *memory, 932 unsigned long scn_addr) 933 { 934 const u32 *dm; 935 unsigned int n, rc; 936 unsigned long lmb_size; 937 int default_nid = any_online_node(NODE_MASK_ALL); 938 int nid; 939 struct assoc_arrays aa; 940 941 n = of_get_drconf_memory(memory, &dm); 942 if (!n) 943 return default_nid;; 944 945 lmb_size = of_get_lmb_size(memory); 946 if (!lmb_size) 947 return default_nid; 948 949 rc = of_get_assoc_arrays(memory, &aa); 950 if (rc) 951 return default_nid; 952 953 for (; n != 0; --n) { 954 struct of_drconf_cell drmem; 955 956 read_drconf_cell(&drmem, &dm); 957 958 /* skip this block if it is reserved or not assigned to 959 * this partition */ 960 if ((drmem.flags & DRCONF_MEM_RESERVED) 961 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 962 continue; 963 964 nid = of_drconf_to_nid_single(&drmem, &aa); 965 966 if (valid_hot_add_scn(&nid, drmem.base_addr, lmb_size, 967 scn_addr)) 968 return nid; 969 } 970 971 BUG(); /* section address should be found above */ 972 return 0; 973 } 974 975 /* 976 * Find the node associated with a hot added memory section. Section 977 * corresponds to a SPARSEMEM section, not an LMB. It is assumed that 978 * sections are fully contained within a single LMB. 979 */ 980 int hot_add_scn_to_nid(unsigned long scn_addr) 981 { 982 struct device_node *memory = NULL; 983 int nid; 984 985 if (!numa_enabled || (min_common_depth < 0)) 986 return any_online_node(NODE_MASK_ALL); 987 988 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 989 if (memory) { 990 nid = hot_add_drconf_scn_to_nid(memory, scn_addr); 991 of_node_put(memory); 992 return nid; 993 } 994 995 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { 996 unsigned long start, size; 997 int ranges; 998 const unsigned int *memcell_buf; 999 unsigned int len; 1000 1001 memcell_buf = of_get_property(memory, "reg", &len); 1002 if (!memcell_buf || len <= 0) 1003 continue; 1004 1005 /* ranges in cell */ 1006 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1007 ha_new_range: 1008 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1009 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1010 nid = of_node_to_nid_single(memory); 1011 1012 if (valid_hot_add_scn(&nid, start, size, scn_addr)) { 1013 of_node_put(memory); 1014 return nid; 1015 } 1016 1017 if (--ranges) /* process all ranges in cell */ 1018 goto ha_new_range; 1019 } 1020 BUG(); /* section address should be found above */ 1021 return 0; 1022 } 1023 #endif /* CONFIG_MEMORY_HOTPLUG */ 1024