1 /* 2 * pSeries NUMA support 3 * 4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #include <linux/threads.h> 12 #include <linux/bootmem.h> 13 #include <linux/init.h> 14 #include <linux/mm.h> 15 #include <linux/mmzone.h> 16 #include <linux/module.h> 17 #include <linux/nodemask.h> 18 #include <linux/cpu.h> 19 #include <linux/notifier.h> 20 #include <linux/lmb.h> 21 #include <linux/of.h> 22 #include <linux/pfn.h> 23 #include <asm/sparsemem.h> 24 #include <asm/prom.h> 25 #include <asm/system.h> 26 #include <asm/smp.h> 27 28 static int numa_enabled = 1; 29 30 static char *cmdline __initdata; 31 32 static int numa_debug; 33 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 34 35 int numa_cpu_lookup_table[NR_CPUS]; 36 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; 37 struct pglist_data *node_data[MAX_NUMNODES]; 38 39 EXPORT_SYMBOL(numa_cpu_lookup_table); 40 EXPORT_SYMBOL(numa_cpumask_lookup_table); 41 EXPORT_SYMBOL(node_data); 42 43 static int min_common_depth; 44 static int n_mem_addr_cells, n_mem_size_cells; 45 46 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, 47 unsigned int *nid) 48 { 49 unsigned long long mem; 50 char *p = cmdline; 51 static unsigned int fake_nid; 52 static unsigned long long curr_boundary; 53 54 /* 55 * Modify node id, iff we started creating NUMA nodes 56 * We want to continue from where we left of the last time 57 */ 58 if (fake_nid) 59 *nid = fake_nid; 60 /* 61 * In case there are no more arguments to parse, the 62 * node_id should be the same as the last fake node id 63 * (we've handled this above). 64 */ 65 if (!p) 66 return 0; 67 68 mem = memparse(p, &p); 69 if (!mem) 70 return 0; 71 72 if (mem < curr_boundary) 73 return 0; 74 75 curr_boundary = mem; 76 77 if ((end_pfn << PAGE_SHIFT) > mem) { 78 /* 79 * Skip commas and spaces 80 */ 81 while (*p == ',' || *p == ' ' || *p == '\t') 82 p++; 83 84 cmdline = p; 85 fake_nid++; 86 *nid = fake_nid; 87 dbg("created new fake_node with id %d\n", fake_nid); 88 return 1; 89 } 90 return 0; 91 } 92 93 /* 94 * get_active_region_work_fn - A helper function for get_node_active_region 95 * Returns datax set to the start_pfn and end_pfn if they contain 96 * the initial value of datax->start_pfn between them 97 * @start_pfn: start page(inclusive) of region to check 98 * @end_pfn: end page(exclusive) of region to check 99 * @datax: comes in with ->start_pfn set to value to search for and 100 * goes out with active range if it contains it 101 * Returns 1 if search value is in range else 0 102 */ 103 static int __init get_active_region_work_fn(unsigned long start_pfn, 104 unsigned long end_pfn, void *datax) 105 { 106 struct node_active_region *data; 107 data = (struct node_active_region *)datax; 108 109 if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) { 110 data->start_pfn = start_pfn; 111 data->end_pfn = end_pfn; 112 return 1; 113 } 114 return 0; 115 116 } 117 118 /* 119 * get_node_active_region - Return active region containing start_pfn 120 * Active range returned is empty if none found. 121 * @start_pfn: The page to return the region for. 122 * @node_ar: Returned set to the active region containing start_pfn 123 */ 124 static void __init get_node_active_region(unsigned long start_pfn, 125 struct node_active_region *node_ar) 126 { 127 int nid = early_pfn_to_nid(start_pfn); 128 129 node_ar->nid = nid; 130 node_ar->start_pfn = start_pfn; 131 node_ar->end_pfn = start_pfn; 132 work_with_active_regions(nid, get_active_region_work_fn, node_ar); 133 } 134 135 static void __cpuinit map_cpu_to_node(int cpu, int node) 136 { 137 numa_cpu_lookup_table[cpu] = node; 138 139 dbg("adding cpu %d to node %d\n", cpu, node); 140 141 if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) 142 cpu_set(cpu, numa_cpumask_lookup_table[node]); 143 } 144 145 #ifdef CONFIG_HOTPLUG_CPU 146 static void unmap_cpu_from_node(unsigned long cpu) 147 { 148 int node = numa_cpu_lookup_table[cpu]; 149 150 dbg("removing cpu %lu from node %d\n", cpu, node); 151 152 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { 153 cpu_clear(cpu, numa_cpumask_lookup_table[node]); 154 } else { 155 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 156 cpu, node); 157 } 158 } 159 #endif /* CONFIG_HOTPLUG_CPU */ 160 161 /* must hold reference to node during call */ 162 static const int *of_get_associativity(struct device_node *dev) 163 { 164 return of_get_property(dev, "ibm,associativity", NULL); 165 } 166 167 /* 168 * Returns the property linux,drconf-usable-memory if 169 * it exists (the property exists only in kexec/kdump kernels, 170 * added by kexec-tools) 171 */ 172 static const u32 *of_get_usable_memory(struct device_node *memory) 173 { 174 const u32 *prop; 175 u32 len; 176 prop = of_get_property(memory, "linux,drconf-usable-memory", &len); 177 if (!prop || len < sizeof(unsigned int)) 178 return 0; 179 return prop; 180 } 181 182 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 183 * info is found. 184 */ 185 static int of_node_to_nid_single(struct device_node *device) 186 { 187 int nid = -1; 188 const unsigned int *tmp; 189 190 if (min_common_depth == -1) 191 goto out; 192 193 tmp = of_get_associativity(device); 194 if (!tmp) 195 goto out; 196 197 if (tmp[0] >= min_common_depth) 198 nid = tmp[min_common_depth]; 199 200 /* POWER4 LPAR uses 0xffff as invalid node */ 201 if (nid == 0xffff || nid >= MAX_NUMNODES) 202 nid = -1; 203 out: 204 return nid; 205 } 206 207 /* Walk the device tree upwards, looking for an associativity id */ 208 int of_node_to_nid(struct device_node *device) 209 { 210 struct device_node *tmp; 211 int nid = -1; 212 213 of_node_get(device); 214 while (device) { 215 nid = of_node_to_nid_single(device); 216 if (nid != -1) 217 break; 218 219 tmp = device; 220 device = of_get_parent(tmp); 221 of_node_put(tmp); 222 } 223 of_node_put(device); 224 225 return nid; 226 } 227 EXPORT_SYMBOL_GPL(of_node_to_nid); 228 229 /* 230 * In theory, the "ibm,associativity" property may contain multiple 231 * associativity lists because a resource may be multiply connected 232 * into the machine. This resource then has different associativity 233 * characteristics relative to its multiple connections. We ignore 234 * this for now. We also assume that all cpu and memory sets have 235 * their distances represented at a common level. This won't be 236 * true for hierarchical NUMA. 237 * 238 * In any case the ibm,associativity-reference-points should give 239 * the correct depth for a normal NUMA system. 240 * 241 * - Dave Hansen <haveblue@us.ibm.com> 242 */ 243 static int __init find_min_common_depth(void) 244 { 245 int depth; 246 const unsigned int *ref_points; 247 struct device_node *rtas_root; 248 unsigned int len; 249 250 rtas_root = of_find_node_by_path("/rtas"); 251 252 if (!rtas_root) 253 return -1; 254 255 /* 256 * this property is 2 32-bit integers, each representing a level of 257 * depth in the associativity nodes. The first is for an SMP 258 * configuration (should be all 0's) and the second is for a normal 259 * NUMA configuration. 260 */ 261 ref_points = of_get_property(rtas_root, 262 "ibm,associativity-reference-points", &len); 263 264 if ((len >= 2 * sizeof(unsigned int)) && ref_points) { 265 depth = ref_points[1]; 266 } else { 267 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 268 depth = -1; 269 } 270 of_node_put(rtas_root); 271 272 return depth; 273 } 274 275 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 276 { 277 struct device_node *memory = NULL; 278 279 memory = of_find_node_by_type(memory, "memory"); 280 if (!memory) 281 panic("numa.c: No memory nodes found!"); 282 283 *n_addr_cells = of_n_addr_cells(memory); 284 *n_size_cells = of_n_size_cells(memory); 285 of_node_put(memory); 286 } 287 288 static unsigned long __devinit read_n_cells(int n, const unsigned int **buf) 289 { 290 unsigned long result = 0; 291 292 while (n--) { 293 result = (result << 32) | **buf; 294 (*buf)++; 295 } 296 return result; 297 } 298 299 struct of_drconf_cell { 300 u64 base_addr; 301 u32 drc_index; 302 u32 reserved; 303 u32 aa_index; 304 u32 flags; 305 }; 306 307 #define DRCONF_MEM_ASSIGNED 0x00000008 308 #define DRCONF_MEM_AI_INVALID 0x00000040 309 #define DRCONF_MEM_RESERVED 0x00000080 310 311 /* 312 * Read the next lmb list entry from the ibm,dynamic-memory property 313 * and return the information in the provided of_drconf_cell structure. 314 */ 315 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp) 316 { 317 const u32 *cp; 318 319 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); 320 321 cp = *cellp; 322 drmem->drc_index = cp[0]; 323 drmem->reserved = cp[1]; 324 drmem->aa_index = cp[2]; 325 drmem->flags = cp[3]; 326 327 *cellp = cp + 4; 328 } 329 330 /* 331 * Retreive and validate the ibm,dynamic-memory property of the device tree. 332 * 333 * The layout of the ibm,dynamic-memory property is a number N of lmb 334 * list entries followed by N lmb list entries. Each lmb list entry 335 * contains information as layed out in the of_drconf_cell struct above. 336 */ 337 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm) 338 { 339 const u32 *prop; 340 u32 len, entries; 341 342 prop = of_get_property(memory, "ibm,dynamic-memory", &len); 343 if (!prop || len < sizeof(unsigned int)) 344 return 0; 345 346 entries = *prop++; 347 348 /* Now that we know the number of entries, revalidate the size 349 * of the property read in to ensure we have everything 350 */ 351 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) 352 return 0; 353 354 *dm = prop; 355 return entries; 356 } 357 358 /* 359 * Retreive and validate the ibm,lmb-size property for drconf memory 360 * from the device tree. 361 */ 362 static u64 of_get_lmb_size(struct device_node *memory) 363 { 364 const u32 *prop; 365 u32 len; 366 367 prop = of_get_property(memory, "ibm,lmb-size", &len); 368 if (!prop || len < sizeof(unsigned int)) 369 return 0; 370 371 return read_n_cells(n_mem_size_cells, &prop); 372 } 373 374 struct assoc_arrays { 375 u32 n_arrays; 376 u32 array_sz; 377 const u32 *arrays; 378 }; 379 380 /* 381 * Retreive and validate the list of associativity arrays for drconf 382 * memory from the ibm,associativity-lookup-arrays property of the 383 * device tree.. 384 * 385 * The layout of the ibm,associativity-lookup-arrays property is a number N 386 * indicating the number of associativity arrays, followed by a number M 387 * indicating the size of each associativity array, followed by a list 388 * of N associativity arrays. 389 */ 390 static int of_get_assoc_arrays(struct device_node *memory, 391 struct assoc_arrays *aa) 392 { 393 const u32 *prop; 394 u32 len; 395 396 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 397 if (!prop || len < 2 * sizeof(unsigned int)) 398 return -1; 399 400 aa->n_arrays = *prop++; 401 aa->array_sz = *prop++; 402 403 /* Now that we know the number of arrrays and size of each array, 404 * revalidate the size of the property read in. 405 */ 406 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 407 return -1; 408 409 aa->arrays = prop; 410 return 0; 411 } 412 413 /* 414 * This is like of_node_to_nid_single() for memory represented in the 415 * ibm,dynamic-reconfiguration-memory node. 416 */ 417 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, 418 struct assoc_arrays *aa) 419 { 420 int default_nid = 0; 421 int nid = default_nid; 422 int index; 423 424 if (min_common_depth > 0 && min_common_depth <= aa->array_sz && 425 !(drmem->flags & DRCONF_MEM_AI_INVALID) && 426 drmem->aa_index < aa->n_arrays) { 427 index = drmem->aa_index * aa->array_sz + min_common_depth - 1; 428 nid = aa->arrays[index]; 429 430 if (nid == 0xffff || nid >= MAX_NUMNODES) 431 nid = default_nid; 432 } 433 434 return nid; 435 } 436 437 /* 438 * Figure out to which domain a cpu belongs and stick it there. 439 * Return the id of the domain used. 440 */ 441 static int __cpuinit numa_setup_cpu(unsigned long lcpu) 442 { 443 int nid = 0; 444 struct device_node *cpu = of_get_cpu_node(lcpu, NULL); 445 446 if (!cpu) { 447 WARN_ON(1); 448 goto out; 449 } 450 451 nid = of_node_to_nid_single(cpu); 452 453 if (nid < 0 || !node_online(nid)) 454 nid = any_online_node(NODE_MASK_ALL); 455 out: 456 map_cpu_to_node(lcpu, nid); 457 458 of_node_put(cpu); 459 460 return nid; 461 } 462 463 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, 464 unsigned long action, 465 void *hcpu) 466 { 467 unsigned long lcpu = (unsigned long)hcpu; 468 int ret = NOTIFY_DONE; 469 470 switch (action) { 471 case CPU_UP_PREPARE: 472 case CPU_UP_PREPARE_FROZEN: 473 numa_setup_cpu(lcpu); 474 ret = NOTIFY_OK; 475 break; 476 #ifdef CONFIG_HOTPLUG_CPU 477 case CPU_DEAD: 478 case CPU_DEAD_FROZEN: 479 case CPU_UP_CANCELED: 480 case CPU_UP_CANCELED_FROZEN: 481 unmap_cpu_from_node(lcpu); 482 break; 483 ret = NOTIFY_OK; 484 #endif 485 } 486 return ret; 487 } 488 489 /* 490 * Check and possibly modify a memory region to enforce the memory limit. 491 * 492 * Returns the size the region should have to enforce the memory limit. 493 * This will either be the original value of size, a truncated value, 494 * or zero. If the returned value of size is 0 the region should be 495 * discarded as it lies wholy above the memory limit. 496 */ 497 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 498 unsigned long size) 499 { 500 /* 501 * We use lmb_end_of_DRAM() in here instead of memory_limit because 502 * we've already adjusted it for the limit and it takes care of 503 * having memory holes below the limit. Also, in the case of 504 * iommu_is_off, memory_limit is not set but is implicitly enforced. 505 */ 506 507 if (start + size <= lmb_end_of_DRAM()) 508 return size; 509 510 if (start >= lmb_end_of_DRAM()) 511 return 0; 512 513 return lmb_end_of_DRAM() - start; 514 } 515 516 /* 517 * Reads the counter for a given entry in 518 * linux,drconf-usable-memory property 519 */ 520 static inline int __init read_usm_ranges(const u32 **usm) 521 { 522 /* 523 * For each lmb in ibm,dynamic-memory a corresponding 524 * entry in linux,drconf-usable-memory property contains 525 * a counter followed by that many (base, size) duple. 526 * read the counter from linux,drconf-usable-memory 527 */ 528 return read_n_cells(n_mem_size_cells, usm); 529 } 530 531 /* 532 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 533 * node. This assumes n_mem_{addr,size}_cells have been set. 534 */ 535 static void __init parse_drconf_memory(struct device_node *memory) 536 { 537 const u32 *dm, *usm; 538 unsigned int n, rc, ranges, is_kexec_kdump = 0; 539 unsigned long lmb_size, base, size, sz; 540 int nid; 541 struct assoc_arrays aa; 542 543 n = of_get_drconf_memory(memory, &dm); 544 if (!n) 545 return; 546 547 lmb_size = of_get_lmb_size(memory); 548 if (!lmb_size) 549 return; 550 551 rc = of_get_assoc_arrays(memory, &aa); 552 if (rc) 553 return; 554 555 /* check if this is a kexec/kdump kernel */ 556 usm = of_get_usable_memory(memory); 557 if (usm != NULL) 558 is_kexec_kdump = 1; 559 560 for (; n != 0; --n) { 561 struct of_drconf_cell drmem; 562 563 read_drconf_cell(&drmem, &dm); 564 565 /* skip this block if the reserved bit is set in flags (0x80) 566 or if the block is not assigned to this partition (0x8) */ 567 if ((drmem.flags & DRCONF_MEM_RESERVED) 568 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 569 continue; 570 571 base = drmem.base_addr; 572 size = lmb_size; 573 ranges = 1; 574 575 if (is_kexec_kdump) { 576 ranges = read_usm_ranges(&usm); 577 if (!ranges) /* there are no (base, size) duple */ 578 continue; 579 } 580 do { 581 if (is_kexec_kdump) { 582 base = read_n_cells(n_mem_addr_cells, &usm); 583 size = read_n_cells(n_mem_size_cells, &usm); 584 } 585 nid = of_drconf_to_nid_single(&drmem, &aa); 586 fake_numa_create_new_node( 587 ((base + size) >> PAGE_SHIFT), 588 &nid); 589 node_set_online(nid); 590 sz = numa_enforce_memory_limit(base, size); 591 if (sz) 592 add_active_range(nid, base >> PAGE_SHIFT, 593 (base >> PAGE_SHIFT) 594 + (sz >> PAGE_SHIFT)); 595 } while (--ranges); 596 } 597 } 598 599 static int __init parse_numa_properties(void) 600 { 601 struct device_node *cpu = NULL; 602 struct device_node *memory = NULL; 603 int default_nid = 0; 604 unsigned long i; 605 606 if (numa_enabled == 0) { 607 printk(KERN_WARNING "NUMA disabled by user\n"); 608 return -1; 609 } 610 611 min_common_depth = find_min_common_depth(); 612 613 if (min_common_depth < 0) 614 return min_common_depth; 615 616 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 617 618 /* 619 * Even though we connect cpus to numa domains later in SMP 620 * init, we need to know the node ids now. This is because 621 * each node to be onlined must have NODE_DATA etc backing it. 622 */ 623 for_each_present_cpu(i) { 624 int nid; 625 626 cpu = of_get_cpu_node(i, NULL); 627 BUG_ON(!cpu); 628 nid = of_node_to_nid_single(cpu); 629 of_node_put(cpu); 630 631 /* 632 * Don't fall back to default_nid yet -- we will plug 633 * cpus into nodes once the memory scan has discovered 634 * the topology. 635 */ 636 if (nid < 0) 637 continue; 638 node_set_online(nid); 639 } 640 641 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 642 memory = NULL; 643 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { 644 unsigned long start; 645 unsigned long size; 646 int nid; 647 int ranges; 648 const unsigned int *memcell_buf; 649 unsigned int len; 650 651 memcell_buf = of_get_property(memory, 652 "linux,usable-memory", &len); 653 if (!memcell_buf || len <= 0) 654 memcell_buf = of_get_property(memory, "reg", &len); 655 if (!memcell_buf || len <= 0) 656 continue; 657 658 /* ranges in cell */ 659 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 660 new_range: 661 /* these are order-sensitive, and modify the buffer pointer */ 662 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 663 size = read_n_cells(n_mem_size_cells, &memcell_buf); 664 665 /* 666 * Assumption: either all memory nodes or none will 667 * have associativity properties. If none, then 668 * everything goes to default_nid. 669 */ 670 nid = of_node_to_nid_single(memory); 671 if (nid < 0) 672 nid = default_nid; 673 674 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 675 node_set_online(nid); 676 677 if (!(size = numa_enforce_memory_limit(start, size))) { 678 if (--ranges) 679 goto new_range; 680 else 681 continue; 682 } 683 684 add_active_range(nid, start >> PAGE_SHIFT, 685 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); 686 687 if (--ranges) 688 goto new_range; 689 } 690 691 /* 692 * Now do the same thing for each LMB listed in the ibm,dynamic-memory 693 * property in the ibm,dynamic-reconfiguration-memory node. 694 */ 695 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 696 if (memory) 697 parse_drconf_memory(memory); 698 699 return 0; 700 } 701 702 static void __init setup_nonnuma(void) 703 { 704 unsigned long top_of_ram = lmb_end_of_DRAM(); 705 unsigned long total_ram = lmb_phys_mem_size(); 706 unsigned long start_pfn, end_pfn; 707 unsigned int i, nid = 0; 708 709 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 710 top_of_ram, total_ram); 711 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 712 (top_of_ram - total_ram) >> 20); 713 714 for (i = 0; i < lmb.memory.cnt; ++i) { 715 start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; 716 end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); 717 718 fake_numa_create_new_node(end_pfn, &nid); 719 add_active_range(nid, start_pfn, end_pfn); 720 node_set_online(nid); 721 } 722 } 723 724 void __init dump_numa_cpu_topology(void) 725 { 726 unsigned int node; 727 unsigned int cpu, count; 728 729 if (min_common_depth == -1 || !numa_enabled) 730 return; 731 732 for_each_online_node(node) { 733 printk(KERN_DEBUG "Node %d CPUs:", node); 734 735 count = 0; 736 /* 737 * If we used a CPU iterator here we would miss printing 738 * the holes in the cpumap. 739 */ 740 for (cpu = 0; cpu < NR_CPUS; cpu++) { 741 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { 742 if (count == 0) 743 printk(" %u", cpu); 744 ++count; 745 } else { 746 if (count > 1) 747 printk("-%u", cpu - 1); 748 count = 0; 749 } 750 } 751 752 if (count > 1) 753 printk("-%u", NR_CPUS - 1); 754 printk("\n"); 755 } 756 } 757 758 static void __init dump_numa_memory_topology(void) 759 { 760 unsigned int node; 761 unsigned int count; 762 763 if (min_common_depth == -1 || !numa_enabled) 764 return; 765 766 for_each_online_node(node) { 767 unsigned long i; 768 769 printk(KERN_DEBUG "Node %d Memory:", node); 770 771 count = 0; 772 773 for (i = 0; i < lmb_end_of_DRAM(); 774 i += (1 << SECTION_SIZE_BITS)) { 775 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { 776 if (count == 0) 777 printk(" 0x%lx", i); 778 ++count; 779 } else { 780 if (count > 0) 781 printk("-0x%lx", i); 782 count = 0; 783 } 784 } 785 786 if (count > 0) 787 printk("-0x%lx", i); 788 printk("\n"); 789 } 790 } 791 792 /* 793 * Allocate some memory, satisfying the lmb or bootmem allocator where 794 * required. nid is the preferred node and end is the physical address of 795 * the highest address in the node. 796 * 797 * Returns the virtual address of the memory. 798 */ 799 static void __init *careful_zallocation(int nid, unsigned long size, 800 unsigned long align, 801 unsigned long end_pfn) 802 { 803 void *ret; 804 int new_nid; 805 unsigned long ret_paddr; 806 807 ret_paddr = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); 808 809 /* retry over all memory */ 810 if (!ret_paddr) 811 ret_paddr = __lmb_alloc_base(size, align, lmb_end_of_DRAM()); 812 813 if (!ret_paddr) 814 panic("numa.c: cannot allocate %lu bytes for node %d", 815 size, nid); 816 817 ret = __va(ret_paddr); 818 819 /* 820 * We initialize the nodes in numeric order: 0, 1, 2... 821 * and hand over control from the LMB allocator to the 822 * bootmem allocator. If this function is called for 823 * node 5, then we know that all nodes <5 are using the 824 * bootmem allocator instead of the LMB allocator. 825 * 826 * So, check the nid from which this allocation came 827 * and double check to see if we need to use bootmem 828 * instead of the LMB. We don't free the LMB memory 829 * since it would be useless. 830 */ 831 new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT); 832 if (new_nid < nid) { 833 ret = __alloc_bootmem_node(NODE_DATA(new_nid), 834 size, align, 0); 835 836 dbg("alloc_bootmem %p %lx\n", ret, size); 837 } 838 839 memset(ret, 0, size); 840 return ret; 841 } 842 843 static struct notifier_block __cpuinitdata ppc64_numa_nb = { 844 .notifier_call = cpu_numa_callback, 845 .priority = 1 /* Must run before sched domains notifier. */ 846 }; 847 848 static void mark_reserved_regions_for_nid(int nid) 849 { 850 struct pglist_data *node = NODE_DATA(nid); 851 int i; 852 853 for (i = 0; i < lmb.reserved.cnt; i++) { 854 unsigned long physbase = lmb.reserved.region[i].base; 855 unsigned long size = lmb.reserved.region[i].size; 856 unsigned long start_pfn = physbase >> PAGE_SHIFT; 857 unsigned long end_pfn = PFN_UP(physbase + size); 858 struct node_active_region node_ar; 859 unsigned long node_end_pfn = node->node_start_pfn + 860 node->node_spanned_pages; 861 862 /* 863 * Check to make sure that this lmb.reserved area is 864 * within the bounds of the node that we care about. 865 * Checking the nid of the start and end points is not 866 * sufficient because the reserved area could span the 867 * entire node. 868 */ 869 if (end_pfn <= node->node_start_pfn || 870 start_pfn >= node_end_pfn) 871 continue; 872 873 get_node_active_region(start_pfn, &node_ar); 874 while (start_pfn < end_pfn && 875 node_ar.start_pfn < node_ar.end_pfn) { 876 unsigned long reserve_size = size; 877 /* 878 * if reserved region extends past active region 879 * then trim size to active region 880 */ 881 if (end_pfn > node_ar.end_pfn) 882 reserve_size = (node_ar.end_pfn << PAGE_SHIFT) 883 - physbase; 884 /* 885 * Only worry about *this* node, others may not 886 * yet have valid NODE_DATA(). 887 */ 888 if (node_ar.nid == nid) { 889 dbg("reserve_bootmem %lx %lx nid=%d\n", 890 physbase, reserve_size, node_ar.nid); 891 reserve_bootmem_node(NODE_DATA(node_ar.nid), 892 physbase, reserve_size, 893 BOOTMEM_DEFAULT); 894 } 895 /* 896 * if reserved region is contained in the active region 897 * then done. 898 */ 899 if (end_pfn <= node_ar.end_pfn) 900 break; 901 902 /* 903 * reserved region extends past the active region 904 * get next active region that contains this 905 * reserved region 906 */ 907 start_pfn = node_ar.end_pfn; 908 physbase = start_pfn << PAGE_SHIFT; 909 size = size - reserve_size; 910 get_node_active_region(start_pfn, &node_ar); 911 } 912 } 913 } 914 915 916 void __init do_init_bootmem(void) 917 { 918 int nid; 919 920 min_low_pfn = 0; 921 max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; 922 max_pfn = max_low_pfn; 923 924 if (parse_numa_properties()) 925 setup_nonnuma(); 926 else 927 dump_numa_memory_topology(); 928 929 register_cpu_notifier(&ppc64_numa_nb); 930 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, 931 (void *)(unsigned long)boot_cpuid); 932 933 for_each_online_node(nid) { 934 unsigned long start_pfn, end_pfn; 935 void *bootmem_vaddr; 936 unsigned long bootmap_pages; 937 938 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 939 940 /* 941 * Allocate the node structure node local if possible 942 * 943 * Be careful moving this around, as it relies on all 944 * previous nodes' bootmem to be initialized and have 945 * all reserved areas marked. 946 */ 947 NODE_DATA(nid) = careful_zallocation(nid, 948 sizeof(struct pglist_data), 949 SMP_CACHE_BYTES, end_pfn); 950 951 dbg("node %d\n", nid); 952 dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); 953 954 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 955 NODE_DATA(nid)->node_start_pfn = start_pfn; 956 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 957 958 if (NODE_DATA(nid)->node_spanned_pages == 0) 959 continue; 960 961 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); 962 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); 963 964 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 965 bootmem_vaddr = careful_zallocation(nid, 966 bootmap_pages << PAGE_SHIFT, 967 PAGE_SIZE, end_pfn); 968 969 dbg("bootmap_vaddr = %p\n", bootmem_vaddr); 970 971 init_bootmem_node(NODE_DATA(nid), 972 __pa(bootmem_vaddr) >> PAGE_SHIFT, 973 start_pfn, end_pfn); 974 975 free_bootmem_with_active_regions(nid, end_pfn); 976 /* 977 * Be very careful about moving this around. Future 978 * calls to careful_zallocation() depend on this getting 979 * done correctly. 980 */ 981 mark_reserved_regions_for_nid(nid); 982 sparse_memory_present_with_active_regions(nid); 983 } 984 985 init_bootmem_done = 1; 986 } 987 988 void __init paging_init(void) 989 { 990 unsigned long max_zone_pfns[MAX_NR_ZONES]; 991 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 992 max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT; 993 free_area_init_nodes(max_zone_pfns); 994 } 995 996 static int __init early_numa(char *p) 997 { 998 if (!p) 999 return 0; 1000 1001 if (strstr(p, "off")) 1002 numa_enabled = 0; 1003 1004 if (strstr(p, "debug")) 1005 numa_debug = 1; 1006 1007 p = strstr(p, "fake="); 1008 if (p) 1009 cmdline = p + strlen("fake="); 1010 1011 return 0; 1012 } 1013 early_param("numa", early_numa); 1014 1015 #ifdef CONFIG_MEMORY_HOTPLUG 1016 /* 1017 * Find the node associated with a hot added memory section for 1018 * memory represented in the device tree by the property 1019 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 1020 */ 1021 static int hot_add_drconf_scn_to_nid(struct device_node *memory, 1022 unsigned long scn_addr) 1023 { 1024 const u32 *dm; 1025 unsigned int drconf_cell_cnt, rc; 1026 unsigned long lmb_size; 1027 struct assoc_arrays aa; 1028 int nid = -1; 1029 1030 drconf_cell_cnt = of_get_drconf_memory(memory, &dm); 1031 if (!drconf_cell_cnt) 1032 return -1; 1033 1034 lmb_size = of_get_lmb_size(memory); 1035 if (!lmb_size) 1036 return -1; 1037 1038 rc = of_get_assoc_arrays(memory, &aa); 1039 if (rc) 1040 return -1; 1041 1042 for (; drconf_cell_cnt != 0; --drconf_cell_cnt) { 1043 struct of_drconf_cell drmem; 1044 1045 read_drconf_cell(&drmem, &dm); 1046 1047 /* skip this block if it is reserved or not assigned to 1048 * this partition */ 1049 if ((drmem.flags & DRCONF_MEM_RESERVED) 1050 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 1051 continue; 1052 1053 if ((scn_addr < drmem.base_addr) 1054 || (scn_addr >= (drmem.base_addr + lmb_size))) 1055 continue; 1056 1057 nid = of_drconf_to_nid_single(&drmem, &aa); 1058 break; 1059 } 1060 1061 return nid; 1062 } 1063 1064 /* 1065 * Find the node associated with a hot added memory section for memory 1066 * represented in the device tree as a node (i.e. memory@XXXX) for 1067 * each lmb. 1068 */ 1069 int hot_add_node_scn_to_nid(unsigned long scn_addr) 1070 { 1071 struct device_node *memory = NULL; 1072 int nid = -1; 1073 1074 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { 1075 unsigned long start, size; 1076 int ranges; 1077 const unsigned int *memcell_buf; 1078 unsigned int len; 1079 1080 memcell_buf = of_get_property(memory, "reg", &len); 1081 if (!memcell_buf || len <= 0) 1082 continue; 1083 1084 /* ranges in cell */ 1085 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1086 1087 while (ranges--) { 1088 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1089 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1090 1091 if ((scn_addr < start) || (scn_addr >= (start + size))) 1092 continue; 1093 1094 nid = of_node_to_nid_single(memory); 1095 break; 1096 } 1097 1098 of_node_put(memory); 1099 if (nid >= 0) 1100 break; 1101 } 1102 1103 return nid; 1104 } 1105 1106 /* 1107 * Find the node associated with a hot added memory section. Section 1108 * corresponds to a SPARSEMEM section, not an LMB. It is assumed that 1109 * sections are fully contained within a single LMB. 1110 */ 1111 int hot_add_scn_to_nid(unsigned long scn_addr) 1112 { 1113 struct device_node *memory = NULL; 1114 int nid, found = 0; 1115 1116 if (!numa_enabled || (min_common_depth < 0)) 1117 return any_online_node(NODE_MASK_ALL); 1118 1119 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1120 if (memory) { 1121 nid = hot_add_drconf_scn_to_nid(memory, scn_addr); 1122 of_node_put(memory); 1123 } else { 1124 nid = hot_add_node_scn_to_nid(scn_addr); 1125 } 1126 1127 if (nid < 0 || !node_online(nid)) 1128 nid = any_online_node(NODE_MASK_ALL); 1129 1130 if (NODE_DATA(nid)->node_spanned_pages) 1131 return nid; 1132 1133 for_each_online_node(nid) { 1134 if (NODE_DATA(nid)->node_spanned_pages) { 1135 found = 1; 1136 break; 1137 } 1138 } 1139 1140 BUG_ON(!found); 1141 return nid; 1142 } 1143 1144 #endif /* CONFIG_MEMORY_HOTPLUG */ 1145