1 /* 2 * pSeries NUMA support 3 * 4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #include <linux/threads.h> 12 #include <linux/bootmem.h> 13 #include <linux/init.h> 14 #include <linux/mm.h> 15 #include <linux/mmzone.h> 16 #include <linux/module.h> 17 #include <linux/nodemask.h> 18 #include <linux/cpu.h> 19 #include <linux/notifier.h> 20 #include <asm/sparsemem.h> 21 #include <asm/lmb.h> 22 #include <asm/system.h> 23 #include <asm/smp.h> 24 25 static int numa_enabled = 1; 26 27 static int numa_debug; 28 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 29 30 int numa_cpu_lookup_table[NR_CPUS]; 31 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; 32 struct pglist_data *node_data[MAX_NUMNODES]; 33 34 EXPORT_SYMBOL(numa_cpu_lookup_table); 35 EXPORT_SYMBOL(numa_cpumask_lookup_table); 36 EXPORT_SYMBOL(node_data); 37 38 static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; 39 static int min_common_depth; 40 41 /* 42 * We need somewhere to store start/end/node for each region until we have 43 * allocated the real node_data structures. 44 */ 45 #define MAX_REGIONS (MAX_LMB_REGIONS*2) 46 static struct { 47 unsigned long start_pfn; 48 unsigned long end_pfn; 49 int nid; 50 } init_node_data[MAX_REGIONS] __initdata; 51 52 int __init early_pfn_to_nid(unsigned long pfn) 53 { 54 unsigned int i; 55 56 for (i = 0; init_node_data[i].end_pfn; i++) { 57 unsigned long start_pfn = init_node_data[i].start_pfn; 58 unsigned long end_pfn = init_node_data[i].end_pfn; 59 60 if ((start_pfn <= pfn) && (pfn < end_pfn)) 61 return init_node_data[i].nid; 62 } 63 64 return -1; 65 } 66 67 void __init add_region(unsigned int nid, unsigned long start_pfn, 68 unsigned long pages) 69 { 70 unsigned int i; 71 72 dbg("add_region nid %d start_pfn 0x%lx pages 0x%lx\n", 73 nid, start_pfn, pages); 74 75 for (i = 0; init_node_data[i].end_pfn; i++) { 76 if (init_node_data[i].nid != nid) 77 continue; 78 if (init_node_data[i].end_pfn == start_pfn) { 79 init_node_data[i].end_pfn += pages; 80 return; 81 } 82 if (init_node_data[i].start_pfn == (start_pfn + pages)) { 83 init_node_data[i].start_pfn -= pages; 84 return; 85 } 86 } 87 88 /* 89 * Leave last entry NULL so we dont iterate off the end (we use 90 * entry.end_pfn to terminate the walk). 91 */ 92 if (i >= (MAX_REGIONS - 1)) { 93 printk(KERN_ERR "WARNING: too many memory regions in " 94 "numa code, truncating\n"); 95 return; 96 } 97 98 init_node_data[i].start_pfn = start_pfn; 99 init_node_data[i].end_pfn = start_pfn + pages; 100 init_node_data[i].nid = nid; 101 } 102 103 /* We assume init_node_data has no overlapping regions */ 104 void __init get_region(unsigned int nid, unsigned long *start_pfn, 105 unsigned long *end_pfn, unsigned long *pages_present) 106 { 107 unsigned int i; 108 109 *start_pfn = -1UL; 110 *end_pfn = *pages_present = 0; 111 112 for (i = 0; init_node_data[i].end_pfn; i++) { 113 if (init_node_data[i].nid != nid) 114 continue; 115 116 *pages_present += init_node_data[i].end_pfn - 117 init_node_data[i].start_pfn; 118 119 if (init_node_data[i].start_pfn < *start_pfn) 120 *start_pfn = init_node_data[i].start_pfn; 121 122 if (init_node_data[i].end_pfn > *end_pfn) 123 *end_pfn = init_node_data[i].end_pfn; 124 } 125 126 /* We didnt find a matching region, return start/end as 0 */ 127 if (*start_pfn == -1UL) 128 start_pfn = 0; 129 } 130 131 static inline void map_cpu_to_node(int cpu, int node) 132 { 133 numa_cpu_lookup_table[cpu] = node; 134 135 if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) 136 cpu_set(cpu, numa_cpumask_lookup_table[node]); 137 } 138 139 #ifdef CONFIG_HOTPLUG_CPU 140 static void unmap_cpu_from_node(unsigned long cpu) 141 { 142 int node = numa_cpu_lookup_table[cpu]; 143 144 dbg("removing cpu %lu from node %d\n", cpu, node); 145 146 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { 147 cpu_clear(cpu, numa_cpumask_lookup_table[node]); 148 } else { 149 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 150 cpu, node); 151 } 152 } 153 #endif /* CONFIG_HOTPLUG_CPU */ 154 155 static struct device_node *find_cpu_node(unsigned int cpu) 156 { 157 unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); 158 struct device_node *cpu_node = NULL; 159 unsigned int *interrupt_server, *reg; 160 int len; 161 162 while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { 163 /* Try interrupt server first */ 164 interrupt_server = (unsigned int *)get_property(cpu_node, 165 "ibm,ppc-interrupt-server#s", &len); 166 167 len = len / sizeof(u32); 168 169 if (interrupt_server && (len > 0)) { 170 while (len--) { 171 if (interrupt_server[len] == hw_cpuid) 172 return cpu_node; 173 } 174 } else { 175 reg = (unsigned int *)get_property(cpu_node, 176 "reg", &len); 177 if (reg && (len > 0) && (reg[0] == hw_cpuid)) 178 return cpu_node; 179 } 180 } 181 182 return NULL; 183 } 184 185 /* must hold reference to node during call */ 186 static int *of_get_associativity(struct device_node *dev) 187 { 188 return (unsigned int *)get_property(dev, "ibm,associativity", NULL); 189 } 190 191 static int of_node_numa_domain(struct device_node *device) 192 { 193 int numa_domain; 194 unsigned int *tmp; 195 196 if (min_common_depth == -1) 197 return 0; 198 199 tmp = of_get_associativity(device); 200 if (tmp && (tmp[0] >= min_common_depth)) { 201 numa_domain = tmp[min_common_depth]; 202 } else { 203 dbg("WARNING: no NUMA information for %s\n", 204 device->full_name); 205 numa_domain = 0; 206 } 207 return numa_domain; 208 } 209 210 /* 211 * In theory, the "ibm,associativity" property may contain multiple 212 * associativity lists because a resource may be multiply connected 213 * into the machine. This resource then has different associativity 214 * characteristics relative to its multiple connections. We ignore 215 * this for now. We also assume that all cpu and memory sets have 216 * their distances represented at a common level. This won't be 217 * true for heirarchical NUMA. 218 * 219 * In any case the ibm,associativity-reference-points should give 220 * the correct depth for a normal NUMA system. 221 * 222 * - Dave Hansen <haveblue@us.ibm.com> 223 */ 224 static int __init find_min_common_depth(void) 225 { 226 int depth; 227 unsigned int *ref_points; 228 struct device_node *rtas_root; 229 unsigned int len; 230 231 rtas_root = of_find_node_by_path("/rtas"); 232 233 if (!rtas_root) 234 return -1; 235 236 /* 237 * this property is 2 32-bit integers, each representing a level of 238 * depth in the associativity nodes. The first is for an SMP 239 * configuration (should be all 0's) and the second is for a normal 240 * NUMA configuration. 241 */ 242 ref_points = (unsigned int *)get_property(rtas_root, 243 "ibm,associativity-reference-points", &len); 244 245 if ((len >= 1) && ref_points) { 246 depth = ref_points[1]; 247 } else { 248 dbg("WARNING: could not find NUMA " 249 "associativity reference point\n"); 250 depth = -1; 251 } 252 of_node_put(rtas_root); 253 254 return depth; 255 } 256 257 static int __init get_mem_addr_cells(void) 258 { 259 struct device_node *memory = NULL; 260 int rc; 261 262 memory = of_find_node_by_type(memory, "memory"); 263 if (!memory) 264 return 0; /* it won't matter */ 265 266 rc = prom_n_addr_cells(memory); 267 return rc; 268 } 269 270 static int __init get_mem_size_cells(void) 271 { 272 struct device_node *memory = NULL; 273 int rc; 274 275 memory = of_find_node_by_type(memory, "memory"); 276 if (!memory) 277 return 0; /* it won't matter */ 278 rc = prom_n_size_cells(memory); 279 return rc; 280 } 281 282 static unsigned long __init read_n_cells(int n, unsigned int **buf) 283 { 284 unsigned long result = 0; 285 286 while (n--) { 287 result = (result << 32) | **buf; 288 (*buf)++; 289 } 290 return result; 291 } 292 293 /* 294 * Figure out to which domain a cpu belongs and stick it there. 295 * Return the id of the domain used. 296 */ 297 static int numa_setup_cpu(unsigned long lcpu) 298 { 299 int numa_domain = 0; 300 struct device_node *cpu = find_cpu_node(lcpu); 301 302 if (!cpu) { 303 WARN_ON(1); 304 goto out; 305 } 306 307 numa_domain = of_node_numa_domain(cpu); 308 309 if (numa_domain >= num_online_nodes()) { 310 /* 311 * POWER4 LPAR uses 0xffff as invalid node, 312 * dont warn in this case. 313 */ 314 if (numa_domain != 0xffff) 315 printk(KERN_ERR "WARNING: cpu %ld " 316 "maps to invalid NUMA node %d\n", 317 lcpu, numa_domain); 318 numa_domain = 0; 319 } 320 out: 321 node_set_online(numa_domain); 322 323 map_cpu_to_node(lcpu, numa_domain); 324 325 of_node_put(cpu); 326 327 return numa_domain; 328 } 329 330 static int cpu_numa_callback(struct notifier_block *nfb, 331 unsigned long action, 332 void *hcpu) 333 { 334 unsigned long lcpu = (unsigned long)hcpu; 335 int ret = NOTIFY_DONE; 336 337 switch (action) { 338 case CPU_UP_PREPARE: 339 if (min_common_depth == -1 || !numa_enabled) 340 map_cpu_to_node(lcpu, 0); 341 else 342 numa_setup_cpu(lcpu); 343 ret = NOTIFY_OK; 344 break; 345 #ifdef CONFIG_HOTPLUG_CPU 346 case CPU_DEAD: 347 case CPU_UP_CANCELED: 348 unmap_cpu_from_node(lcpu); 349 break; 350 ret = NOTIFY_OK; 351 #endif 352 } 353 return ret; 354 } 355 356 /* 357 * Check and possibly modify a memory region to enforce the memory limit. 358 * 359 * Returns the size the region should have to enforce the memory limit. 360 * This will either be the original value of size, a truncated value, 361 * or zero. If the returned value of size is 0 the region should be 362 * discarded as it lies wholy above the memory limit. 363 */ 364 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 365 unsigned long size) 366 { 367 /* 368 * We use lmb_end_of_DRAM() in here instead of memory_limit because 369 * we've already adjusted it for the limit and it takes care of 370 * having memory holes below the limit. 371 */ 372 373 if (! memory_limit) 374 return size; 375 376 if (start + size <= lmb_end_of_DRAM()) 377 return size; 378 379 if (start >= lmb_end_of_DRAM()) 380 return 0; 381 382 return lmb_end_of_DRAM() - start; 383 } 384 385 static int __init parse_numa_properties(void) 386 { 387 struct device_node *cpu = NULL; 388 struct device_node *memory = NULL; 389 int addr_cells, size_cells; 390 int max_domain; 391 unsigned long i; 392 393 if (numa_enabled == 0) { 394 printk(KERN_WARNING "NUMA disabled by user\n"); 395 return -1; 396 } 397 398 min_common_depth = find_min_common_depth(); 399 400 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 401 if (min_common_depth < 0) 402 return min_common_depth; 403 404 max_domain = numa_setup_cpu(boot_cpuid); 405 406 /* 407 * Even though we connect cpus to numa domains later in SMP init, 408 * we need to know the maximum node id now. This is because each 409 * node id must have NODE_DATA etc backing it. 410 * As a result of hotplug we could still have cpus appear later on 411 * with larger node ids. In that case we force the cpu into node 0. 412 */ 413 for_each_cpu(i) { 414 int numa_domain; 415 416 cpu = find_cpu_node(i); 417 418 if (cpu) { 419 numa_domain = of_node_numa_domain(cpu); 420 of_node_put(cpu); 421 422 if (numa_domain < MAX_NUMNODES && 423 max_domain < numa_domain) 424 max_domain = numa_domain; 425 } 426 } 427 428 addr_cells = get_mem_addr_cells(); 429 size_cells = get_mem_size_cells(); 430 memory = NULL; 431 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { 432 unsigned long start; 433 unsigned long size; 434 int numa_domain; 435 int ranges; 436 unsigned int *memcell_buf; 437 unsigned int len; 438 439 memcell_buf = (unsigned int *)get_property(memory, "reg", &len); 440 if (!memcell_buf || len <= 0) 441 continue; 442 443 ranges = memory->n_addrs; 444 new_range: 445 /* these are order-sensitive, and modify the buffer pointer */ 446 start = read_n_cells(addr_cells, &memcell_buf); 447 size = read_n_cells(size_cells, &memcell_buf); 448 449 numa_domain = of_node_numa_domain(memory); 450 451 if (numa_domain >= MAX_NUMNODES) { 452 if (numa_domain != 0xffff) 453 printk(KERN_ERR "WARNING: memory at %lx maps " 454 "to invalid NUMA node %d\n", start, 455 numa_domain); 456 numa_domain = 0; 457 } 458 459 if (max_domain < numa_domain) 460 max_domain = numa_domain; 461 462 if (!(size = numa_enforce_memory_limit(start, size))) { 463 if (--ranges) 464 goto new_range; 465 else 466 continue; 467 } 468 469 add_region(numa_domain, start >> PAGE_SHIFT, 470 size >> PAGE_SHIFT); 471 472 if (--ranges) 473 goto new_range; 474 } 475 476 for (i = 0; i <= max_domain; i++) 477 node_set_online(i); 478 479 return 0; 480 } 481 482 static void __init setup_nonnuma(void) 483 { 484 unsigned long top_of_ram = lmb_end_of_DRAM(); 485 unsigned long total_ram = lmb_phys_mem_size(); 486 487 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 488 top_of_ram, total_ram); 489 printk(KERN_INFO "Memory hole size: %ldMB\n", 490 (top_of_ram - total_ram) >> 20); 491 492 map_cpu_to_node(boot_cpuid, 0); 493 add_region(0, 0, lmb_end_of_DRAM() >> PAGE_SHIFT); 494 node_set_online(0); 495 } 496 497 static void __init dump_numa_topology(void) 498 { 499 unsigned int node; 500 unsigned int count; 501 502 if (min_common_depth == -1 || !numa_enabled) 503 return; 504 505 for_each_online_node(node) { 506 unsigned long i; 507 508 printk(KERN_INFO "Node %d Memory:", node); 509 510 count = 0; 511 512 for (i = 0; i < lmb_end_of_DRAM(); 513 i += (1 << SECTION_SIZE_BITS)) { 514 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { 515 if (count == 0) 516 printk(" 0x%lx", i); 517 ++count; 518 } else { 519 if (count > 0) 520 printk("-0x%lx", i); 521 count = 0; 522 } 523 } 524 525 if (count > 0) 526 printk("-0x%lx", i); 527 printk("\n"); 528 } 529 return; 530 } 531 532 /* 533 * Allocate some memory, satisfying the lmb or bootmem allocator where 534 * required. nid is the preferred node and end is the physical address of 535 * the highest address in the node. 536 * 537 * Returns the physical address of the memory. 538 */ 539 static void __init *careful_allocation(int nid, unsigned long size, 540 unsigned long align, 541 unsigned long end_pfn) 542 { 543 int new_nid; 544 unsigned long ret = lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); 545 546 /* retry over all memory */ 547 if (!ret) 548 ret = lmb_alloc_base(size, align, lmb_end_of_DRAM()); 549 550 if (!ret) 551 panic("numa.c: cannot allocate %lu bytes on node %d", 552 size, nid); 553 554 /* 555 * If the memory came from a previously allocated node, we must 556 * retry with the bootmem allocator. 557 */ 558 new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT); 559 if (new_nid < nid) { 560 ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid), 561 size, align, 0); 562 563 if (!ret) 564 panic("numa.c: cannot allocate %lu bytes on node %d", 565 size, new_nid); 566 567 ret = __pa(ret); 568 569 dbg("alloc_bootmem %lx %lx\n", ret, size); 570 } 571 572 return (void *)ret; 573 } 574 575 void __init do_init_bootmem(void) 576 { 577 int nid; 578 unsigned int i; 579 static struct notifier_block ppc64_numa_nb = { 580 .notifier_call = cpu_numa_callback, 581 .priority = 1 /* Must run before sched domains notifier. */ 582 }; 583 584 min_low_pfn = 0; 585 max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; 586 max_pfn = max_low_pfn; 587 588 if (parse_numa_properties()) 589 setup_nonnuma(); 590 else 591 dump_numa_topology(); 592 593 register_cpu_notifier(&ppc64_numa_nb); 594 595 for_each_online_node(nid) { 596 unsigned long start_pfn, end_pfn, pages_present; 597 unsigned long bootmem_paddr; 598 unsigned long bootmap_pages; 599 600 get_region(nid, &start_pfn, &end_pfn, &pages_present); 601 602 /* Allocate the node structure node local if possible */ 603 NODE_DATA(nid) = careful_allocation(nid, 604 sizeof(struct pglist_data), 605 SMP_CACHE_BYTES, end_pfn); 606 NODE_DATA(nid) = __va(NODE_DATA(nid)); 607 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 608 609 dbg("node %d\n", nid); 610 dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); 611 612 NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; 613 NODE_DATA(nid)->node_start_pfn = start_pfn; 614 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 615 616 if (NODE_DATA(nid)->node_spanned_pages == 0) 617 continue; 618 619 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); 620 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); 621 622 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 623 bootmem_paddr = (unsigned long)careful_allocation(nid, 624 bootmap_pages << PAGE_SHIFT, 625 PAGE_SIZE, end_pfn); 626 memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); 627 628 dbg("bootmap_paddr = %lx\n", bootmem_paddr); 629 630 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, 631 start_pfn, end_pfn); 632 633 /* Add free regions on this node */ 634 for (i = 0; init_node_data[i].end_pfn; i++) { 635 unsigned long start, end; 636 637 if (init_node_data[i].nid != nid) 638 continue; 639 640 start = init_node_data[i].start_pfn << PAGE_SHIFT; 641 end = init_node_data[i].end_pfn << PAGE_SHIFT; 642 643 dbg("free_bootmem %lx %lx\n", start, end - start); 644 free_bootmem_node(NODE_DATA(nid), start, end - start); 645 } 646 647 /* Mark reserved regions on this node */ 648 for (i = 0; i < lmb.reserved.cnt; i++) { 649 unsigned long physbase = lmb.reserved.region[i].base; 650 unsigned long size = lmb.reserved.region[i].size; 651 unsigned long start_paddr = start_pfn << PAGE_SHIFT; 652 unsigned long end_paddr = end_pfn << PAGE_SHIFT; 653 654 if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && 655 early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) 656 continue; 657 658 if (physbase < end_paddr && 659 (physbase+size) > start_paddr) { 660 /* overlaps */ 661 if (physbase < start_paddr) { 662 size -= start_paddr - physbase; 663 physbase = start_paddr; 664 } 665 666 if (size > end_paddr - physbase) 667 size = end_paddr - physbase; 668 669 dbg("reserve_bootmem %lx %lx\n", physbase, 670 size); 671 reserve_bootmem_node(NODE_DATA(nid), physbase, 672 size); 673 } 674 } 675 676 /* Add regions into sparsemem */ 677 for (i = 0; init_node_data[i].end_pfn; i++) { 678 unsigned long start, end; 679 680 if (init_node_data[i].nid != nid) 681 continue; 682 683 start = init_node_data[i].start_pfn; 684 end = init_node_data[i].end_pfn; 685 686 memory_present(nid, start, end); 687 } 688 } 689 } 690 691 void __init paging_init(void) 692 { 693 unsigned long zones_size[MAX_NR_ZONES]; 694 unsigned long zholes_size[MAX_NR_ZONES]; 695 int nid; 696 697 memset(zones_size, 0, sizeof(zones_size)); 698 memset(zholes_size, 0, sizeof(zholes_size)); 699 700 for_each_online_node(nid) { 701 unsigned long start_pfn, end_pfn, pages_present; 702 703 get_region(nid, &start_pfn, &end_pfn, &pages_present); 704 705 zones_size[ZONE_DMA] = end_pfn - start_pfn; 706 zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - pages_present; 707 708 dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, 709 zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); 710 711 free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, 712 zholes_size); 713 } 714 } 715 716 static int __init early_numa(char *p) 717 { 718 if (!p) 719 return 0; 720 721 if (strstr(p, "off")) 722 numa_enabled = 0; 723 724 if (strstr(p, "debug")) 725 numa_debug = 1; 726 727 return 0; 728 } 729 early_param("numa", early_numa); 730