1 /* 2 * Generic VM initialization for x86-64 NUMA setups. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 */ 5 #include <linux/kernel.h> 6 #include <linux/mm.h> 7 #include <linux/string.h> 8 #include <linux/init.h> 9 #include <linux/bootmem.h> 10 #include <linux/mmzone.h> 11 #include <linux/ctype.h> 12 #include <linux/module.h> 13 #include <linux/nodemask.h> 14 #include <linux/sched.h> 15 16 #include <asm/e820.h> 17 #include <asm/proto.h> 18 #include <asm/dma.h> 19 #include <asm/numa.h> 20 #include <asm/acpi.h> 21 #include <asm/k8.h> 22 23 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 24 EXPORT_SYMBOL(node_data); 25 26 struct memnode memnode; 27 28 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 30 }; 31 32 int numa_off __initdata; 33 static unsigned long __initdata nodemap_addr; 34 static unsigned long __initdata nodemap_size; 35 36 /* 37 * Map cpu index to node index 38 */ 39 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 40 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 41 42 /* 43 * Given a shift value, try to populate memnodemap[] 44 * Returns : 45 * 1 if OK 46 * 0 if memnodmap[] too small (of shift too small) 47 * -1 if node overlap or lost ram (shift too big) 48 */ 49 static int __init populate_memnodemap(const struct bootnode *nodes, 50 int numnodes, int shift, int *nodeids) 51 { 52 unsigned long addr, end; 53 int i, res = -1; 54 55 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 56 for (i = 0; i < numnodes; i++) { 57 addr = nodes[i].start; 58 end = nodes[i].end; 59 if (addr >= end) 60 continue; 61 if ((end >> shift) >= memnodemapsize) 62 return 0; 63 do { 64 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 65 return -1; 66 67 if (!nodeids) 68 memnodemap[addr >> shift] = i; 69 else 70 memnodemap[addr >> shift] = nodeids[i]; 71 72 addr += (1UL << shift); 73 } while (addr < end); 74 res = 1; 75 } 76 return res; 77 } 78 79 static int __init allocate_cachealigned_memnodemap(void) 80 { 81 unsigned long addr; 82 83 memnodemap = memnode.embedded_map; 84 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) 85 return 0; 86 87 addr = 0x8000; 88 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 89 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 90 nodemap_size, L1_CACHE_BYTES); 91 if (nodemap_addr == -1UL) { 92 printk(KERN_ERR 93 "NUMA: Unable to allocate Memory to Node hash map\n"); 94 nodemap_addr = nodemap_size = 0; 95 return -1; 96 } 97 memnodemap = phys_to_virt(nodemap_addr); 98 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); 99 100 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 101 nodemap_addr, nodemap_addr + nodemap_size); 102 return 0; 103 } 104 105 /* 106 * The LSB of all start and end addresses in the node map is the value of the 107 * maximum possible shift. 108 */ 109 static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 110 int numnodes) 111 { 112 int i, nodes_used = 0; 113 unsigned long start, end; 114 unsigned long bitfield = 0, memtop = 0; 115 116 for (i = 0; i < numnodes; i++) { 117 start = nodes[i].start; 118 end = nodes[i].end; 119 if (start >= end) 120 continue; 121 bitfield |= start; 122 nodes_used++; 123 if (end > memtop) 124 memtop = end; 125 } 126 if (nodes_used <= 1) 127 i = 63; 128 else 129 i = find_first_bit(&bitfield, sizeof(unsigned long)*8); 130 memnodemapsize = (memtop >> i)+1; 131 return i; 132 } 133 134 int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 135 int *nodeids) 136 { 137 int shift; 138 139 shift = extract_lsb_from_nodes(nodes, numnodes); 140 if (allocate_cachealigned_memnodemap()) 141 return -1; 142 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 143 shift); 144 145 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 146 printk(KERN_INFO "Your memory is not aligned you need to " 147 "rebuild your kernel with a bigger NODEMAPSIZE " 148 "shift=%d\n", shift); 149 return -1; 150 } 151 return shift; 152 } 153 154 int __meminit __early_pfn_to_nid(unsigned long pfn) 155 { 156 return phys_to_nid(pfn << PAGE_SHIFT); 157 } 158 159 static void * __init early_node_mem(int nodeid, unsigned long start, 160 unsigned long end, unsigned long size, 161 unsigned long align) 162 { 163 unsigned long mem; 164 165 /* 166 * put it on high as possible 167 * something will go with NODE_DATA 168 */ 169 if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) 170 start = MAX_DMA_PFN<<PAGE_SHIFT; 171 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && 172 end > (MAX_DMA32_PFN<<PAGE_SHIFT)) 173 start = MAX_DMA32_PFN<<PAGE_SHIFT; 174 mem = find_e820_area(start, end, size, align); 175 if (mem != -1L) 176 return __va(mem); 177 178 /* extend the search scope */ 179 end = max_pfn_mapped << PAGE_SHIFT; 180 if (end > (MAX_DMA32_PFN<<PAGE_SHIFT)) 181 start = MAX_DMA32_PFN<<PAGE_SHIFT; 182 else 183 start = MAX_DMA_PFN<<PAGE_SHIFT; 184 mem = find_e820_area(start, end, size, align); 185 if (mem != -1L) 186 return __va(mem); 187 188 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 189 size, nodeid); 190 191 return NULL; 192 } 193 194 /* Initialize bootmem allocator for a node */ 195 void __init 196 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 197 { 198 unsigned long start_pfn, last_pfn, nodedata_phys; 199 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 200 int nid; 201 #ifndef CONFIG_NO_BOOTMEM 202 unsigned long bootmap_start, bootmap_pages, bootmap_size; 203 void *bootmap; 204 #endif 205 206 if (!end) 207 return; 208 209 /* 210 * Don't confuse VM with a node that doesn't have the 211 * minimum amount of memory: 212 */ 213 if (end && (end - start) < NODE_MIN_SIZE) 214 return; 215 216 start = roundup(start, ZONE_ALIGN); 217 218 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, 219 start, end); 220 221 start_pfn = start >> PAGE_SHIFT; 222 last_pfn = end >> PAGE_SHIFT; 223 224 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 225 SMP_CACHE_BYTES); 226 if (node_data[nodeid] == NULL) 227 return; 228 nodedata_phys = __pa(node_data[nodeid]); 229 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); 230 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 231 nodedata_phys + pgdat_size - 1); 232 nid = phys_to_nid(nodedata_phys); 233 if (nid != nodeid) 234 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); 235 236 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 237 NODE_DATA(nodeid)->node_id = nodeid; 238 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 239 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 240 241 #ifndef CONFIG_NO_BOOTMEM 242 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 243 244 /* 245 * Find a place for the bootmem map 246 * nodedata_phys could be on other nodes by alloc_bootmem, 247 * so need to sure bootmap_start not to be small, otherwise 248 * early_node_mem will get that with find_e820_area instead 249 * of alloc_bootmem, that could clash with reserved range 250 */ 251 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 252 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); 253 /* 254 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 255 * to use that to align to PAGE_SIZE 256 */ 257 bootmap = early_node_mem(nodeid, bootmap_start, end, 258 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 259 if (bootmap == NULL) { 260 free_early(nodedata_phys, nodedata_phys + pgdat_size); 261 node_data[nodeid] = NULL; 262 return; 263 } 264 bootmap_start = __pa(bootmap); 265 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT), 266 "BOOTMAP"); 267 268 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 269 bootmap_start >> PAGE_SHIFT, 270 start_pfn, last_pfn); 271 272 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 273 bootmap_start, bootmap_start + bootmap_size - 1, 274 bootmap_pages); 275 nid = phys_to_nid(bootmap_start); 276 if (nid != nodeid) 277 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 278 279 free_bootmem_with_active_regions(nodeid, end); 280 #endif 281 282 node_set_online(nodeid); 283 } 284 285 /* 286 * There are unfortunately some poorly designed mainboards around that 287 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 288 * mapping. To avoid this fill in the mapping for all possible CPUs, 289 * as the number of CPUs is not known yet. We round robin the existing 290 * nodes. 291 */ 292 void __init numa_init_array(void) 293 { 294 int rr, i; 295 296 rr = first_node(node_online_map); 297 for (i = 0; i < nr_cpu_ids; i++) { 298 if (early_cpu_to_node(i) != NUMA_NO_NODE) 299 continue; 300 numa_set_node(i, rr); 301 rr = next_node(rr, node_online_map); 302 if (rr == MAX_NUMNODES) 303 rr = first_node(node_online_map); 304 } 305 } 306 307 #ifdef CONFIG_NUMA_EMU 308 /* Numa emulation */ 309 static struct bootnode nodes[MAX_NUMNODES] __initdata; 310 static struct bootnode physnodes[MAX_NUMNODES] __initdata; 311 static char *cmdline __initdata; 312 313 static int __init setup_physnodes(unsigned long start, unsigned long end, 314 int acpi, int k8) 315 { 316 int nr_nodes = 0; 317 int ret = 0; 318 int i; 319 320 #ifdef CONFIG_ACPI_NUMA 321 if (acpi) 322 nr_nodes = acpi_get_nodes(physnodes); 323 #endif 324 #ifdef CONFIG_K8_NUMA 325 if (k8) 326 nr_nodes = k8_get_nodes(physnodes); 327 #endif 328 /* 329 * Basic sanity checking on the physical node map: there may be errors 330 * if the SRAT or K8 incorrectly reported the topology or the mem= 331 * kernel parameter is used. 332 */ 333 for (i = 0; i < nr_nodes; i++) { 334 if (physnodes[i].start == physnodes[i].end) 335 continue; 336 if (physnodes[i].start > end) { 337 physnodes[i].end = physnodes[i].start; 338 continue; 339 } 340 if (physnodes[i].end < start) { 341 physnodes[i].start = physnodes[i].end; 342 continue; 343 } 344 if (physnodes[i].start < start) 345 physnodes[i].start = start; 346 if (physnodes[i].end > end) 347 physnodes[i].end = end; 348 } 349 350 /* 351 * Remove all nodes that have no memory or were truncated because of the 352 * limited address range. 353 */ 354 for (i = 0; i < nr_nodes; i++) { 355 if (physnodes[i].start == physnodes[i].end) 356 continue; 357 physnodes[ret].start = physnodes[i].start; 358 physnodes[ret].end = physnodes[i].end; 359 ret++; 360 } 361 362 /* 363 * If no physical topology was detected, a single node is faked to cover 364 * the entire address space. 365 */ 366 if (!ret) { 367 physnodes[ret].start = start; 368 physnodes[ret].end = end; 369 ret = 1; 370 } 371 return ret; 372 } 373 374 /* 375 * Setups up nid to range from addr to addr + size. If the end 376 * boundary is greater than max_addr, then max_addr is used instead. 377 * The return value is 0 if there is additional memory left for 378 * allocation past addr and -1 otherwise. addr is adjusted to be at 379 * the end of the node. 380 */ 381 static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) 382 { 383 int ret = 0; 384 nodes[nid].start = *addr; 385 *addr += size; 386 if (*addr >= max_addr) { 387 *addr = max_addr; 388 ret = -1; 389 } 390 nodes[nid].end = *addr; 391 node_set(nid, node_possible_map); 392 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, 393 nodes[nid].start, nodes[nid].end, 394 (nodes[nid].end - nodes[nid].start) >> 20); 395 return ret; 396 } 397 398 /* 399 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 400 * to max_addr. The return value is the number of nodes allocated. 401 */ 402 static int __init split_nodes_interleave(u64 addr, u64 max_addr, 403 int nr_phys_nodes, int nr_nodes) 404 { 405 nodemask_t physnode_mask = NODE_MASK_NONE; 406 u64 size; 407 int big; 408 int ret = 0; 409 int i; 410 411 if (nr_nodes <= 0) 412 return -1; 413 if (nr_nodes > MAX_NUMNODES) { 414 pr_info("numa=fake=%d too large, reducing to %d\n", 415 nr_nodes, MAX_NUMNODES); 416 nr_nodes = MAX_NUMNODES; 417 } 418 419 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; 420 /* 421 * Calculate the number of big nodes that can be allocated as a result 422 * of consolidating the remainder. 423 */ 424 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 425 FAKE_NODE_MIN_SIZE; 426 427 size &= FAKE_NODE_MIN_HASH_MASK; 428 if (!size) { 429 pr_err("Not enough memory for each node. " 430 "NUMA emulation disabled.\n"); 431 return -1; 432 } 433 434 for (i = 0; i < nr_phys_nodes; i++) 435 if (physnodes[i].start != physnodes[i].end) 436 node_set(i, physnode_mask); 437 438 /* 439 * Continue to fill physical nodes with fake nodes until there is no 440 * memory left on any of them. 441 */ 442 while (nodes_weight(physnode_mask)) { 443 for_each_node_mask(i, physnode_mask) { 444 u64 end = physnodes[i].start + size; 445 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 446 447 if (ret < big) 448 end += FAKE_NODE_MIN_SIZE; 449 450 /* 451 * Continue to add memory to this fake node if its 452 * non-reserved memory is less than the per-node size. 453 */ 454 while (end - physnodes[i].start - 455 e820_hole_size(physnodes[i].start, end) < size) { 456 end += FAKE_NODE_MIN_SIZE; 457 if (end > physnodes[i].end) { 458 end = physnodes[i].end; 459 break; 460 } 461 } 462 463 /* 464 * If there won't be at least FAKE_NODE_MIN_SIZE of 465 * non-reserved memory in ZONE_DMA32 for the next node, 466 * this one must extend to the boundary. 467 */ 468 if (end < dma32_end && dma32_end - end - 469 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 470 end = dma32_end; 471 472 /* 473 * If there won't be enough non-reserved memory for the 474 * next node, this one must extend to the end of the 475 * physical node. 476 */ 477 if (physnodes[i].end - end - 478 e820_hole_size(end, physnodes[i].end) < size) 479 end = physnodes[i].end; 480 481 /* 482 * Avoid allocating more nodes than requested, which can 483 * happen as a result of rounding down each node's size 484 * to FAKE_NODE_MIN_SIZE. 485 */ 486 if (nodes_weight(physnode_mask) + ret >= nr_nodes) 487 end = physnodes[i].end; 488 489 if (setup_node_range(ret++, &physnodes[i].start, 490 end - physnodes[i].start, 491 physnodes[i].end) < 0) 492 node_clear(i, physnode_mask); 493 } 494 } 495 return ret; 496 } 497 498 /* 499 * Returns the end address of a node so that there is at least `size' amount of 500 * non-reserved memory or `max_addr' is reached. 501 */ 502 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 503 { 504 u64 end = start + size; 505 506 while (end - start - e820_hole_size(start, end) < size) { 507 end += FAKE_NODE_MIN_SIZE; 508 if (end > max_addr) { 509 end = max_addr; 510 break; 511 } 512 } 513 return end; 514 } 515 516 /* 517 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 518 * `addr' to `max_addr'. The return value is the number of nodes allocated. 519 */ 520 static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) 521 { 522 nodemask_t physnode_mask = NODE_MASK_NONE; 523 u64 min_size; 524 int ret = 0; 525 int i; 526 527 if (!size) 528 return -1; 529 /* 530 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 531 * increased accordingly if the requested size is too small. This 532 * creates a uniform distribution of node sizes across the entire 533 * machine (but not necessarily over physical nodes). 534 */ 535 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / 536 MAX_NUMNODES; 537 min_size = max(min_size, FAKE_NODE_MIN_SIZE); 538 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) 539 min_size = (min_size + FAKE_NODE_MIN_SIZE) & 540 FAKE_NODE_MIN_HASH_MASK; 541 if (size < min_size) { 542 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 543 size >> 20, min_size >> 20); 544 size = min_size; 545 } 546 size &= FAKE_NODE_MIN_HASH_MASK; 547 548 for (i = 0; i < MAX_NUMNODES; i++) 549 if (physnodes[i].start != physnodes[i].end) 550 node_set(i, physnode_mask); 551 /* 552 * Fill physical nodes with fake nodes of size until there is no memory 553 * left on any of them. 554 */ 555 while (nodes_weight(physnode_mask)) { 556 for_each_node_mask(i, physnode_mask) { 557 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; 558 u64 end; 559 560 end = find_end_of_node(physnodes[i].start, 561 physnodes[i].end, size); 562 /* 563 * If there won't be at least FAKE_NODE_MIN_SIZE of 564 * non-reserved memory in ZONE_DMA32 for the next node, 565 * this one must extend to the boundary. 566 */ 567 if (end < dma32_end && dma32_end - end - 568 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 569 end = dma32_end; 570 571 /* 572 * If there won't be enough non-reserved memory for the 573 * next node, this one must extend to the end of the 574 * physical node. 575 */ 576 if (physnodes[i].end - end - 577 e820_hole_size(end, physnodes[i].end) < size) 578 end = physnodes[i].end; 579 580 /* 581 * Setup the fake node that will be allocated as bootmem 582 * later. If setup_node_range() returns non-zero, there 583 * is no more memory available on this physical node. 584 */ 585 if (setup_node_range(ret++, &physnodes[i].start, 586 end - physnodes[i].start, 587 physnodes[i].end) < 0) 588 node_clear(i, physnode_mask); 589 } 590 } 591 return ret; 592 } 593 594 /* 595 * Sets up the system RAM area from start_pfn to last_pfn according to the 596 * numa=fake command-line option. 597 */ 598 static int __init numa_emulation(unsigned long start_pfn, 599 unsigned long last_pfn, int acpi, int k8) 600 { 601 u64 addr = start_pfn << PAGE_SHIFT; 602 u64 max_addr = last_pfn << PAGE_SHIFT; 603 int num_phys_nodes; 604 int num_nodes; 605 int i; 606 607 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 608 /* 609 * If the numa=fake command-line contains a 'M' or 'G', it represents 610 * the fixed node size. Otherwise, if it is just a single number N, 611 * split the system RAM into N fake nodes. 612 */ 613 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { 614 u64 size; 615 616 size = memparse(cmdline, &cmdline); 617 num_nodes = split_nodes_size_interleave(addr, max_addr, size); 618 } else { 619 unsigned long n; 620 621 n = simple_strtoul(cmdline, NULL, 0); 622 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); 623 } 624 625 if (num_nodes < 0) 626 return num_nodes; 627 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 628 if (memnode_shift < 0) { 629 memnode_shift = 0; 630 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 631 "disabled.\n"); 632 return -1; 633 } 634 635 /* 636 * We need to vacate all active ranges that may have been registered for 637 * the e820 memory map. 638 */ 639 remove_all_active_ranges(); 640 for_each_node_mask(i, node_possible_map) { 641 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 642 nodes[i].end >> PAGE_SHIFT); 643 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 644 } 645 acpi_fake_nodes(nodes, num_nodes); 646 numa_init_array(); 647 return 0; 648 } 649 #endif /* CONFIG_NUMA_EMU */ 650 651 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 652 int acpi, int k8) 653 { 654 int i; 655 656 nodes_clear(node_possible_map); 657 nodes_clear(node_online_map); 658 659 #ifdef CONFIG_NUMA_EMU 660 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) 661 return; 662 nodes_clear(node_possible_map); 663 nodes_clear(node_online_map); 664 #endif 665 666 #ifdef CONFIG_ACPI_NUMA 667 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 668 last_pfn << PAGE_SHIFT)) 669 return; 670 nodes_clear(node_possible_map); 671 nodes_clear(node_online_map); 672 #endif 673 674 #ifdef CONFIG_K8_NUMA 675 if (!numa_off && k8 && !k8_scan_nodes()) 676 return; 677 nodes_clear(node_possible_map); 678 nodes_clear(node_online_map); 679 #endif 680 printk(KERN_INFO "%s\n", 681 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 682 683 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 684 start_pfn << PAGE_SHIFT, 685 last_pfn << PAGE_SHIFT); 686 /* setup dummy node covering all memory */ 687 memnode_shift = 63; 688 memnodemap = memnode.embedded_map; 689 memnodemap[0] = 0; 690 node_set_online(0); 691 node_set(0, node_possible_map); 692 for (i = 0; i < nr_cpu_ids; i++) 693 numa_set_node(i, 0); 694 e820_register_active_regions(0, start_pfn, last_pfn); 695 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); 696 } 697 698 unsigned long __init numa_free_all_bootmem(void) 699 { 700 unsigned long pages = 0; 701 int i; 702 703 for_each_online_node(i) 704 pages += free_all_bootmem_node(NODE_DATA(i)); 705 706 #ifdef CONFIG_NO_BOOTMEM 707 pages += free_all_memory_core_early(MAX_NUMNODES); 708 #endif 709 710 return pages; 711 } 712 713 static __init int numa_setup(char *opt) 714 { 715 if (!opt) 716 return -EINVAL; 717 if (!strncmp(opt, "off", 3)) 718 numa_off = 1; 719 #ifdef CONFIG_NUMA_EMU 720 if (!strncmp(opt, "fake=", 5)) 721 cmdline = opt + 5; 722 #endif 723 #ifdef CONFIG_ACPI_NUMA 724 if (!strncmp(opt, "noacpi", 6)) 725 acpi_numa = -1; 726 #endif 727 return 0; 728 } 729 early_param("numa", numa_setup); 730 731 #ifdef CONFIG_NUMA 732 733 static __init int find_near_online_node(int node) 734 { 735 int n, val; 736 int min_val = INT_MAX; 737 int best_node = -1; 738 739 for_each_online_node(n) { 740 val = node_distance(node, n); 741 742 if (val < min_val) { 743 min_val = val; 744 best_node = n; 745 } 746 } 747 748 return best_node; 749 } 750 751 /* 752 * Setup early cpu_to_node. 753 * 754 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 755 * and apicid_to_node[] tables have valid entries for a CPU. 756 * This means we skip cpu_to_node[] initialisation for NUMA 757 * emulation and faking node case (when running a kernel compiled 758 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 759 * is already initialized in a round robin manner at numa_init_array, 760 * prior to this call, and this initialization is good enough 761 * for the fake NUMA cases. 762 * 763 * Called before the per_cpu areas are setup. 764 */ 765 void __init init_cpu_to_node(void) 766 { 767 int cpu; 768 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 769 770 BUG_ON(cpu_to_apicid == NULL); 771 772 for_each_possible_cpu(cpu) { 773 int node; 774 u16 apicid = cpu_to_apicid[cpu]; 775 776 if (apicid == BAD_APICID) 777 continue; 778 node = apicid_to_node[apicid]; 779 if (node == NUMA_NO_NODE) 780 continue; 781 if (!node_online(node)) 782 node = find_near_online_node(node); 783 numa_set_node(cpu, node); 784 } 785 } 786 #endif 787 788 789 void __cpuinit numa_set_node(int cpu, int node) 790 { 791 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 792 793 /* early setting, no percpu area yet */ 794 if (cpu_to_node_map) { 795 cpu_to_node_map[cpu] = node; 796 return; 797 } 798 799 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 800 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 801 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 802 dump_stack(); 803 return; 804 } 805 #endif 806 per_cpu(x86_cpu_to_node_map, cpu) = node; 807 808 if (node != NUMA_NO_NODE) 809 set_cpu_numa_node(cpu, node); 810 } 811 812 void __cpuinit numa_clear_node(int cpu) 813 { 814 numa_set_node(cpu, NUMA_NO_NODE); 815 } 816 817 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 818 819 void __cpuinit numa_add_cpu(int cpu) 820 { 821 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 822 } 823 824 void __cpuinit numa_remove_cpu(int cpu) 825 { 826 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 827 } 828 829 #else /* CONFIG_DEBUG_PER_CPU_MAPS */ 830 831 /* 832 * --------- debug versions of the numa functions --------- 833 */ 834 static void __cpuinit numa_set_cpumask(int cpu, int enable) 835 { 836 int node = early_cpu_to_node(cpu); 837 struct cpumask *mask; 838 char buf[64]; 839 840 mask = node_to_cpumask_map[node]; 841 if (mask == NULL) { 842 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); 843 dump_stack(); 844 return; 845 } 846 847 if (enable) 848 cpumask_set_cpu(cpu, mask); 849 else 850 cpumask_clear_cpu(cpu, mask); 851 852 cpulist_scnprintf(buf, sizeof(buf), mask); 853 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 854 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 855 } 856 857 void __cpuinit numa_add_cpu(int cpu) 858 { 859 numa_set_cpumask(cpu, 1); 860 } 861 862 void __cpuinit numa_remove_cpu(int cpu) 863 { 864 numa_set_cpumask(cpu, 0); 865 } 866 867 int __cpu_to_node(int cpu) 868 { 869 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 870 printk(KERN_WARNING 871 "cpu_to_node(%d): usage too early!\n", cpu); 872 dump_stack(); 873 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 874 } 875 return per_cpu(x86_cpu_to_node_map, cpu); 876 } 877 EXPORT_SYMBOL(__cpu_to_node); 878 879 /* 880 * Same function as cpu_to_node() but used if called before the 881 * per_cpu areas are setup. 882 */ 883 int early_cpu_to_node(int cpu) 884 { 885 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 886 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 887 888 if (!cpu_possible(cpu)) { 889 printk(KERN_WARNING 890 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 891 dump_stack(); 892 return NUMA_NO_NODE; 893 } 894 return per_cpu(x86_cpu_to_node_map, cpu); 895 } 896 897 /* 898 * --------- end of debug versions of the numa functions --------- 899 */ 900 901 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ 902