1 /* 2 * Generic VM initialization for x86-64 NUMA setups. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 */ 5 #include <linux/kernel.h> 6 #include <linux/mm.h> 7 #include <linux/string.h> 8 #include <linux/init.h> 9 #include <linux/bootmem.h> 10 #include <linux/mmzone.h> 11 #include <linux/ctype.h> 12 #include <linux/module.h> 13 #include <linux/nodemask.h> 14 #include <linux/sched.h> 15 16 #include <asm/e820.h> 17 #include <asm/proto.h> 18 #include <asm/dma.h> 19 #include <asm/numa.h> 20 #include <asm/acpi.h> 21 #include <asm/k8.h> 22 23 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 24 EXPORT_SYMBOL(node_data); 25 26 struct memnode memnode; 27 28 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 30 }; 31 32 int numa_off __initdata; 33 static unsigned long __initdata nodemap_addr; 34 static unsigned long __initdata nodemap_size; 35 36 DEFINE_PER_CPU(int, node_number) = 0; 37 EXPORT_PER_CPU_SYMBOL(node_number); 38 39 /* 40 * Map cpu index to node index 41 */ 42 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 43 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 44 45 /* 46 * Given a shift value, try to populate memnodemap[] 47 * Returns : 48 * 1 if OK 49 * 0 if memnodmap[] too small (of shift too small) 50 * -1 if node overlap or lost ram (shift too big) 51 */ 52 static int __init populate_memnodemap(const struct bootnode *nodes, 53 int numnodes, int shift, int *nodeids) 54 { 55 unsigned long addr, end; 56 int i, res = -1; 57 58 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 59 for (i = 0; i < numnodes; i++) { 60 addr = nodes[i].start; 61 end = nodes[i].end; 62 if (addr >= end) 63 continue; 64 if ((end >> shift) >= memnodemapsize) 65 return 0; 66 do { 67 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 68 return -1; 69 70 if (!nodeids) 71 memnodemap[addr >> shift] = i; 72 else 73 memnodemap[addr >> shift] = nodeids[i]; 74 75 addr += (1UL << shift); 76 } while (addr < end); 77 res = 1; 78 } 79 return res; 80 } 81 82 static int __init allocate_cachealigned_memnodemap(void) 83 { 84 unsigned long addr; 85 86 memnodemap = memnode.embedded_map; 87 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) 88 return 0; 89 90 addr = 0x8000; 91 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 92 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 93 nodemap_size, L1_CACHE_BYTES); 94 if (nodemap_addr == -1UL) { 95 printk(KERN_ERR 96 "NUMA: Unable to allocate Memory to Node hash map\n"); 97 nodemap_addr = nodemap_size = 0; 98 return -1; 99 } 100 memnodemap = phys_to_virt(nodemap_addr); 101 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); 102 103 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 104 nodemap_addr, nodemap_addr + nodemap_size); 105 return 0; 106 } 107 108 /* 109 * The LSB of all start and end addresses in the node map is the value of the 110 * maximum possible shift. 111 */ 112 static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 113 int numnodes) 114 { 115 int i, nodes_used = 0; 116 unsigned long start, end; 117 unsigned long bitfield = 0, memtop = 0; 118 119 for (i = 0; i < numnodes; i++) { 120 start = nodes[i].start; 121 end = nodes[i].end; 122 if (start >= end) 123 continue; 124 bitfield |= start; 125 nodes_used++; 126 if (end > memtop) 127 memtop = end; 128 } 129 if (nodes_used <= 1) 130 i = 63; 131 else 132 i = find_first_bit(&bitfield, sizeof(unsigned long)*8); 133 memnodemapsize = (memtop >> i)+1; 134 return i; 135 } 136 137 int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 138 int *nodeids) 139 { 140 int shift; 141 142 shift = extract_lsb_from_nodes(nodes, numnodes); 143 if (allocate_cachealigned_memnodemap()) 144 return -1; 145 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 146 shift); 147 148 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 149 printk(KERN_INFO "Your memory is not aligned you need to " 150 "rebuild your kernel with a bigger NODEMAPSIZE " 151 "shift=%d\n", shift); 152 return -1; 153 } 154 return shift; 155 } 156 157 int __meminit __early_pfn_to_nid(unsigned long pfn) 158 { 159 return phys_to_nid(pfn << PAGE_SHIFT); 160 } 161 162 static void * __init early_node_mem(int nodeid, unsigned long start, 163 unsigned long end, unsigned long size, 164 unsigned long align) 165 { 166 unsigned long mem = find_e820_area(start, end, size, align); 167 void *ptr; 168 169 if (mem != -1L) 170 return __va(mem); 171 172 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 173 if (ptr == NULL) { 174 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 175 size, nodeid); 176 return NULL; 177 } 178 return ptr; 179 } 180 181 /* Initialize bootmem allocator for a node */ 182 void __init 183 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 184 { 185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 187 unsigned long bootmap_start, nodedata_phys; 188 void *bootmap; 189 int nid; 190 191 if (!end) 192 return; 193 194 /* 195 * Don't confuse VM with a node that doesn't have the 196 * minimum amount of memory: 197 */ 198 if (end && (end - start) < NODE_MIN_SIZE) 199 return; 200 201 start = roundup(start, ZONE_ALIGN); 202 203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 204 start, end); 205 206 start_pfn = start >> PAGE_SHIFT; 207 last_pfn = end >> PAGE_SHIFT; 208 209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 210 SMP_CACHE_BYTES); 211 if (node_data[nodeid] == NULL) 212 return; 213 nodedata_phys = __pa(node_data[nodeid]); 214 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 215 nodedata_phys + pgdat_size - 1); 216 217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 218 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 220 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 221 222 /* 223 * Find a place for the bootmem map 224 * nodedata_phys could be on other nodes by alloc_bootmem, 225 * so need to sure bootmap_start not to be small, otherwise 226 * early_node_mem will get that with find_e820_area instead 227 * of alloc_bootmem, that could clash with reserved range 228 */ 229 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 230 nid = phys_to_nid(nodedata_phys); 231 if (nid == nodeid) 232 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); 233 else 234 bootmap_start = roundup(start, PAGE_SIZE); 235 /* 236 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 237 * to use that to align to PAGE_SIZE 238 */ 239 bootmap = early_node_mem(nodeid, bootmap_start, end, 240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 241 if (bootmap == NULL) { 242 if (nodedata_phys < start || nodedata_phys >= end) { 243 /* 244 * only need to free it if it is from other node 245 * bootmem 246 */ 247 if (nid != nodeid) 248 free_bootmem(nodedata_phys, pgdat_size); 249 } 250 node_data[nodeid] = NULL; 251 return; 252 } 253 bootmap_start = __pa(bootmap); 254 255 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 256 bootmap_start >> PAGE_SHIFT, 257 start_pfn, last_pfn); 258 259 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 260 bootmap_start, bootmap_start + bootmap_size - 1, 261 bootmap_pages); 262 263 free_bootmem_with_active_regions(nodeid, end); 264 265 /* 266 * convert early reserve to bootmem reserve earlier 267 * otherwise early_node_mem could use early reserved mem 268 * on previous node 269 */ 270 early_res_to_bootmem(start, end); 271 272 /* 273 * in some case early_node_mem could use alloc_bootmem 274 * to get range on other node, don't reserve that again 275 */ 276 if (nid != nodeid) 277 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); 278 else 279 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, 280 pgdat_size, BOOTMEM_DEFAULT); 281 nid = phys_to_nid(bootmap_start); 282 if (nid != nodeid) 283 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 284 else 285 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 286 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 287 288 node_set_online(nodeid); 289 } 290 291 /* 292 * There are unfortunately some poorly designed mainboards around that 293 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 294 * mapping. To avoid this fill in the mapping for all possible CPUs, 295 * as the number of CPUs is not known yet. We round robin the existing 296 * nodes. 297 */ 298 void __init numa_init_array(void) 299 { 300 int rr, i; 301 302 rr = first_node(node_online_map); 303 for (i = 0; i < nr_cpu_ids; i++) { 304 if (early_cpu_to_node(i) != NUMA_NO_NODE) 305 continue; 306 numa_set_node(i, rr); 307 rr = next_node(rr, node_online_map); 308 if (rr == MAX_NUMNODES) 309 rr = first_node(node_online_map); 310 } 311 } 312 313 #ifdef CONFIG_NUMA_EMU 314 /* Numa emulation */ 315 static struct bootnode nodes[MAX_NUMNODES] __initdata; 316 static struct bootnode physnodes[MAX_NUMNODES] __initdata; 317 static char *cmdline __initdata; 318 319 static int __init setup_physnodes(unsigned long start, unsigned long end, 320 int acpi, int k8) 321 { 322 int nr_nodes = 0; 323 int ret = 0; 324 int i; 325 326 #ifdef CONFIG_ACPI_NUMA 327 if (acpi) 328 nr_nodes = acpi_get_nodes(physnodes); 329 #endif 330 #ifdef CONFIG_K8_NUMA 331 if (k8) 332 nr_nodes = k8_get_nodes(physnodes); 333 #endif 334 /* 335 * Basic sanity checking on the physical node map: there may be errors 336 * if the SRAT or K8 incorrectly reported the topology or the mem= 337 * kernel parameter is used. 338 */ 339 for (i = 0; i < nr_nodes; i++) { 340 if (physnodes[i].start == physnodes[i].end) 341 continue; 342 if (physnodes[i].start > end) { 343 physnodes[i].end = physnodes[i].start; 344 continue; 345 } 346 if (physnodes[i].end < start) { 347 physnodes[i].start = physnodes[i].end; 348 continue; 349 } 350 if (physnodes[i].start < start) 351 physnodes[i].start = start; 352 if (physnodes[i].end > end) 353 physnodes[i].end = end; 354 } 355 356 /* 357 * Remove all nodes that have no memory or were truncated because of the 358 * limited address range. 359 */ 360 for (i = 0; i < nr_nodes; i++) { 361 if (physnodes[i].start == physnodes[i].end) 362 continue; 363 physnodes[ret].start = physnodes[i].start; 364 physnodes[ret].end = physnodes[i].end; 365 ret++; 366 } 367 368 /* 369 * If no physical topology was detected, a single node is faked to cover 370 * the entire address space. 371 */ 372 if (!ret) { 373 physnodes[ret].start = start; 374 physnodes[ret].end = end; 375 ret = 1; 376 } 377 return ret; 378 } 379 380 /* 381 * Setups up nid to range from addr to addr + size. If the end 382 * boundary is greater than max_addr, then max_addr is used instead. 383 * The return value is 0 if there is additional memory left for 384 * allocation past addr and -1 otherwise. addr is adjusted to be at 385 * the end of the node. 386 */ 387 static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) 388 { 389 int ret = 0; 390 nodes[nid].start = *addr; 391 *addr += size; 392 if (*addr >= max_addr) { 393 *addr = max_addr; 394 ret = -1; 395 } 396 nodes[nid].end = *addr; 397 node_set(nid, node_possible_map); 398 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, 399 nodes[nid].start, nodes[nid].end, 400 (nodes[nid].end - nodes[nid].start) >> 20); 401 return ret; 402 } 403 404 /* 405 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 406 * to max_addr. The return value is the number of nodes allocated. 407 */ 408 static int __init split_nodes_interleave(u64 addr, u64 max_addr, 409 int nr_phys_nodes, int nr_nodes) 410 { 411 nodemask_t physnode_mask = NODE_MASK_NONE; 412 u64 size; 413 int big; 414 int ret = 0; 415 int i; 416 417 if (nr_nodes <= 0) 418 return -1; 419 if (nr_nodes > MAX_NUMNODES) { 420 pr_info("numa=fake=%d too large, reducing to %d\n", 421 nr_nodes, MAX_NUMNODES); 422 nr_nodes = MAX_NUMNODES; 423 } 424 425 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; 426 /* 427 * Calculate the number of big nodes that can be allocated as a result 428 * of consolidating the remainder. 429 */ 430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / 431 FAKE_NODE_MIN_SIZE; 432 433 size &= FAKE_NODE_MIN_HASH_MASK; 434 if (!size) { 435 pr_err("Not enough memory for each node. " 436 "NUMA emulation disabled.\n"); 437 return -1; 438 } 439 440 for (i = 0; i < nr_phys_nodes; i++) 441 if (physnodes[i].start != physnodes[i].end) 442 node_set(i, physnode_mask); 443 444 /* 445 * Continue to fill physical nodes with fake nodes until there is no 446 * memory left on any of them. 447 */ 448 while (nodes_weight(physnode_mask)) { 449 for_each_node_mask(i, physnode_mask) { 450 u64 end = physnodes[i].start + size; 451 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 452 453 if (ret < big) 454 end += FAKE_NODE_MIN_SIZE; 455 456 /* 457 * Continue to add memory to this fake node if its 458 * non-reserved memory is less than the per-node size. 459 */ 460 while (end - physnodes[i].start - 461 e820_hole_size(physnodes[i].start, end) < size) { 462 end += FAKE_NODE_MIN_SIZE; 463 if (end > physnodes[i].end) { 464 end = physnodes[i].end; 465 break; 466 } 467 } 468 469 /* 470 * If there won't be at least FAKE_NODE_MIN_SIZE of 471 * non-reserved memory in ZONE_DMA32 for the next node, 472 * this one must extend to the boundary. 473 */ 474 if (end < dma32_end && dma32_end - end - 475 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 476 end = dma32_end; 477 478 /* 479 * If there won't be enough non-reserved memory for the 480 * next node, this one must extend to the end of the 481 * physical node. 482 */ 483 if (physnodes[i].end - end - 484 e820_hole_size(end, physnodes[i].end) < size) 485 end = physnodes[i].end; 486 487 /* 488 * Avoid allocating more nodes than requested, which can 489 * happen as a result of rounding down each node's size 490 * to FAKE_NODE_MIN_SIZE. 491 */ 492 if (nodes_weight(physnode_mask) + ret >= nr_nodes) 493 end = physnodes[i].end; 494 495 if (setup_node_range(ret++, &physnodes[i].start, 496 end - physnodes[i].start, 497 physnodes[i].end) < 0) 498 node_clear(i, physnode_mask); 499 } 500 } 501 return ret; 502 } 503 504 /* 505 * Splits num_nodes nodes up equally starting at node_start. The return value 506 * is the number of nodes split up and addr is adjusted to be at the end of the 507 * last node allocated. 508 */ 509 static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, 510 int num_nodes) 511 { 512 unsigned int big; 513 u64 size; 514 int i; 515 516 if (num_nodes <= 0) 517 return -1; 518 if (num_nodes > MAX_NUMNODES) 519 num_nodes = MAX_NUMNODES; 520 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / 521 num_nodes; 522 /* 523 * Calculate the number of big nodes that can be allocated as a result 524 * of consolidating the leftovers. 525 */ 526 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / 527 FAKE_NODE_MIN_SIZE; 528 529 /* Round down to nearest FAKE_NODE_MIN_SIZE. */ 530 size &= FAKE_NODE_MIN_HASH_MASK; 531 if (!size) { 532 printk(KERN_ERR "Not enough memory for each node. " 533 "NUMA emulation disabled.\n"); 534 return -1; 535 } 536 537 for (i = node_start; i < num_nodes + node_start; i++) { 538 u64 end = *addr + size; 539 540 if (i < big) 541 end += FAKE_NODE_MIN_SIZE; 542 /* 543 * The final node can have the remaining system RAM. Other 544 * nodes receive roughly the same amount of available pages. 545 */ 546 if (i == num_nodes + node_start - 1) 547 end = max_addr; 548 else 549 while (end - *addr - e820_hole_size(*addr, end) < 550 size) { 551 end += FAKE_NODE_MIN_SIZE; 552 if (end > max_addr) { 553 end = max_addr; 554 break; 555 } 556 } 557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0) 558 break; 559 } 560 return i - node_start + 1; 561 } 562 563 /* 564 * Splits the remaining system RAM into chunks of size. The remaining memory is 565 * always assigned to a final node and can be asymmetric. Returns the number of 566 * nodes split. 567 */ 568 static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, 569 u64 size) 570 { 571 int i = node_start; 572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 573 while (!setup_node_range(i++, addr, size, max_addr)) 574 ; 575 return i - node_start; 576 } 577 578 /* 579 * Sets up the system RAM area from start_pfn to last_pfn according to the 580 * numa=fake command-line option. 581 */ 582 static int __init numa_emulation(unsigned long start_pfn, 583 unsigned long last_pfn, int acpi, int k8) 584 { 585 u64 size, addr = start_pfn << PAGE_SHIFT; 586 u64 max_addr = last_pfn << PAGE_SHIFT; 587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 588 int num_phys_nodes; 589 590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 591 /* 592 * If the numa=fake command-line is just a single number N, split the 593 * system RAM into N fake nodes. 594 */ 595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 596 long n = simple_strtol(cmdline, NULL, 0); 597 598 num_nodes = split_nodes_interleave(addr, max_addr, 599 num_phys_nodes, n); 600 if (num_nodes < 0) 601 return num_nodes; 602 goto out; 603 } 604 605 /* Parse the command line. */ 606 for (coeff_flag = 0; ; cmdline++) { 607 if (*cmdline && isdigit(*cmdline)) { 608 num = num * 10 + *cmdline - '0'; 609 continue; 610 } 611 if (*cmdline == '*') { 612 if (num > 0) 613 coeff = num; 614 coeff_flag = 1; 615 } 616 if (!*cmdline || *cmdline == ',') { 617 if (!coeff_flag) 618 coeff = 1; 619 /* 620 * Round down to the nearest FAKE_NODE_MIN_SIZE. 621 * Command-line coefficients are in megabytes. 622 */ 623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 624 if (size) 625 for (i = 0; i < coeff; i++, num_nodes++) 626 if (setup_node_range(num_nodes, &addr, 627 size, max_addr) < 0) 628 goto done; 629 if (!*cmdline) 630 break; 631 coeff_flag = 0; 632 coeff = -1; 633 } 634 num = 0; 635 } 636 done: 637 if (!num_nodes) 638 return -1; 639 /* Fill remainder of system RAM, if appropriate. */ 640 if (addr < max_addr) { 641 if (coeff_flag && coeff < 0) { 642 /* Split remaining nodes into num-sized chunks */ 643 num_nodes += split_nodes_by_size(&addr, max_addr, 644 num_nodes, num); 645 goto out; 646 } 647 switch (*(cmdline - 1)) { 648 case '*': 649 /* Split remaining nodes into coeff chunks */ 650 if (coeff <= 0) 651 break; 652 num_nodes += split_nodes_equally(&addr, max_addr, 653 num_nodes, coeff); 654 break; 655 case ',': 656 /* Do not allocate remaining system RAM */ 657 break; 658 default: 659 /* Give one final node */ 660 setup_node_range(num_nodes, &addr, max_addr - addr, 661 max_addr); 662 num_nodes++; 663 } 664 } 665 out: 666 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 667 if (memnode_shift < 0) { 668 memnode_shift = 0; 669 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 670 "disabled.\n"); 671 return -1; 672 } 673 674 /* 675 * We need to vacate all active ranges that may have been registered for 676 * the e820 memory map. 677 */ 678 remove_all_active_ranges(); 679 for_each_node_mask(i, node_possible_map) { 680 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 681 nodes[i].end >> PAGE_SHIFT); 682 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 683 } 684 acpi_fake_nodes(nodes, num_nodes); 685 numa_init_array(); 686 return 0; 687 } 688 #endif /* CONFIG_NUMA_EMU */ 689 690 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 691 int acpi, int k8) 692 { 693 int i; 694 695 nodes_clear(node_possible_map); 696 nodes_clear(node_online_map); 697 698 #ifdef CONFIG_NUMA_EMU 699 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) 700 return; 701 nodes_clear(node_possible_map); 702 nodes_clear(node_online_map); 703 #endif 704 705 #ifdef CONFIG_ACPI_NUMA 706 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 707 last_pfn << PAGE_SHIFT)) 708 return; 709 nodes_clear(node_possible_map); 710 nodes_clear(node_online_map); 711 #endif 712 713 #ifdef CONFIG_K8_NUMA 714 if (!numa_off && k8 && !k8_scan_nodes()) 715 return; 716 nodes_clear(node_possible_map); 717 nodes_clear(node_online_map); 718 #endif 719 printk(KERN_INFO "%s\n", 720 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 721 722 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 723 start_pfn << PAGE_SHIFT, 724 last_pfn << PAGE_SHIFT); 725 /* setup dummy node covering all memory */ 726 memnode_shift = 63; 727 memnodemap = memnode.embedded_map; 728 memnodemap[0] = 0; 729 node_set_online(0); 730 node_set(0, node_possible_map); 731 for (i = 0; i < nr_cpu_ids; i++) 732 numa_set_node(i, 0); 733 e820_register_active_regions(0, start_pfn, last_pfn); 734 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); 735 } 736 737 unsigned long __init numa_free_all_bootmem(void) 738 { 739 unsigned long pages = 0; 740 int i; 741 742 for_each_online_node(i) 743 pages += free_all_bootmem_node(NODE_DATA(i)); 744 745 return pages; 746 } 747 748 static __init int numa_setup(char *opt) 749 { 750 if (!opt) 751 return -EINVAL; 752 if (!strncmp(opt, "off", 3)) 753 numa_off = 1; 754 #ifdef CONFIG_NUMA_EMU 755 if (!strncmp(opt, "fake=", 5)) 756 cmdline = opt + 5; 757 #endif 758 #ifdef CONFIG_ACPI_NUMA 759 if (!strncmp(opt, "noacpi", 6)) 760 acpi_numa = -1; 761 #endif 762 return 0; 763 } 764 early_param("numa", numa_setup); 765 766 #ifdef CONFIG_NUMA 767 768 static __init int find_near_online_node(int node) 769 { 770 int n, val; 771 int min_val = INT_MAX; 772 int best_node = -1; 773 774 for_each_online_node(n) { 775 val = node_distance(node, n); 776 777 if (val < min_val) { 778 min_val = val; 779 best_node = n; 780 } 781 } 782 783 return best_node; 784 } 785 786 /* 787 * Setup early cpu_to_node. 788 * 789 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 790 * and apicid_to_node[] tables have valid entries for a CPU. 791 * This means we skip cpu_to_node[] initialisation for NUMA 792 * emulation and faking node case (when running a kernel compiled 793 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 794 * is already initialized in a round robin manner at numa_init_array, 795 * prior to this call, and this initialization is good enough 796 * for the fake NUMA cases. 797 * 798 * Called before the per_cpu areas are setup. 799 */ 800 void __init init_cpu_to_node(void) 801 { 802 int cpu; 803 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 804 805 BUG_ON(cpu_to_apicid == NULL); 806 807 for_each_possible_cpu(cpu) { 808 int node; 809 u16 apicid = cpu_to_apicid[cpu]; 810 811 if (apicid == BAD_APICID) 812 continue; 813 node = apicid_to_node[apicid]; 814 if (node == NUMA_NO_NODE) 815 continue; 816 if (!node_online(node)) 817 node = find_near_online_node(node); 818 numa_set_node(cpu, node); 819 } 820 } 821 #endif 822 823 824 void __cpuinit numa_set_node(int cpu, int node) 825 { 826 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 827 828 /* early setting, no percpu area yet */ 829 if (cpu_to_node_map) { 830 cpu_to_node_map[cpu] = node; 831 return; 832 } 833 834 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 835 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 836 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 837 dump_stack(); 838 return; 839 } 840 #endif 841 per_cpu(x86_cpu_to_node_map, cpu) = node; 842 843 if (node != NUMA_NO_NODE) 844 per_cpu(node_number, cpu) = node; 845 } 846 847 void __cpuinit numa_clear_node(int cpu) 848 { 849 numa_set_node(cpu, NUMA_NO_NODE); 850 } 851 852 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 853 854 void __cpuinit numa_add_cpu(int cpu) 855 { 856 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 857 } 858 859 void __cpuinit numa_remove_cpu(int cpu) 860 { 861 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 862 } 863 864 #else /* CONFIG_DEBUG_PER_CPU_MAPS */ 865 866 /* 867 * --------- debug versions of the numa functions --------- 868 */ 869 static void __cpuinit numa_set_cpumask(int cpu, int enable) 870 { 871 int node = early_cpu_to_node(cpu); 872 struct cpumask *mask; 873 char buf[64]; 874 875 mask = node_to_cpumask_map[node]; 876 if (mask == NULL) { 877 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); 878 dump_stack(); 879 return; 880 } 881 882 if (enable) 883 cpumask_set_cpu(cpu, mask); 884 else 885 cpumask_clear_cpu(cpu, mask); 886 887 cpulist_scnprintf(buf, sizeof(buf), mask); 888 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 889 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 890 } 891 892 void __cpuinit numa_add_cpu(int cpu) 893 { 894 numa_set_cpumask(cpu, 1); 895 } 896 897 void __cpuinit numa_remove_cpu(int cpu) 898 { 899 numa_set_cpumask(cpu, 0); 900 } 901 902 int cpu_to_node(int cpu) 903 { 904 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 905 printk(KERN_WARNING 906 "cpu_to_node(%d): usage too early!\n", cpu); 907 dump_stack(); 908 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 909 } 910 return per_cpu(x86_cpu_to_node_map, cpu); 911 } 912 EXPORT_SYMBOL(cpu_to_node); 913 914 /* 915 * Same function as cpu_to_node() but used if called before the 916 * per_cpu areas are setup. 917 */ 918 int early_cpu_to_node(int cpu) 919 { 920 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 921 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 922 923 if (!cpu_possible(cpu)) { 924 printk(KERN_WARNING 925 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 926 dump_stack(); 927 return NUMA_NO_NODE; 928 } 929 return per_cpu(x86_cpu_to_node_map, cpu); 930 } 931 932 /* 933 * --------- end of debug versions of the numa functions --------- 934 */ 935 936 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ 937