1 /* 2 * Generic VM initialization for x86-64 NUMA setups. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 */ 5 #include <linux/kernel.h> 6 #include <linux/mm.h> 7 #include <linux/string.h> 8 #include <linux/init.h> 9 #include <linux/bootmem.h> 10 #include <linux/mmzone.h> 11 #include <linux/ctype.h> 12 #include <linux/module.h> 13 #include <linux/nodemask.h> 14 #include <linux/sched.h> 15 16 #include <asm/e820.h> 17 #include <asm/proto.h> 18 #include <asm/dma.h> 19 #include <asm/numa.h> 20 #include <asm/acpi.h> 21 #include <asm/k8.h> 22 23 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 24 # define DBG(x...) printk(KERN_DEBUG x) 25 #else 26 # define DBG(x...) 27 #endif 28 29 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 30 EXPORT_SYMBOL(node_data); 31 32 struct memnode memnode; 33 34 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 36 }; 37 38 int numa_off __initdata; 39 static unsigned long __initdata nodemap_addr; 40 static unsigned long __initdata nodemap_size; 41 42 DEFINE_PER_CPU(int, node_number) = 0; 43 EXPORT_PER_CPU_SYMBOL(node_number); 44 45 /* 46 * Map cpu index to node index 47 */ 48 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 49 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 50 51 /* 52 * Which logical CPUs are on which nodes 53 */ 54 cpumask_t *node_to_cpumask_map; 55 EXPORT_SYMBOL(node_to_cpumask_map); 56 57 /* 58 * Given a shift value, try to populate memnodemap[] 59 * Returns : 60 * 1 if OK 61 * 0 if memnodmap[] too small (of shift too small) 62 * -1 if node overlap or lost ram (shift too big) 63 */ 64 static int __init populate_memnodemap(const struct bootnode *nodes, 65 int numnodes, int shift, int *nodeids) 66 { 67 unsigned long addr, end; 68 int i, res = -1; 69 70 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 71 for (i = 0; i < numnodes; i++) { 72 addr = nodes[i].start; 73 end = nodes[i].end; 74 if (addr >= end) 75 continue; 76 if ((end >> shift) >= memnodemapsize) 77 return 0; 78 do { 79 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 80 return -1; 81 82 if (!nodeids) 83 memnodemap[addr >> shift] = i; 84 else 85 memnodemap[addr >> shift] = nodeids[i]; 86 87 addr += (1UL << shift); 88 } while (addr < end); 89 res = 1; 90 } 91 return res; 92 } 93 94 static int __init allocate_cachealigned_memnodemap(void) 95 { 96 unsigned long addr; 97 98 memnodemap = memnode.embedded_map; 99 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) 100 return 0; 101 102 addr = 0x8000; 103 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 104 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 105 nodemap_size, L1_CACHE_BYTES); 106 if (nodemap_addr == -1UL) { 107 printk(KERN_ERR 108 "NUMA: Unable to allocate Memory to Node hash map\n"); 109 nodemap_addr = nodemap_size = 0; 110 return -1; 111 } 112 memnodemap = phys_to_virt(nodemap_addr); 113 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); 114 115 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 116 nodemap_addr, nodemap_addr + nodemap_size); 117 return 0; 118 } 119 120 /* 121 * The LSB of all start and end addresses in the node map is the value of the 122 * maximum possible shift. 123 */ 124 static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 125 int numnodes) 126 { 127 int i, nodes_used = 0; 128 unsigned long start, end; 129 unsigned long bitfield = 0, memtop = 0; 130 131 for (i = 0; i < numnodes; i++) { 132 start = nodes[i].start; 133 end = nodes[i].end; 134 if (start >= end) 135 continue; 136 bitfield |= start; 137 nodes_used++; 138 if (end > memtop) 139 memtop = end; 140 } 141 if (nodes_used <= 1) 142 i = 63; 143 else 144 i = find_first_bit(&bitfield, sizeof(unsigned long)*8); 145 memnodemapsize = (memtop >> i)+1; 146 return i; 147 } 148 149 int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 150 int *nodeids) 151 { 152 int shift; 153 154 shift = extract_lsb_from_nodes(nodes, numnodes); 155 if (allocate_cachealigned_memnodemap()) 156 return -1; 157 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 158 shift); 159 160 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 161 printk(KERN_INFO "Your memory is not aligned you need to " 162 "rebuild your kernel with a bigger NODEMAPSIZE " 163 "shift=%d\n", shift); 164 return -1; 165 } 166 return shift; 167 } 168 169 int __meminit __early_pfn_to_nid(unsigned long pfn) 170 { 171 return phys_to_nid(pfn << PAGE_SHIFT); 172 } 173 174 static void * __init early_node_mem(int nodeid, unsigned long start, 175 unsigned long end, unsigned long size, 176 unsigned long align) 177 { 178 unsigned long mem = find_e820_area(start, end, size, align); 179 void *ptr; 180 181 if (mem != -1L) 182 return __va(mem); 183 184 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 185 if (ptr == NULL) { 186 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 187 size, nodeid); 188 return NULL; 189 } 190 return ptr; 191 } 192 193 /* Initialize bootmem allocator for a node */ 194 void __init setup_node_bootmem(int nodeid, unsigned long start, 195 unsigned long end) 196 { 197 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 198 unsigned long bootmap_start, nodedata_phys; 199 void *bootmap; 200 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 201 int nid; 202 203 start = roundup(start, ZONE_ALIGN); 204 205 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 206 start, end); 207 208 start_pfn = start >> PAGE_SHIFT; 209 last_pfn = end >> PAGE_SHIFT; 210 211 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 212 SMP_CACHE_BYTES); 213 if (node_data[nodeid] == NULL) 214 return; 215 nodedata_phys = __pa(node_data[nodeid]); 216 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 217 nodedata_phys + pgdat_size - 1); 218 219 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 220 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 221 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 222 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 223 224 /* 225 * Find a place for the bootmem map 226 * nodedata_phys could be on other nodes by alloc_bootmem, 227 * so need to sure bootmap_start not to be small, otherwise 228 * early_node_mem will get that with find_e820_area instead 229 * of alloc_bootmem, that could clash with reserved range 230 */ 231 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 232 nid = phys_to_nid(nodedata_phys); 233 if (nid == nodeid) 234 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); 235 else 236 bootmap_start = roundup(start, PAGE_SIZE); 237 /* 238 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 239 * to use that to align to PAGE_SIZE 240 */ 241 bootmap = early_node_mem(nodeid, bootmap_start, end, 242 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 243 if (bootmap == NULL) { 244 if (nodedata_phys < start || nodedata_phys >= end) 245 free_bootmem(nodedata_phys, pgdat_size); 246 node_data[nodeid] = NULL; 247 return; 248 } 249 bootmap_start = __pa(bootmap); 250 251 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 252 bootmap_start >> PAGE_SHIFT, 253 start_pfn, last_pfn); 254 255 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 256 bootmap_start, bootmap_start + bootmap_size - 1, 257 bootmap_pages); 258 259 free_bootmem_with_active_regions(nodeid, end); 260 261 /* 262 * convert early reserve to bootmem reserve earlier 263 * otherwise early_node_mem could use early reserved mem 264 * on previous node 265 */ 266 early_res_to_bootmem(start, end); 267 268 /* 269 * in some case early_node_mem could use alloc_bootmem 270 * to get range on other node, don't reserve that again 271 */ 272 if (nid != nodeid) 273 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); 274 else 275 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, 276 pgdat_size, BOOTMEM_DEFAULT); 277 nid = phys_to_nid(bootmap_start); 278 if (nid != nodeid) 279 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 280 else 281 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 282 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 283 284 #ifdef CONFIG_ACPI_NUMA 285 srat_reserve_add_area(nodeid); 286 #endif 287 node_set_online(nodeid); 288 } 289 290 /* 291 * There are unfortunately some poorly designed mainboards around that 292 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 293 * mapping. To avoid this fill in the mapping for all possible CPUs, 294 * as the number of CPUs is not known yet. We round robin the existing 295 * nodes. 296 */ 297 void __init numa_init_array(void) 298 { 299 int rr, i; 300 301 rr = first_node(node_online_map); 302 for (i = 0; i < nr_cpu_ids; i++) { 303 if (early_cpu_to_node(i) != NUMA_NO_NODE) 304 continue; 305 numa_set_node(i, rr); 306 rr = next_node(rr, node_online_map); 307 if (rr == MAX_NUMNODES) 308 rr = first_node(node_online_map); 309 } 310 } 311 312 #ifdef CONFIG_NUMA_EMU 313 /* Numa emulation */ 314 static char *cmdline __initdata; 315 316 /* 317 * Setups up nid to range from addr to addr + size. If the end 318 * boundary is greater than max_addr, then max_addr is used instead. 319 * The return value is 0 if there is additional memory left for 320 * allocation past addr and -1 otherwise. addr is adjusted to be at 321 * the end of the node. 322 */ 323 static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 324 u64 size, u64 max_addr) 325 { 326 int ret = 0; 327 328 nodes[nid].start = *addr; 329 *addr += size; 330 if (*addr >= max_addr) { 331 *addr = max_addr; 332 ret = -1; 333 } 334 nodes[nid].end = *addr; 335 node_set(nid, node_possible_map); 336 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, 337 nodes[nid].start, nodes[nid].end, 338 (nodes[nid].end - nodes[nid].start) >> 20); 339 return ret; 340 } 341 342 /* 343 * Splits num_nodes nodes up equally starting at node_start. The return value 344 * is the number of nodes split up and addr is adjusted to be at the end of the 345 * last node allocated. 346 */ 347 static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 348 u64 max_addr, int node_start, 349 int num_nodes) 350 { 351 unsigned int big; 352 u64 size; 353 int i; 354 355 if (num_nodes <= 0) 356 return -1; 357 if (num_nodes > MAX_NUMNODES) 358 num_nodes = MAX_NUMNODES; 359 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / 360 num_nodes; 361 /* 362 * Calculate the number of big nodes that can be allocated as a result 363 * of consolidating the leftovers. 364 */ 365 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / 366 FAKE_NODE_MIN_SIZE; 367 368 /* Round down to nearest FAKE_NODE_MIN_SIZE. */ 369 size &= FAKE_NODE_MIN_HASH_MASK; 370 if (!size) { 371 printk(KERN_ERR "Not enough memory for each node. " 372 "NUMA emulation disabled.\n"); 373 return -1; 374 } 375 376 for (i = node_start; i < num_nodes + node_start; i++) { 377 u64 end = *addr + size; 378 379 if (i < big) 380 end += FAKE_NODE_MIN_SIZE; 381 /* 382 * The final node can have the remaining system RAM. Other 383 * nodes receive roughly the same amount of available pages. 384 */ 385 if (i == num_nodes + node_start - 1) 386 end = max_addr; 387 else 388 while (end - *addr - e820_hole_size(*addr, end) < 389 size) { 390 end += FAKE_NODE_MIN_SIZE; 391 if (end > max_addr) { 392 end = max_addr; 393 break; 394 } 395 } 396 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 397 break; 398 } 399 return i - node_start + 1; 400 } 401 402 /* 403 * Splits the remaining system RAM into chunks of size. The remaining memory is 404 * always assigned to a final node and can be asymmetric. Returns the number of 405 * nodes split. 406 */ 407 static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 408 u64 max_addr, int node_start, u64 size) 409 { 410 int i = node_start; 411 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 412 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 413 ; 414 return i - node_start; 415 } 416 417 /* 418 * Sets up the system RAM area from start_pfn to last_pfn according to the 419 * numa=fake command-line option. 420 */ 421 static struct bootnode nodes[MAX_NUMNODES] __initdata; 422 423 static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) 424 { 425 u64 size, addr = start_pfn << PAGE_SHIFT; 426 u64 max_addr = last_pfn << PAGE_SHIFT; 427 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 428 429 memset(&nodes, 0, sizeof(nodes)); 430 /* 431 * If the numa=fake command-line is just a single number N, split the 432 * system RAM into N fake nodes. 433 */ 434 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 435 long n = simple_strtol(cmdline, NULL, 0); 436 437 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 438 if (num_nodes < 0) 439 return num_nodes; 440 goto out; 441 } 442 443 /* Parse the command line. */ 444 for (coeff_flag = 0; ; cmdline++) { 445 if (*cmdline && isdigit(*cmdline)) { 446 num = num * 10 + *cmdline - '0'; 447 continue; 448 } 449 if (*cmdline == '*') { 450 if (num > 0) 451 coeff = num; 452 coeff_flag = 1; 453 } 454 if (!*cmdline || *cmdline == ',') { 455 if (!coeff_flag) 456 coeff = 1; 457 /* 458 * Round down to the nearest FAKE_NODE_MIN_SIZE. 459 * Command-line coefficients are in megabytes. 460 */ 461 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 462 if (size) 463 for (i = 0; i < coeff; i++, num_nodes++) 464 if (setup_node_range(num_nodes, nodes, 465 &addr, size, max_addr) < 0) 466 goto done; 467 if (!*cmdline) 468 break; 469 coeff_flag = 0; 470 coeff = -1; 471 } 472 num = 0; 473 } 474 done: 475 if (!num_nodes) 476 return -1; 477 /* Fill remainder of system RAM, if appropriate. */ 478 if (addr < max_addr) { 479 if (coeff_flag && coeff < 0) { 480 /* Split remaining nodes into num-sized chunks */ 481 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 482 num_nodes, num); 483 goto out; 484 } 485 switch (*(cmdline - 1)) { 486 case '*': 487 /* Split remaining nodes into coeff chunks */ 488 if (coeff <= 0) 489 break; 490 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 491 num_nodes, coeff); 492 break; 493 case ',': 494 /* Do not allocate remaining system RAM */ 495 break; 496 default: 497 /* Give one final node */ 498 setup_node_range(num_nodes, nodes, &addr, 499 max_addr - addr, max_addr); 500 num_nodes++; 501 } 502 } 503 out: 504 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 505 if (memnode_shift < 0) { 506 memnode_shift = 0; 507 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 508 "disabled.\n"); 509 return -1; 510 } 511 512 /* 513 * We need to vacate all active ranges that may have been registered by 514 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 515 * true. NUMA emulation has succeeded so we will not scan ACPI nodes. 516 */ 517 remove_all_active_ranges(); 518 #ifdef CONFIG_ACPI_NUMA 519 acpi_numa = -1; 520 #endif 521 for_each_node_mask(i, node_possible_map) { 522 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 523 nodes[i].end >> PAGE_SHIFT); 524 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 525 } 526 acpi_fake_nodes(nodes, num_nodes); 527 numa_init_array(); 528 return 0; 529 } 530 #endif /* CONFIG_NUMA_EMU */ 531 532 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 533 { 534 int i; 535 536 nodes_clear(node_possible_map); 537 nodes_clear(node_online_map); 538 539 #ifdef CONFIG_NUMA_EMU 540 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 541 return; 542 nodes_clear(node_possible_map); 543 nodes_clear(node_online_map); 544 #endif 545 546 #ifdef CONFIG_ACPI_NUMA 547 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 548 last_pfn << PAGE_SHIFT)) 549 return; 550 nodes_clear(node_possible_map); 551 nodes_clear(node_online_map); 552 #endif 553 554 #ifdef CONFIG_K8_NUMA 555 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 556 last_pfn<<PAGE_SHIFT)) 557 return; 558 nodes_clear(node_possible_map); 559 nodes_clear(node_online_map); 560 #endif 561 printk(KERN_INFO "%s\n", 562 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 563 564 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 565 start_pfn << PAGE_SHIFT, 566 last_pfn << PAGE_SHIFT); 567 /* setup dummy node covering all memory */ 568 memnode_shift = 63; 569 memnodemap = memnode.embedded_map; 570 memnodemap[0] = 0; 571 node_set_online(0); 572 node_set(0, node_possible_map); 573 for (i = 0; i < nr_cpu_ids; i++) 574 numa_set_node(i, 0); 575 e820_register_active_regions(0, start_pfn, last_pfn); 576 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); 577 } 578 579 unsigned long __init numa_free_all_bootmem(void) 580 { 581 unsigned long pages = 0; 582 int i; 583 584 for_each_online_node(i) 585 pages += free_all_bootmem_node(NODE_DATA(i)); 586 587 return pages; 588 } 589 590 void __init paging_init(void) 591 { 592 unsigned long max_zone_pfns[MAX_NR_ZONES]; 593 594 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 595 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 596 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 597 max_zone_pfns[ZONE_NORMAL] = max_pfn; 598 599 sparse_memory_present_with_active_regions(MAX_NUMNODES); 600 sparse_init(); 601 602 free_area_init_nodes(max_zone_pfns); 603 } 604 605 static __init int numa_setup(char *opt) 606 { 607 if (!opt) 608 return -EINVAL; 609 if (!strncmp(opt, "off", 3)) 610 numa_off = 1; 611 #ifdef CONFIG_NUMA_EMU 612 if (!strncmp(opt, "fake=", 5)) 613 cmdline = opt + 5; 614 #endif 615 #ifdef CONFIG_ACPI_NUMA 616 if (!strncmp(opt, "noacpi", 6)) 617 acpi_numa = -1; 618 if (!strncmp(opt, "hotadd=", 7)) 619 hotadd_percent = simple_strtoul(opt+7, NULL, 10); 620 #endif 621 return 0; 622 } 623 early_param("numa", numa_setup); 624 625 #ifdef CONFIG_NUMA 626 /* 627 * Setup early cpu_to_node. 628 * 629 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 630 * and apicid_to_node[] tables have valid entries for a CPU. 631 * This means we skip cpu_to_node[] initialisation for NUMA 632 * emulation and faking node case (when running a kernel compiled 633 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 634 * is already initialized in a round robin manner at numa_init_array, 635 * prior to this call, and this initialization is good enough 636 * for the fake NUMA cases. 637 * 638 * Called before the per_cpu areas are setup. 639 */ 640 void __init init_cpu_to_node(void) 641 { 642 int cpu; 643 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 644 645 BUG_ON(cpu_to_apicid == NULL); 646 647 for_each_possible_cpu(cpu) { 648 int node; 649 u16 apicid = cpu_to_apicid[cpu]; 650 651 if (apicid == BAD_APICID) 652 continue; 653 node = apicid_to_node[apicid]; 654 if (node == NUMA_NO_NODE) 655 continue; 656 if (!node_online(node)) 657 continue; 658 numa_set_node(cpu, node); 659 } 660 } 661 #endif 662 663 664 /* 665 * Allocate node_to_cpumask_map based on number of available nodes 666 * Requires node_possible_map to be valid. 667 * 668 * Note: node_to_cpumask() is not valid until after this is done. 669 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 670 */ 671 void __init setup_node_to_cpumask_map(void) 672 { 673 unsigned int node, num = 0; 674 cpumask_t *map; 675 676 /* setup nr_node_ids if not done yet */ 677 if (nr_node_ids == MAX_NUMNODES) { 678 for_each_node_mask(node, node_possible_map) 679 num = node; 680 nr_node_ids = num + 1; 681 } 682 683 /* allocate the map */ 684 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 685 DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); 686 687 pr_debug("Node to cpumask map at %p for %d nodes\n", 688 map, nr_node_ids); 689 690 /* node_to_cpumask() will now work */ 691 node_to_cpumask_map = map; 692 } 693 694 void __cpuinit numa_set_node(int cpu, int node) 695 { 696 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 697 698 /* early setting, no percpu area yet */ 699 if (cpu_to_node_map) { 700 cpu_to_node_map[cpu] = node; 701 return; 702 } 703 704 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 705 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 706 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 707 dump_stack(); 708 return; 709 } 710 #endif 711 per_cpu(x86_cpu_to_node_map, cpu) = node; 712 713 if (node != NUMA_NO_NODE) 714 per_cpu(node_number, cpu) = node; 715 } 716 717 void __cpuinit numa_clear_node(int cpu) 718 { 719 numa_set_node(cpu, NUMA_NO_NODE); 720 } 721 722 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 723 724 void __cpuinit numa_add_cpu(int cpu) 725 { 726 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 727 } 728 729 void __cpuinit numa_remove_cpu(int cpu) 730 { 731 cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 732 } 733 734 #else /* CONFIG_DEBUG_PER_CPU_MAPS */ 735 736 /* 737 * --------- debug versions of the numa functions --------- 738 */ 739 static void __cpuinit numa_set_cpumask(int cpu, int enable) 740 { 741 int node = early_cpu_to_node(cpu); 742 cpumask_t *mask; 743 char buf[64]; 744 745 if (node_to_cpumask_map == NULL) { 746 printk(KERN_ERR "node_to_cpumask_map NULL\n"); 747 dump_stack(); 748 return; 749 } 750 751 mask = &node_to_cpumask_map[node]; 752 if (enable) 753 cpu_set(cpu, *mask); 754 else 755 cpu_clear(cpu, *mask); 756 757 cpulist_scnprintf(buf, sizeof(buf), mask); 758 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 759 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 760 } 761 762 void __cpuinit numa_add_cpu(int cpu) 763 { 764 numa_set_cpumask(cpu, 1); 765 } 766 767 void __cpuinit numa_remove_cpu(int cpu) 768 { 769 numa_set_cpumask(cpu, 0); 770 } 771 772 int cpu_to_node(int cpu) 773 { 774 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 775 printk(KERN_WARNING 776 "cpu_to_node(%d): usage too early!\n", cpu); 777 dump_stack(); 778 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 779 } 780 return per_cpu(x86_cpu_to_node_map, cpu); 781 } 782 EXPORT_SYMBOL(cpu_to_node); 783 784 /* 785 * Same function as cpu_to_node() but used if called before the 786 * per_cpu areas are setup. 787 */ 788 int early_cpu_to_node(int cpu) 789 { 790 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 791 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 792 793 if (!cpu_possible(cpu)) { 794 printk(KERN_WARNING 795 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 796 dump_stack(); 797 return NUMA_NO_NODE; 798 } 799 return per_cpu(x86_cpu_to_node_map, cpu); 800 } 801 802 803 /* empty cpumask */ 804 static const cpumask_t cpu_mask_none; 805 806 /* 807 * Returns a pointer to the bitmask of CPUs on Node 'node'. 808 */ 809 const cpumask_t *cpumask_of_node(int node) 810 { 811 if (node_to_cpumask_map == NULL) { 812 printk(KERN_WARNING 813 "cpumask_of_node(%d): no node_to_cpumask_map!\n", 814 node); 815 dump_stack(); 816 return (const cpumask_t *)&cpu_online_map; 817 } 818 if (node >= nr_node_ids) { 819 printk(KERN_WARNING 820 "cpumask_of_node(%d): node > nr_node_ids(%d)\n", 821 node, nr_node_ids); 822 dump_stack(); 823 return &cpu_mask_none; 824 } 825 return &node_to_cpumask_map[node]; 826 } 827 EXPORT_SYMBOL(cpumask_of_node); 828 829 /* 830 * Returns a bitmask of CPUs on Node 'node'. 831 * 832 * Side note: this function creates the returned cpumask on the stack 833 * so with a high NR_CPUS count, excessive stack space is used. The 834 * node_to_cpumask_ptr function should be used whenever possible. 835 */ 836 cpumask_t node_to_cpumask(int node) 837 { 838 if (node_to_cpumask_map == NULL) { 839 printk(KERN_WARNING 840 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); 841 dump_stack(); 842 return cpu_online_map; 843 } 844 if (node >= nr_node_ids) { 845 printk(KERN_WARNING 846 "node_to_cpumask(%d): node > nr_node_ids(%d)\n", 847 node, nr_node_ids); 848 dump_stack(); 849 return cpu_mask_none; 850 } 851 return node_to_cpumask_map[node]; 852 } 853 EXPORT_SYMBOL(node_to_cpumask); 854 855 /* 856 * --------- end of debug versions of the numa functions --------- 857 */ 858 859 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ 860