1 /* 2 * Generic VM initialization for x86-64 NUMA setups. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 */ 5 #include <linux/kernel.h> 6 #include <linux/mm.h> 7 #include <linux/string.h> 8 #include <linux/init.h> 9 #include <linux/bootmem.h> 10 #include <linux/memblock.h> 11 #include <linux/mmzone.h> 12 #include <linux/ctype.h> 13 #include <linux/module.h> 14 #include <linux/nodemask.h> 15 #include <linux/sched.h> 16 #include <linux/acpi.h> 17 18 #include <asm/e820.h> 19 #include <asm/proto.h> 20 #include <asm/dma.h> 21 #include <asm/acpi.h> 22 #include <asm/amd_nb.h> 23 24 #include "numa_internal.h" 25 26 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27 EXPORT_SYMBOL(node_data); 28 29 nodemask_t numa_nodes_parsed __initdata; 30 31 struct memnode memnode; 32 33 static unsigned long __initdata nodemap_addr; 34 static unsigned long __initdata nodemap_size; 35 36 static struct numa_meminfo numa_meminfo __initdata; 37 38 static int numa_distance_cnt; 39 static u8 *numa_distance; 40 41 /* 42 * Given a shift value, try to populate memnodemap[] 43 * Returns : 44 * 1 if OK 45 * 0 if memnodmap[] too small (of shift too small) 46 * -1 if node overlap or lost ram (shift too big) 47 */ 48 static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) 49 { 50 unsigned long addr, end; 51 int i, res = -1; 52 53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 54 for (i = 0; i < mi->nr_blks; i++) { 55 addr = mi->blk[i].start; 56 end = mi->blk[i].end; 57 if (addr >= end) 58 continue; 59 if ((end >> shift) >= memnodemapsize) 60 return 0; 61 do { 62 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 63 return -1; 64 memnodemap[addr >> shift] = mi->blk[i].nid; 65 addr += (1UL << shift); 66 } while (addr < end); 67 res = 1; 68 } 69 return res; 70 } 71 72 static int __init allocate_cachealigned_memnodemap(void) 73 { 74 unsigned long addr; 75 76 memnodemap = memnode.embedded_map; 77 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) 78 return 0; 79 80 addr = 0x8000; 81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), 83 nodemap_size, L1_CACHE_BYTES); 84 if (nodemap_addr == MEMBLOCK_ERROR) { 85 printk(KERN_ERR 86 "NUMA: Unable to allocate Memory to Node hash map\n"); 87 nodemap_addr = nodemap_size = 0; 88 return -1; 89 } 90 memnodemap = phys_to_virt(nodemap_addr); 91 memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); 92 93 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 94 nodemap_addr, nodemap_addr + nodemap_size); 95 return 0; 96 } 97 98 /* 99 * The LSB of all start and end addresses in the node map is the value of the 100 * maximum possible shift. 101 */ 102 static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) 103 { 104 int i, nodes_used = 0; 105 unsigned long start, end; 106 unsigned long bitfield = 0, memtop = 0; 107 108 for (i = 0; i < mi->nr_blks; i++) { 109 start = mi->blk[i].start; 110 end = mi->blk[i].end; 111 if (start >= end) 112 continue; 113 bitfield |= start; 114 nodes_used++; 115 if (end > memtop) 116 memtop = end; 117 } 118 if (nodes_used <= 1) 119 i = 63; 120 else 121 i = find_first_bit(&bitfield, sizeof(unsigned long)*8); 122 memnodemapsize = (memtop >> i)+1; 123 return i; 124 } 125 126 static int __init compute_hash_shift(const struct numa_meminfo *mi) 127 { 128 int shift; 129 130 shift = extract_lsb_from_nodes(mi); 131 if (allocate_cachealigned_memnodemap()) 132 return -1; 133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 134 shift); 135 136 if (populate_memnodemap(mi, shift) != 1) { 137 printk(KERN_INFO "Your memory is not aligned you need to " 138 "rebuild your kernel with a bigger NODEMAPSIZE " 139 "shift=%d\n", shift); 140 return -1; 141 } 142 return shift; 143 } 144 145 int __meminit __early_pfn_to_nid(unsigned long pfn) 146 { 147 return phys_to_nid(pfn << PAGE_SHIFT); 148 } 149 150 static void * __init early_node_mem(int nodeid, unsigned long start, 151 unsigned long end, unsigned long size, 152 unsigned long align) 153 { 154 unsigned long mem; 155 156 /* 157 * put it on high as possible 158 * something will go with NODE_DATA 159 */ 160 if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) 161 start = MAX_DMA_PFN<<PAGE_SHIFT; 162 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && 163 end > (MAX_DMA32_PFN<<PAGE_SHIFT)) 164 start = MAX_DMA32_PFN<<PAGE_SHIFT; 165 mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align); 166 if (mem != MEMBLOCK_ERROR) 167 return __va(mem); 168 169 /* extend the search scope */ 170 end = max_pfn_mapped << PAGE_SHIFT; 171 start = MAX_DMA_PFN << PAGE_SHIFT; 172 mem = memblock_find_in_range(start, end, size, align); 173 if (mem != MEMBLOCK_ERROR) 174 return __va(mem); 175 176 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 177 size, nodeid); 178 179 return NULL; 180 } 181 182 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 183 struct numa_meminfo *mi) 184 { 185 /* ignore zero length blks */ 186 if (start == end) 187 return 0; 188 189 /* whine about and ignore invalid blks */ 190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", 192 nid, start, end); 193 return 0; 194 } 195 196 if (mi->nr_blks >= NR_NODE_MEMBLKS) { 197 pr_err("NUMA: too many memblk ranges\n"); 198 return -EINVAL; 199 } 200 201 mi->blk[mi->nr_blks].start = start; 202 mi->blk[mi->nr_blks].end = end; 203 mi->blk[mi->nr_blks].nid = nid; 204 mi->nr_blks++; 205 return 0; 206 } 207 208 /** 209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 210 * @idx: Index of memblk to remove 211 * @mi: numa_meminfo to remove memblk from 212 * 213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 214 * decrementing @mi->nr_blks. 215 */ 216 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 217 { 218 mi->nr_blks--; 219 memmove(&mi->blk[idx], &mi->blk[idx + 1], 220 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 221 } 222 223 /** 224 * numa_add_memblk - Add one numa_memblk to numa_meminfo 225 * @nid: NUMA node ID of the new memblk 226 * @start: Start address of the new memblk 227 * @end: End address of the new memblk 228 * 229 * Add a new memblk to the default numa_meminfo. 230 * 231 * RETURNS: 232 * 0 on success, -errno on failure. 233 */ 234 int __init numa_add_memblk(int nid, u64 start, u64 end) 235 { 236 return numa_add_memblk_to(nid, start, end, &numa_meminfo); 237 } 238 239 /* Initialize bootmem allocator for a node */ 240 void __init 241 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 242 { 243 unsigned long start_pfn, last_pfn, nodedata_phys; 244 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 245 int nid; 246 247 if (!end) 248 return; 249 250 /* 251 * Don't confuse VM with a node that doesn't have the 252 * minimum amount of memory: 253 */ 254 if (end && (end - start) < NODE_MIN_SIZE) 255 return; 256 257 start = roundup(start, ZONE_ALIGN); 258 259 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, 260 start, end); 261 262 start_pfn = start >> PAGE_SHIFT; 263 last_pfn = end >> PAGE_SHIFT; 264 265 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 266 SMP_CACHE_BYTES); 267 if (node_data[nodeid] == NULL) 268 return; 269 nodedata_phys = __pa(node_data[nodeid]); 270 memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); 271 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 272 nodedata_phys + pgdat_size - 1); 273 nid = phys_to_nid(nodedata_phys); 274 if (nid != nodeid) 275 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); 276 277 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 278 NODE_DATA(nodeid)->node_id = nodeid; 279 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 280 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 281 282 node_set_online(nodeid); 283 } 284 285 /** 286 * numa_cleanup_meminfo - Cleanup a numa_meminfo 287 * @mi: numa_meminfo to clean up 288 * 289 * Sanitize @mi by merging and removing unncessary memblks. Also check for 290 * conflicts and clear unused memblks. 291 * 292 * RETURNS: 293 * 0 on success, -errno on failure. 294 */ 295 int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 296 { 297 const u64 low = 0; 298 const u64 high = (u64)max_pfn << PAGE_SHIFT; 299 int i, j, k; 300 301 for (i = 0; i < mi->nr_blks; i++) { 302 struct numa_memblk *bi = &mi->blk[i]; 303 304 /* make sure all blocks are inside the limits */ 305 bi->start = max(bi->start, low); 306 bi->end = min(bi->end, high); 307 308 /* and there's no empty block */ 309 if (bi->start == bi->end) { 310 numa_remove_memblk_from(i--, mi); 311 continue; 312 } 313 314 for (j = i + 1; j < mi->nr_blks; j++) { 315 struct numa_memblk *bj = &mi->blk[j]; 316 unsigned long start, end; 317 318 /* 319 * See whether there are overlapping blocks. Whine 320 * about but allow overlaps of the same nid. They 321 * will be merged below. 322 */ 323 if (bi->end > bj->start && bi->start < bj->end) { 324 if (bi->nid != bj->nid) { 325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", 326 bi->nid, bi->start, bi->end, 327 bj->nid, bj->start, bj->end); 328 return -EINVAL; 329 } 330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", 331 bi->nid, bi->start, bi->end, 332 bj->start, bj->end); 333 } 334 335 /* 336 * Join together blocks on the same node, holes 337 * between which don't overlap with memory on other 338 * nodes. 339 */ 340 if (bi->nid != bj->nid) 341 continue; 342 start = max(min(bi->start, bj->start), low); 343 end = min(max(bi->end, bj->end), high); 344 for (k = 0; k < mi->nr_blks; k++) { 345 struct numa_memblk *bk = &mi->blk[k]; 346 347 if (bi->nid == bk->nid) 348 continue; 349 if (start < bk->end && end > bk->start) 350 break; 351 } 352 if (k < mi->nr_blks) 353 continue; 354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", 355 bi->nid, bi->start, bi->end, bj->start, bj->end, 356 start, end); 357 bi->start = start; 358 bi->end = end; 359 numa_remove_memblk_from(j--, mi); 360 } 361 } 362 363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 364 mi->blk[i].start = mi->blk[i].end = 0; 365 mi->blk[i].nid = NUMA_NO_NODE; 366 } 367 368 return 0; 369 } 370 371 /* 372 * Set nodes, which have memory in @mi, in *@nodemask. 373 */ 374 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 375 const struct numa_meminfo *mi) 376 { 377 int i; 378 379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 380 if (mi->blk[i].start != mi->blk[i].end && 381 mi->blk[i].nid != NUMA_NO_NODE) 382 node_set(mi->blk[i].nid, *nodemask); 383 } 384 385 /** 386 * numa_reset_distance - Reset NUMA distance table 387 * 388 * The current table is freed. The next numa_set_distance() call will 389 * create a new one. 390 */ 391 void __init numa_reset_distance(void) 392 { 393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 394 395 /* numa_distance could be 1LU marking allocation failure, test cnt */ 396 if (numa_distance_cnt) 397 memblock_x86_free_range(__pa(numa_distance), 398 __pa(numa_distance) + size); 399 numa_distance_cnt = 0; 400 numa_distance = NULL; /* enable table creation */ 401 } 402 403 static int __init numa_alloc_distance(void) 404 { 405 nodemask_t nodes_parsed; 406 size_t size; 407 int i, j, cnt = 0; 408 u64 phys; 409 410 /* size the new table and allocate it */ 411 nodes_parsed = numa_nodes_parsed; 412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 413 414 for_each_node_mask(i, nodes_parsed) 415 cnt = i; 416 cnt++; 417 size = cnt * cnt * sizeof(numa_distance[0]); 418 419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, 420 size, PAGE_SIZE); 421 if (phys == MEMBLOCK_ERROR) { 422 pr_warning("NUMA: Warning: can't allocate distance table!\n"); 423 /* don't retry until explicitly reset */ 424 numa_distance = (void *)1LU; 425 return -ENOMEM; 426 } 427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); 428 429 numa_distance = __va(phys); 430 numa_distance_cnt = cnt; 431 432 /* fill with the default distances */ 433 for (i = 0; i < cnt; i++) 434 for (j = 0; j < cnt; j++) 435 numa_distance[i * cnt + j] = i == j ? 436 LOCAL_DISTANCE : REMOTE_DISTANCE; 437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 438 439 return 0; 440 } 441 442 /** 443 * numa_set_distance - Set NUMA distance from one NUMA to another 444 * @from: the 'from' node to set distance 445 * @to: the 'to' node to set distance 446 * @distance: NUMA distance 447 * 448 * Set the distance from node @from to @to to @distance. If distance table 449 * doesn't exist, one which is large enough to accommodate all the currently 450 * known nodes will be created. 451 * 452 * If such table cannot be allocated, a warning is printed and further 453 * calls are ignored until the distance table is reset with 454 * numa_reset_distance(). 455 * 456 * If @from or @to is higher than the highest known node at the time of 457 * table creation or @distance doesn't make sense, the call is ignored. 458 * This is to allow simplification of specific NUMA config implementations. 459 */ 460 void __init numa_set_distance(int from, int to, int distance) 461 { 462 if (!numa_distance && numa_alloc_distance() < 0) 463 return; 464 465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) { 466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", 467 from, to, distance); 468 return; 469 } 470 471 if ((u8)distance != distance || 472 (from == to && distance != LOCAL_DISTANCE)) { 473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 474 from, to, distance); 475 return; 476 } 477 478 numa_distance[from * numa_distance_cnt + to] = distance; 479 } 480 481 int __node_distance(int from, int to) 482 { 483 if (from >= numa_distance_cnt || to >= numa_distance_cnt) 484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 485 return numa_distance[from * numa_distance_cnt + to]; 486 } 487 EXPORT_SYMBOL(__node_distance); 488 489 /* 490 * Sanity check to catch more bad NUMA configurations (they are amazingly 491 * common). Make sure the nodes cover all memory. 492 */ 493 static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) 494 { 495 unsigned long numaram, e820ram; 496 int i; 497 498 numaram = 0; 499 for (i = 0; i < mi->nr_blks; i++) { 500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT; 501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT; 502 numaram += e - s; 503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); 504 if ((long)numaram < 0) 505 numaram = 0; 506 } 507 508 e820ram = max_pfn - (memblock_x86_hole_size(0, 509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); 510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ 511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { 512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", 513 (numaram << PAGE_SHIFT) >> 20, 514 (e820ram << PAGE_SHIFT) >> 20); 515 return false; 516 } 517 return true; 518 } 519 520 static int __init numa_register_memblks(struct numa_meminfo *mi) 521 { 522 int i, nid; 523 524 /* Account for nodes with cpus and no memory */ 525 node_possible_map = numa_nodes_parsed; 526 numa_nodemask_from_meminfo(&node_possible_map, mi); 527 if (WARN_ON(nodes_empty(node_possible_map))) 528 return -EINVAL; 529 530 memnode_shift = compute_hash_shift(mi); 531 if (memnode_shift < 0) { 532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); 533 return -EINVAL; 534 } 535 536 for (i = 0; i < mi->nr_blks; i++) 537 memblock_x86_register_active_regions(mi->blk[i].nid, 538 mi->blk[i].start >> PAGE_SHIFT, 539 mi->blk[i].end >> PAGE_SHIFT); 540 541 /* for out of order entries */ 542 sort_node_map(); 543 if (!numa_meminfo_cover_memory(mi)) 544 return -EINVAL; 545 546 /* Finally register nodes. */ 547 for_each_node_mask(nid, node_possible_map) { 548 u64 start = (u64)max_pfn << PAGE_SHIFT; 549 u64 end = 0; 550 551 for (i = 0; i < mi->nr_blks; i++) { 552 if (nid != mi->blk[i].nid) 553 continue; 554 start = min(mi->blk[i].start, start); 555 end = max(mi->blk[i].end, end); 556 } 557 558 if (start < end) 559 setup_node_bootmem(nid, start, end); 560 } 561 562 return 0; 563 } 564 565 /** 566 * dummy_numma_init - Fallback dummy NUMA init 567 * 568 * Used if there's no underlying NUMA architecture, NUMA initialization 569 * fails, or NUMA is disabled on the command line. 570 * 571 * Must online at least one node and add memory blocks that cover all 572 * allowed memory. This function must not fail. 573 */ 574 static int __init dummy_numa_init(void) 575 { 576 printk(KERN_INFO "%s\n", 577 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 579 0LU, max_pfn << PAGE_SHIFT); 580 581 node_set(0, numa_nodes_parsed); 582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); 583 584 return 0; 585 } 586 587 static int __init numa_init(int (*init_func)(void)) 588 { 589 int i; 590 int ret; 591 592 for (i = 0; i < MAX_LOCAL_APIC; i++) 593 set_apicid_to_node(i, NUMA_NO_NODE); 594 595 nodes_clear(numa_nodes_parsed); 596 nodes_clear(node_possible_map); 597 nodes_clear(node_online_map); 598 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 599 remove_all_active_ranges(); 600 numa_reset_distance(); 601 602 ret = init_func(); 603 if (ret < 0) 604 return ret; 605 ret = numa_cleanup_meminfo(&numa_meminfo); 606 if (ret < 0) 607 return ret; 608 609 numa_emulation(&numa_meminfo, numa_distance_cnt); 610 611 ret = numa_register_memblks(&numa_meminfo); 612 if (ret < 0) 613 return ret; 614 615 for (i = 0; i < nr_cpu_ids; i++) { 616 int nid = early_cpu_to_node(i); 617 618 if (nid == NUMA_NO_NODE) 619 continue; 620 if (!node_online(nid)) 621 numa_clear_node(i); 622 } 623 numa_init_array(); 624 return 0; 625 } 626 627 void __init initmem_init(void) 628 { 629 int ret; 630 631 if (!numa_off) { 632 #ifdef CONFIG_ACPI_NUMA 633 ret = numa_init(x86_acpi_numa_init); 634 if (!ret) 635 return; 636 #endif 637 #ifdef CONFIG_AMD_NUMA 638 ret = numa_init(amd_numa_init); 639 if (!ret) 640 return; 641 #endif 642 } 643 644 numa_init(dummy_numa_init); 645 } 646 647 unsigned long __init numa_free_all_bootmem(void) 648 { 649 unsigned long pages = 0; 650 int i; 651 652 for_each_online_node(i) 653 pages += free_all_bootmem_node(NODE_DATA(i)); 654 655 pages += free_all_memory_core_early(MAX_NUMNODES); 656 657 return pages; 658 } 659 660 int __cpuinit numa_cpu_node(int cpu) 661 { 662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 663 664 if (apicid != BAD_APICID) 665 return __apicid_to_node[apicid]; 666 return NUMA_NO_NODE; 667 } 668