1 /* 2 * pSeries NUMA support 3 * 4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #define pr_fmt(fmt) "numa: " fmt 12 13 #include <linux/threads.h> 14 #include <linux/bootmem.h> 15 #include <linux/init.h> 16 #include <linux/mm.h> 17 #include <linux/mmzone.h> 18 #include <linux/export.h> 19 #include <linux/nodemask.h> 20 #include <linux/cpu.h> 21 #include <linux/notifier.h> 22 #include <linux/memblock.h> 23 #include <linux/of.h> 24 #include <linux/pfn.h> 25 #include <linux/cpuset.h> 26 #include <linux/node.h> 27 #include <linux/stop_machine.h> 28 #include <linux/proc_fs.h> 29 #include <linux/seq_file.h> 30 #include <linux/uaccess.h> 31 #include <linux/slab.h> 32 #include <asm/cputhreads.h> 33 #include <asm/sparsemem.h> 34 #include <asm/prom.h> 35 #include <asm/smp.h> 36 #include <asm/cputhreads.h> 37 #include <asm/topology.h> 38 #include <asm/firmware.h> 39 #include <asm/paca.h> 40 #include <asm/hvcall.h> 41 #include <asm/setup.h> 42 #include <asm/vdso.h> 43 44 static int numa_enabled = 1; 45 46 static char *cmdline __initdata; 47 48 static int numa_debug; 49 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 50 51 int numa_cpu_lookup_table[NR_CPUS]; 52 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 53 struct pglist_data *node_data[MAX_NUMNODES]; 54 55 EXPORT_SYMBOL(numa_cpu_lookup_table); 56 EXPORT_SYMBOL(node_to_cpumask_map); 57 EXPORT_SYMBOL(node_data); 58 59 static int min_common_depth; 60 static int n_mem_addr_cells, n_mem_size_cells; 61 static int form1_affinity; 62 63 #define MAX_DISTANCE_REF_POINTS 4 64 static int distance_ref_points_depth; 65 static const __be32 *distance_ref_points; 66 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; 67 68 /* 69 * Allocate node_to_cpumask_map based on number of available nodes 70 * Requires node_possible_map to be valid. 71 * 72 * Note: cpumask_of_node() is not valid until after this is done. 73 */ 74 static void __init setup_node_to_cpumask_map(void) 75 { 76 unsigned int node; 77 78 /* setup nr_node_ids if not done yet */ 79 if (nr_node_ids == MAX_NUMNODES) 80 setup_nr_node_ids(); 81 82 /* allocate the map */ 83 for (node = 0; node < nr_node_ids; node++) 84 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 85 86 /* cpumask_of_node() will now work */ 87 dbg("Node to cpumask map for %d nodes\n", nr_node_ids); 88 } 89 90 static int __init fake_numa_create_new_node(unsigned long end_pfn, 91 unsigned int *nid) 92 { 93 unsigned long long mem; 94 char *p = cmdline; 95 static unsigned int fake_nid; 96 static unsigned long long curr_boundary; 97 98 /* 99 * Modify node id, iff we started creating NUMA nodes 100 * We want to continue from where we left of the last time 101 */ 102 if (fake_nid) 103 *nid = fake_nid; 104 /* 105 * In case there are no more arguments to parse, the 106 * node_id should be the same as the last fake node id 107 * (we've handled this above). 108 */ 109 if (!p) 110 return 0; 111 112 mem = memparse(p, &p); 113 if (!mem) 114 return 0; 115 116 if (mem < curr_boundary) 117 return 0; 118 119 curr_boundary = mem; 120 121 if ((end_pfn << PAGE_SHIFT) > mem) { 122 /* 123 * Skip commas and spaces 124 */ 125 while (*p == ',' || *p == ' ' || *p == '\t') 126 p++; 127 128 cmdline = p; 129 fake_nid++; 130 *nid = fake_nid; 131 dbg("created new fake_node with id %d\n", fake_nid); 132 return 1; 133 } 134 return 0; 135 } 136 137 /* 138 * get_node_active_region - Return active region containing pfn 139 * Active range returned is empty if none found. 140 * @pfn: The page to return the region for 141 * @node_ar: Returned set to the active region containing @pfn 142 */ 143 static void __init get_node_active_region(unsigned long pfn, 144 struct node_active_region *node_ar) 145 { 146 unsigned long start_pfn, end_pfn; 147 int i, nid; 148 149 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 150 if (pfn >= start_pfn && pfn < end_pfn) { 151 node_ar->nid = nid; 152 node_ar->start_pfn = start_pfn; 153 node_ar->end_pfn = end_pfn; 154 break; 155 } 156 } 157 } 158 159 static void reset_numa_cpu_lookup_table(void) 160 { 161 unsigned int cpu; 162 163 for_each_possible_cpu(cpu) 164 numa_cpu_lookup_table[cpu] = -1; 165 } 166 167 static void update_numa_cpu_lookup_table(unsigned int cpu, int node) 168 { 169 numa_cpu_lookup_table[cpu] = node; 170 } 171 172 static void map_cpu_to_node(int cpu, int node) 173 { 174 update_numa_cpu_lookup_table(cpu, node); 175 176 dbg("adding cpu %d to node %d\n", cpu, node); 177 178 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) 179 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 180 } 181 182 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) 183 static void unmap_cpu_from_node(unsigned long cpu) 184 { 185 int node = numa_cpu_lookup_table[cpu]; 186 187 dbg("removing cpu %lu from node %d\n", cpu, node); 188 189 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 190 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); 191 } else { 192 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 193 cpu, node); 194 } 195 } 196 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ 197 198 /* must hold reference to node during call */ 199 static const __be32 *of_get_associativity(struct device_node *dev) 200 { 201 return of_get_property(dev, "ibm,associativity", NULL); 202 } 203 204 /* 205 * Returns the property linux,drconf-usable-memory if 206 * it exists (the property exists only in kexec/kdump kernels, 207 * added by kexec-tools) 208 */ 209 static const __be32 *of_get_usable_memory(struct device_node *memory) 210 { 211 const __be32 *prop; 212 u32 len; 213 prop = of_get_property(memory, "linux,drconf-usable-memory", &len); 214 if (!prop || len < sizeof(unsigned int)) 215 return NULL; 216 return prop; 217 } 218 219 int __node_distance(int a, int b) 220 { 221 int i; 222 int distance = LOCAL_DISTANCE; 223 224 if (!form1_affinity) 225 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); 226 227 for (i = 0; i < distance_ref_points_depth; i++) { 228 if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) 229 break; 230 231 /* Double the distance for each NUMA level */ 232 distance *= 2; 233 } 234 235 return distance; 236 } 237 EXPORT_SYMBOL(__node_distance); 238 239 static void initialize_distance_lookup_table(int nid, 240 const __be32 *associativity) 241 { 242 int i; 243 244 if (!form1_affinity) 245 return; 246 247 for (i = 0; i < distance_ref_points_depth; i++) { 248 const __be32 *entry; 249 250 entry = &associativity[be32_to_cpu(distance_ref_points[i])]; 251 distance_lookup_table[nid][i] = of_read_number(entry, 1); 252 } 253 } 254 255 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 256 * info is found. 257 */ 258 static int associativity_to_nid(const __be32 *associativity) 259 { 260 int nid = -1; 261 262 if (min_common_depth == -1) 263 goto out; 264 265 if (of_read_number(associativity, 1) >= min_common_depth) 266 nid = of_read_number(&associativity[min_common_depth], 1); 267 268 /* POWER4 LPAR uses 0xffff as invalid node */ 269 if (nid == 0xffff || nid >= MAX_NUMNODES) 270 nid = -1; 271 272 if (nid > 0 && 273 of_read_number(associativity, 1) >= distance_ref_points_depth) 274 initialize_distance_lookup_table(nid, associativity); 275 276 out: 277 return nid; 278 } 279 280 /* Returns the nid associated with the given device tree node, 281 * or -1 if not found. 282 */ 283 static int of_node_to_nid_single(struct device_node *device) 284 { 285 int nid = -1; 286 const __be32 *tmp; 287 288 tmp = of_get_associativity(device); 289 if (tmp) 290 nid = associativity_to_nid(tmp); 291 return nid; 292 } 293 294 /* Walk the device tree upwards, looking for an associativity id */ 295 int of_node_to_nid(struct device_node *device) 296 { 297 struct device_node *tmp; 298 int nid = -1; 299 300 of_node_get(device); 301 while (device) { 302 nid = of_node_to_nid_single(device); 303 if (nid != -1) 304 break; 305 306 tmp = device; 307 device = of_get_parent(tmp); 308 of_node_put(tmp); 309 } 310 of_node_put(device); 311 312 return nid; 313 } 314 EXPORT_SYMBOL_GPL(of_node_to_nid); 315 316 static int __init find_min_common_depth(void) 317 { 318 int depth; 319 struct device_node *root; 320 321 if (firmware_has_feature(FW_FEATURE_OPAL)) 322 root = of_find_node_by_path("/ibm,opal"); 323 else 324 root = of_find_node_by_path("/rtas"); 325 if (!root) 326 root = of_find_node_by_path("/"); 327 328 /* 329 * This property is a set of 32-bit integers, each representing 330 * an index into the ibm,associativity nodes. 331 * 332 * With form 0 affinity the first integer is for an SMP configuration 333 * (should be all 0's) and the second is for a normal NUMA 334 * configuration. We have only one level of NUMA. 335 * 336 * With form 1 affinity the first integer is the most significant 337 * NUMA boundary and the following are progressively less significant 338 * boundaries. There can be more than one level of NUMA. 339 */ 340 distance_ref_points = of_get_property(root, 341 "ibm,associativity-reference-points", 342 &distance_ref_points_depth); 343 344 if (!distance_ref_points) { 345 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 346 goto err; 347 } 348 349 distance_ref_points_depth /= sizeof(int); 350 351 if (firmware_has_feature(FW_FEATURE_OPAL) || 352 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { 353 dbg("Using form 1 affinity\n"); 354 form1_affinity = 1; 355 } 356 357 if (form1_affinity) { 358 depth = of_read_number(distance_ref_points, 1); 359 } else { 360 if (distance_ref_points_depth < 2) { 361 printk(KERN_WARNING "NUMA: " 362 "short ibm,associativity-reference-points\n"); 363 goto err; 364 } 365 366 depth = of_read_number(&distance_ref_points[1], 1); 367 } 368 369 /* 370 * Warn and cap if the hardware supports more than 371 * MAX_DISTANCE_REF_POINTS domains. 372 */ 373 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { 374 printk(KERN_WARNING "NUMA: distance array capped at " 375 "%d entries\n", MAX_DISTANCE_REF_POINTS); 376 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 377 } 378 379 of_node_put(root); 380 return depth; 381 382 err: 383 of_node_put(root); 384 return -1; 385 } 386 387 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 388 { 389 struct device_node *memory = NULL; 390 391 memory = of_find_node_by_type(memory, "memory"); 392 if (!memory) 393 panic("numa.c: No memory nodes found!"); 394 395 *n_addr_cells = of_n_addr_cells(memory); 396 *n_size_cells = of_n_size_cells(memory); 397 of_node_put(memory); 398 } 399 400 static unsigned long read_n_cells(int n, const __be32 **buf) 401 { 402 unsigned long result = 0; 403 404 while (n--) { 405 result = (result << 32) | of_read_number(*buf, 1); 406 (*buf)++; 407 } 408 return result; 409 } 410 411 /* 412 * Read the next memblock list entry from the ibm,dynamic-memory property 413 * and return the information in the provided of_drconf_cell structure. 414 */ 415 static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp) 416 { 417 const __be32 *cp; 418 419 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); 420 421 cp = *cellp; 422 drmem->drc_index = of_read_number(cp, 1); 423 drmem->reserved = of_read_number(&cp[1], 1); 424 drmem->aa_index = of_read_number(&cp[2], 1); 425 drmem->flags = of_read_number(&cp[3], 1); 426 427 *cellp = cp + 4; 428 } 429 430 /* 431 * Retrieve and validate the ibm,dynamic-memory property of the device tree. 432 * 433 * The layout of the ibm,dynamic-memory property is a number N of memblock 434 * list entries followed by N memblock list entries. Each memblock list entry 435 * contains information as laid out in the of_drconf_cell struct above. 436 */ 437 static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm) 438 { 439 const __be32 *prop; 440 u32 len, entries; 441 442 prop = of_get_property(memory, "ibm,dynamic-memory", &len); 443 if (!prop || len < sizeof(unsigned int)) 444 return 0; 445 446 entries = of_read_number(prop++, 1); 447 448 /* Now that we know the number of entries, revalidate the size 449 * of the property read in to ensure we have everything 450 */ 451 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) 452 return 0; 453 454 *dm = prop; 455 return entries; 456 } 457 458 /* 459 * Retrieve and validate the ibm,lmb-size property for drconf memory 460 * from the device tree. 461 */ 462 static u64 of_get_lmb_size(struct device_node *memory) 463 { 464 const __be32 *prop; 465 u32 len; 466 467 prop = of_get_property(memory, "ibm,lmb-size", &len); 468 if (!prop || len < sizeof(unsigned int)) 469 return 0; 470 471 return read_n_cells(n_mem_size_cells, &prop); 472 } 473 474 struct assoc_arrays { 475 u32 n_arrays; 476 u32 array_sz; 477 const __be32 *arrays; 478 }; 479 480 /* 481 * Retrieve and validate the list of associativity arrays for drconf 482 * memory from the ibm,associativity-lookup-arrays property of the 483 * device tree.. 484 * 485 * The layout of the ibm,associativity-lookup-arrays property is a number N 486 * indicating the number of associativity arrays, followed by a number M 487 * indicating the size of each associativity array, followed by a list 488 * of N associativity arrays. 489 */ 490 static int of_get_assoc_arrays(struct device_node *memory, 491 struct assoc_arrays *aa) 492 { 493 const __be32 *prop; 494 u32 len; 495 496 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 497 if (!prop || len < 2 * sizeof(unsigned int)) 498 return -1; 499 500 aa->n_arrays = of_read_number(prop++, 1); 501 aa->array_sz = of_read_number(prop++, 1); 502 503 /* Now that we know the number of arrays and size of each array, 504 * revalidate the size of the property read in. 505 */ 506 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 507 return -1; 508 509 aa->arrays = prop; 510 return 0; 511 } 512 513 /* 514 * This is like of_node_to_nid_single() for memory represented in the 515 * ibm,dynamic-reconfiguration-memory node. 516 */ 517 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, 518 struct assoc_arrays *aa) 519 { 520 int default_nid = 0; 521 int nid = default_nid; 522 int index; 523 524 if (min_common_depth > 0 && min_common_depth <= aa->array_sz && 525 !(drmem->flags & DRCONF_MEM_AI_INVALID) && 526 drmem->aa_index < aa->n_arrays) { 527 index = drmem->aa_index * aa->array_sz + min_common_depth - 1; 528 nid = of_read_number(&aa->arrays[index], 1); 529 530 if (nid == 0xffff || nid >= MAX_NUMNODES) 531 nid = default_nid; 532 } 533 534 return nid; 535 } 536 537 /* 538 * Figure out to which domain a cpu belongs and stick it there. 539 * Return the id of the domain used. 540 */ 541 static int numa_setup_cpu(unsigned long lcpu) 542 { 543 int nid = -1; 544 struct device_node *cpu; 545 546 /* 547 * If a valid cpu-to-node mapping is already available, use it 548 * directly instead of querying the firmware, since it represents 549 * the most recent mapping notified to us by the platform (eg: VPHN). 550 */ 551 if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) { 552 map_cpu_to_node(lcpu, nid); 553 return nid; 554 } 555 556 cpu = of_get_cpu_node(lcpu, NULL); 557 558 if (!cpu) { 559 WARN_ON(1); 560 if (cpu_present(lcpu)) 561 goto out_present; 562 else 563 goto out; 564 } 565 566 nid = of_node_to_nid_single(cpu); 567 568 out_present: 569 if (nid < 0 || !node_online(nid)) 570 nid = first_online_node; 571 572 map_cpu_to_node(lcpu, nid); 573 of_node_put(cpu); 574 out: 575 return nid; 576 } 577 578 static void verify_cpu_node_mapping(int cpu, int node) 579 { 580 int base, sibling, i; 581 582 /* Verify that all the threads in the core belong to the same node */ 583 base = cpu_first_thread_sibling(cpu); 584 585 for (i = 0; i < threads_per_core; i++) { 586 sibling = base + i; 587 588 if (sibling == cpu || cpu_is_offline(sibling)) 589 continue; 590 591 if (cpu_to_node(sibling) != node) { 592 WARN(1, "CPU thread siblings %d and %d don't belong" 593 " to the same node!\n", cpu, sibling); 594 break; 595 } 596 } 597 } 598 599 static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action, 600 void *hcpu) 601 { 602 unsigned long lcpu = (unsigned long)hcpu; 603 int ret = NOTIFY_DONE, nid; 604 605 switch (action) { 606 case CPU_UP_PREPARE: 607 case CPU_UP_PREPARE_FROZEN: 608 nid = numa_setup_cpu(lcpu); 609 verify_cpu_node_mapping((int)lcpu, nid); 610 ret = NOTIFY_OK; 611 break; 612 #ifdef CONFIG_HOTPLUG_CPU 613 case CPU_DEAD: 614 case CPU_DEAD_FROZEN: 615 case CPU_UP_CANCELED: 616 case CPU_UP_CANCELED_FROZEN: 617 unmap_cpu_from_node(lcpu); 618 ret = NOTIFY_OK; 619 break; 620 #endif 621 } 622 return ret; 623 } 624 625 /* 626 * Check and possibly modify a memory region to enforce the memory limit. 627 * 628 * Returns the size the region should have to enforce the memory limit. 629 * This will either be the original value of size, a truncated value, 630 * or zero. If the returned value of size is 0 the region should be 631 * discarded as it lies wholly above the memory limit. 632 */ 633 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 634 unsigned long size) 635 { 636 /* 637 * We use memblock_end_of_DRAM() in here instead of memory_limit because 638 * we've already adjusted it for the limit and it takes care of 639 * having memory holes below the limit. Also, in the case of 640 * iommu_is_off, memory_limit is not set but is implicitly enforced. 641 */ 642 643 if (start + size <= memblock_end_of_DRAM()) 644 return size; 645 646 if (start >= memblock_end_of_DRAM()) 647 return 0; 648 649 return memblock_end_of_DRAM() - start; 650 } 651 652 /* 653 * Reads the counter for a given entry in 654 * linux,drconf-usable-memory property 655 */ 656 static inline int __init read_usm_ranges(const __be32 **usm) 657 { 658 /* 659 * For each lmb in ibm,dynamic-memory a corresponding 660 * entry in linux,drconf-usable-memory property contains 661 * a counter followed by that many (base, size) duple. 662 * read the counter from linux,drconf-usable-memory 663 */ 664 return read_n_cells(n_mem_size_cells, usm); 665 } 666 667 /* 668 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 669 * node. This assumes n_mem_{addr,size}_cells have been set. 670 */ 671 static void __init parse_drconf_memory(struct device_node *memory) 672 { 673 const __be32 *uninitialized_var(dm), *usm; 674 unsigned int n, rc, ranges, is_kexec_kdump = 0; 675 unsigned long lmb_size, base, size, sz; 676 int nid; 677 struct assoc_arrays aa = { .arrays = NULL }; 678 679 n = of_get_drconf_memory(memory, &dm); 680 if (!n) 681 return; 682 683 lmb_size = of_get_lmb_size(memory); 684 if (!lmb_size) 685 return; 686 687 rc = of_get_assoc_arrays(memory, &aa); 688 if (rc) 689 return; 690 691 /* check if this is a kexec/kdump kernel */ 692 usm = of_get_usable_memory(memory); 693 if (usm != NULL) 694 is_kexec_kdump = 1; 695 696 for (; n != 0; --n) { 697 struct of_drconf_cell drmem; 698 699 read_drconf_cell(&drmem, &dm); 700 701 /* skip this block if the reserved bit is set in flags (0x80) 702 or if the block is not assigned to this partition (0x8) */ 703 if ((drmem.flags & DRCONF_MEM_RESERVED) 704 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 705 continue; 706 707 base = drmem.base_addr; 708 size = lmb_size; 709 ranges = 1; 710 711 if (is_kexec_kdump) { 712 ranges = read_usm_ranges(&usm); 713 if (!ranges) /* there are no (base, size) duple */ 714 continue; 715 } 716 do { 717 if (is_kexec_kdump) { 718 base = read_n_cells(n_mem_addr_cells, &usm); 719 size = read_n_cells(n_mem_size_cells, &usm); 720 } 721 nid = of_drconf_to_nid_single(&drmem, &aa); 722 fake_numa_create_new_node( 723 ((base + size) >> PAGE_SHIFT), 724 &nid); 725 node_set_online(nid); 726 sz = numa_enforce_memory_limit(base, size); 727 if (sz) 728 memblock_set_node(base, sz, 729 &memblock.memory, nid); 730 } while (--ranges); 731 } 732 } 733 734 static int __init parse_numa_properties(void) 735 { 736 struct device_node *memory; 737 int default_nid = 0; 738 unsigned long i; 739 740 if (numa_enabled == 0) { 741 printk(KERN_WARNING "NUMA disabled by user\n"); 742 return -1; 743 } 744 745 min_common_depth = find_min_common_depth(); 746 747 if (min_common_depth < 0) 748 return min_common_depth; 749 750 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 751 752 /* 753 * Even though we connect cpus to numa domains later in SMP 754 * init, we need to know the node ids now. This is because 755 * each node to be onlined must have NODE_DATA etc backing it. 756 */ 757 for_each_present_cpu(i) { 758 struct device_node *cpu; 759 int nid; 760 761 cpu = of_get_cpu_node(i, NULL); 762 BUG_ON(!cpu); 763 nid = of_node_to_nid_single(cpu); 764 of_node_put(cpu); 765 766 /* 767 * Don't fall back to default_nid yet -- we will plug 768 * cpus into nodes once the memory scan has discovered 769 * the topology. 770 */ 771 if (nid < 0) 772 continue; 773 node_set_online(nid); 774 } 775 776 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 777 778 for_each_node_by_type(memory, "memory") { 779 unsigned long start; 780 unsigned long size; 781 int nid; 782 int ranges; 783 const __be32 *memcell_buf; 784 unsigned int len; 785 786 memcell_buf = of_get_property(memory, 787 "linux,usable-memory", &len); 788 if (!memcell_buf || len <= 0) 789 memcell_buf = of_get_property(memory, "reg", &len); 790 if (!memcell_buf || len <= 0) 791 continue; 792 793 /* ranges in cell */ 794 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 795 new_range: 796 /* these are order-sensitive, and modify the buffer pointer */ 797 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 798 size = read_n_cells(n_mem_size_cells, &memcell_buf); 799 800 /* 801 * Assumption: either all memory nodes or none will 802 * have associativity properties. If none, then 803 * everything goes to default_nid. 804 */ 805 nid = of_node_to_nid_single(memory); 806 if (nid < 0) 807 nid = default_nid; 808 809 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 810 node_set_online(nid); 811 812 if (!(size = numa_enforce_memory_limit(start, size))) { 813 if (--ranges) 814 goto new_range; 815 else 816 continue; 817 } 818 819 memblock_set_node(start, size, &memblock.memory, nid); 820 821 if (--ranges) 822 goto new_range; 823 } 824 825 /* 826 * Now do the same thing for each MEMBLOCK listed in the 827 * ibm,dynamic-memory property in the 828 * ibm,dynamic-reconfiguration-memory node. 829 */ 830 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 831 if (memory) 832 parse_drconf_memory(memory); 833 834 return 0; 835 } 836 837 static void __init setup_nonnuma(void) 838 { 839 unsigned long top_of_ram = memblock_end_of_DRAM(); 840 unsigned long total_ram = memblock_phys_mem_size(); 841 unsigned long start_pfn, end_pfn; 842 unsigned int nid = 0; 843 struct memblock_region *reg; 844 845 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 846 top_of_ram, total_ram); 847 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 848 (top_of_ram - total_ram) >> 20); 849 850 for_each_memblock(memory, reg) { 851 start_pfn = memblock_region_memory_base_pfn(reg); 852 end_pfn = memblock_region_memory_end_pfn(reg); 853 854 fake_numa_create_new_node(end_pfn, &nid); 855 memblock_set_node(PFN_PHYS(start_pfn), 856 PFN_PHYS(end_pfn - start_pfn), 857 &memblock.memory, nid); 858 node_set_online(nid); 859 } 860 } 861 862 void __init dump_numa_cpu_topology(void) 863 { 864 unsigned int node; 865 unsigned int cpu, count; 866 867 if (min_common_depth == -1 || !numa_enabled) 868 return; 869 870 for_each_online_node(node) { 871 printk(KERN_DEBUG "Node %d CPUs:", node); 872 873 count = 0; 874 /* 875 * If we used a CPU iterator here we would miss printing 876 * the holes in the cpumap. 877 */ 878 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 879 if (cpumask_test_cpu(cpu, 880 node_to_cpumask_map[node])) { 881 if (count == 0) 882 printk(" %u", cpu); 883 ++count; 884 } else { 885 if (count > 1) 886 printk("-%u", cpu - 1); 887 count = 0; 888 } 889 } 890 891 if (count > 1) 892 printk("-%u", nr_cpu_ids - 1); 893 printk("\n"); 894 } 895 } 896 897 static void __init dump_numa_memory_topology(void) 898 { 899 unsigned int node; 900 unsigned int count; 901 902 if (min_common_depth == -1 || !numa_enabled) 903 return; 904 905 for_each_online_node(node) { 906 unsigned long i; 907 908 printk(KERN_DEBUG "Node %d Memory:", node); 909 910 count = 0; 911 912 for (i = 0; i < memblock_end_of_DRAM(); 913 i += (1 << SECTION_SIZE_BITS)) { 914 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { 915 if (count == 0) 916 printk(" 0x%lx", i); 917 ++count; 918 } else { 919 if (count > 0) 920 printk("-0x%lx", i); 921 count = 0; 922 } 923 } 924 925 if (count > 0) 926 printk("-0x%lx", i); 927 printk("\n"); 928 } 929 } 930 931 /* 932 * Allocate some memory, satisfying the memblock or bootmem allocator where 933 * required. nid is the preferred node and end is the physical address of 934 * the highest address in the node. 935 * 936 * Returns the virtual address of the memory. 937 */ 938 static void __init *careful_zallocation(int nid, unsigned long size, 939 unsigned long align, 940 unsigned long end_pfn) 941 { 942 void *ret; 943 int new_nid; 944 unsigned long ret_paddr; 945 946 ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT); 947 948 /* retry over all memory */ 949 if (!ret_paddr) 950 ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM()); 951 952 if (!ret_paddr) 953 panic("numa.c: cannot allocate %lu bytes for node %d", 954 size, nid); 955 956 ret = __va(ret_paddr); 957 958 /* 959 * We initialize the nodes in numeric order: 0, 1, 2... 960 * and hand over control from the MEMBLOCK allocator to the 961 * bootmem allocator. If this function is called for 962 * node 5, then we know that all nodes <5 are using the 963 * bootmem allocator instead of the MEMBLOCK allocator. 964 * 965 * So, check the nid from which this allocation came 966 * and double check to see if we need to use bootmem 967 * instead of the MEMBLOCK. We don't free the MEMBLOCK memory 968 * since it would be useless. 969 */ 970 new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT); 971 if (new_nid < nid) { 972 ret = __alloc_bootmem_node(NODE_DATA(new_nid), 973 size, align, 0); 974 975 dbg("alloc_bootmem %p %lx\n", ret, size); 976 } 977 978 memset(ret, 0, size); 979 return ret; 980 } 981 982 static struct notifier_block ppc64_numa_nb = { 983 .notifier_call = cpu_numa_callback, 984 .priority = 1 /* Must run before sched domains notifier. */ 985 }; 986 987 static void __init mark_reserved_regions_for_nid(int nid) 988 { 989 struct pglist_data *node = NODE_DATA(nid); 990 struct memblock_region *reg; 991 992 for_each_memblock(reserved, reg) { 993 unsigned long physbase = reg->base; 994 unsigned long size = reg->size; 995 unsigned long start_pfn = physbase >> PAGE_SHIFT; 996 unsigned long end_pfn = PFN_UP(physbase + size); 997 struct node_active_region node_ar; 998 unsigned long node_end_pfn = pgdat_end_pfn(node); 999 1000 /* 1001 * Check to make sure that this memblock.reserved area is 1002 * within the bounds of the node that we care about. 1003 * Checking the nid of the start and end points is not 1004 * sufficient because the reserved area could span the 1005 * entire node. 1006 */ 1007 if (end_pfn <= node->node_start_pfn || 1008 start_pfn >= node_end_pfn) 1009 continue; 1010 1011 get_node_active_region(start_pfn, &node_ar); 1012 while (start_pfn < end_pfn && 1013 node_ar.start_pfn < node_ar.end_pfn) { 1014 unsigned long reserve_size = size; 1015 /* 1016 * if reserved region extends past active region 1017 * then trim size to active region 1018 */ 1019 if (end_pfn > node_ar.end_pfn) 1020 reserve_size = (node_ar.end_pfn << PAGE_SHIFT) 1021 - physbase; 1022 /* 1023 * Only worry about *this* node, others may not 1024 * yet have valid NODE_DATA(). 1025 */ 1026 if (node_ar.nid == nid) { 1027 dbg("reserve_bootmem %lx %lx nid=%d\n", 1028 physbase, reserve_size, node_ar.nid); 1029 reserve_bootmem_node(NODE_DATA(node_ar.nid), 1030 physbase, reserve_size, 1031 BOOTMEM_DEFAULT); 1032 } 1033 /* 1034 * if reserved region is contained in the active region 1035 * then done. 1036 */ 1037 if (end_pfn <= node_ar.end_pfn) 1038 break; 1039 1040 /* 1041 * reserved region extends past the active region 1042 * get next active region that contains this 1043 * reserved region 1044 */ 1045 start_pfn = node_ar.end_pfn; 1046 physbase = start_pfn << PAGE_SHIFT; 1047 size = size - reserve_size; 1048 get_node_active_region(start_pfn, &node_ar); 1049 } 1050 } 1051 } 1052 1053 1054 void __init do_init_bootmem(void) 1055 { 1056 int nid, cpu; 1057 1058 min_low_pfn = 0; 1059 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 1060 max_pfn = max_low_pfn; 1061 1062 if (parse_numa_properties()) 1063 setup_nonnuma(); 1064 else 1065 dump_numa_memory_topology(); 1066 1067 for_each_online_node(nid) { 1068 unsigned long start_pfn, end_pfn; 1069 void *bootmem_vaddr; 1070 unsigned long bootmap_pages; 1071 1072 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 1073 1074 /* 1075 * Allocate the node structure node local if possible 1076 * 1077 * Be careful moving this around, as it relies on all 1078 * previous nodes' bootmem to be initialized and have 1079 * all reserved areas marked. 1080 */ 1081 NODE_DATA(nid) = careful_zallocation(nid, 1082 sizeof(struct pglist_data), 1083 SMP_CACHE_BYTES, end_pfn); 1084 1085 dbg("node %d\n", nid); 1086 dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); 1087 1088 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 1089 NODE_DATA(nid)->node_start_pfn = start_pfn; 1090 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 1091 1092 if (NODE_DATA(nid)->node_spanned_pages == 0) 1093 continue; 1094 1095 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); 1096 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); 1097 1098 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 1099 bootmem_vaddr = careful_zallocation(nid, 1100 bootmap_pages << PAGE_SHIFT, 1101 PAGE_SIZE, end_pfn); 1102 1103 dbg("bootmap_vaddr = %p\n", bootmem_vaddr); 1104 1105 init_bootmem_node(NODE_DATA(nid), 1106 __pa(bootmem_vaddr) >> PAGE_SHIFT, 1107 start_pfn, end_pfn); 1108 1109 free_bootmem_with_active_regions(nid, end_pfn); 1110 /* 1111 * Be very careful about moving this around. Future 1112 * calls to careful_zallocation() depend on this getting 1113 * done correctly. 1114 */ 1115 mark_reserved_regions_for_nid(nid); 1116 sparse_memory_present_with_active_regions(nid); 1117 } 1118 1119 init_bootmem_done = 1; 1120 1121 /* 1122 * Now bootmem is initialised we can create the node to cpumask 1123 * lookup tables and setup the cpu callback to populate them. 1124 */ 1125 setup_node_to_cpumask_map(); 1126 1127 reset_numa_cpu_lookup_table(); 1128 register_cpu_notifier(&ppc64_numa_nb); 1129 /* 1130 * We need the numa_cpu_lookup_table to be accurate for all CPUs, 1131 * even before we online them, so that we can use cpu_to_{node,mem} 1132 * early in boot, cf. smp_prepare_cpus(). 1133 */ 1134 for_each_present_cpu(cpu) { 1135 numa_setup_cpu((unsigned long)cpu); 1136 } 1137 } 1138 1139 static int __init early_numa(char *p) 1140 { 1141 if (!p) 1142 return 0; 1143 1144 if (strstr(p, "off")) 1145 numa_enabled = 0; 1146 1147 if (strstr(p, "debug")) 1148 numa_debug = 1; 1149 1150 p = strstr(p, "fake="); 1151 if (p) 1152 cmdline = p + strlen("fake="); 1153 1154 return 0; 1155 } 1156 early_param("numa", early_numa); 1157 1158 static bool topology_updates_enabled = true; 1159 1160 static int __init early_topology_updates(char *p) 1161 { 1162 if (!p) 1163 return 0; 1164 1165 if (!strcmp(p, "off")) { 1166 pr_info("Disabling topology updates\n"); 1167 topology_updates_enabled = false; 1168 } 1169 1170 return 0; 1171 } 1172 early_param("topology_updates", early_topology_updates); 1173 1174 #ifdef CONFIG_MEMORY_HOTPLUG 1175 /* 1176 * Find the node associated with a hot added memory section for 1177 * memory represented in the device tree by the property 1178 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 1179 */ 1180 static int hot_add_drconf_scn_to_nid(struct device_node *memory, 1181 unsigned long scn_addr) 1182 { 1183 const __be32 *dm; 1184 unsigned int drconf_cell_cnt, rc; 1185 unsigned long lmb_size; 1186 struct assoc_arrays aa; 1187 int nid = -1; 1188 1189 drconf_cell_cnt = of_get_drconf_memory(memory, &dm); 1190 if (!drconf_cell_cnt) 1191 return -1; 1192 1193 lmb_size = of_get_lmb_size(memory); 1194 if (!lmb_size) 1195 return -1; 1196 1197 rc = of_get_assoc_arrays(memory, &aa); 1198 if (rc) 1199 return -1; 1200 1201 for (; drconf_cell_cnt != 0; --drconf_cell_cnt) { 1202 struct of_drconf_cell drmem; 1203 1204 read_drconf_cell(&drmem, &dm); 1205 1206 /* skip this block if it is reserved or not assigned to 1207 * this partition */ 1208 if ((drmem.flags & DRCONF_MEM_RESERVED) 1209 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 1210 continue; 1211 1212 if ((scn_addr < drmem.base_addr) 1213 || (scn_addr >= (drmem.base_addr + lmb_size))) 1214 continue; 1215 1216 nid = of_drconf_to_nid_single(&drmem, &aa); 1217 break; 1218 } 1219 1220 return nid; 1221 } 1222 1223 /* 1224 * Find the node associated with a hot added memory section for memory 1225 * represented in the device tree as a node (i.e. memory@XXXX) for 1226 * each memblock. 1227 */ 1228 static int hot_add_node_scn_to_nid(unsigned long scn_addr) 1229 { 1230 struct device_node *memory; 1231 int nid = -1; 1232 1233 for_each_node_by_type(memory, "memory") { 1234 unsigned long start, size; 1235 int ranges; 1236 const __be32 *memcell_buf; 1237 unsigned int len; 1238 1239 memcell_buf = of_get_property(memory, "reg", &len); 1240 if (!memcell_buf || len <= 0) 1241 continue; 1242 1243 /* ranges in cell */ 1244 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1245 1246 while (ranges--) { 1247 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1248 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1249 1250 if ((scn_addr < start) || (scn_addr >= (start + size))) 1251 continue; 1252 1253 nid = of_node_to_nid_single(memory); 1254 break; 1255 } 1256 1257 if (nid >= 0) 1258 break; 1259 } 1260 1261 of_node_put(memory); 1262 1263 return nid; 1264 } 1265 1266 /* 1267 * Find the node associated with a hot added memory section. Section 1268 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that 1269 * sections are fully contained within a single MEMBLOCK. 1270 */ 1271 int hot_add_scn_to_nid(unsigned long scn_addr) 1272 { 1273 struct device_node *memory = NULL; 1274 int nid, found = 0; 1275 1276 if (!numa_enabled || (min_common_depth < 0)) 1277 return first_online_node; 1278 1279 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1280 if (memory) { 1281 nid = hot_add_drconf_scn_to_nid(memory, scn_addr); 1282 of_node_put(memory); 1283 } else { 1284 nid = hot_add_node_scn_to_nid(scn_addr); 1285 } 1286 1287 if (nid < 0 || !node_online(nid)) 1288 nid = first_online_node; 1289 1290 if (NODE_DATA(nid)->node_spanned_pages) 1291 return nid; 1292 1293 for_each_online_node(nid) { 1294 if (NODE_DATA(nid)->node_spanned_pages) { 1295 found = 1; 1296 break; 1297 } 1298 } 1299 1300 BUG_ON(!found); 1301 return nid; 1302 } 1303 1304 static u64 hot_add_drconf_memory_max(void) 1305 { 1306 struct device_node *memory = NULL; 1307 unsigned int drconf_cell_cnt = 0; 1308 u64 lmb_size = 0; 1309 const __be32 *dm = NULL; 1310 1311 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1312 if (memory) { 1313 drconf_cell_cnt = of_get_drconf_memory(memory, &dm); 1314 lmb_size = of_get_lmb_size(memory); 1315 of_node_put(memory); 1316 } 1317 return lmb_size * drconf_cell_cnt; 1318 } 1319 1320 /* 1321 * memory_hotplug_max - return max address of memory that may be added 1322 * 1323 * This is currently only used on systems that support drconfig memory 1324 * hotplug. 1325 */ 1326 u64 memory_hotplug_max(void) 1327 { 1328 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); 1329 } 1330 #endif /* CONFIG_MEMORY_HOTPLUG */ 1331 1332 /* Virtual Processor Home Node (VPHN) support */ 1333 #ifdef CONFIG_PPC_SPLPAR 1334 struct topology_update_data { 1335 struct topology_update_data *next; 1336 unsigned int cpu; 1337 int old_nid; 1338 int new_nid; 1339 }; 1340 1341 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; 1342 static cpumask_t cpu_associativity_changes_mask; 1343 static int vphn_enabled; 1344 static int prrn_enabled; 1345 static void reset_topology_timer(void); 1346 1347 /* 1348 * Store the current values of the associativity change counters in the 1349 * hypervisor. 1350 */ 1351 static void setup_cpu_associativity_change_counters(void) 1352 { 1353 int cpu; 1354 1355 /* The VPHN feature supports a maximum of 8 reference points */ 1356 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); 1357 1358 for_each_possible_cpu(cpu) { 1359 int i; 1360 u8 *counts = vphn_cpu_change_counts[cpu]; 1361 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; 1362 1363 for (i = 0; i < distance_ref_points_depth; i++) 1364 counts[i] = hypervisor_counts[i]; 1365 } 1366 } 1367 1368 /* 1369 * The hypervisor maintains a set of 8 associativity change counters in 1370 * the VPA of each cpu that correspond to the associativity levels in the 1371 * ibm,associativity-reference-points property. When an associativity 1372 * level changes, the corresponding counter is incremented. 1373 * 1374 * Set a bit in cpu_associativity_changes_mask for each cpu whose home 1375 * node associativity levels have changed. 1376 * 1377 * Returns the number of cpus with unhandled associativity changes. 1378 */ 1379 static int update_cpu_associativity_changes_mask(void) 1380 { 1381 int cpu; 1382 cpumask_t *changes = &cpu_associativity_changes_mask; 1383 1384 for_each_possible_cpu(cpu) { 1385 int i, changed = 0; 1386 u8 *counts = vphn_cpu_change_counts[cpu]; 1387 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; 1388 1389 for (i = 0; i < distance_ref_points_depth; i++) { 1390 if (hypervisor_counts[i] != counts[i]) { 1391 counts[i] = hypervisor_counts[i]; 1392 changed = 1; 1393 } 1394 } 1395 if (changed) { 1396 cpumask_or(changes, changes, cpu_sibling_mask(cpu)); 1397 cpu = cpu_last_thread_sibling(cpu); 1398 } 1399 } 1400 1401 return cpumask_weight(changes); 1402 } 1403 1404 /* 1405 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form 1406 * the complete property we have to add the length in the first cell. 1407 */ 1408 #define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1) 1409 1410 /* 1411 * Convert the associativity domain numbers returned from the hypervisor 1412 * to the sequence they would appear in the ibm,associativity property. 1413 */ 1414 static int vphn_unpack_associativity(const long *packed, __be32 *unpacked) 1415 { 1416 int i, nr_assoc_doms = 0; 1417 const __be16 *field = (const __be16 *) packed; 1418 1419 #define VPHN_FIELD_UNUSED (0xffff) 1420 #define VPHN_FIELD_MSB (0x8000) 1421 #define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) 1422 1423 for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { 1424 if (be16_to_cpup(field) == VPHN_FIELD_UNUSED) { 1425 /* All significant fields processed, and remaining 1426 * fields contain the reserved value of all 1's. 1427 * Just store them. 1428 */ 1429 unpacked[i] = *((__be32 *)field); 1430 field += 2; 1431 } else if (be16_to_cpup(field) & VPHN_FIELD_MSB) { 1432 /* Data is in the lower 15 bits of this field */ 1433 unpacked[i] = cpu_to_be32( 1434 be16_to_cpup(field) & VPHN_FIELD_MASK); 1435 field++; 1436 nr_assoc_doms++; 1437 } else { 1438 /* Data is in the lower 15 bits of this field 1439 * concatenated with the next 16 bit field 1440 */ 1441 unpacked[i] = *((__be32 *)field); 1442 field += 2; 1443 nr_assoc_doms++; 1444 } 1445 } 1446 1447 /* The first cell contains the length of the property */ 1448 unpacked[0] = cpu_to_be32(nr_assoc_doms); 1449 1450 return nr_assoc_doms; 1451 } 1452 1453 /* 1454 * Retrieve the new associativity information for a virtual processor's 1455 * home node. 1456 */ 1457 static long hcall_vphn(unsigned long cpu, __be32 *associativity) 1458 { 1459 long rc; 1460 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; 1461 u64 flags = 1; 1462 int hwcpu = get_hard_smp_processor_id(cpu); 1463 int i; 1464 1465 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); 1466 for (i = 0; i < 6; i++) 1467 retbuf[i] = cpu_to_be64(retbuf[i]); 1468 vphn_unpack_associativity(retbuf, associativity); 1469 1470 return rc; 1471 } 1472 1473 static long vphn_get_associativity(unsigned long cpu, 1474 __be32 *associativity) 1475 { 1476 long rc; 1477 1478 rc = hcall_vphn(cpu, associativity); 1479 1480 switch (rc) { 1481 case H_FUNCTION: 1482 printk(KERN_INFO 1483 "VPHN is not supported. Disabling polling...\n"); 1484 stop_topology_update(); 1485 break; 1486 case H_HARDWARE: 1487 printk(KERN_ERR 1488 "hcall_vphn() experienced a hardware fault " 1489 "preventing VPHN. Disabling polling...\n"); 1490 stop_topology_update(); 1491 } 1492 1493 return rc; 1494 } 1495 1496 /* 1497 * Update the CPU maps and sysfs entries for a single CPU when its NUMA 1498 * characteristics change. This function doesn't perform any locking and is 1499 * only safe to call from stop_machine(). 1500 */ 1501 static int update_cpu_topology(void *data) 1502 { 1503 struct topology_update_data *update; 1504 unsigned long cpu; 1505 1506 if (!data) 1507 return -EINVAL; 1508 1509 cpu = smp_processor_id(); 1510 1511 for (update = data; update; update = update->next) { 1512 int new_nid = update->new_nid; 1513 if (cpu != update->cpu) 1514 continue; 1515 1516 unmap_cpu_from_node(cpu); 1517 map_cpu_to_node(cpu, new_nid); 1518 set_cpu_numa_node(cpu, new_nid); 1519 set_cpu_numa_mem(cpu, local_memory_node(new_nid)); 1520 vdso_getcpu_init(); 1521 } 1522 1523 return 0; 1524 } 1525 1526 static int update_lookup_table(void *data) 1527 { 1528 struct topology_update_data *update; 1529 1530 if (!data) 1531 return -EINVAL; 1532 1533 /* 1534 * Upon topology update, the numa-cpu lookup table needs to be updated 1535 * for all threads in the core, including offline CPUs, to ensure that 1536 * future hotplug operations respect the cpu-to-node associativity 1537 * properly. 1538 */ 1539 for (update = data; update; update = update->next) { 1540 int nid, base, j; 1541 1542 nid = update->new_nid; 1543 base = cpu_first_thread_sibling(update->cpu); 1544 1545 for (j = 0; j < threads_per_core; j++) { 1546 update_numa_cpu_lookup_table(base + j, nid); 1547 } 1548 } 1549 1550 return 0; 1551 } 1552 1553 /* 1554 * Update the node maps and sysfs entries for each cpu whose home node 1555 * has changed. Returns 1 when the topology has changed, and 0 otherwise. 1556 */ 1557 int arch_update_cpu_topology(void) 1558 { 1559 unsigned int cpu, sibling, changed = 0; 1560 struct topology_update_data *updates, *ud; 1561 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1562 cpumask_t updated_cpus; 1563 struct device *dev; 1564 int weight, new_nid, i = 0; 1565 1566 if (!prrn_enabled && !vphn_enabled) 1567 return 0; 1568 1569 weight = cpumask_weight(&cpu_associativity_changes_mask); 1570 if (!weight) 1571 return 0; 1572 1573 updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL); 1574 if (!updates) 1575 return 0; 1576 1577 cpumask_clear(&updated_cpus); 1578 1579 for_each_cpu(cpu, &cpu_associativity_changes_mask) { 1580 /* 1581 * If siblings aren't flagged for changes, updates list 1582 * will be too short. Skip on this update and set for next 1583 * update. 1584 */ 1585 if (!cpumask_subset(cpu_sibling_mask(cpu), 1586 &cpu_associativity_changes_mask)) { 1587 pr_info("Sibling bits not set for associativity " 1588 "change, cpu%d\n", cpu); 1589 cpumask_or(&cpu_associativity_changes_mask, 1590 &cpu_associativity_changes_mask, 1591 cpu_sibling_mask(cpu)); 1592 cpu = cpu_last_thread_sibling(cpu); 1593 continue; 1594 } 1595 1596 /* Use associativity from first thread for all siblings */ 1597 vphn_get_associativity(cpu, associativity); 1598 new_nid = associativity_to_nid(associativity); 1599 if (new_nid < 0 || !node_online(new_nid)) 1600 new_nid = first_online_node; 1601 1602 if (new_nid == numa_cpu_lookup_table[cpu]) { 1603 cpumask_andnot(&cpu_associativity_changes_mask, 1604 &cpu_associativity_changes_mask, 1605 cpu_sibling_mask(cpu)); 1606 cpu = cpu_last_thread_sibling(cpu); 1607 continue; 1608 } 1609 1610 for_each_cpu(sibling, cpu_sibling_mask(cpu)) { 1611 ud = &updates[i++]; 1612 ud->cpu = sibling; 1613 ud->new_nid = new_nid; 1614 ud->old_nid = numa_cpu_lookup_table[sibling]; 1615 cpumask_set_cpu(sibling, &updated_cpus); 1616 if (i < weight) 1617 ud->next = &updates[i]; 1618 } 1619 cpu = cpu_last_thread_sibling(cpu); 1620 } 1621 1622 pr_debug("Topology update for the following CPUs:\n"); 1623 if (cpumask_weight(&updated_cpus)) { 1624 for (ud = &updates[0]; ud; ud = ud->next) { 1625 pr_debug("cpu %d moving from node %d " 1626 "to %d\n", ud->cpu, 1627 ud->old_nid, ud->new_nid); 1628 } 1629 } 1630 1631 /* 1632 * In cases where we have nothing to update (because the updates list 1633 * is too short or because the new topology is same as the old one), 1634 * skip invoking update_cpu_topology() via stop-machine(). This is 1635 * necessary (and not just a fast-path optimization) since stop-machine 1636 * can end up electing a random CPU to run update_cpu_topology(), and 1637 * thus trick us into setting up incorrect cpu-node mappings (since 1638 * 'updates' is kzalloc()'ed). 1639 * 1640 * And for the similar reason, we will skip all the following updating. 1641 */ 1642 if (!cpumask_weight(&updated_cpus)) 1643 goto out; 1644 1645 stop_machine(update_cpu_topology, &updates[0], &updated_cpus); 1646 1647 /* 1648 * Update the numa-cpu lookup table with the new mappings, even for 1649 * offline CPUs. It is best to perform this update from the stop- 1650 * machine context. 1651 */ 1652 stop_machine(update_lookup_table, &updates[0], 1653 cpumask_of(raw_smp_processor_id())); 1654 1655 for (ud = &updates[0]; ud; ud = ud->next) { 1656 unregister_cpu_under_node(ud->cpu, ud->old_nid); 1657 register_cpu_under_node(ud->cpu, ud->new_nid); 1658 1659 dev = get_cpu_device(ud->cpu); 1660 if (dev) 1661 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 1662 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask); 1663 changed = 1; 1664 } 1665 1666 out: 1667 kfree(updates); 1668 return changed; 1669 } 1670 1671 static void topology_work_fn(struct work_struct *work) 1672 { 1673 rebuild_sched_domains(); 1674 } 1675 static DECLARE_WORK(topology_work, topology_work_fn); 1676 1677 static void topology_schedule_update(void) 1678 { 1679 schedule_work(&topology_work); 1680 } 1681 1682 static void topology_timer_fn(unsigned long ignored) 1683 { 1684 if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask)) 1685 topology_schedule_update(); 1686 else if (vphn_enabled) { 1687 if (update_cpu_associativity_changes_mask() > 0) 1688 topology_schedule_update(); 1689 reset_topology_timer(); 1690 } 1691 } 1692 static struct timer_list topology_timer = 1693 TIMER_INITIALIZER(topology_timer_fn, 0, 0); 1694 1695 static void reset_topology_timer(void) 1696 { 1697 topology_timer.data = 0; 1698 topology_timer.expires = jiffies + 60 * HZ; 1699 mod_timer(&topology_timer, topology_timer.expires); 1700 } 1701 1702 #ifdef CONFIG_SMP 1703 1704 static void stage_topology_update(int core_id) 1705 { 1706 cpumask_or(&cpu_associativity_changes_mask, 1707 &cpu_associativity_changes_mask, cpu_sibling_mask(core_id)); 1708 reset_topology_timer(); 1709 } 1710 1711 static int dt_update_callback(struct notifier_block *nb, 1712 unsigned long action, void *data) 1713 { 1714 struct of_prop_reconfig *update; 1715 int rc = NOTIFY_DONE; 1716 1717 switch (action) { 1718 case OF_RECONFIG_UPDATE_PROPERTY: 1719 update = (struct of_prop_reconfig *)data; 1720 if (!of_prop_cmp(update->dn->type, "cpu") && 1721 !of_prop_cmp(update->prop->name, "ibm,associativity")) { 1722 u32 core_id; 1723 of_property_read_u32(update->dn, "reg", &core_id); 1724 stage_topology_update(core_id); 1725 rc = NOTIFY_OK; 1726 } 1727 break; 1728 } 1729 1730 return rc; 1731 } 1732 1733 static struct notifier_block dt_update_nb = { 1734 .notifier_call = dt_update_callback, 1735 }; 1736 1737 #endif 1738 1739 /* 1740 * Start polling for associativity changes. 1741 */ 1742 int start_topology_update(void) 1743 { 1744 int rc = 0; 1745 1746 if (firmware_has_feature(FW_FEATURE_PRRN)) { 1747 if (!prrn_enabled) { 1748 prrn_enabled = 1; 1749 vphn_enabled = 0; 1750 #ifdef CONFIG_SMP 1751 rc = of_reconfig_notifier_register(&dt_update_nb); 1752 #endif 1753 } 1754 } else if (firmware_has_feature(FW_FEATURE_VPHN) && 1755 lppaca_shared_proc(get_lppaca())) { 1756 if (!vphn_enabled) { 1757 prrn_enabled = 0; 1758 vphn_enabled = 1; 1759 setup_cpu_associativity_change_counters(); 1760 init_timer_deferrable(&topology_timer); 1761 reset_topology_timer(); 1762 } 1763 } 1764 1765 return rc; 1766 } 1767 1768 /* 1769 * Disable polling for VPHN associativity changes. 1770 */ 1771 int stop_topology_update(void) 1772 { 1773 int rc = 0; 1774 1775 if (prrn_enabled) { 1776 prrn_enabled = 0; 1777 #ifdef CONFIG_SMP 1778 rc = of_reconfig_notifier_unregister(&dt_update_nb); 1779 #endif 1780 } else if (vphn_enabled) { 1781 vphn_enabled = 0; 1782 rc = del_timer_sync(&topology_timer); 1783 } 1784 1785 return rc; 1786 } 1787 1788 int prrn_is_enabled(void) 1789 { 1790 return prrn_enabled; 1791 } 1792 1793 static int topology_read(struct seq_file *file, void *v) 1794 { 1795 if (vphn_enabled || prrn_enabled) 1796 seq_puts(file, "on\n"); 1797 else 1798 seq_puts(file, "off\n"); 1799 1800 return 0; 1801 } 1802 1803 static int topology_open(struct inode *inode, struct file *file) 1804 { 1805 return single_open(file, topology_read, NULL); 1806 } 1807 1808 static ssize_t topology_write(struct file *file, const char __user *buf, 1809 size_t count, loff_t *off) 1810 { 1811 char kbuf[4]; /* "on" or "off" plus null. */ 1812 int read_len; 1813 1814 read_len = count < 3 ? count : 3; 1815 if (copy_from_user(kbuf, buf, read_len)) 1816 return -EINVAL; 1817 1818 kbuf[read_len] = '\0'; 1819 1820 if (!strncmp(kbuf, "on", 2)) 1821 start_topology_update(); 1822 else if (!strncmp(kbuf, "off", 3)) 1823 stop_topology_update(); 1824 else 1825 return -EINVAL; 1826 1827 return count; 1828 } 1829 1830 static const struct file_operations topology_ops = { 1831 .read = seq_read, 1832 .write = topology_write, 1833 .open = topology_open, 1834 .release = single_release 1835 }; 1836 1837 static int topology_update_init(void) 1838 { 1839 /* Do not poll for changes if disabled at boot */ 1840 if (topology_updates_enabled) 1841 start_topology_update(); 1842 1843 if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) 1844 return -ENOMEM; 1845 1846 return 0; 1847 } 1848 device_initcall(topology_update_init); 1849 #endif /* CONFIG_PPC_SPLPAR */ 1850