1 /* 2 * pSeries NUMA support 3 * 4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #define pr_fmt(fmt) "numa: " fmt 12 13 #include <linux/threads.h> 14 #include <linux/bootmem.h> 15 #include <linux/init.h> 16 #include <linux/mm.h> 17 #include <linux/mmzone.h> 18 #include <linux/export.h> 19 #include <linux/nodemask.h> 20 #include <linux/cpu.h> 21 #include <linux/notifier.h> 22 #include <linux/memblock.h> 23 #include <linux/of.h> 24 #include <linux/pfn.h> 25 #include <linux/cpuset.h> 26 #include <linux/node.h> 27 #include <linux/stop_machine.h> 28 #include <linux/proc_fs.h> 29 #include <linux/seq_file.h> 30 #include <linux/uaccess.h> 31 #include <linux/slab.h> 32 #include <asm/cputhreads.h> 33 #include <asm/sparsemem.h> 34 #include <asm/prom.h> 35 #include <asm/smp.h> 36 #include <asm/cputhreads.h> 37 #include <asm/topology.h> 38 #include <asm/firmware.h> 39 #include <asm/paca.h> 40 #include <asm/hvcall.h> 41 #include <asm/setup.h> 42 #include <asm/vdso.h> 43 #include <asm/drmem.h> 44 45 static int numa_enabled = 1; 46 47 static char *cmdline __initdata; 48 49 static int numa_debug; 50 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 51 52 int numa_cpu_lookup_table[NR_CPUS]; 53 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 54 struct pglist_data *node_data[MAX_NUMNODES]; 55 56 EXPORT_SYMBOL(numa_cpu_lookup_table); 57 EXPORT_SYMBOL(node_to_cpumask_map); 58 EXPORT_SYMBOL(node_data); 59 60 static int min_common_depth; 61 static int n_mem_addr_cells, n_mem_size_cells; 62 static int form1_affinity; 63 64 #define MAX_DISTANCE_REF_POINTS 4 65 static int distance_ref_points_depth; 66 static const __be32 *distance_ref_points; 67 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; 68 69 /* 70 * Allocate node_to_cpumask_map based on number of available nodes 71 * Requires node_possible_map to be valid. 72 * 73 * Note: cpumask_of_node() is not valid until after this is done. 74 */ 75 static void __init setup_node_to_cpumask_map(void) 76 { 77 unsigned int node; 78 79 /* setup nr_node_ids if not done yet */ 80 if (nr_node_ids == MAX_NUMNODES) 81 setup_nr_node_ids(); 82 83 /* allocate the map */ 84 for_each_node(node) 85 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 86 87 /* cpumask_of_node() will now work */ 88 dbg("Node to cpumask map for %d nodes\n", nr_node_ids); 89 } 90 91 static int __init fake_numa_create_new_node(unsigned long end_pfn, 92 unsigned int *nid) 93 { 94 unsigned long long mem; 95 char *p = cmdline; 96 static unsigned int fake_nid; 97 static unsigned long long curr_boundary; 98 99 /* 100 * Modify node id, iff we started creating NUMA nodes 101 * We want to continue from where we left of the last time 102 */ 103 if (fake_nid) 104 *nid = fake_nid; 105 /* 106 * In case there are no more arguments to parse, the 107 * node_id should be the same as the last fake node id 108 * (we've handled this above). 109 */ 110 if (!p) 111 return 0; 112 113 mem = memparse(p, &p); 114 if (!mem) 115 return 0; 116 117 if (mem < curr_boundary) 118 return 0; 119 120 curr_boundary = mem; 121 122 if ((end_pfn << PAGE_SHIFT) > mem) { 123 /* 124 * Skip commas and spaces 125 */ 126 while (*p == ',' || *p == ' ' || *p == '\t') 127 p++; 128 129 cmdline = p; 130 fake_nid++; 131 *nid = fake_nid; 132 dbg("created new fake_node with id %d\n", fake_nid); 133 return 1; 134 } 135 return 0; 136 } 137 138 static void reset_numa_cpu_lookup_table(void) 139 { 140 unsigned int cpu; 141 142 for_each_possible_cpu(cpu) 143 numa_cpu_lookup_table[cpu] = -1; 144 } 145 146 static void update_numa_cpu_lookup_table(unsigned int cpu, int node) 147 { 148 numa_cpu_lookup_table[cpu] = node; 149 } 150 151 static void map_cpu_to_node(int cpu, int node) 152 { 153 update_numa_cpu_lookup_table(cpu, node); 154 155 dbg("adding cpu %d to node %d\n", cpu, node); 156 157 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) 158 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 159 } 160 161 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) 162 static void unmap_cpu_from_node(unsigned long cpu) 163 { 164 int node = numa_cpu_lookup_table[cpu]; 165 166 dbg("removing cpu %lu from node %d\n", cpu, node); 167 168 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 169 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); 170 } else { 171 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 172 cpu, node); 173 } 174 } 175 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ 176 177 /* must hold reference to node during call */ 178 static const __be32 *of_get_associativity(struct device_node *dev) 179 { 180 return of_get_property(dev, "ibm,associativity", NULL); 181 } 182 183 int __node_distance(int a, int b) 184 { 185 int i; 186 int distance = LOCAL_DISTANCE; 187 188 if (!form1_affinity) 189 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); 190 191 for (i = 0; i < distance_ref_points_depth; i++) { 192 if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) 193 break; 194 195 /* Double the distance for each NUMA level */ 196 distance *= 2; 197 } 198 199 return distance; 200 } 201 EXPORT_SYMBOL(__node_distance); 202 203 static void initialize_distance_lookup_table(int nid, 204 const __be32 *associativity) 205 { 206 int i; 207 208 if (!form1_affinity) 209 return; 210 211 for (i = 0; i < distance_ref_points_depth; i++) { 212 const __be32 *entry; 213 214 entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; 215 distance_lookup_table[nid][i] = of_read_number(entry, 1); 216 } 217 } 218 219 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 220 * info is found. 221 */ 222 static int associativity_to_nid(const __be32 *associativity) 223 { 224 int nid = -1; 225 226 if (min_common_depth == -1) 227 goto out; 228 229 if (of_read_number(associativity, 1) >= min_common_depth) 230 nid = of_read_number(&associativity[min_common_depth], 1); 231 232 /* POWER4 LPAR uses 0xffff as invalid node */ 233 if (nid == 0xffff || nid >= MAX_NUMNODES) 234 nid = -1; 235 236 if (nid > 0 && 237 of_read_number(associativity, 1) >= distance_ref_points_depth) { 238 /* 239 * Skip the length field and send start of associativity array 240 */ 241 initialize_distance_lookup_table(nid, associativity + 1); 242 } 243 244 out: 245 return nid; 246 } 247 248 /* Returns the nid associated with the given device tree node, 249 * or -1 if not found. 250 */ 251 static int of_node_to_nid_single(struct device_node *device) 252 { 253 int nid = -1; 254 const __be32 *tmp; 255 256 tmp = of_get_associativity(device); 257 if (tmp) 258 nid = associativity_to_nid(tmp); 259 return nid; 260 } 261 262 /* Walk the device tree upwards, looking for an associativity id */ 263 int of_node_to_nid(struct device_node *device) 264 { 265 int nid = -1; 266 267 of_node_get(device); 268 while (device) { 269 nid = of_node_to_nid_single(device); 270 if (nid != -1) 271 break; 272 273 device = of_get_next_parent(device); 274 } 275 of_node_put(device); 276 277 return nid; 278 } 279 EXPORT_SYMBOL(of_node_to_nid); 280 281 static int __init find_min_common_depth(void) 282 { 283 int depth; 284 struct device_node *root; 285 286 if (firmware_has_feature(FW_FEATURE_OPAL)) 287 root = of_find_node_by_path("/ibm,opal"); 288 else 289 root = of_find_node_by_path("/rtas"); 290 if (!root) 291 root = of_find_node_by_path("/"); 292 293 /* 294 * This property is a set of 32-bit integers, each representing 295 * an index into the ibm,associativity nodes. 296 * 297 * With form 0 affinity the first integer is for an SMP configuration 298 * (should be all 0's) and the second is for a normal NUMA 299 * configuration. We have only one level of NUMA. 300 * 301 * With form 1 affinity the first integer is the most significant 302 * NUMA boundary and the following are progressively less significant 303 * boundaries. There can be more than one level of NUMA. 304 */ 305 distance_ref_points = of_get_property(root, 306 "ibm,associativity-reference-points", 307 &distance_ref_points_depth); 308 309 if (!distance_ref_points) { 310 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 311 goto err; 312 } 313 314 distance_ref_points_depth /= sizeof(int); 315 316 if (firmware_has_feature(FW_FEATURE_OPAL) || 317 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { 318 dbg("Using form 1 affinity\n"); 319 form1_affinity = 1; 320 } 321 322 if (form1_affinity) { 323 depth = of_read_number(distance_ref_points, 1); 324 } else { 325 if (distance_ref_points_depth < 2) { 326 printk(KERN_WARNING "NUMA: " 327 "short ibm,associativity-reference-points\n"); 328 goto err; 329 } 330 331 depth = of_read_number(&distance_ref_points[1], 1); 332 } 333 334 /* 335 * Warn and cap if the hardware supports more than 336 * MAX_DISTANCE_REF_POINTS domains. 337 */ 338 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { 339 printk(KERN_WARNING "NUMA: distance array capped at " 340 "%d entries\n", MAX_DISTANCE_REF_POINTS); 341 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 342 } 343 344 of_node_put(root); 345 return depth; 346 347 err: 348 of_node_put(root); 349 return -1; 350 } 351 352 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 353 { 354 struct device_node *memory = NULL; 355 356 memory = of_find_node_by_type(memory, "memory"); 357 if (!memory) 358 panic("numa.c: No memory nodes found!"); 359 360 *n_addr_cells = of_n_addr_cells(memory); 361 *n_size_cells = of_n_size_cells(memory); 362 of_node_put(memory); 363 } 364 365 static unsigned long read_n_cells(int n, const __be32 **buf) 366 { 367 unsigned long result = 0; 368 369 while (n--) { 370 result = (result << 32) | of_read_number(*buf, 1); 371 (*buf)++; 372 } 373 return result; 374 } 375 376 struct assoc_arrays { 377 u32 n_arrays; 378 u32 array_sz; 379 const __be32 *arrays; 380 }; 381 382 /* 383 * Retrieve and validate the list of associativity arrays for drconf 384 * memory from the ibm,associativity-lookup-arrays property of the 385 * device tree.. 386 * 387 * The layout of the ibm,associativity-lookup-arrays property is a number N 388 * indicating the number of associativity arrays, followed by a number M 389 * indicating the size of each associativity array, followed by a list 390 * of N associativity arrays. 391 */ 392 static int of_get_assoc_arrays(struct assoc_arrays *aa) 393 { 394 struct device_node *memory; 395 const __be32 *prop; 396 u32 len; 397 398 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 399 if (!memory) 400 return -1; 401 402 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 403 if (!prop || len < 2 * sizeof(unsigned int)) { 404 of_node_put(memory); 405 return -1; 406 } 407 408 aa->n_arrays = of_read_number(prop++, 1); 409 aa->array_sz = of_read_number(prop++, 1); 410 411 of_node_put(memory); 412 413 /* Now that we know the number of arrays and size of each array, 414 * revalidate the size of the property read in. 415 */ 416 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 417 return -1; 418 419 aa->arrays = prop; 420 return 0; 421 } 422 423 /* 424 * This is like of_node_to_nid_single() for memory represented in the 425 * ibm,dynamic-reconfiguration-memory node. 426 */ 427 static int of_drconf_to_nid_single(struct drmem_lmb *lmb) 428 { 429 struct assoc_arrays aa = { .arrays = NULL }; 430 int default_nid = 0; 431 int nid = default_nid; 432 int rc, index; 433 434 rc = of_get_assoc_arrays(&aa); 435 if (rc) 436 return default_nid; 437 438 if (min_common_depth > 0 && min_common_depth <= aa.array_sz && 439 !(lmb->flags & DRCONF_MEM_AI_INVALID) && 440 lmb->aa_index < aa.n_arrays) { 441 index = lmb->aa_index * aa.array_sz + min_common_depth - 1; 442 nid = of_read_number(&aa.arrays[index], 1); 443 444 if (nid == 0xffff || nid >= MAX_NUMNODES) 445 nid = default_nid; 446 447 if (nid > 0) { 448 index = lmb->aa_index * aa.array_sz; 449 initialize_distance_lookup_table(nid, 450 &aa.arrays[index]); 451 } 452 } 453 454 return nid; 455 } 456 457 /* 458 * Figure out to which domain a cpu belongs and stick it there. 459 * Return the id of the domain used. 460 */ 461 static int numa_setup_cpu(unsigned long lcpu) 462 { 463 int nid = -1; 464 struct device_node *cpu; 465 466 /* 467 * If a valid cpu-to-node mapping is already available, use it 468 * directly instead of querying the firmware, since it represents 469 * the most recent mapping notified to us by the platform (eg: VPHN). 470 */ 471 if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) { 472 map_cpu_to_node(lcpu, nid); 473 return nid; 474 } 475 476 cpu = of_get_cpu_node(lcpu, NULL); 477 478 if (!cpu) { 479 WARN_ON(1); 480 if (cpu_present(lcpu)) 481 goto out_present; 482 else 483 goto out; 484 } 485 486 nid = of_node_to_nid_single(cpu); 487 488 out_present: 489 if (nid < 0 || !node_possible(nid)) 490 nid = first_online_node; 491 492 map_cpu_to_node(lcpu, nid); 493 of_node_put(cpu); 494 out: 495 return nid; 496 } 497 498 static void verify_cpu_node_mapping(int cpu, int node) 499 { 500 int base, sibling, i; 501 502 /* Verify that all the threads in the core belong to the same node */ 503 base = cpu_first_thread_sibling(cpu); 504 505 for (i = 0; i < threads_per_core; i++) { 506 sibling = base + i; 507 508 if (sibling == cpu || cpu_is_offline(sibling)) 509 continue; 510 511 if (cpu_to_node(sibling) != node) { 512 WARN(1, "CPU thread siblings %d and %d don't belong" 513 " to the same node!\n", cpu, sibling); 514 break; 515 } 516 } 517 } 518 519 /* Must run before sched domains notifier. */ 520 static int ppc_numa_cpu_prepare(unsigned int cpu) 521 { 522 int nid; 523 524 nid = numa_setup_cpu(cpu); 525 verify_cpu_node_mapping(cpu, nid); 526 return 0; 527 } 528 529 static int ppc_numa_cpu_dead(unsigned int cpu) 530 { 531 #ifdef CONFIG_HOTPLUG_CPU 532 unmap_cpu_from_node(cpu); 533 #endif 534 return 0; 535 } 536 537 /* 538 * Check and possibly modify a memory region to enforce the memory limit. 539 * 540 * Returns the size the region should have to enforce the memory limit. 541 * This will either be the original value of size, a truncated value, 542 * or zero. If the returned value of size is 0 the region should be 543 * discarded as it lies wholly above the memory limit. 544 */ 545 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 546 unsigned long size) 547 { 548 /* 549 * We use memblock_end_of_DRAM() in here instead of memory_limit because 550 * we've already adjusted it for the limit and it takes care of 551 * having memory holes below the limit. Also, in the case of 552 * iommu_is_off, memory_limit is not set but is implicitly enforced. 553 */ 554 555 if (start + size <= memblock_end_of_DRAM()) 556 return size; 557 558 if (start >= memblock_end_of_DRAM()) 559 return 0; 560 561 return memblock_end_of_DRAM() - start; 562 } 563 564 /* 565 * Reads the counter for a given entry in 566 * linux,drconf-usable-memory property 567 */ 568 static inline int __init read_usm_ranges(const __be32 **usm) 569 { 570 /* 571 * For each lmb in ibm,dynamic-memory a corresponding 572 * entry in linux,drconf-usable-memory property contains 573 * a counter followed by that many (base, size) duple. 574 * read the counter from linux,drconf-usable-memory 575 */ 576 return read_n_cells(n_mem_size_cells, usm); 577 } 578 579 /* 580 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 581 * node. This assumes n_mem_{addr,size}_cells have been set. 582 */ 583 static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, 584 const __be32 **usm) 585 { 586 unsigned int ranges, is_kexec_kdump = 0; 587 unsigned long base, size, sz; 588 int nid; 589 590 /* 591 * Skip this block if the reserved bit is set in flags (0x80) 592 * or if the block is not assigned to this partition (0x8) 593 */ 594 if ((lmb->flags & DRCONF_MEM_RESERVED) 595 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 596 return; 597 598 if (*usm) 599 is_kexec_kdump = 1; 600 601 base = lmb->base_addr; 602 size = drmem_lmb_size(); 603 ranges = 1; 604 605 if (is_kexec_kdump) { 606 ranges = read_usm_ranges(usm); 607 if (!ranges) /* there are no (base, size) duple */ 608 return; 609 } 610 611 do { 612 if (is_kexec_kdump) { 613 base = read_n_cells(n_mem_addr_cells, usm); 614 size = read_n_cells(n_mem_size_cells, usm); 615 } 616 617 nid = of_drconf_to_nid_single(lmb); 618 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), 619 &nid); 620 node_set_online(nid); 621 sz = numa_enforce_memory_limit(base, size); 622 if (sz) 623 memblock_set_node(base, sz, &memblock.memory, nid); 624 } while (--ranges); 625 } 626 627 static int __init parse_numa_properties(void) 628 { 629 struct device_node *memory; 630 int default_nid = 0; 631 unsigned long i; 632 633 if (numa_enabled == 0) { 634 printk(KERN_WARNING "NUMA disabled by user\n"); 635 return -1; 636 } 637 638 min_common_depth = find_min_common_depth(); 639 640 if (min_common_depth < 0) 641 return min_common_depth; 642 643 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 644 645 /* 646 * Even though we connect cpus to numa domains later in SMP 647 * init, we need to know the node ids now. This is because 648 * each node to be onlined must have NODE_DATA etc backing it. 649 */ 650 for_each_present_cpu(i) { 651 struct device_node *cpu; 652 int nid; 653 654 cpu = of_get_cpu_node(i, NULL); 655 BUG_ON(!cpu); 656 nid = of_node_to_nid_single(cpu); 657 of_node_put(cpu); 658 659 /* 660 * Don't fall back to default_nid yet -- we will plug 661 * cpus into nodes once the memory scan has discovered 662 * the topology. 663 */ 664 if (nid < 0) 665 continue; 666 node_set_online(nid); 667 } 668 669 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 670 671 for_each_node_by_type(memory, "memory") { 672 unsigned long start; 673 unsigned long size; 674 int nid; 675 int ranges; 676 const __be32 *memcell_buf; 677 unsigned int len; 678 679 memcell_buf = of_get_property(memory, 680 "linux,usable-memory", &len); 681 if (!memcell_buf || len <= 0) 682 memcell_buf = of_get_property(memory, "reg", &len); 683 if (!memcell_buf || len <= 0) 684 continue; 685 686 /* ranges in cell */ 687 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 688 new_range: 689 /* these are order-sensitive, and modify the buffer pointer */ 690 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 691 size = read_n_cells(n_mem_size_cells, &memcell_buf); 692 693 /* 694 * Assumption: either all memory nodes or none will 695 * have associativity properties. If none, then 696 * everything goes to default_nid. 697 */ 698 nid = of_node_to_nid_single(memory); 699 if (nid < 0) 700 nid = default_nid; 701 702 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 703 node_set_online(nid); 704 705 size = numa_enforce_memory_limit(start, size); 706 if (size) 707 memblock_set_node(start, size, &memblock.memory, nid); 708 709 if (--ranges) 710 goto new_range; 711 } 712 713 /* 714 * Now do the same thing for each MEMBLOCK listed in the 715 * ibm,dynamic-memory property in the 716 * ibm,dynamic-reconfiguration-memory node. 717 */ 718 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 719 if (memory) { 720 walk_drmem_lmbs(memory, numa_setup_drmem_lmb); 721 of_node_put(memory); 722 } 723 724 return 0; 725 } 726 727 static void __init setup_nonnuma(void) 728 { 729 unsigned long top_of_ram = memblock_end_of_DRAM(); 730 unsigned long total_ram = memblock_phys_mem_size(); 731 unsigned long start_pfn, end_pfn; 732 unsigned int nid = 0; 733 struct memblock_region *reg; 734 735 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 736 top_of_ram, total_ram); 737 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 738 (top_of_ram - total_ram) >> 20); 739 740 for_each_memblock(memory, reg) { 741 start_pfn = memblock_region_memory_base_pfn(reg); 742 end_pfn = memblock_region_memory_end_pfn(reg); 743 744 fake_numa_create_new_node(end_pfn, &nid); 745 memblock_set_node(PFN_PHYS(start_pfn), 746 PFN_PHYS(end_pfn - start_pfn), 747 &memblock.memory, nid); 748 node_set_online(nid); 749 } 750 } 751 752 void __init dump_numa_cpu_topology(void) 753 { 754 unsigned int node; 755 unsigned int cpu, count; 756 757 if (min_common_depth == -1 || !numa_enabled) 758 return; 759 760 for_each_online_node(node) { 761 pr_info("Node %d CPUs:", node); 762 763 count = 0; 764 /* 765 * If we used a CPU iterator here we would miss printing 766 * the holes in the cpumap. 767 */ 768 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 769 if (cpumask_test_cpu(cpu, 770 node_to_cpumask_map[node])) { 771 if (count == 0) 772 pr_cont(" %u", cpu); 773 ++count; 774 } else { 775 if (count > 1) 776 pr_cont("-%u", cpu - 1); 777 count = 0; 778 } 779 } 780 781 if (count > 1) 782 pr_cont("-%u", nr_cpu_ids - 1); 783 pr_cont("\n"); 784 } 785 } 786 787 /* Initialize NODE_DATA for a node on the local memory */ 788 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) 789 { 790 u64 spanned_pages = end_pfn - start_pfn; 791 const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); 792 u64 nd_pa; 793 void *nd; 794 int tnid; 795 796 nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 797 nd = __va(nd_pa); 798 799 /* report and initialize */ 800 pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", 801 nd_pa, nd_pa + nd_size - 1); 802 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 803 if (tnid != nid) 804 pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); 805 806 node_data[nid] = nd; 807 memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 808 NODE_DATA(nid)->node_id = nid; 809 NODE_DATA(nid)->node_start_pfn = start_pfn; 810 NODE_DATA(nid)->node_spanned_pages = spanned_pages; 811 } 812 813 static void __init find_possible_nodes(void) 814 { 815 struct device_node *rtas; 816 u32 numnodes, i; 817 818 if (min_common_depth <= 0) 819 return; 820 821 rtas = of_find_node_by_path("/rtas"); 822 if (!rtas) 823 return; 824 825 if (of_property_read_u32_index(rtas, 826 "ibm,max-associativity-domains", 827 min_common_depth, &numnodes)) 828 goto out; 829 830 for (i = 0; i < numnodes; i++) { 831 if (!node_possible(i)) 832 node_set(i, node_possible_map); 833 } 834 835 out: 836 of_node_put(rtas); 837 } 838 839 void __init initmem_init(void) 840 { 841 int nid, cpu; 842 843 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 844 max_pfn = max_low_pfn; 845 846 if (parse_numa_properties()) 847 setup_nonnuma(); 848 849 memblock_dump_all(); 850 851 /* 852 * Modify the set of possible NUMA nodes to reflect information 853 * available about the set of online nodes, and the set of nodes 854 * that we expect to make use of for this platform's affinity 855 * calculations. 856 */ 857 nodes_and(node_possible_map, node_possible_map, node_online_map); 858 859 find_possible_nodes(); 860 861 for_each_online_node(nid) { 862 unsigned long start_pfn, end_pfn; 863 864 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 865 setup_node_data(nid, start_pfn, end_pfn); 866 sparse_memory_present_with_active_regions(nid); 867 } 868 869 sparse_init(); 870 871 setup_node_to_cpumask_map(); 872 873 reset_numa_cpu_lookup_table(); 874 875 /* 876 * We need the numa_cpu_lookup_table to be accurate for all CPUs, 877 * even before we online them, so that we can use cpu_to_{node,mem} 878 * early in boot, cf. smp_prepare_cpus(). 879 * _nocalls() + manual invocation is used because cpuhp is not yet 880 * initialized for the boot CPU. 881 */ 882 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", 883 ppc_numa_cpu_prepare, ppc_numa_cpu_dead); 884 for_each_present_cpu(cpu) 885 numa_setup_cpu(cpu); 886 } 887 888 static int __init early_numa(char *p) 889 { 890 if (!p) 891 return 0; 892 893 if (strstr(p, "off")) 894 numa_enabled = 0; 895 896 if (strstr(p, "debug")) 897 numa_debug = 1; 898 899 p = strstr(p, "fake="); 900 if (p) 901 cmdline = p + strlen("fake="); 902 903 return 0; 904 } 905 early_param("numa", early_numa); 906 907 static bool topology_updates_enabled = true; 908 909 static int __init early_topology_updates(char *p) 910 { 911 if (!p) 912 return 0; 913 914 if (!strcmp(p, "off")) { 915 pr_info("Disabling topology updates\n"); 916 topology_updates_enabled = false; 917 } 918 919 return 0; 920 } 921 early_param("topology_updates", early_topology_updates); 922 923 #ifdef CONFIG_MEMORY_HOTPLUG 924 /* 925 * Find the node associated with a hot added memory section for 926 * memory represented in the device tree by the property 927 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 928 */ 929 static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) 930 { 931 struct drmem_lmb *lmb; 932 unsigned long lmb_size; 933 int nid = -1; 934 935 lmb_size = drmem_lmb_size(); 936 937 for_each_drmem_lmb(lmb) { 938 /* skip this block if it is reserved or not assigned to 939 * this partition */ 940 if ((lmb->flags & DRCONF_MEM_RESERVED) 941 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 942 continue; 943 944 if ((scn_addr < lmb->base_addr) 945 || (scn_addr >= (lmb->base_addr + lmb_size))) 946 continue; 947 948 nid = of_drconf_to_nid_single(lmb); 949 break; 950 } 951 952 return nid; 953 } 954 955 /* 956 * Find the node associated with a hot added memory section for memory 957 * represented in the device tree as a node (i.e. memory@XXXX) for 958 * each memblock. 959 */ 960 static int hot_add_node_scn_to_nid(unsigned long scn_addr) 961 { 962 struct device_node *memory; 963 int nid = -1; 964 965 for_each_node_by_type(memory, "memory") { 966 unsigned long start, size; 967 int ranges; 968 const __be32 *memcell_buf; 969 unsigned int len; 970 971 memcell_buf = of_get_property(memory, "reg", &len); 972 if (!memcell_buf || len <= 0) 973 continue; 974 975 /* ranges in cell */ 976 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 977 978 while (ranges--) { 979 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 980 size = read_n_cells(n_mem_size_cells, &memcell_buf); 981 982 if ((scn_addr < start) || (scn_addr >= (start + size))) 983 continue; 984 985 nid = of_node_to_nid_single(memory); 986 break; 987 } 988 989 if (nid >= 0) 990 break; 991 } 992 993 of_node_put(memory); 994 995 return nid; 996 } 997 998 /* 999 * Find the node associated with a hot added memory section. Section 1000 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that 1001 * sections are fully contained within a single MEMBLOCK. 1002 */ 1003 int hot_add_scn_to_nid(unsigned long scn_addr) 1004 { 1005 struct device_node *memory = NULL; 1006 int nid; 1007 1008 if (!numa_enabled || (min_common_depth < 0)) 1009 return first_online_node; 1010 1011 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1012 if (memory) { 1013 nid = hot_add_drconf_scn_to_nid(scn_addr); 1014 of_node_put(memory); 1015 } else { 1016 nid = hot_add_node_scn_to_nid(scn_addr); 1017 } 1018 1019 if (nid < 0 || !node_possible(nid)) 1020 nid = first_online_node; 1021 1022 return nid; 1023 } 1024 1025 static u64 hot_add_drconf_memory_max(void) 1026 { 1027 struct device_node *memory = NULL; 1028 struct device_node *dn = NULL; 1029 const __be64 *lrdr = NULL; 1030 1031 dn = of_find_node_by_path("/rtas"); 1032 if (dn) { 1033 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL); 1034 of_node_put(dn); 1035 if (lrdr) 1036 return be64_to_cpup(lrdr); 1037 } 1038 1039 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1040 if (memory) { 1041 of_node_put(memory); 1042 return drmem_lmb_memory_max(); 1043 } 1044 return 0; 1045 } 1046 1047 /* 1048 * memory_hotplug_max - return max address of memory that may be added 1049 * 1050 * This is currently only used on systems that support drconfig memory 1051 * hotplug. 1052 */ 1053 u64 memory_hotplug_max(void) 1054 { 1055 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); 1056 } 1057 #endif /* CONFIG_MEMORY_HOTPLUG */ 1058 1059 /* Virtual Processor Home Node (VPHN) support */ 1060 #ifdef CONFIG_PPC_SPLPAR 1061 1062 #include "vphn.h" 1063 1064 struct topology_update_data { 1065 struct topology_update_data *next; 1066 unsigned int cpu; 1067 int old_nid; 1068 int new_nid; 1069 }; 1070 1071 #define TOPOLOGY_DEF_TIMER_SECS 60 1072 1073 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; 1074 static cpumask_t cpu_associativity_changes_mask; 1075 static int vphn_enabled; 1076 static int prrn_enabled; 1077 static void reset_topology_timer(void); 1078 static int topology_timer_secs = 1; 1079 static int topology_inited; 1080 static int topology_update_needed; 1081 1082 /* 1083 * Change polling interval for associativity changes. 1084 */ 1085 int timed_topology_update(int nsecs) 1086 { 1087 if (vphn_enabled) { 1088 if (nsecs > 0) 1089 topology_timer_secs = nsecs; 1090 else 1091 topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS; 1092 1093 reset_topology_timer(); 1094 } 1095 1096 return 0; 1097 } 1098 1099 /* 1100 * Store the current values of the associativity change counters in the 1101 * hypervisor. 1102 */ 1103 static void setup_cpu_associativity_change_counters(void) 1104 { 1105 int cpu; 1106 1107 /* The VPHN feature supports a maximum of 8 reference points */ 1108 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); 1109 1110 for_each_possible_cpu(cpu) { 1111 int i; 1112 u8 *counts = vphn_cpu_change_counts[cpu]; 1113 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; 1114 1115 for (i = 0; i < distance_ref_points_depth; i++) 1116 counts[i] = hypervisor_counts[i]; 1117 } 1118 } 1119 1120 /* 1121 * The hypervisor maintains a set of 8 associativity change counters in 1122 * the VPA of each cpu that correspond to the associativity levels in the 1123 * ibm,associativity-reference-points property. When an associativity 1124 * level changes, the corresponding counter is incremented. 1125 * 1126 * Set a bit in cpu_associativity_changes_mask for each cpu whose home 1127 * node associativity levels have changed. 1128 * 1129 * Returns the number of cpus with unhandled associativity changes. 1130 */ 1131 static int update_cpu_associativity_changes_mask(void) 1132 { 1133 int cpu; 1134 cpumask_t *changes = &cpu_associativity_changes_mask; 1135 1136 for_each_possible_cpu(cpu) { 1137 int i, changed = 0; 1138 u8 *counts = vphn_cpu_change_counts[cpu]; 1139 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; 1140 1141 for (i = 0; i < distance_ref_points_depth; i++) { 1142 if (hypervisor_counts[i] != counts[i]) { 1143 counts[i] = hypervisor_counts[i]; 1144 changed = 1; 1145 } 1146 } 1147 if (changed) { 1148 cpumask_or(changes, changes, cpu_sibling_mask(cpu)); 1149 cpu = cpu_last_thread_sibling(cpu); 1150 } 1151 } 1152 1153 return cpumask_weight(changes); 1154 } 1155 1156 /* 1157 * Retrieve the new associativity information for a virtual processor's 1158 * home node. 1159 */ 1160 static long hcall_vphn(unsigned long cpu, __be32 *associativity) 1161 { 1162 long rc; 1163 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; 1164 u64 flags = 1; 1165 int hwcpu = get_hard_smp_processor_id(cpu); 1166 1167 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); 1168 vphn_unpack_associativity(retbuf, associativity); 1169 1170 return rc; 1171 } 1172 1173 static long vphn_get_associativity(unsigned long cpu, 1174 __be32 *associativity) 1175 { 1176 long rc; 1177 1178 rc = hcall_vphn(cpu, associativity); 1179 1180 switch (rc) { 1181 case H_FUNCTION: 1182 printk(KERN_INFO 1183 "VPHN is not supported. Disabling polling...\n"); 1184 stop_topology_update(); 1185 break; 1186 case H_HARDWARE: 1187 printk(KERN_ERR 1188 "hcall_vphn() experienced a hardware fault " 1189 "preventing VPHN. Disabling polling...\n"); 1190 stop_topology_update(); 1191 break; 1192 case H_SUCCESS: 1193 dbg("VPHN hcall succeeded. Reset polling...\n"); 1194 timed_topology_update(0); 1195 break; 1196 } 1197 1198 return rc; 1199 } 1200 1201 int find_and_online_cpu_nid(int cpu) 1202 { 1203 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1204 int new_nid; 1205 1206 /* Use associativity from first thread for all siblings */ 1207 vphn_get_associativity(cpu, associativity); 1208 new_nid = associativity_to_nid(associativity); 1209 if (new_nid < 0 || !node_possible(new_nid)) 1210 new_nid = first_online_node; 1211 1212 if (NODE_DATA(new_nid) == NULL) { 1213 #ifdef CONFIG_MEMORY_HOTPLUG 1214 /* 1215 * Need to ensure that NODE_DATA is initialized for a node from 1216 * available memory (see memblock_alloc_try_nid). If unable to 1217 * init the node, then default to nearest node that has memory 1218 * installed. 1219 */ 1220 if (try_online_node(new_nid)) 1221 new_nid = first_online_node; 1222 #else 1223 /* 1224 * Default to using the nearest node that has memory installed. 1225 * Otherwise, it would be necessary to patch the kernel MM code 1226 * to deal with more memoryless-node error conditions. 1227 */ 1228 new_nid = first_online_node; 1229 #endif 1230 } 1231 1232 pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__, 1233 cpu, new_nid); 1234 return new_nid; 1235 } 1236 1237 /* 1238 * Update the CPU maps and sysfs entries for a single CPU when its NUMA 1239 * characteristics change. This function doesn't perform any locking and is 1240 * only safe to call from stop_machine(). 1241 */ 1242 static int update_cpu_topology(void *data) 1243 { 1244 struct topology_update_data *update; 1245 unsigned long cpu; 1246 1247 if (!data) 1248 return -EINVAL; 1249 1250 cpu = smp_processor_id(); 1251 1252 for (update = data; update; update = update->next) { 1253 int new_nid = update->new_nid; 1254 if (cpu != update->cpu) 1255 continue; 1256 1257 unmap_cpu_from_node(cpu); 1258 map_cpu_to_node(cpu, new_nid); 1259 set_cpu_numa_node(cpu, new_nid); 1260 set_cpu_numa_mem(cpu, local_memory_node(new_nid)); 1261 vdso_getcpu_init(); 1262 } 1263 1264 return 0; 1265 } 1266 1267 static int update_lookup_table(void *data) 1268 { 1269 struct topology_update_data *update; 1270 1271 if (!data) 1272 return -EINVAL; 1273 1274 /* 1275 * Upon topology update, the numa-cpu lookup table needs to be updated 1276 * for all threads in the core, including offline CPUs, to ensure that 1277 * future hotplug operations respect the cpu-to-node associativity 1278 * properly. 1279 */ 1280 for (update = data; update; update = update->next) { 1281 int nid, base, j; 1282 1283 nid = update->new_nid; 1284 base = cpu_first_thread_sibling(update->cpu); 1285 1286 for (j = 0; j < threads_per_core; j++) { 1287 update_numa_cpu_lookup_table(base + j, nid); 1288 } 1289 } 1290 1291 return 0; 1292 } 1293 1294 /* 1295 * Update the node maps and sysfs entries for each cpu whose home node 1296 * has changed. Returns 1 when the topology has changed, and 0 otherwise. 1297 * 1298 * cpus_locked says whether we already hold cpu_hotplug_lock. 1299 */ 1300 int numa_update_cpu_topology(bool cpus_locked) 1301 { 1302 unsigned int cpu, sibling, changed = 0; 1303 struct topology_update_data *updates, *ud; 1304 cpumask_t updated_cpus; 1305 struct device *dev; 1306 int weight, new_nid, i = 0; 1307 1308 if (!prrn_enabled && !vphn_enabled) { 1309 if (!topology_inited) 1310 topology_update_needed = 1; 1311 return 0; 1312 } 1313 1314 weight = cpumask_weight(&cpu_associativity_changes_mask); 1315 if (!weight) 1316 return 0; 1317 1318 updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL); 1319 if (!updates) 1320 return 0; 1321 1322 cpumask_clear(&updated_cpus); 1323 1324 for_each_cpu(cpu, &cpu_associativity_changes_mask) { 1325 /* 1326 * If siblings aren't flagged for changes, updates list 1327 * will be too short. Skip on this update and set for next 1328 * update. 1329 */ 1330 if (!cpumask_subset(cpu_sibling_mask(cpu), 1331 &cpu_associativity_changes_mask)) { 1332 pr_info("Sibling bits not set for associativity " 1333 "change, cpu%d\n", cpu); 1334 cpumask_or(&cpu_associativity_changes_mask, 1335 &cpu_associativity_changes_mask, 1336 cpu_sibling_mask(cpu)); 1337 cpu = cpu_last_thread_sibling(cpu); 1338 continue; 1339 } 1340 1341 new_nid = find_and_online_cpu_nid(cpu); 1342 1343 if (new_nid == numa_cpu_lookup_table[cpu]) { 1344 cpumask_andnot(&cpu_associativity_changes_mask, 1345 &cpu_associativity_changes_mask, 1346 cpu_sibling_mask(cpu)); 1347 dbg("Assoc chg gives same node %d for cpu%d\n", 1348 new_nid, cpu); 1349 cpu = cpu_last_thread_sibling(cpu); 1350 continue; 1351 } 1352 1353 for_each_cpu(sibling, cpu_sibling_mask(cpu)) { 1354 ud = &updates[i++]; 1355 ud->next = &updates[i]; 1356 ud->cpu = sibling; 1357 ud->new_nid = new_nid; 1358 ud->old_nid = numa_cpu_lookup_table[sibling]; 1359 cpumask_set_cpu(sibling, &updated_cpus); 1360 } 1361 cpu = cpu_last_thread_sibling(cpu); 1362 } 1363 1364 /* 1365 * Prevent processing of 'updates' from overflowing array 1366 * where last entry filled in a 'next' pointer. 1367 */ 1368 if (i) 1369 updates[i-1].next = NULL; 1370 1371 pr_debug("Topology update for the following CPUs:\n"); 1372 if (cpumask_weight(&updated_cpus)) { 1373 for (ud = &updates[0]; ud; ud = ud->next) { 1374 pr_debug("cpu %d moving from node %d " 1375 "to %d\n", ud->cpu, 1376 ud->old_nid, ud->new_nid); 1377 } 1378 } 1379 1380 /* 1381 * In cases where we have nothing to update (because the updates list 1382 * is too short or because the new topology is same as the old one), 1383 * skip invoking update_cpu_topology() via stop-machine(). This is 1384 * necessary (and not just a fast-path optimization) since stop-machine 1385 * can end up electing a random CPU to run update_cpu_topology(), and 1386 * thus trick us into setting up incorrect cpu-node mappings (since 1387 * 'updates' is kzalloc()'ed). 1388 * 1389 * And for the similar reason, we will skip all the following updating. 1390 */ 1391 if (!cpumask_weight(&updated_cpus)) 1392 goto out; 1393 1394 if (cpus_locked) 1395 stop_machine_cpuslocked(update_cpu_topology, &updates[0], 1396 &updated_cpus); 1397 else 1398 stop_machine(update_cpu_topology, &updates[0], &updated_cpus); 1399 1400 /* 1401 * Update the numa-cpu lookup table with the new mappings, even for 1402 * offline CPUs. It is best to perform this update from the stop- 1403 * machine context. 1404 */ 1405 if (cpus_locked) 1406 stop_machine_cpuslocked(update_lookup_table, &updates[0], 1407 cpumask_of(raw_smp_processor_id())); 1408 else 1409 stop_machine(update_lookup_table, &updates[0], 1410 cpumask_of(raw_smp_processor_id())); 1411 1412 for (ud = &updates[0]; ud; ud = ud->next) { 1413 unregister_cpu_under_node(ud->cpu, ud->old_nid); 1414 register_cpu_under_node(ud->cpu, ud->new_nid); 1415 1416 dev = get_cpu_device(ud->cpu); 1417 if (dev) 1418 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 1419 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask); 1420 changed = 1; 1421 } 1422 1423 out: 1424 kfree(updates); 1425 topology_update_needed = 0; 1426 return changed; 1427 } 1428 1429 int arch_update_cpu_topology(void) 1430 { 1431 return numa_update_cpu_topology(true); 1432 } 1433 1434 static void topology_work_fn(struct work_struct *work) 1435 { 1436 rebuild_sched_domains(); 1437 } 1438 static DECLARE_WORK(topology_work, topology_work_fn); 1439 1440 static void topology_schedule_update(void) 1441 { 1442 schedule_work(&topology_work); 1443 } 1444 1445 static void topology_timer_fn(struct timer_list *unused) 1446 { 1447 if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask)) 1448 topology_schedule_update(); 1449 else if (vphn_enabled) { 1450 if (update_cpu_associativity_changes_mask() > 0) 1451 topology_schedule_update(); 1452 reset_topology_timer(); 1453 } 1454 } 1455 static struct timer_list topology_timer; 1456 1457 static void reset_topology_timer(void) 1458 { 1459 mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ); 1460 } 1461 1462 #ifdef CONFIG_SMP 1463 1464 static void stage_topology_update(int core_id) 1465 { 1466 cpumask_or(&cpu_associativity_changes_mask, 1467 &cpu_associativity_changes_mask, cpu_sibling_mask(core_id)); 1468 reset_topology_timer(); 1469 } 1470 1471 static int dt_update_callback(struct notifier_block *nb, 1472 unsigned long action, void *data) 1473 { 1474 struct of_reconfig_data *update = data; 1475 int rc = NOTIFY_DONE; 1476 1477 switch (action) { 1478 case OF_RECONFIG_UPDATE_PROPERTY: 1479 if (!of_prop_cmp(update->dn->type, "cpu") && 1480 !of_prop_cmp(update->prop->name, "ibm,associativity")) { 1481 u32 core_id; 1482 of_property_read_u32(update->dn, "reg", &core_id); 1483 stage_topology_update(core_id); 1484 rc = NOTIFY_OK; 1485 } 1486 break; 1487 } 1488 1489 return rc; 1490 } 1491 1492 static struct notifier_block dt_update_nb = { 1493 .notifier_call = dt_update_callback, 1494 }; 1495 1496 #endif 1497 1498 /* 1499 * Start polling for associativity changes. 1500 */ 1501 int start_topology_update(void) 1502 { 1503 int rc = 0; 1504 1505 if (firmware_has_feature(FW_FEATURE_PRRN)) { 1506 if (!prrn_enabled) { 1507 prrn_enabled = 1; 1508 #ifdef CONFIG_SMP 1509 rc = of_reconfig_notifier_register(&dt_update_nb); 1510 #endif 1511 } 1512 } 1513 if (firmware_has_feature(FW_FEATURE_VPHN) && 1514 lppaca_shared_proc(get_lppaca())) { 1515 if (!vphn_enabled) { 1516 vphn_enabled = 1; 1517 setup_cpu_associativity_change_counters(); 1518 timer_setup(&topology_timer, topology_timer_fn, 1519 TIMER_DEFERRABLE); 1520 reset_topology_timer(); 1521 } 1522 } 1523 1524 return rc; 1525 } 1526 1527 /* 1528 * Disable polling for VPHN associativity changes. 1529 */ 1530 int stop_topology_update(void) 1531 { 1532 int rc = 0; 1533 1534 if (prrn_enabled) { 1535 prrn_enabled = 0; 1536 #ifdef CONFIG_SMP 1537 rc = of_reconfig_notifier_unregister(&dt_update_nb); 1538 #endif 1539 } 1540 if (vphn_enabled) { 1541 vphn_enabled = 0; 1542 rc = del_timer_sync(&topology_timer); 1543 } 1544 1545 return rc; 1546 } 1547 1548 int prrn_is_enabled(void) 1549 { 1550 return prrn_enabled; 1551 } 1552 1553 static int topology_read(struct seq_file *file, void *v) 1554 { 1555 if (vphn_enabled || prrn_enabled) 1556 seq_puts(file, "on\n"); 1557 else 1558 seq_puts(file, "off\n"); 1559 1560 return 0; 1561 } 1562 1563 static int topology_open(struct inode *inode, struct file *file) 1564 { 1565 return single_open(file, topology_read, NULL); 1566 } 1567 1568 static ssize_t topology_write(struct file *file, const char __user *buf, 1569 size_t count, loff_t *off) 1570 { 1571 char kbuf[4]; /* "on" or "off" plus null. */ 1572 int read_len; 1573 1574 read_len = count < 3 ? count : 3; 1575 if (copy_from_user(kbuf, buf, read_len)) 1576 return -EINVAL; 1577 1578 kbuf[read_len] = '\0'; 1579 1580 if (!strncmp(kbuf, "on", 2)) 1581 start_topology_update(); 1582 else if (!strncmp(kbuf, "off", 3)) 1583 stop_topology_update(); 1584 else 1585 return -EINVAL; 1586 1587 return count; 1588 } 1589 1590 static const struct file_operations topology_ops = { 1591 .read = seq_read, 1592 .write = topology_write, 1593 .open = topology_open, 1594 .release = single_release 1595 }; 1596 1597 static int topology_update_init(void) 1598 { 1599 /* Do not poll for changes if disabled at boot */ 1600 if (topology_updates_enabled) 1601 start_topology_update(); 1602 1603 if (vphn_enabled) 1604 topology_schedule_update(); 1605 1606 if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) 1607 return -ENOMEM; 1608 1609 topology_inited = 1; 1610 if (topology_update_needed) 1611 bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), 1612 nr_cpumask_bits); 1613 1614 return 0; 1615 } 1616 device_initcall(topology_update_init); 1617 #endif /* CONFIG_PPC_SPLPAR */ 1618