xref: /openbmc/linux/arch/powerpc/mm/numa.c (revision 545e4006)
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <linux/nodemask.h>
18 #include <linux/cpu.h>
19 #include <linux/notifier.h>
20 #include <linux/lmb.h>
21 #include <linux/of.h>
22 #include <asm/sparsemem.h>
23 #include <asm/prom.h>
24 #include <asm/system.h>
25 #include <asm/smp.h>
26 
27 static int numa_enabled = 1;
28 
29 static char *cmdline __initdata;
30 
31 static int numa_debug;
32 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
33 
34 int numa_cpu_lookup_table[NR_CPUS];
35 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
36 struct pglist_data *node_data[MAX_NUMNODES];
37 
38 EXPORT_SYMBOL(numa_cpu_lookup_table);
39 EXPORT_SYMBOL(numa_cpumask_lookup_table);
40 EXPORT_SYMBOL(node_data);
41 
42 static int min_common_depth;
43 static int n_mem_addr_cells, n_mem_size_cells;
44 
45 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
46 						unsigned int *nid)
47 {
48 	unsigned long long mem;
49 	char *p = cmdline;
50 	static unsigned int fake_nid;
51 	static unsigned long long curr_boundary;
52 
53 	/*
54 	 * Modify node id, iff we started creating NUMA nodes
55 	 * We want to continue from where we left of the last time
56 	 */
57 	if (fake_nid)
58 		*nid = fake_nid;
59 	/*
60 	 * In case there are no more arguments to parse, the
61 	 * node_id should be the same as the last fake node id
62 	 * (we've handled this above).
63 	 */
64 	if (!p)
65 		return 0;
66 
67 	mem = memparse(p, &p);
68 	if (!mem)
69 		return 0;
70 
71 	if (mem < curr_boundary)
72 		return 0;
73 
74 	curr_boundary = mem;
75 
76 	if ((end_pfn << PAGE_SHIFT) > mem) {
77 		/*
78 		 * Skip commas and spaces
79 		 */
80 		while (*p == ',' || *p == ' ' || *p == '\t')
81 			p++;
82 
83 		cmdline = p;
84 		fake_nid++;
85 		*nid = fake_nid;
86 		dbg("created new fake_node with id %d\n", fake_nid);
87 		return 1;
88 	}
89 	return 0;
90 }
91 
92 static void __cpuinit map_cpu_to_node(int cpu, int node)
93 {
94 	numa_cpu_lookup_table[cpu] = node;
95 
96 	dbg("adding cpu %d to node %d\n", cpu, node);
97 
98 	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
99 		cpu_set(cpu, numa_cpumask_lookup_table[node]);
100 }
101 
102 #ifdef CONFIG_HOTPLUG_CPU
103 static void unmap_cpu_from_node(unsigned long cpu)
104 {
105 	int node = numa_cpu_lookup_table[cpu];
106 
107 	dbg("removing cpu %lu from node %d\n", cpu, node);
108 
109 	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
110 		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
111 	} else {
112 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
113 		       cpu, node);
114 	}
115 }
116 #endif /* CONFIG_HOTPLUG_CPU */
117 
118 static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
119 {
120 	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
121 	struct device_node *cpu_node = NULL;
122 	const unsigned int *interrupt_server, *reg;
123 	int len;
124 
125 	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
126 		/* Try interrupt server first */
127 		interrupt_server = of_get_property(cpu_node,
128 					"ibm,ppc-interrupt-server#s", &len);
129 
130 		len = len / sizeof(u32);
131 
132 		if (interrupt_server && (len > 0)) {
133 			while (len--) {
134 				if (interrupt_server[len] == hw_cpuid)
135 					return cpu_node;
136 			}
137 		} else {
138 			reg = of_get_property(cpu_node, "reg", &len);
139 			if (reg && (len > 0) && (reg[0] == hw_cpuid))
140 				return cpu_node;
141 		}
142 	}
143 
144 	return NULL;
145 }
146 
147 /* must hold reference to node during call */
148 static const int *of_get_associativity(struct device_node *dev)
149 {
150 	return of_get_property(dev, "ibm,associativity", NULL);
151 }
152 
153 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
154  * info is found.
155  */
156 static int of_node_to_nid_single(struct device_node *device)
157 {
158 	int nid = -1;
159 	const unsigned int *tmp;
160 
161 	if (min_common_depth == -1)
162 		goto out;
163 
164 	tmp = of_get_associativity(device);
165 	if (!tmp)
166 		goto out;
167 
168 	if (tmp[0] >= min_common_depth)
169 		nid = tmp[min_common_depth];
170 
171 	/* POWER4 LPAR uses 0xffff as invalid node */
172 	if (nid == 0xffff || nid >= MAX_NUMNODES)
173 		nid = -1;
174 out:
175 	return nid;
176 }
177 
178 /* Walk the device tree upwards, looking for an associativity id */
179 int of_node_to_nid(struct device_node *device)
180 {
181 	struct device_node *tmp;
182 	int nid = -1;
183 
184 	of_node_get(device);
185 	while (device) {
186 		nid = of_node_to_nid_single(device);
187 		if (nid != -1)
188 			break;
189 
190 	        tmp = device;
191 		device = of_get_parent(tmp);
192 		of_node_put(tmp);
193 	}
194 	of_node_put(device);
195 
196 	return nid;
197 }
198 EXPORT_SYMBOL_GPL(of_node_to_nid);
199 
200 /*
201  * In theory, the "ibm,associativity" property may contain multiple
202  * associativity lists because a resource may be multiply connected
203  * into the machine.  This resource then has different associativity
204  * characteristics relative to its multiple connections.  We ignore
205  * this for now.  We also assume that all cpu and memory sets have
206  * their distances represented at a common level.  This won't be
207  * true for hierarchical NUMA.
208  *
209  * In any case the ibm,associativity-reference-points should give
210  * the correct depth for a normal NUMA system.
211  *
212  * - Dave Hansen <haveblue@us.ibm.com>
213  */
214 static int __init find_min_common_depth(void)
215 {
216 	int depth;
217 	const unsigned int *ref_points;
218 	struct device_node *rtas_root;
219 	unsigned int len;
220 
221 	rtas_root = of_find_node_by_path("/rtas");
222 
223 	if (!rtas_root)
224 		return -1;
225 
226 	/*
227 	 * this property is 2 32-bit integers, each representing a level of
228 	 * depth in the associativity nodes.  The first is for an SMP
229 	 * configuration (should be all 0's) and the second is for a normal
230 	 * NUMA configuration.
231 	 */
232 	ref_points = of_get_property(rtas_root,
233 			"ibm,associativity-reference-points", &len);
234 
235 	if ((len >= 1) && ref_points) {
236 		depth = ref_points[1];
237 	} else {
238 		dbg("NUMA: ibm,associativity-reference-points not found.\n");
239 		depth = -1;
240 	}
241 	of_node_put(rtas_root);
242 
243 	return depth;
244 }
245 
246 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
247 {
248 	struct device_node *memory = NULL;
249 
250 	memory = of_find_node_by_type(memory, "memory");
251 	if (!memory)
252 		panic("numa.c: No memory nodes found!");
253 
254 	*n_addr_cells = of_n_addr_cells(memory);
255 	*n_size_cells = of_n_size_cells(memory);
256 	of_node_put(memory);
257 }
258 
259 static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
260 {
261 	unsigned long result = 0;
262 
263 	while (n--) {
264 		result = (result << 32) | **buf;
265 		(*buf)++;
266 	}
267 	return result;
268 }
269 
270 struct of_drconf_cell {
271 	u64	base_addr;
272 	u32	drc_index;
273 	u32	reserved;
274 	u32	aa_index;
275 	u32	flags;
276 };
277 
278 #define DRCONF_MEM_ASSIGNED	0x00000008
279 #define DRCONF_MEM_AI_INVALID	0x00000040
280 #define DRCONF_MEM_RESERVED	0x00000080
281 
282 /*
283  * Read the next lmb list entry from the ibm,dynamic-memory property
284  * and return the information in the provided of_drconf_cell structure.
285  */
286 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
287 {
288 	const u32 *cp;
289 
290 	drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
291 
292 	cp = *cellp;
293 	drmem->drc_index = cp[0];
294 	drmem->reserved = cp[1];
295 	drmem->aa_index = cp[2];
296 	drmem->flags = cp[3];
297 
298 	*cellp = cp + 4;
299 }
300 
301 /*
302  * Retreive and validate the ibm,dynamic-memory property of the device tree.
303  *
304  * The layout of the ibm,dynamic-memory property is a number N of lmb
305  * list entries followed by N lmb list entries.  Each lmb list entry
306  * contains information as layed out in the of_drconf_cell struct above.
307  */
308 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
309 {
310 	const u32 *prop;
311 	u32 len, entries;
312 
313 	prop = of_get_property(memory, "ibm,dynamic-memory", &len);
314 	if (!prop || len < sizeof(unsigned int))
315 		return 0;
316 
317 	entries = *prop++;
318 
319 	/* Now that we know the number of entries, revalidate the size
320 	 * of the property read in to ensure we have everything
321 	 */
322 	if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
323 		return 0;
324 
325 	*dm = prop;
326 	return entries;
327 }
328 
329 /*
330  * Retreive and validate the ibm,lmb-size property for drconf memory
331  * from the device tree.
332  */
333 static u64 of_get_lmb_size(struct device_node *memory)
334 {
335 	const u32 *prop;
336 	u32 len;
337 
338 	prop = of_get_property(memory, "ibm,lmb-size", &len);
339 	if (!prop || len < sizeof(unsigned int))
340 		return 0;
341 
342 	return read_n_cells(n_mem_size_cells, &prop);
343 }
344 
345 struct assoc_arrays {
346 	u32	n_arrays;
347 	u32	array_sz;
348 	const u32 *arrays;
349 };
350 
351 /*
352  * Retreive and validate the list of associativity arrays for drconf
353  * memory from the ibm,associativity-lookup-arrays property of the
354  * device tree..
355  *
356  * The layout of the ibm,associativity-lookup-arrays property is a number N
357  * indicating the number of associativity arrays, followed by a number M
358  * indicating the size of each associativity array, followed by a list
359  * of N associativity arrays.
360  */
361 static int of_get_assoc_arrays(struct device_node *memory,
362 			       struct assoc_arrays *aa)
363 {
364 	const u32 *prop;
365 	u32 len;
366 
367 	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
368 	if (!prop || len < 2 * sizeof(unsigned int))
369 		return -1;
370 
371 	aa->n_arrays = *prop++;
372 	aa->array_sz = *prop++;
373 
374 	/* Now that we know the number of arrrays and size of each array,
375 	 * revalidate the size of the property read in.
376 	 */
377 	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
378 		return -1;
379 
380 	aa->arrays = prop;
381 	return 0;
382 }
383 
384 /*
385  * This is like of_node_to_nid_single() for memory represented in the
386  * ibm,dynamic-reconfiguration-memory node.
387  */
388 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
389 				   struct assoc_arrays *aa)
390 {
391 	int default_nid = 0;
392 	int nid = default_nid;
393 	int index;
394 
395 	if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
396 	    !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
397 	    drmem->aa_index < aa->n_arrays) {
398 		index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
399 		nid = aa->arrays[index];
400 
401 		if (nid == 0xffff || nid >= MAX_NUMNODES)
402 			nid = default_nid;
403 	}
404 
405 	return nid;
406 }
407 
408 /*
409  * Figure out to which domain a cpu belongs and stick it there.
410  * Return the id of the domain used.
411  */
412 static int __cpuinit numa_setup_cpu(unsigned long lcpu)
413 {
414 	int nid = 0;
415 	struct device_node *cpu = find_cpu_node(lcpu);
416 
417 	if (!cpu) {
418 		WARN_ON(1);
419 		goto out;
420 	}
421 
422 	nid = of_node_to_nid_single(cpu);
423 
424 	if (nid < 0 || !node_online(nid))
425 		nid = any_online_node(NODE_MASK_ALL);
426 out:
427 	map_cpu_to_node(lcpu, nid);
428 
429 	of_node_put(cpu);
430 
431 	return nid;
432 }
433 
434 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
435 			     unsigned long action,
436 			     void *hcpu)
437 {
438 	unsigned long lcpu = (unsigned long)hcpu;
439 	int ret = NOTIFY_DONE;
440 
441 	switch (action) {
442 	case CPU_UP_PREPARE:
443 	case CPU_UP_PREPARE_FROZEN:
444 		numa_setup_cpu(lcpu);
445 		ret = NOTIFY_OK;
446 		break;
447 #ifdef CONFIG_HOTPLUG_CPU
448 	case CPU_DEAD:
449 	case CPU_DEAD_FROZEN:
450 	case CPU_UP_CANCELED:
451 	case CPU_UP_CANCELED_FROZEN:
452 		unmap_cpu_from_node(lcpu);
453 		break;
454 		ret = NOTIFY_OK;
455 #endif
456 	}
457 	return ret;
458 }
459 
460 /*
461  * Check and possibly modify a memory region to enforce the memory limit.
462  *
463  * Returns the size the region should have to enforce the memory limit.
464  * This will either be the original value of size, a truncated value,
465  * or zero. If the returned value of size is 0 the region should be
466  * discarded as it lies wholy above the memory limit.
467  */
468 static unsigned long __init numa_enforce_memory_limit(unsigned long start,
469 						      unsigned long size)
470 {
471 	/*
472 	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
473 	 * we've already adjusted it for the limit and it takes care of
474 	 * having memory holes below the limit.
475 	 */
476 
477 	if (! memory_limit)
478 		return size;
479 
480 	if (start + size <= lmb_end_of_DRAM())
481 		return size;
482 
483 	if (start >= lmb_end_of_DRAM())
484 		return 0;
485 
486 	return lmb_end_of_DRAM() - start;
487 }
488 
489 /*
490  * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
491  * node.  This assumes n_mem_{addr,size}_cells have been set.
492  */
493 static void __init parse_drconf_memory(struct device_node *memory)
494 {
495 	const u32 *dm;
496 	unsigned int n, rc;
497 	unsigned long lmb_size, size;
498 	int nid;
499 	struct assoc_arrays aa;
500 
501 	n = of_get_drconf_memory(memory, &dm);
502 	if (!n)
503 		return;
504 
505 	lmb_size = of_get_lmb_size(memory);
506 	if (!lmb_size)
507 		return;
508 
509 	rc = of_get_assoc_arrays(memory, &aa);
510 	if (rc)
511 		return;
512 
513 	for (; n != 0; --n) {
514 		struct of_drconf_cell drmem;
515 
516 		read_drconf_cell(&drmem, &dm);
517 
518 		/* skip this block if the reserved bit is set in flags (0x80)
519 		   or if the block is not assigned to this partition (0x8) */
520 		if ((drmem.flags & DRCONF_MEM_RESERVED)
521 		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
522 			continue;
523 
524 		nid = of_drconf_to_nid_single(&drmem, &aa);
525 
526 		fake_numa_create_new_node(
527 				((drmem.base_addr + lmb_size) >> PAGE_SHIFT),
528 					   &nid);
529 
530 		node_set_online(nid);
531 
532 		size = numa_enforce_memory_limit(drmem.base_addr, lmb_size);
533 		if (!size)
534 			continue;
535 
536 		add_active_range(nid, drmem.base_addr >> PAGE_SHIFT,
537 				 (drmem.base_addr >> PAGE_SHIFT)
538 				 + (size >> PAGE_SHIFT));
539 	}
540 }
541 
542 static int __init parse_numa_properties(void)
543 {
544 	struct device_node *cpu = NULL;
545 	struct device_node *memory = NULL;
546 	int default_nid = 0;
547 	unsigned long i;
548 
549 	if (numa_enabled == 0) {
550 		printk(KERN_WARNING "NUMA disabled by user\n");
551 		return -1;
552 	}
553 
554 	min_common_depth = find_min_common_depth();
555 
556 	if (min_common_depth < 0)
557 		return min_common_depth;
558 
559 	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
560 
561 	/*
562 	 * Even though we connect cpus to numa domains later in SMP
563 	 * init, we need to know the node ids now. This is because
564 	 * each node to be onlined must have NODE_DATA etc backing it.
565 	 */
566 	for_each_present_cpu(i) {
567 		int nid;
568 
569 		cpu = find_cpu_node(i);
570 		BUG_ON(!cpu);
571 		nid = of_node_to_nid_single(cpu);
572 		of_node_put(cpu);
573 
574 		/*
575 		 * Don't fall back to default_nid yet -- we will plug
576 		 * cpus into nodes once the memory scan has discovered
577 		 * the topology.
578 		 */
579 		if (nid < 0)
580 			continue;
581 		node_set_online(nid);
582 	}
583 
584 	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
585 	memory = NULL;
586 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
587 		unsigned long start;
588 		unsigned long size;
589 		int nid;
590 		int ranges;
591 		const unsigned int *memcell_buf;
592 		unsigned int len;
593 
594 		memcell_buf = of_get_property(memory,
595 			"linux,usable-memory", &len);
596 		if (!memcell_buf || len <= 0)
597 			memcell_buf = of_get_property(memory, "reg", &len);
598 		if (!memcell_buf || len <= 0)
599 			continue;
600 
601 		/* ranges in cell */
602 		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
603 new_range:
604 		/* these are order-sensitive, and modify the buffer pointer */
605 		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
606 		size = read_n_cells(n_mem_size_cells, &memcell_buf);
607 
608 		/*
609 		 * Assumption: either all memory nodes or none will
610 		 * have associativity properties.  If none, then
611 		 * everything goes to default_nid.
612 		 */
613 		nid = of_node_to_nid_single(memory);
614 		if (nid < 0)
615 			nid = default_nid;
616 
617 		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
618 		node_set_online(nid);
619 
620 		if (!(size = numa_enforce_memory_limit(start, size))) {
621 			if (--ranges)
622 				goto new_range;
623 			else
624 				continue;
625 		}
626 
627 		add_active_range(nid, start >> PAGE_SHIFT,
628 				(start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
629 
630 		if (--ranges)
631 			goto new_range;
632 	}
633 
634 	/*
635 	 * Now do the same thing for each LMB listed in the ibm,dynamic-memory
636 	 * property in the ibm,dynamic-reconfiguration-memory node.
637 	 */
638 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
639 	if (memory)
640 		parse_drconf_memory(memory);
641 
642 	return 0;
643 }
644 
645 static void __init setup_nonnuma(void)
646 {
647 	unsigned long top_of_ram = lmb_end_of_DRAM();
648 	unsigned long total_ram = lmb_phys_mem_size();
649 	unsigned long start_pfn, end_pfn;
650 	unsigned int i, nid = 0;
651 
652 	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
653 	       top_of_ram, total_ram);
654 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
655 	       (top_of_ram - total_ram) >> 20);
656 
657 	for (i = 0; i < lmb.memory.cnt; ++i) {
658 		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
659 		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
660 
661 		fake_numa_create_new_node(end_pfn, &nid);
662 		add_active_range(nid, start_pfn, end_pfn);
663 		node_set_online(nid);
664 	}
665 }
666 
667 void __init dump_numa_cpu_topology(void)
668 {
669 	unsigned int node;
670 	unsigned int cpu, count;
671 
672 	if (min_common_depth == -1 || !numa_enabled)
673 		return;
674 
675 	for_each_online_node(node) {
676 		printk(KERN_DEBUG "Node %d CPUs:", node);
677 
678 		count = 0;
679 		/*
680 		 * If we used a CPU iterator here we would miss printing
681 		 * the holes in the cpumap.
682 		 */
683 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
684 			if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
685 				if (count == 0)
686 					printk(" %u", cpu);
687 				++count;
688 			} else {
689 				if (count > 1)
690 					printk("-%u", cpu - 1);
691 				count = 0;
692 			}
693 		}
694 
695 		if (count > 1)
696 			printk("-%u", NR_CPUS - 1);
697 		printk("\n");
698 	}
699 }
700 
701 static void __init dump_numa_memory_topology(void)
702 {
703 	unsigned int node;
704 	unsigned int count;
705 
706 	if (min_common_depth == -1 || !numa_enabled)
707 		return;
708 
709 	for_each_online_node(node) {
710 		unsigned long i;
711 
712 		printk(KERN_DEBUG "Node %d Memory:", node);
713 
714 		count = 0;
715 
716 		for (i = 0; i < lmb_end_of_DRAM();
717 		     i += (1 << SECTION_SIZE_BITS)) {
718 			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
719 				if (count == 0)
720 					printk(" 0x%lx", i);
721 				++count;
722 			} else {
723 				if (count > 0)
724 					printk("-0x%lx", i);
725 				count = 0;
726 			}
727 		}
728 
729 		if (count > 0)
730 			printk("-0x%lx", i);
731 		printk("\n");
732 	}
733 }
734 
735 /*
736  * Allocate some memory, satisfying the lmb or bootmem allocator where
737  * required. nid is the preferred node and end is the physical address of
738  * the highest address in the node.
739  *
740  * Returns the physical address of the memory.
741  */
742 static void __init *careful_allocation(int nid, unsigned long size,
743 				       unsigned long align,
744 				       unsigned long end_pfn)
745 {
746 	int new_nid;
747 	unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
748 
749 	/* retry over all memory */
750 	if (!ret)
751 		ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
752 
753 	if (!ret)
754 		panic("numa.c: cannot allocate %lu bytes on node %d",
755 		      size, nid);
756 
757 	/*
758 	 * If the memory came from a previously allocated node, we must
759 	 * retry with the bootmem allocator.
760 	 */
761 	new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT);
762 	if (new_nid < nid) {
763 		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid),
764 				size, align, 0);
765 
766 		if (!ret)
767 			panic("numa.c: cannot allocate %lu bytes on node %d",
768 			      size, new_nid);
769 
770 		ret = __pa(ret);
771 
772 		dbg("alloc_bootmem %lx %lx\n", ret, size);
773 	}
774 
775 	return (void *)ret;
776 }
777 
778 static struct notifier_block __cpuinitdata ppc64_numa_nb = {
779 	.notifier_call = cpu_numa_callback,
780 	.priority = 1 /* Must run before sched domains notifier. */
781 };
782 
783 void __init do_init_bootmem(void)
784 {
785 	int nid;
786 	unsigned int i;
787 
788 	min_low_pfn = 0;
789 	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
790 	max_pfn = max_low_pfn;
791 
792 	if (parse_numa_properties())
793 		setup_nonnuma();
794 	else
795 		dump_numa_memory_topology();
796 
797 	register_cpu_notifier(&ppc64_numa_nb);
798 	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
799 			  (void *)(unsigned long)boot_cpuid);
800 
801 	for_each_online_node(nid) {
802 		unsigned long start_pfn, end_pfn;
803 		unsigned long bootmem_paddr;
804 		unsigned long bootmap_pages;
805 
806 		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
807 
808 		/* Allocate the node structure node local if possible */
809 		NODE_DATA(nid) = careful_allocation(nid,
810 					sizeof(struct pglist_data),
811 					SMP_CACHE_BYTES, end_pfn);
812 		NODE_DATA(nid) = __va(NODE_DATA(nid));
813 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
814 
815   		dbg("node %d\n", nid);
816 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
817 
818 		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
819 		NODE_DATA(nid)->node_start_pfn = start_pfn;
820 		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
821 
822 		if (NODE_DATA(nid)->node_spanned_pages == 0)
823   			continue;
824 
825   		dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
826   		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
827 
828 		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
829 		bootmem_paddr = (unsigned long)careful_allocation(nid,
830 					bootmap_pages << PAGE_SHIFT,
831 					PAGE_SIZE, end_pfn);
832 		memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT);
833 
834 		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
835 
836 		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
837 				  start_pfn, end_pfn);
838 
839 		free_bootmem_with_active_regions(nid, end_pfn);
840 
841 		/* Mark reserved regions on this node */
842 		for (i = 0; i < lmb.reserved.cnt; i++) {
843 			unsigned long physbase = lmb.reserved.region[i].base;
844 			unsigned long size = lmb.reserved.region[i].size;
845 			unsigned long start_paddr = start_pfn << PAGE_SHIFT;
846 			unsigned long end_paddr = end_pfn << PAGE_SHIFT;
847 
848 			if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
849 			    early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
850 				continue;
851 
852 			if (physbase < end_paddr &&
853 			    (physbase+size) > start_paddr) {
854 				/* overlaps */
855 				if (physbase < start_paddr) {
856 					size -= start_paddr - physbase;
857 					physbase = start_paddr;
858 				}
859 
860 				if (size > end_paddr - physbase)
861 					size = end_paddr - physbase;
862 
863 				dbg("reserve_bootmem %lx %lx\n", physbase,
864 				    size);
865 				reserve_bootmem_node(NODE_DATA(nid), physbase,
866 						     size, BOOTMEM_DEFAULT);
867 			}
868 		}
869 
870 		sparse_memory_present_with_active_regions(nid);
871 	}
872 }
873 
874 void __init paging_init(void)
875 {
876 	unsigned long max_zone_pfns[MAX_NR_ZONES];
877 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
878 	max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
879 	free_area_init_nodes(max_zone_pfns);
880 }
881 
882 static int __init early_numa(char *p)
883 {
884 	if (!p)
885 		return 0;
886 
887 	if (strstr(p, "off"))
888 		numa_enabled = 0;
889 
890 	if (strstr(p, "debug"))
891 		numa_debug = 1;
892 
893 	p = strstr(p, "fake=");
894 	if (p)
895 		cmdline = p + strlen("fake=");
896 
897 	return 0;
898 }
899 early_param("numa", early_numa);
900 
901 #ifdef CONFIG_MEMORY_HOTPLUG
902 /*
903  * Validate the node associated with the memory section we are
904  * trying to add.
905  */
906 int valid_hot_add_scn(int *nid, unsigned long start, u32 lmb_size,
907 		      unsigned long scn_addr)
908 {
909 	nodemask_t nodes;
910 
911 	if (*nid < 0 || !node_online(*nid))
912 		*nid = any_online_node(NODE_MASK_ALL);
913 
914 	if ((scn_addr >= start) && (scn_addr < (start + lmb_size))) {
915 		nodes_setall(nodes);
916 		while (NODE_DATA(*nid)->node_spanned_pages == 0) {
917 			node_clear(*nid, nodes);
918 			*nid = any_online_node(nodes);
919 		}
920 
921 		return 1;
922 	}
923 
924 	return 0;
925 }
926 
927 /*
928  * Find the node associated with a hot added memory section represented
929  * by the ibm,dynamic-reconfiguration-memory node.
930  */
931 static int hot_add_drconf_scn_to_nid(struct device_node *memory,
932 				     unsigned long scn_addr)
933 {
934 	const u32 *dm;
935 	unsigned int n, rc;
936 	unsigned long lmb_size;
937 	int default_nid = any_online_node(NODE_MASK_ALL);
938 	int nid;
939 	struct assoc_arrays aa;
940 
941 	n = of_get_drconf_memory(memory, &dm);
942 	if (!n)
943 		return default_nid;;
944 
945 	lmb_size = of_get_lmb_size(memory);
946 	if (!lmb_size)
947 		return default_nid;
948 
949 	rc = of_get_assoc_arrays(memory, &aa);
950 	if (rc)
951 		return default_nid;
952 
953 	for (; n != 0; --n) {
954 		struct of_drconf_cell drmem;
955 
956 		read_drconf_cell(&drmem, &dm);
957 
958 		/* skip this block if it is reserved or not assigned to
959 		 * this partition */
960 		if ((drmem.flags & DRCONF_MEM_RESERVED)
961 		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
962 			continue;
963 
964 		nid = of_drconf_to_nid_single(&drmem, &aa);
965 
966 		if (valid_hot_add_scn(&nid, drmem.base_addr, lmb_size,
967 				      scn_addr))
968 			return nid;
969 	}
970 
971 	BUG();	/* section address should be found above */
972 	return 0;
973 }
974 
975 /*
976  * Find the node associated with a hot added memory section.  Section
977  * corresponds to a SPARSEMEM section, not an LMB.  It is assumed that
978  * sections are fully contained within a single LMB.
979  */
980 int hot_add_scn_to_nid(unsigned long scn_addr)
981 {
982 	struct device_node *memory = NULL;
983 	int nid;
984 
985 	if (!numa_enabled || (min_common_depth < 0))
986 		return any_online_node(NODE_MASK_ALL);
987 
988 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
989 	if (memory) {
990 		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
991 		of_node_put(memory);
992 		return nid;
993 	}
994 
995 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
996 		unsigned long start, size;
997 		int ranges;
998 		const unsigned int *memcell_buf;
999 		unsigned int len;
1000 
1001 		memcell_buf = of_get_property(memory, "reg", &len);
1002 		if (!memcell_buf || len <= 0)
1003 			continue;
1004 
1005 		/* ranges in cell */
1006 		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1007 ha_new_range:
1008 		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1009 		size = read_n_cells(n_mem_size_cells, &memcell_buf);
1010 		nid = of_node_to_nid_single(memory);
1011 
1012 		if (valid_hot_add_scn(&nid, start, size, scn_addr)) {
1013 			of_node_put(memory);
1014 			return nid;
1015 		}
1016 
1017 		if (--ranges)		/* process all ranges in cell */
1018 			goto ha_new_range;
1019 	}
1020 	BUG();	/* section address should be found above */
1021 	return 0;
1022 }
1023 #endif /* CONFIG_MEMORY_HOTPLUG */
1024