xref: /openbmc/linux/arch/powerpc/mm/numa.c (revision f42b3800)
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <linux/nodemask.h>
18 #include <linux/cpu.h>
19 #include <linux/notifier.h>
20 #include <asm/sparsemem.h>
21 #include <asm/lmb.h>
22 #include <asm/system.h>
23 #include <asm/smp.h>
24 
25 static int numa_enabled = 1;
26 
27 static char *cmdline __initdata;
28 
29 static int numa_debug;
30 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
31 
32 int numa_cpu_lookup_table[NR_CPUS];
33 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
34 struct pglist_data *node_data[MAX_NUMNODES];
35 
36 EXPORT_SYMBOL(numa_cpu_lookup_table);
37 EXPORT_SYMBOL(numa_cpumask_lookup_table);
38 EXPORT_SYMBOL(node_data);
39 
40 static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
41 static int min_common_depth;
42 static int n_mem_addr_cells, n_mem_size_cells;
43 
44 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
45 						unsigned int *nid)
46 {
47 	unsigned long long mem;
48 	char *p = cmdline;
49 	static unsigned int fake_nid;
50 	static unsigned long long curr_boundary;
51 
52 	/*
53 	 * Modify node id, iff we started creating NUMA nodes
54 	 * We want to continue from where we left of the last time
55 	 */
56 	if (fake_nid)
57 		*nid = fake_nid;
58 	/*
59 	 * In case there are no more arguments to parse, the
60 	 * node_id should be the same as the last fake node id
61 	 * (we've handled this above).
62 	 */
63 	if (!p)
64 		return 0;
65 
66 	mem = memparse(p, &p);
67 	if (!mem)
68 		return 0;
69 
70 	if (mem < curr_boundary)
71 		return 0;
72 
73 	curr_boundary = mem;
74 
75 	if ((end_pfn << PAGE_SHIFT) > mem) {
76 		/*
77 		 * Skip commas and spaces
78 		 */
79 		while (*p == ',' || *p == ' ' || *p == '\t')
80 			p++;
81 
82 		cmdline = p;
83 		fake_nid++;
84 		*nid = fake_nid;
85 		dbg("created new fake_node with id %d\n", fake_nid);
86 		return 1;
87 	}
88 	return 0;
89 }
90 
91 static void __cpuinit map_cpu_to_node(int cpu, int node)
92 {
93 	numa_cpu_lookup_table[cpu] = node;
94 
95 	dbg("adding cpu %d to node %d\n", cpu, node);
96 
97 	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
98 		cpu_set(cpu, numa_cpumask_lookup_table[node]);
99 }
100 
101 #ifdef CONFIG_HOTPLUG_CPU
102 static void unmap_cpu_from_node(unsigned long cpu)
103 {
104 	int node = numa_cpu_lookup_table[cpu];
105 
106 	dbg("removing cpu %lu from node %d\n", cpu, node);
107 
108 	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
109 		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
110 	} else {
111 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
112 		       cpu, node);
113 	}
114 }
115 #endif /* CONFIG_HOTPLUG_CPU */
116 
117 static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
118 {
119 	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
120 	struct device_node *cpu_node = NULL;
121 	const unsigned int *interrupt_server, *reg;
122 	int len;
123 
124 	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
125 		/* Try interrupt server first */
126 		interrupt_server = of_get_property(cpu_node,
127 					"ibm,ppc-interrupt-server#s", &len);
128 
129 		len = len / sizeof(u32);
130 
131 		if (interrupt_server && (len > 0)) {
132 			while (len--) {
133 				if (interrupt_server[len] == hw_cpuid)
134 					return cpu_node;
135 			}
136 		} else {
137 			reg = of_get_property(cpu_node, "reg", &len);
138 			if (reg && (len > 0) && (reg[0] == hw_cpuid))
139 				return cpu_node;
140 		}
141 	}
142 
143 	return NULL;
144 }
145 
146 /* must hold reference to node during call */
147 static const int *of_get_associativity(struct device_node *dev)
148 {
149 	return of_get_property(dev, "ibm,associativity", NULL);
150 }
151 
152 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
153  * info is found.
154  */
155 static int of_node_to_nid_single(struct device_node *device)
156 {
157 	int nid = -1;
158 	const unsigned int *tmp;
159 
160 	if (min_common_depth == -1)
161 		goto out;
162 
163 	tmp = of_get_associativity(device);
164 	if (!tmp)
165 		goto out;
166 
167 	if (tmp[0] >= min_common_depth)
168 		nid = tmp[min_common_depth];
169 
170 	/* POWER4 LPAR uses 0xffff as invalid node */
171 	if (nid == 0xffff || nid >= MAX_NUMNODES)
172 		nid = -1;
173 out:
174 	return nid;
175 }
176 
177 /* Walk the device tree upwards, looking for an associativity id */
178 int of_node_to_nid(struct device_node *device)
179 {
180 	struct device_node *tmp;
181 	int nid = -1;
182 
183 	of_node_get(device);
184 	while (device) {
185 		nid = of_node_to_nid_single(device);
186 		if (nid != -1)
187 			break;
188 
189 	        tmp = device;
190 		device = of_get_parent(tmp);
191 		of_node_put(tmp);
192 	}
193 	of_node_put(device);
194 
195 	return nid;
196 }
197 EXPORT_SYMBOL_GPL(of_node_to_nid);
198 
199 /*
200  * In theory, the "ibm,associativity" property may contain multiple
201  * associativity lists because a resource may be multiply connected
202  * into the machine.  This resource then has different associativity
203  * characteristics relative to its multiple connections.  We ignore
204  * this for now.  We also assume that all cpu and memory sets have
205  * their distances represented at a common level.  This won't be
206  * true for hierarchical NUMA.
207  *
208  * In any case the ibm,associativity-reference-points should give
209  * the correct depth for a normal NUMA system.
210  *
211  * - Dave Hansen <haveblue@us.ibm.com>
212  */
213 static int __init find_min_common_depth(void)
214 {
215 	int depth;
216 	const unsigned int *ref_points;
217 	struct device_node *rtas_root;
218 	unsigned int len;
219 
220 	rtas_root = of_find_node_by_path("/rtas");
221 
222 	if (!rtas_root)
223 		return -1;
224 
225 	/*
226 	 * this property is 2 32-bit integers, each representing a level of
227 	 * depth in the associativity nodes.  The first is for an SMP
228 	 * configuration (should be all 0's) and the second is for a normal
229 	 * NUMA configuration.
230 	 */
231 	ref_points = of_get_property(rtas_root,
232 			"ibm,associativity-reference-points", &len);
233 
234 	if ((len >= 1) && ref_points) {
235 		depth = ref_points[1];
236 	} else {
237 		dbg("NUMA: ibm,associativity-reference-points not found.\n");
238 		depth = -1;
239 	}
240 	of_node_put(rtas_root);
241 
242 	return depth;
243 }
244 
245 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
246 {
247 	struct device_node *memory = NULL;
248 
249 	memory = of_find_node_by_type(memory, "memory");
250 	if (!memory)
251 		panic("numa.c: No memory nodes found!");
252 
253 	*n_addr_cells = of_n_addr_cells(memory);
254 	*n_size_cells = of_n_size_cells(memory);
255 	of_node_put(memory);
256 }
257 
258 static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
259 {
260 	unsigned long result = 0;
261 
262 	while (n--) {
263 		result = (result << 32) | **buf;
264 		(*buf)++;
265 	}
266 	return result;
267 }
268 
269 /*
270  * Figure out to which domain a cpu belongs and stick it there.
271  * Return the id of the domain used.
272  */
273 static int __cpuinit numa_setup_cpu(unsigned long lcpu)
274 {
275 	int nid = 0;
276 	struct device_node *cpu = find_cpu_node(lcpu);
277 
278 	if (!cpu) {
279 		WARN_ON(1);
280 		goto out;
281 	}
282 
283 	nid = of_node_to_nid_single(cpu);
284 
285 	if (nid < 0 || !node_online(nid))
286 		nid = any_online_node(NODE_MASK_ALL);
287 out:
288 	map_cpu_to_node(lcpu, nid);
289 
290 	of_node_put(cpu);
291 
292 	return nid;
293 }
294 
295 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
296 			     unsigned long action,
297 			     void *hcpu)
298 {
299 	unsigned long lcpu = (unsigned long)hcpu;
300 	int ret = NOTIFY_DONE;
301 
302 	switch (action) {
303 	case CPU_UP_PREPARE:
304 	case CPU_UP_PREPARE_FROZEN:
305 		numa_setup_cpu(lcpu);
306 		ret = NOTIFY_OK;
307 		break;
308 #ifdef CONFIG_HOTPLUG_CPU
309 	case CPU_DEAD:
310 	case CPU_DEAD_FROZEN:
311 	case CPU_UP_CANCELED:
312 	case CPU_UP_CANCELED_FROZEN:
313 		unmap_cpu_from_node(lcpu);
314 		break;
315 		ret = NOTIFY_OK;
316 #endif
317 	}
318 	return ret;
319 }
320 
321 /*
322  * Check and possibly modify a memory region to enforce the memory limit.
323  *
324  * Returns the size the region should have to enforce the memory limit.
325  * This will either be the original value of size, a truncated value,
326  * or zero. If the returned value of size is 0 the region should be
327  * discarded as it lies wholy above the memory limit.
328  */
329 static unsigned long __init numa_enforce_memory_limit(unsigned long start,
330 						      unsigned long size)
331 {
332 	/*
333 	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
334 	 * we've already adjusted it for the limit and it takes care of
335 	 * having memory holes below the limit.
336 	 */
337 
338 	if (! memory_limit)
339 		return size;
340 
341 	if (start + size <= lmb_end_of_DRAM())
342 		return size;
343 
344 	if (start >= lmb_end_of_DRAM())
345 		return 0;
346 
347 	return lmb_end_of_DRAM() - start;
348 }
349 
350 /*
351  * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
352  * node.  This assumes n_mem_{addr,size}_cells have been set.
353  */
354 static void __init parse_drconf_memory(struct device_node *memory)
355 {
356 	const unsigned int *lm, *dm, *aa;
357 	unsigned int ls, ld, la;
358 	unsigned int n, aam, aalen;
359 	unsigned long lmb_size, size, start;
360 	int nid, default_nid = 0;
361 	unsigned int ai, flags;
362 
363 	lm = of_get_property(memory, "ibm,lmb-size", &ls);
364 	dm = of_get_property(memory, "ibm,dynamic-memory", &ld);
365 	aa = of_get_property(memory, "ibm,associativity-lookup-arrays", &la);
366 	if (!lm || !dm || !aa ||
367 	    ls < sizeof(unsigned int) || ld < sizeof(unsigned int) ||
368 	    la < 2 * sizeof(unsigned int))
369 		return;
370 
371 	lmb_size = read_n_cells(n_mem_size_cells, &lm);
372 	n = *dm++;		/* number of LMBs */
373 	aam = *aa++;		/* number of associativity lists */
374 	aalen = *aa++;		/* length of each associativity list */
375 	if (ld < (n * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int) ||
376 	    la < (aam * aalen + 2) * sizeof(unsigned int))
377 		return;
378 
379 	for (; n != 0; --n) {
380 		start = read_n_cells(n_mem_addr_cells, &dm);
381 		ai = dm[2];
382 		flags = dm[3];
383 		dm += 4;
384 		/* 0x80 == reserved, 0x8 = assigned to us */
385 		if ((flags & 0x80) || !(flags & 0x8))
386 			continue;
387 		nid = default_nid;
388 		/* flags & 0x40 means associativity index is invalid */
389 		if (min_common_depth > 0 && min_common_depth <= aalen &&
390 		    (flags & 0x40) == 0 && ai < aam) {
391 			/* this is like of_node_to_nid_single */
392 			nid = aa[ai * aalen + min_common_depth - 1];
393 			if (nid == 0xffff || nid >= MAX_NUMNODES)
394 				nid = default_nid;
395 		}
396 
397 		fake_numa_create_new_node(((start + lmb_size) >> PAGE_SHIFT),
398 						&nid);
399 		node_set_online(nid);
400 
401 		size = numa_enforce_memory_limit(start, lmb_size);
402 		if (!size)
403 			continue;
404 
405 		add_active_range(nid, start >> PAGE_SHIFT,
406 				 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
407 	}
408 }
409 
410 static int __init parse_numa_properties(void)
411 {
412 	struct device_node *cpu = NULL;
413 	struct device_node *memory = NULL;
414 	int default_nid = 0;
415 	unsigned long i;
416 
417 	if (numa_enabled == 0) {
418 		printk(KERN_WARNING "NUMA disabled by user\n");
419 		return -1;
420 	}
421 
422 	min_common_depth = find_min_common_depth();
423 
424 	if (min_common_depth < 0)
425 		return min_common_depth;
426 
427 	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
428 
429 	/*
430 	 * Even though we connect cpus to numa domains later in SMP
431 	 * init, we need to know the node ids now. This is because
432 	 * each node to be onlined must have NODE_DATA etc backing it.
433 	 */
434 	for_each_present_cpu(i) {
435 		int nid;
436 
437 		cpu = find_cpu_node(i);
438 		BUG_ON(!cpu);
439 		nid = of_node_to_nid_single(cpu);
440 		of_node_put(cpu);
441 
442 		/*
443 		 * Don't fall back to default_nid yet -- we will plug
444 		 * cpus into nodes once the memory scan has discovered
445 		 * the topology.
446 		 */
447 		if (nid < 0)
448 			continue;
449 		node_set_online(nid);
450 	}
451 
452 	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
453 	memory = NULL;
454 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
455 		unsigned long start;
456 		unsigned long size;
457 		int nid;
458 		int ranges;
459 		const unsigned int *memcell_buf;
460 		unsigned int len;
461 
462 		memcell_buf = of_get_property(memory,
463 			"linux,usable-memory", &len);
464 		if (!memcell_buf || len <= 0)
465 			memcell_buf = of_get_property(memory, "reg", &len);
466 		if (!memcell_buf || len <= 0)
467 			continue;
468 
469 		/* ranges in cell */
470 		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
471 new_range:
472 		/* these are order-sensitive, and modify the buffer pointer */
473 		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
474 		size = read_n_cells(n_mem_size_cells, &memcell_buf);
475 
476 		/*
477 		 * Assumption: either all memory nodes or none will
478 		 * have associativity properties.  If none, then
479 		 * everything goes to default_nid.
480 		 */
481 		nid = of_node_to_nid_single(memory);
482 		if (nid < 0)
483 			nid = default_nid;
484 
485 		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
486 		node_set_online(nid);
487 
488 		if (!(size = numa_enforce_memory_limit(start, size))) {
489 			if (--ranges)
490 				goto new_range;
491 			else
492 				continue;
493 		}
494 
495 		add_active_range(nid, start >> PAGE_SHIFT,
496 				(start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
497 
498 		if (--ranges)
499 			goto new_range;
500 	}
501 
502 	/*
503 	 * Now do the same thing for each LMB listed in the ibm,dynamic-memory
504 	 * property in the ibm,dynamic-reconfiguration-memory node.
505 	 */
506 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
507 	if (memory)
508 		parse_drconf_memory(memory);
509 
510 	return 0;
511 }
512 
513 static void __init setup_nonnuma(void)
514 {
515 	unsigned long top_of_ram = lmb_end_of_DRAM();
516 	unsigned long total_ram = lmb_phys_mem_size();
517 	unsigned long start_pfn, end_pfn;
518 	unsigned int i, nid = 0;
519 
520 	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
521 	       top_of_ram, total_ram);
522 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
523 	       (top_of_ram - total_ram) >> 20);
524 
525 	for (i = 0; i < lmb.memory.cnt; ++i) {
526 		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
527 		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
528 
529 		fake_numa_create_new_node(end_pfn, &nid);
530 		add_active_range(nid, start_pfn, end_pfn);
531 		node_set_online(nid);
532 	}
533 }
534 
535 void __init dump_numa_cpu_topology(void)
536 {
537 	unsigned int node;
538 	unsigned int cpu, count;
539 
540 	if (min_common_depth == -1 || !numa_enabled)
541 		return;
542 
543 	for_each_online_node(node) {
544 		printk(KERN_DEBUG "Node %d CPUs:", node);
545 
546 		count = 0;
547 		/*
548 		 * If we used a CPU iterator here we would miss printing
549 		 * the holes in the cpumap.
550 		 */
551 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
552 			if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
553 				if (count == 0)
554 					printk(" %u", cpu);
555 				++count;
556 			} else {
557 				if (count > 1)
558 					printk("-%u", cpu - 1);
559 				count = 0;
560 			}
561 		}
562 
563 		if (count > 1)
564 			printk("-%u", NR_CPUS - 1);
565 		printk("\n");
566 	}
567 }
568 
569 static void __init dump_numa_memory_topology(void)
570 {
571 	unsigned int node;
572 	unsigned int count;
573 
574 	if (min_common_depth == -1 || !numa_enabled)
575 		return;
576 
577 	for_each_online_node(node) {
578 		unsigned long i;
579 
580 		printk(KERN_DEBUG "Node %d Memory:", node);
581 
582 		count = 0;
583 
584 		for (i = 0; i < lmb_end_of_DRAM();
585 		     i += (1 << SECTION_SIZE_BITS)) {
586 			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
587 				if (count == 0)
588 					printk(" 0x%lx", i);
589 				++count;
590 			} else {
591 				if (count > 0)
592 					printk("-0x%lx", i);
593 				count = 0;
594 			}
595 		}
596 
597 		if (count > 0)
598 			printk("-0x%lx", i);
599 		printk("\n");
600 	}
601 }
602 
603 /*
604  * Allocate some memory, satisfying the lmb or bootmem allocator where
605  * required. nid is the preferred node and end is the physical address of
606  * the highest address in the node.
607  *
608  * Returns the physical address of the memory.
609  */
610 static void __init *careful_allocation(int nid, unsigned long size,
611 				       unsigned long align,
612 				       unsigned long end_pfn)
613 {
614 	int new_nid;
615 	unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
616 
617 	/* retry over all memory */
618 	if (!ret)
619 		ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
620 
621 	if (!ret)
622 		panic("numa.c: cannot allocate %lu bytes on node %d",
623 		      size, nid);
624 
625 	/*
626 	 * If the memory came from a previously allocated node, we must
627 	 * retry with the bootmem allocator.
628 	 */
629 	new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT);
630 	if (new_nid < nid) {
631 		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid),
632 				size, align, 0);
633 
634 		if (!ret)
635 			panic("numa.c: cannot allocate %lu bytes on node %d",
636 			      size, new_nid);
637 
638 		ret = __pa(ret);
639 
640 		dbg("alloc_bootmem %lx %lx\n", ret, size);
641 	}
642 
643 	return (void *)ret;
644 }
645 
646 static struct notifier_block __cpuinitdata ppc64_numa_nb = {
647 	.notifier_call = cpu_numa_callback,
648 	.priority = 1 /* Must run before sched domains notifier. */
649 };
650 
651 void __init do_init_bootmem(void)
652 {
653 	int nid;
654 	unsigned int i;
655 
656 	min_low_pfn = 0;
657 	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
658 	max_pfn = max_low_pfn;
659 
660 	if (parse_numa_properties())
661 		setup_nonnuma();
662 	else
663 		dump_numa_memory_topology();
664 
665 	register_cpu_notifier(&ppc64_numa_nb);
666 	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
667 			  (void *)(unsigned long)boot_cpuid);
668 
669 	for_each_online_node(nid) {
670 		unsigned long start_pfn, end_pfn;
671 		unsigned long bootmem_paddr;
672 		unsigned long bootmap_pages;
673 
674 		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
675 
676 		/* Allocate the node structure node local if possible */
677 		NODE_DATA(nid) = careful_allocation(nid,
678 					sizeof(struct pglist_data),
679 					SMP_CACHE_BYTES, end_pfn);
680 		NODE_DATA(nid) = __va(NODE_DATA(nid));
681 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
682 
683   		dbg("node %d\n", nid);
684 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
685 
686 		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
687 		NODE_DATA(nid)->node_start_pfn = start_pfn;
688 		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
689 
690 		if (NODE_DATA(nid)->node_spanned_pages == 0)
691   			continue;
692 
693   		dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
694   		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
695 
696 		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
697 		bootmem_paddr = (unsigned long)careful_allocation(nid,
698 					bootmap_pages << PAGE_SHIFT,
699 					PAGE_SIZE, end_pfn);
700 		memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT);
701 
702 		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
703 
704 		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
705 				  start_pfn, end_pfn);
706 
707 		free_bootmem_with_active_regions(nid, end_pfn);
708 
709 		/* Mark reserved regions on this node */
710 		for (i = 0; i < lmb.reserved.cnt; i++) {
711 			unsigned long physbase = lmb.reserved.region[i].base;
712 			unsigned long size = lmb.reserved.region[i].size;
713 			unsigned long start_paddr = start_pfn << PAGE_SHIFT;
714 			unsigned long end_paddr = end_pfn << PAGE_SHIFT;
715 
716 			if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
717 			    early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
718 				continue;
719 
720 			if (physbase < end_paddr &&
721 			    (physbase+size) > start_paddr) {
722 				/* overlaps */
723 				if (physbase < start_paddr) {
724 					size -= start_paddr - physbase;
725 					physbase = start_paddr;
726 				}
727 
728 				if (size > end_paddr - physbase)
729 					size = end_paddr - physbase;
730 
731 				dbg("reserve_bootmem %lx %lx\n", physbase,
732 				    size);
733 				reserve_bootmem_node(NODE_DATA(nid), physbase,
734 						     size, BOOTMEM_DEFAULT);
735 			}
736 		}
737 
738 		sparse_memory_present_with_active_regions(nid);
739 	}
740 }
741 
742 void __init paging_init(void)
743 {
744 	unsigned long max_zone_pfns[MAX_NR_ZONES];
745 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
746 	max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
747 	free_area_init_nodes(max_zone_pfns);
748 }
749 
750 static int __init early_numa(char *p)
751 {
752 	if (!p)
753 		return 0;
754 
755 	if (strstr(p, "off"))
756 		numa_enabled = 0;
757 
758 	if (strstr(p, "debug"))
759 		numa_debug = 1;
760 
761 	p = strstr(p, "fake=");
762 	if (p)
763 		cmdline = p + strlen("fake=");
764 
765 	return 0;
766 }
767 early_param("numa", early_numa);
768 
769 #ifdef CONFIG_MEMORY_HOTPLUG
770 /*
771  * Find the node associated with a hot added memory section.  Section
772  * corresponds to a SPARSEMEM section, not an LMB.  It is assumed that
773  * sections are fully contained within a single LMB.
774  */
775 int hot_add_scn_to_nid(unsigned long scn_addr)
776 {
777 	struct device_node *memory = NULL;
778 	nodemask_t nodes;
779 	int default_nid = any_online_node(NODE_MASK_ALL);
780 	int nid;
781 
782 	if (!numa_enabled || (min_common_depth < 0))
783 		return default_nid;
784 
785 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
786 		unsigned long start, size;
787 		int ranges;
788 		const unsigned int *memcell_buf;
789 		unsigned int len;
790 
791 		memcell_buf = of_get_property(memory, "reg", &len);
792 		if (!memcell_buf || len <= 0)
793 			continue;
794 
795 		/* ranges in cell */
796 		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
797 ha_new_range:
798 		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
799 		size = read_n_cells(n_mem_size_cells, &memcell_buf);
800 		nid = of_node_to_nid_single(memory);
801 
802 		/* Domains not present at boot default to 0 */
803 		if (nid < 0 || !node_online(nid))
804 			nid = default_nid;
805 
806 		if ((scn_addr >= start) && (scn_addr < (start + size))) {
807 			of_node_put(memory);
808 			goto got_nid;
809 		}
810 
811 		if (--ranges)		/* process all ranges in cell */
812 			goto ha_new_range;
813 	}
814 	BUG();	/* section address should be found above */
815 	return 0;
816 
817 	/* Temporary code to ensure that returned node is not empty */
818 got_nid:
819 	nodes_setall(nodes);
820 	while (NODE_DATA(nid)->node_spanned_pages == 0) {
821 		node_clear(nid, nodes);
822 		nid = any_online_node(nodes);
823 	}
824 	return nid;
825 }
826 #endif /* CONFIG_MEMORY_HOTPLUG */
827