xref: /openbmc/linux/arch/powerpc/mm/numa.c (revision 3b621ee5df437d3f332a635ab6421aaa61a7dc2b)
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <linux/nodemask.h>
18 #include <linux/cpu.h>
19 #include <linux/notifier.h>
20 #include <asm/lmb.h>
21 #include <asm/machdep.h>
22 #include <asm/abs_addr.h>
23 #include <asm/system.h>
24 #include <asm/smp.h>
25 
26 static int numa_enabled = 1;
27 
28 static int numa_debug;
29 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
30 
31 #ifdef DEBUG_NUMA
32 #define ARRAY_INITIALISER -1
33 #else
34 #define ARRAY_INITIALISER 0
35 #endif
36 
37 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
38 	ARRAY_INITIALISER};
39 char *numa_memory_lookup_table;
40 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
41 int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
42 
43 struct pglist_data *node_data[MAX_NUMNODES];
44 bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
45 static int min_common_depth;
46 
47 /*
48  * We need somewhere to store start/span for each node until we have
49  * allocated the real node_data structures.
50  */
51 static struct {
52 	unsigned long node_start_pfn;
53 	unsigned long node_end_pfn;
54 	unsigned long node_present_pages;
55 } init_node_data[MAX_NUMNODES] __initdata;
56 
57 EXPORT_SYMBOL(node_data);
58 EXPORT_SYMBOL(numa_cpu_lookup_table);
59 EXPORT_SYMBOL(numa_memory_lookup_table);
60 EXPORT_SYMBOL(numa_cpumask_lookup_table);
61 EXPORT_SYMBOL(nr_cpus_in_node);
62 
63 static inline void map_cpu_to_node(int cpu, int node)
64 {
65 	numa_cpu_lookup_table[cpu] = node;
66 	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
67 		cpu_set(cpu, numa_cpumask_lookup_table[node]);
68 		nr_cpus_in_node[node]++;
69 	}
70 }
71 
72 #ifdef CONFIG_HOTPLUG_CPU
73 static void unmap_cpu_from_node(unsigned long cpu)
74 {
75 	int node = numa_cpu_lookup_table[cpu];
76 
77 	dbg("removing cpu %lu from node %d\n", cpu, node);
78 
79 	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
80 		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
81 		nr_cpus_in_node[node]--;
82 	} else {
83 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
84 		       cpu, node);
85 	}
86 }
87 #endif /* CONFIG_HOTPLUG_CPU */
88 
89 static struct device_node * __devinit find_cpu_node(unsigned int cpu)
90 {
91 	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
92 	struct device_node *cpu_node = NULL;
93 	unsigned int *interrupt_server, *reg;
94 	int len;
95 
96 	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
97 		/* Try interrupt server first */
98 		interrupt_server = (unsigned int *)get_property(cpu_node,
99 					"ibm,ppc-interrupt-server#s", &len);
100 
101 		len = len / sizeof(u32);
102 
103 		if (interrupt_server && (len > 0)) {
104 			while (len--) {
105 				if (interrupt_server[len] == hw_cpuid)
106 					return cpu_node;
107 			}
108 		} else {
109 			reg = (unsigned int *)get_property(cpu_node,
110 							   "reg", &len);
111 			if (reg && (len > 0) && (reg[0] == hw_cpuid))
112 				return cpu_node;
113 		}
114 	}
115 
116 	return NULL;
117 }
118 
119 /* must hold reference to node during call */
120 static int *of_get_associativity(struct device_node *dev)
121 {
122 	return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
123 }
124 
125 static int of_node_numa_domain(struct device_node *device)
126 {
127 	int numa_domain;
128 	unsigned int *tmp;
129 
130 	if (min_common_depth == -1)
131 		return 0;
132 
133 	tmp = of_get_associativity(device);
134 	if (tmp && (tmp[0] >= min_common_depth)) {
135 		numa_domain = tmp[min_common_depth];
136 	} else {
137 		dbg("WARNING: no NUMA information for %s\n",
138 		    device->full_name);
139 		numa_domain = 0;
140 	}
141 	return numa_domain;
142 }
143 
144 /*
145  * In theory, the "ibm,associativity" property may contain multiple
146  * associativity lists because a resource may be multiply connected
147  * into the machine.  This resource then has different associativity
148  * characteristics relative to its multiple connections.  We ignore
149  * this for now.  We also assume that all cpu and memory sets have
150  * their distances represented at a common level.  This won't be
151  * true for heirarchical NUMA.
152  *
153  * In any case the ibm,associativity-reference-points should give
154  * the correct depth for a normal NUMA system.
155  *
156  * - Dave Hansen <haveblue@us.ibm.com>
157  */
158 static int __init find_min_common_depth(void)
159 {
160 	int depth;
161 	unsigned int *ref_points;
162 	struct device_node *rtas_root;
163 	unsigned int len;
164 
165 	rtas_root = of_find_node_by_path("/rtas");
166 
167 	if (!rtas_root)
168 		return -1;
169 
170 	/*
171 	 * this property is 2 32-bit integers, each representing a level of
172 	 * depth in the associativity nodes.  The first is for an SMP
173 	 * configuration (should be all 0's) and the second is for a normal
174 	 * NUMA configuration.
175 	 */
176 	ref_points = (unsigned int *)get_property(rtas_root,
177 			"ibm,associativity-reference-points", &len);
178 
179 	if ((len >= 1) && ref_points) {
180 		depth = ref_points[1];
181 	} else {
182 		dbg("WARNING: could not find NUMA "
183 		    "associativity reference point\n");
184 		depth = -1;
185 	}
186 	of_node_put(rtas_root);
187 
188 	return depth;
189 }
190 
191 static int __init get_mem_addr_cells(void)
192 {
193 	struct device_node *memory = NULL;
194 	int rc;
195 
196 	memory = of_find_node_by_type(memory, "memory");
197 	if (!memory)
198 		return 0; /* it won't matter */
199 
200 	rc = prom_n_addr_cells(memory);
201 	return rc;
202 }
203 
204 static int __init get_mem_size_cells(void)
205 {
206 	struct device_node *memory = NULL;
207 	int rc;
208 
209 	memory = of_find_node_by_type(memory, "memory");
210 	if (!memory)
211 		return 0; /* it won't matter */
212 	rc = prom_n_size_cells(memory);
213 	return rc;
214 }
215 
216 static unsigned long read_n_cells(int n, unsigned int **buf)
217 {
218 	unsigned long result = 0;
219 
220 	while (n--) {
221 		result = (result << 32) | **buf;
222 		(*buf)++;
223 	}
224 	return result;
225 }
226 
227 /*
228  * Figure out to which domain a cpu belongs and stick it there.
229  * Return the id of the domain used.
230  */
231 static int numa_setup_cpu(unsigned long lcpu)
232 {
233 	int numa_domain = 0;
234 	struct device_node *cpu = find_cpu_node(lcpu);
235 
236 	if (!cpu) {
237 		WARN_ON(1);
238 		goto out;
239 	}
240 
241 	numa_domain = of_node_numa_domain(cpu);
242 
243 	if (numa_domain >= num_online_nodes()) {
244 		/*
245 		 * POWER4 LPAR uses 0xffff as invalid node,
246 		 * dont warn in this case.
247 		 */
248 		if (numa_domain != 0xffff)
249 			printk(KERN_ERR "WARNING: cpu %ld "
250 			       "maps to invalid NUMA node %d\n",
251 			       lcpu, numa_domain);
252 		numa_domain = 0;
253 	}
254 out:
255 	node_set_online(numa_domain);
256 
257 	map_cpu_to_node(lcpu, numa_domain);
258 
259 	of_node_put(cpu);
260 
261 	return numa_domain;
262 }
263 
264 static int cpu_numa_callback(struct notifier_block *nfb,
265 			     unsigned long action,
266 			     void *hcpu)
267 {
268 	unsigned long lcpu = (unsigned long)hcpu;
269 	int ret = NOTIFY_DONE;
270 
271 	switch (action) {
272 	case CPU_UP_PREPARE:
273 		if (min_common_depth == -1 || !numa_enabled)
274 			map_cpu_to_node(lcpu, 0);
275 		else
276 			numa_setup_cpu(lcpu);
277 		ret = NOTIFY_OK;
278 		break;
279 #ifdef CONFIG_HOTPLUG_CPU
280 	case CPU_DEAD:
281 	case CPU_UP_CANCELED:
282 		unmap_cpu_from_node(lcpu);
283 		break;
284 		ret = NOTIFY_OK;
285 #endif
286 	}
287 	return ret;
288 }
289 
290 /*
291  * Check and possibly modify a memory region to enforce the memory limit.
292  *
293  * Returns the size the region should have to enforce the memory limit.
294  * This will either be the original value of size, a truncated value,
295  * or zero. If the returned value of size is 0 the region should be
296  * discarded as it lies wholy above the memory limit.
297  */
298 static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
299 {
300 	/*
301 	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
302 	 * we've already adjusted it for the limit and it takes care of
303 	 * having memory holes below the limit.
304 	 */
305 
306 	if (! memory_limit)
307 		return size;
308 
309 	if (start + size <= lmb_end_of_DRAM())
310 		return size;
311 
312 	if (start >= lmb_end_of_DRAM())
313 		return 0;
314 
315 	return lmb_end_of_DRAM() - start;
316 }
317 
318 static int __init parse_numa_properties(void)
319 {
320 	struct device_node *cpu = NULL;
321 	struct device_node *memory = NULL;
322 	int addr_cells, size_cells;
323 	int max_domain = 0;
324 	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
325 	unsigned long i;
326 
327 	if (numa_enabled == 0) {
328 		printk(KERN_WARNING "NUMA disabled by user\n");
329 		return -1;
330 	}
331 
332 	numa_memory_lookup_table =
333 		(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
334 	memset(numa_memory_lookup_table, 0, entries * sizeof(char));
335 
336 	for (i = 0; i < entries ; i++)
337 		numa_memory_lookup_table[i] = ARRAY_INITIALISER;
338 
339 	min_common_depth = find_min_common_depth();
340 
341 	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
342 	if (min_common_depth < 0)
343 		return min_common_depth;
344 
345 	max_domain = numa_setup_cpu(boot_cpuid);
346 
347 	/*
348 	 * Even though we connect cpus to numa domains later in SMP init,
349 	 * we need to know the maximum node id now. This is because each
350 	 * node id must have NODE_DATA etc backing it.
351 	 * As a result of hotplug we could still have cpus appear later on
352 	 * with larger node ids. In that case we force the cpu into node 0.
353 	 */
354 	for_each_cpu(i) {
355 		int numa_domain;
356 
357 		cpu = find_cpu_node(i);
358 
359 		if (cpu) {
360 			numa_domain = of_node_numa_domain(cpu);
361 			of_node_put(cpu);
362 
363 			if (numa_domain < MAX_NUMNODES &&
364 			    max_domain < numa_domain)
365 				max_domain = numa_domain;
366 		}
367 	}
368 
369 	addr_cells = get_mem_addr_cells();
370 	size_cells = get_mem_size_cells();
371 	memory = NULL;
372 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
373 		unsigned long start;
374 		unsigned long size;
375 		int numa_domain;
376 		int ranges;
377 		unsigned int *memcell_buf;
378 		unsigned int len;
379 
380 		memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
381 		if (!memcell_buf || len <= 0)
382 			continue;
383 
384 		ranges = memory->n_addrs;
385 new_range:
386 		/* these are order-sensitive, and modify the buffer pointer */
387 		start = read_n_cells(addr_cells, &memcell_buf);
388 		size = read_n_cells(size_cells, &memcell_buf);
389 
390 		start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
391 		size = _ALIGN_UP(size, MEMORY_INCREMENT);
392 
393 		numa_domain = of_node_numa_domain(memory);
394 
395 		if (numa_domain >= MAX_NUMNODES) {
396 			if (numa_domain != 0xffff)
397 				printk(KERN_ERR "WARNING: memory at %lx maps "
398 				       "to invalid NUMA node %d\n", start,
399 				       numa_domain);
400 			numa_domain = 0;
401 		}
402 
403 		if (max_domain < numa_domain)
404 			max_domain = numa_domain;
405 
406 		if (! (size = numa_enforce_memory_limit(start, size))) {
407 			if (--ranges)
408 				goto new_range;
409 			else
410 				continue;
411 		}
412 
413 		/*
414 		 * Initialize new node struct, or add to an existing one.
415 		 */
416 		if (init_node_data[numa_domain].node_end_pfn) {
417 			if ((start / PAGE_SIZE) <
418 			    init_node_data[numa_domain].node_start_pfn)
419 				init_node_data[numa_domain].node_start_pfn =
420 					start / PAGE_SIZE;
421 			if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
422 			    init_node_data[numa_domain].node_end_pfn)
423 				init_node_data[numa_domain].node_end_pfn =
424 					(start / PAGE_SIZE) +
425 					(size / PAGE_SIZE);
426 
427 			init_node_data[numa_domain].node_present_pages +=
428 				size / PAGE_SIZE;
429 		} else {
430 			node_set_online(numa_domain);
431 
432 			init_node_data[numa_domain].node_start_pfn =
433 				start / PAGE_SIZE;
434 			init_node_data[numa_domain].node_end_pfn =
435 				init_node_data[numa_domain].node_start_pfn +
436 				size / PAGE_SIZE;
437 			init_node_data[numa_domain].node_present_pages =
438 				size / PAGE_SIZE;
439 		}
440 
441 		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
442 			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
443 				numa_domain;
444 
445 		if (--ranges)
446 			goto new_range;
447 	}
448 
449 	for (i = 0; i <= max_domain; i++)
450 		node_set_online(i);
451 
452 	return 0;
453 }
454 
455 static void __init setup_nonnuma(void)
456 {
457 	unsigned long top_of_ram = lmb_end_of_DRAM();
458 	unsigned long total_ram = lmb_phys_mem_size();
459 	unsigned long i;
460 
461 	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
462 	       top_of_ram, total_ram);
463 	printk(KERN_INFO "Memory hole size: %ldMB\n",
464 	       (top_of_ram - total_ram) >> 20);
465 
466 	if (!numa_memory_lookup_table) {
467 		long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
468 		numa_memory_lookup_table =
469 			(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
470 		memset(numa_memory_lookup_table, 0, entries * sizeof(char));
471 		for (i = 0; i < entries ; i++)
472 			numa_memory_lookup_table[i] = ARRAY_INITIALISER;
473 	}
474 
475 	map_cpu_to_node(boot_cpuid, 0);
476 
477 	node_set_online(0);
478 
479 	init_node_data[0].node_start_pfn = 0;
480 	init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
481 	init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
482 
483 	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
484 		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
485 }
486 
487 static void __init dump_numa_topology(void)
488 {
489 	unsigned int node;
490 	unsigned int count;
491 
492 	if (min_common_depth == -1 || !numa_enabled)
493 		return;
494 
495 	for_each_online_node(node) {
496 		unsigned long i;
497 
498 		printk(KERN_INFO "Node %d Memory:", node);
499 
500 		count = 0;
501 
502 		for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
503 			if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
504 				if (count == 0)
505 					printk(" 0x%lx", i);
506 				++count;
507 			} else {
508 				if (count > 0)
509 					printk("-0x%lx", i);
510 				count = 0;
511 			}
512 		}
513 
514 		if (count > 0)
515 			printk("-0x%lx", i);
516 		printk("\n");
517 	}
518 	return;
519 }
520 
521 /*
522  * Allocate some memory, satisfying the lmb or bootmem allocator where
523  * required. nid is the preferred node and end is the physical address of
524  * the highest address in the node.
525  *
526  * Returns the physical address of the memory.
527  */
528 static unsigned long careful_allocation(int nid, unsigned long size,
529 					unsigned long align, unsigned long end)
530 {
531 	unsigned long ret = lmb_alloc_base(size, align, end);
532 
533 	/* retry over all memory */
534 	if (!ret)
535 		ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
536 
537 	if (!ret)
538 		panic("numa.c: cannot allocate %lu bytes on node %d",
539 		      size, nid);
540 
541 	/*
542 	 * If the memory came from a previously allocated node, we must
543 	 * retry with the bootmem allocator.
544 	 */
545 	if (pa_to_nid(ret) < nid) {
546 		nid = pa_to_nid(ret);
547 		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
548 				size, align, 0);
549 
550 		if (!ret)
551 			panic("numa.c: cannot allocate %lu bytes on node %d",
552 			      size, nid);
553 
554 		ret = virt_to_abs(ret);
555 
556 		dbg("alloc_bootmem %lx %lx\n", ret, size);
557 	}
558 
559 	return ret;
560 }
561 
562 void __init do_init_bootmem(void)
563 {
564 	int nid;
565 	int addr_cells, size_cells;
566 	struct device_node *memory = NULL;
567 	static struct notifier_block ppc64_numa_nb = {
568 		.notifier_call = cpu_numa_callback,
569 		.priority = 1 /* Must run before sched domains notifier. */
570 	};
571 
572 	min_low_pfn = 0;
573 	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
574 	max_pfn = max_low_pfn;
575 
576 	if (parse_numa_properties())
577 		setup_nonnuma();
578 	else
579 		dump_numa_topology();
580 
581 	register_cpu_notifier(&ppc64_numa_nb);
582 
583 	for_each_online_node(nid) {
584 		unsigned long start_paddr, end_paddr;
585 		int i;
586 		unsigned long bootmem_paddr;
587 		unsigned long bootmap_pages;
588 
589 		start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
590 		end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
591 
592 		/* Allocate the node structure node local if possible */
593 		NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
594 					sizeof(struct pglist_data),
595 					SMP_CACHE_BYTES, end_paddr);
596 		NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
597 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
598 
599   		dbg("node %d\n", nid);
600 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
601 
602 		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
603 		NODE_DATA(nid)->node_start_pfn =
604 			init_node_data[nid].node_start_pfn;
605 		NODE_DATA(nid)->node_spanned_pages =
606 			end_paddr - start_paddr;
607 
608 		if (NODE_DATA(nid)->node_spanned_pages == 0)
609   			continue;
610 
611   		dbg("start_paddr = %lx\n", start_paddr);
612   		dbg("end_paddr = %lx\n", end_paddr);
613 
614 		bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
615 
616 		bootmem_paddr = careful_allocation(nid,
617 				bootmap_pages << PAGE_SHIFT,
618 				PAGE_SIZE, end_paddr);
619 		memset(abs_to_virt(bootmem_paddr), 0,
620 		       bootmap_pages << PAGE_SHIFT);
621 		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
622 
623 		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
624 				  start_paddr >> PAGE_SHIFT,
625 				  end_paddr >> PAGE_SHIFT);
626 
627 		/*
628 		 * We need to do another scan of all memory sections to
629 		 * associate memory with the correct node.
630 		 */
631 		addr_cells = get_mem_addr_cells();
632 		size_cells = get_mem_size_cells();
633 		memory = NULL;
634 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
635 			unsigned long mem_start, mem_size;
636 			int numa_domain, ranges;
637 			unsigned int *memcell_buf;
638 			unsigned int len;
639 
640 			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
641 			if (!memcell_buf || len <= 0)
642 				continue;
643 
644 			ranges = memory->n_addrs;	/* ranges in cell */
645 new_range:
646 			mem_start = read_n_cells(addr_cells, &memcell_buf);
647 			mem_size = read_n_cells(size_cells, &memcell_buf);
648 			if (numa_enabled) {
649 				numa_domain = of_node_numa_domain(memory);
650 				if (numa_domain  >= MAX_NUMNODES)
651 					numa_domain = 0;
652 			} else
653 				numa_domain =  0;
654 
655 			if (numa_domain != nid)
656 				continue;
657 
658 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
659   			if (mem_size) {
660   				dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
661   				free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
662 			}
663 
664 			if (--ranges)		/* process all ranges in cell */
665 				goto new_range;
666 		}
667 
668 		/*
669 		 * Mark reserved regions on this node
670 		 */
671 		for (i = 0; i < lmb.reserved.cnt; i++) {
672 			unsigned long physbase = lmb.reserved.region[i].base;
673 			unsigned long size = lmb.reserved.region[i].size;
674 
675 			if (pa_to_nid(physbase) != nid &&
676 			    pa_to_nid(physbase+size-1) != nid)
677 				continue;
678 
679 			if (physbase < end_paddr &&
680 			    (physbase+size) > start_paddr) {
681 				/* overlaps */
682 				if (physbase < start_paddr) {
683 					size -= start_paddr - physbase;
684 					physbase = start_paddr;
685 				}
686 
687 				if (size > end_paddr - physbase)
688 					size = end_paddr - physbase;
689 
690 				dbg("reserve_bootmem %lx %lx\n", physbase,
691 				    size);
692 				reserve_bootmem_node(NODE_DATA(nid), physbase,
693 						     size);
694 			}
695 		}
696 		/*
697 		 * This loop may look famaliar, but we have to do it again
698 		 * after marking our reserved memory to mark memory present
699 		 * for sparsemem.
700 		 */
701 		addr_cells = get_mem_addr_cells();
702 		size_cells = get_mem_size_cells();
703 		memory = NULL;
704 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
705 			unsigned long mem_start, mem_size;
706 			int numa_domain, ranges;
707 			unsigned int *memcell_buf;
708 			unsigned int len;
709 
710 			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
711 			if (!memcell_buf || len <= 0)
712 				continue;
713 
714 			ranges = memory->n_addrs;	/* ranges in cell */
715 new_range2:
716 			mem_start = read_n_cells(addr_cells, &memcell_buf);
717 			mem_size = read_n_cells(size_cells, &memcell_buf);
718 			if (numa_enabled) {
719 				numa_domain = of_node_numa_domain(memory);
720 				if (numa_domain  >= MAX_NUMNODES)
721 					numa_domain = 0;
722 			} else
723 				numa_domain =  0;
724 
725 			if (numa_domain != nid)
726 				continue;
727 
728 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
729 			memory_present(numa_domain, mem_start >> PAGE_SHIFT,
730 				       (mem_start + mem_size) >> PAGE_SHIFT);
731 
732 			if (--ranges)		/* process all ranges in cell */
733 				goto new_range2;
734 		}
735 
736 	}
737 }
738 
739 void __init paging_init(void)
740 {
741 	unsigned long zones_size[MAX_NR_ZONES];
742 	unsigned long zholes_size[MAX_NR_ZONES];
743 	int nid;
744 
745 	memset(zones_size, 0, sizeof(zones_size));
746 	memset(zholes_size, 0, sizeof(zholes_size));
747 
748 	for_each_online_node(nid) {
749 		unsigned long start_pfn;
750 		unsigned long end_pfn;
751 
752 		start_pfn = init_node_data[nid].node_start_pfn;
753 		end_pfn = init_node_data[nid].node_end_pfn;
754 
755 		zones_size[ZONE_DMA] = end_pfn - start_pfn;
756 		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
757 			init_node_data[nid].node_present_pages;
758 
759 		dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
760 		    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
761 
762 		free_area_init_node(nid, NODE_DATA(nid), zones_size,
763 							start_pfn, zholes_size);
764 	}
765 }
766 
767 static int __init early_numa(char *p)
768 {
769 	if (!p)
770 		return 0;
771 
772 	if (strstr(p, "off"))
773 		numa_enabled = 0;
774 
775 	if (strstr(p, "debug"))
776 		numa_debug = 1;
777 
778 	return 0;
779 }
780 early_param("numa", early_numa);
781