xref: /openbmc/linux/arch/powerpc/mm/numa.c (revision ab1f9dac6eea25ee59e4c8e1cf0b7476afbbfe07)
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <linux/nodemask.h>
18 #include <linux/cpu.h>
19 #include <linux/notifier.h>
20 #include <asm/lmb.h>
21 #include <asm/machdep.h>
22 #include <asm/abs_addr.h>
23 
24 static int numa_enabled = 1;
25 
26 static int numa_debug;
27 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
28 
29 #ifdef DEBUG_NUMA
30 #define ARRAY_INITIALISER -1
31 #else
32 #define ARRAY_INITIALISER 0
33 #endif
34 
35 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
36 	ARRAY_INITIALISER};
37 char *numa_memory_lookup_table;
38 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
39 int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
40 
41 struct pglist_data *node_data[MAX_NUMNODES];
42 bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
43 static int min_common_depth;
44 
45 /*
46  * We need somewhere to store start/span for each node until we have
47  * allocated the real node_data structures.
48  */
49 static struct {
50 	unsigned long node_start_pfn;
51 	unsigned long node_end_pfn;
52 	unsigned long node_present_pages;
53 } init_node_data[MAX_NUMNODES] __initdata;
54 
55 EXPORT_SYMBOL(node_data);
56 EXPORT_SYMBOL(numa_cpu_lookup_table);
57 EXPORT_SYMBOL(numa_memory_lookup_table);
58 EXPORT_SYMBOL(numa_cpumask_lookup_table);
59 EXPORT_SYMBOL(nr_cpus_in_node);
60 
61 static inline void map_cpu_to_node(int cpu, int node)
62 {
63 	numa_cpu_lookup_table[cpu] = node;
64 	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
65 		cpu_set(cpu, numa_cpumask_lookup_table[node]);
66 		nr_cpus_in_node[node]++;
67 	}
68 }
69 
70 #ifdef CONFIG_HOTPLUG_CPU
71 static void unmap_cpu_from_node(unsigned long cpu)
72 {
73 	int node = numa_cpu_lookup_table[cpu];
74 
75 	dbg("removing cpu %lu from node %d\n", cpu, node);
76 
77 	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
78 		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
79 		nr_cpus_in_node[node]--;
80 	} else {
81 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
82 		       cpu, node);
83 	}
84 }
85 #endif /* CONFIG_HOTPLUG_CPU */
86 
87 static struct device_node * __devinit find_cpu_node(unsigned int cpu)
88 {
89 	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
90 	struct device_node *cpu_node = NULL;
91 	unsigned int *interrupt_server, *reg;
92 	int len;
93 
94 	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
95 		/* Try interrupt server first */
96 		interrupt_server = (unsigned int *)get_property(cpu_node,
97 					"ibm,ppc-interrupt-server#s", &len);
98 
99 		len = len / sizeof(u32);
100 
101 		if (interrupt_server && (len > 0)) {
102 			while (len--) {
103 				if (interrupt_server[len] == hw_cpuid)
104 					return cpu_node;
105 			}
106 		} else {
107 			reg = (unsigned int *)get_property(cpu_node,
108 							   "reg", &len);
109 			if (reg && (len > 0) && (reg[0] == hw_cpuid))
110 				return cpu_node;
111 		}
112 	}
113 
114 	return NULL;
115 }
116 
117 /* must hold reference to node during call */
118 static int *of_get_associativity(struct device_node *dev)
119 {
120 	return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
121 }
122 
123 static int of_node_numa_domain(struct device_node *device)
124 {
125 	int numa_domain;
126 	unsigned int *tmp;
127 
128 	if (min_common_depth == -1)
129 		return 0;
130 
131 	tmp = of_get_associativity(device);
132 	if (tmp && (tmp[0] >= min_common_depth)) {
133 		numa_domain = tmp[min_common_depth];
134 	} else {
135 		dbg("WARNING: no NUMA information for %s\n",
136 		    device->full_name);
137 		numa_domain = 0;
138 	}
139 	return numa_domain;
140 }
141 
142 /*
143  * In theory, the "ibm,associativity" property may contain multiple
144  * associativity lists because a resource may be multiply connected
145  * into the machine.  This resource then has different associativity
146  * characteristics relative to its multiple connections.  We ignore
147  * this for now.  We also assume that all cpu and memory sets have
148  * their distances represented at a common level.  This won't be
149  * true for heirarchical NUMA.
150  *
151  * In any case the ibm,associativity-reference-points should give
152  * the correct depth for a normal NUMA system.
153  *
154  * - Dave Hansen <haveblue@us.ibm.com>
155  */
156 static int __init find_min_common_depth(void)
157 {
158 	int depth;
159 	unsigned int *ref_points;
160 	struct device_node *rtas_root;
161 	unsigned int len;
162 
163 	rtas_root = of_find_node_by_path("/rtas");
164 
165 	if (!rtas_root)
166 		return -1;
167 
168 	/*
169 	 * this property is 2 32-bit integers, each representing a level of
170 	 * depth in the associativity nodes.  The first is for an SMP
171 	 * configuration (should be all 0's) and the second is for a normal
172 	 * NUMA configuration.
173 	 */
174 	ref_points = (unsigned int *)get_property(rtas_root,
175 			"ibm,associativity-reference-points", &len);
176 
177 	if ((len >= 1) && ref_points) {
178 		depth = ref_points[1];
179 	} else {
180 		dbg("WARNING: could not find NUMA "
181 		    "associativity reference point\n");
182 		depth = -1;
183 	}
184 	of_node_put(rtas_root);
185 
186 	return depth;
187 }
188 
189 static int __init get_mem_addr_cells(void)
190 {
191 	struct device_node *memory = NULL;
192 	int rc;
193 
194 	memory = of_find_node_by_type(memory, "memory");
195 	if (!memory)
196 		return 0; /* it won't matter */
197 
198 	rc = prom_n_addr_cells(memory);
199 	return rc;
200 }
201 
202 static int __init get_mem_size_cells(void)
203 {
204 	struct device_node *memory = NULL;
205 	int rc;
206 
207 	memory = of_find_node_by_type(memory, "memory");
208 	if (!memory)
209 		return 0; /* it won't matter */
210 	rc = prom_n_size_cells(memory);
211 	return rc;
212 }
213 
214 static unsigned long read_n_cells(int n, unsigned int **buf)
215 {
216 	unsigned long result = 0;
217 
218 	while (n--) {
219 		result = (result << 32) | **buf;
220 		(*buf)++;
221 	}
222 	return result;
223 }
224 
225 /*
226  * Figure out to which domain a cpu belongs and stick it there.
227  * Return the id of the domain used.
228  */
229 static int numa_setup_cpu(unsigned long lcpu)
230 {
231 	int numa_domain = 0;
232 	struct device_node *cpu = find_cpu_node(lcpu);
233 
234 	if (!cpu) {
235 		WARN_ON(1);
236 		goto out;
237 	}
238 
239 	numa_domain = of_node_numa_domain(cpu);
240 
241 	if (numa_domain >= num_online_nodes()) {
242 		/*
243 		 * POWER4 LPAR uses 0xffff as invalid node,
244 		 * dont warn in this case.
245 		 */
246 		if (numa_domain != 0xffff)
247 			printk(KERN_ERR "WARNING: cpu %ld "
248 			       "maps to invalid NUMA node %d\n",
249 			       lcpu, numa_domain);
250 		numa_domain = 0;
251 	}
252 out:
253 	node_set_online(numa_domain);
254 
255 	map_cpu_to_node(lcpu, numa_domain);
256 
257 	of_node_put(cpu);
258 
259 	return numa_domain;
260 }
261 
262 static int cpu_numa_callback(struct notifier_block *nfb,
263 			     unsigned long action,
264 			     void *hcpu)
265 {
266 	unsigned long lcpu = (unsigned long)hcpu;
267 	int ret = NOTIFY_DONE;
268 
269 	switch (action) {
270 	case CPU_UP_PREPARE:
271 		if (min_common_depth == -1 || !numa_enabled)
272 			map_cpu_to_node(lcpu, 0);
273 		else
274 			numa_setup_cpu(lcpu);
275 		ret = NOTIFY_OK;
276 		break;
277 #ifdef CONFIG_HOTPLUG_CPU
278 	case CPU_DEAD:
279 	case CPU_UP_CANCELED:
280 		unmap_cpu_from_node(lcpu);
281 		break;
282 		ret = NOTIFY_OK;
283 #endif
284 	}
285 	return ret;
286 }
287 
288 /*
289  * Check and possibly modify a memory region to enforce the memory limit.
290  *
291  * Returns the size the region should have to enforce the memory limit.
292  * This will either be the original value of size, a truncated value,
293  * or zero. If the returned value of size is 0 the region should be
294  * discarded as it lies wholy above the memory limit.
295  */
296 static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
297 {
298 	/*
299 	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
300 	 * we've already adjusted it for the limit and it takes care of
301 	 * having memory holes below the limit.
302 	 */
303 	extern unsigned long memory_limit;
304 
305 	if (! memory_limit)
306 		return size;
307 
308 	if (start + size <= lmb_end_of_DRAM())
309 		return size;
310 
311 	if (start >= lmb_end_of_DRAM())
312 		return 0;
313 
314 	return lmb_end_of_DRAM() - start;
315 }
316 
317 static int __init parse_numa_properties(void)
318 {
319 	struct device_node *cpu = NULL;
320 	struct device_node *memory = NULL;
321 	int addr_cells, size_cells;
322 	int max_domain = 0;
323 	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
324 	unsigned long i;
325 
326 	if (numa_enabled == 0) {
327 		printk(KERN_WARNING "NUMA disabled by user\n");
328 		return -1;
329 	}
330 
331 	numa_memory_lookup_table =
332 		(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
333 	memset(numa_memory_lookup_table, 0, entries * sizeof(char));
334 
335 	for (i = 0; i < entries ; i++)
336 		numa_memory_lookup_table[i] = ARRAY_INITIALISER;
337 
338 	min_common_depth = find_min_common_depth();
339 
340 	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
341 	if (min_common_depth < 0)
342 		return min_common_depth;
343 
344 	max_domain = numa_setup_cpu(boot_cpuid);
345 
346 	/*
347 	 * Even though we connect cpus to numa domains later in SMP init,
348 	 * we need to know the maximum node id now. This is because each
349 	 * node id must have NODE_DATA etc backing it.
350 	 * As a result of hotplug we could still have cpus appear later on
351 	 * with larger node ids. In that case we force the cpu into node 0.
352 	 */
353 	for_each_cpu(i) {
354 		int numa_domain;
355 
356 		cpu = find_cpu_node(i);
357 
358 		if (cpu) {
359 			numa_domain = of_node_numa_domain(cpu);
360 			of_node_put(cpu);
361 
362 			if (numa_domain < MAX_NUMNODES &&
363 			    max_domain < numa_domain)
364 				max_domain = numa_domain;
365 		}
366 	}
367 
368 	addr_cells = get_mem_addr_cells();
369 	size_cells = get_mem_size_cells();
370 	memory = NULL;
371 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
372 		unsigned long start;
373 		unsigned long size;
374 		int numa_domain;
375 		int ranges;
376 		unsigned int *memcell_buf;
377 		unsigned int len;
378 
379 		memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
380 		if (!memcell_buf || len <= 0)
381 			continue;
382 
383 		ranges = memory->n_addrs;
384 new_range:
385 		/* these are order-sensitive, and modify the buffer pointer */
386 		start = read_n_cells(addr_cells, &memcell_buf);
387 		size = read_n_cells(size_cells, &memcell_buf);
388 
389 		start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
390 		size = _ALIGN_UP(size, MEMORY_INCREMENT);
391 
392 		numa_domain = of_node_numa_domain(memory);
393 
394 		if (numa_domain >= MAX_NUMNODES) {
395 			if (numa_domain != 0xffff)
396 				printk(KERN_ERR "WARNING: memory at %lx maps "
397 				       "to invalid NUMA node %d\n", start,
398 				       numa_domain);
399 			numa_domain = 0;
400 		}
401 
402 		if (max_domain < numa_domain)
403 			max_domain = numa_domain;
404 
405 		if (! (size = numa_enforce_memory_limit(start, size))) {
406 			if (--ranges)
407 				goto new_range;
408 			else
409 				continue;
410 		}
411 
412 		/*
413 		 * Initialize new node struct, or add to an existing one.
414 		 */
415 		if (init_node_data[numa_domain].node_end_pfn) {
416 			if ((start / PAGE_SIZE) <
417 			    init_node_data[numa_domain].node_start_pfn)
418 				init_node_data[numa_domain].node_start_pfn =
419 					start / PAGE_SIZE;
420 			if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
421 			    init_node_data[numa_domain].node_end_pfn)
422 				init_node_data[numa_domain].node_end_pfn =
423 					(start / PAGE_SIZE) +
424 					(size / PAGE_SIZE);
425 
426 			init_node_data[numa_domain].node_present_pages +=
427 				size / PAGE_SIZE;
428 		} else {
429 			node_set_online(numa_domain);
430 
431 			init_node_data[numa_domain].node_start_pfn =
432 				start / PAGE_SIZE;
433 			init_node_data[numa_domain].node_end_pfn =
434 				init_node_data[numa_domain].node_start_pfn +
435 				size / PAGE_SIZE;
436 			init_node_data[numa_domain].node_present_pages =
437 				size / PAGE_SIZE;
438 		}
439 
440 		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
441 			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
442 				numa_domain;
443 
444 		if (--ranges)
445 			goto new_range;
446 	}
447 
448 	for (i = 0; i <= max_domain; i++)
449 		node_set_online(i);
450 
451 	return 0;
452 }
453 
454 static void __init setup_nonnuma(void)
455 {
456 	unsigned long top_of_ram = lmb_end_of_DRAM();
457 	unsigned long total_ram = lmb_phys_mem_size();
458 	unsigned long i;
459 
460 	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
461 	       top_of_ram, total_ram);
462 	printk(KERN_INFO "Memory hole size: %ldMB\n",
463 	       (top_of_ram - total_ram) >> 20);
464 
465 	if (!numa_memory_lookup_table) {
466 		long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
467 		numa_memory_lookup_table =
468 			(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
469 		memset(numa_memory_lookup_table, 0, entries * sizeof(char));
470 		for (i = 0; i < entries ; i++)
471 			numa_memory_lookup_table[i] = ARRAY_INITIALISER;
472 	}
473 
474 	map_cpu_to_node(boot_cpuid, 0);
475 
476 	node_set_online(0);
477 
478 	init_node_data[0].node_start_pfn = 0;
479 	init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
480 	init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
481 
482 	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
483 		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
484 }
485 
486 static void __init dump_numa_topology(void)
487 {
488 	unsigned int node;
489 	unsigned int count;
490 
491 	if (min_common_depth == -1 || !numa_enabled)
492 		return;
493 
494 	for_each_online_node(node) {
495 		unsigned long i;
496 
497 		printk(KERN_INFO "Node %d Memory:", node);
498 
499 		count = 0;
500 
501 		for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
502 			if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
503 				if (count == 0)
504 					printk(" 0x%lx", i);
505 				++count;
506 			} else {
507 				if (count > 0)
508 					printk("-0x%lx", i);
509 				count = 0;
510 			}
511 		}
512 
513 		if (count > 0)
514 			printk("-0x%lx", i);
515 		printk("\n");
516 	}
517 	return;
518 }
519 
520 /*
521  * Allocate some memory, satisfying the lmb or bootmem allocator where
522  * required. nid is the preferred node and end is the physical address of
523  * the highest address in the node.
524  *
525  * Returns the physical address of the memory.
526  */
527 static unsigned long careful_allocation(int nid, unsigned long size,
528 					unsigned long align, unsigned long end)
529 {
530 	unsigned long ret = lmb_alloc_base(size, align, end);
531 
532 	/* retry over all memory */
533 	if (!ret)
534 		ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
535 
536 	if (!ret)
537 		panic("numa.c: cannot allocate %lu bytes on node %d",
538 		      size, nid);
539 
540 	/*
541 	 * If the memory came from a previously allocated node, we must
542 	 * retry with the bootmem allocator.
543 	 */
544 	if (pa_to_nid(ret) < nid) {
545 		nid = pa_to_nid(ret);
546 		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
547 				size, align, 0);
548 
549 		if (!ret)
550 			panic("numa.c: cannot allocate %lu bytes on node %d",
551 			      size, nid);
552 
553 		ret = virt_to_abs(ret);
554 
555 		dbg("alloc_bootmem %lx %lx\n", ret, size);
556 	}
557 
558 	return ret;
559 }
560 
561 void __init do_init_bootmem(void)
562 {
563 	int nid;
564 	int addr_cells, size_cells;
565 	struct device_node *memory = NULL;
566 	static struct notifier_block ppc64_numa_nb = {
567 		.notifier_call = cpu_numa_callback,
568 		.priority = 1 /* Must run before sched domains notifier. */
569 	};
570 
571 	min_low_pfn = 0;
572 	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
573 	max_pfn = max_low_pfn;
574 
575 	if (parse_numa_properties())
576 		setup_nonnuma();
577 	else
578 		dump_numa_topology();
579 
580 	register_cpu_notifier(&ppc64_numa_nb);
581 
582 	for_each_online_node(nid) {
583 		unsigned long start_paddr, end_paddr;
584 		int i;
585 		unsigned long bootmem_paddr;
586 		unsigned long bootmap_pages;
587 
588 		start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
589 		end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
590 
591 		/* Allocate the node structure node local if possible */
592 		NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
593 					sizeof(struct pglist_data),
594 					SMP_CACHE_BYTES, end_paddr);
595 		NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
596 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
597 
598   		dbg("node %d\n", nid);
599 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
600 
601 		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
602 		NODE_DATA(nid)->node_start_pfn =
603 			init_node_data[nid].node_start_pfn;
604 		NODE_DATA(nid)->node_spanned_pages =
605 			end_paddr - start_paddr;
606 
607 		if (NODE_DATA(nid)->node_spanned_pages == 0)
608   			continue;
609 
610   		dbg("start_paddr = %lx\n", start_paddr);
611   		dbg("end_paddr = %lx\n", end_paddr);
612 
613 		bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
614 
615 		bootmem_paddr = careful_allocation(nid,
616 				bootmap_pages << PAGE_SHIFT,
617 				PAGE_SIZE, end_paddr);
618 		memset(abs_to_virt(bootmem_paddr), 0,
619 		       bootmap_pages << PAGE_SHIFT);
620 		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
621 
622 		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
623 				  start_paddr >> PAGE_SHIFT,
624 				  end_paddr >> PAGE_SHIFT);
625 
626 		/*
627 		 * We need to do another scan of all memory sections to
628 		 * associate memory with the correct node.
629 		 */
630 		addr_cells = get_mem_addr_cells();
631 		size_cells = get_mem_size_cells();
632 		memory = NULL;
633 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
634 			unsigned long mem_start, mem_size;
635 			int numa_domain, ranges;
636 			unsigned int *memcell_buf;
637 			unsigned int len;
638 
639 			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
640 			if (!memcell_buf || len <= 0)
641 				continue;
642 
643 			ranges = memory->n_addrs;	/* ranges in cell */
644 new_range:
645 			mem_start = read_n_cells(addr_cells, &memcell_buf);
646 			mem_size = read_n_cells(size_cells, &memcell_buf);
647 			if (numa_enabled) {
648 				numa_domain = of_node_numa_domain(memory);
649 				if (numa_domain  >= MAX_NUMNODES)
650 					numa_domain = 0;
651 			} else
652 				numa_domain =  0;
653 
654 			if (numa_domain != nid)
655 				continue;
656 
657 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
658   			if (mem_size) {
659   				dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
660   				free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
661 			}
662 
663 			if (--ranges)		/* process all ranges in cell */
664 				goto new_range;
665 		}
666 
667 		/*
668 		 * Mark reserved regions on this node
669 		 */
670 		for (i = 0; i < lmb.reserved.cnt; i++) {
671 			unsigned long physbase = lmb.reserved.region[i].base;
672 			unsigned long size = lmb.reserved.region[i].size;
673 
674 			if (pa_to_nid(physbase) != nid &&
675 			    pa_to_nid(physbase+size-1) != nid)
676 				continue;
677 
678 			if (physbase < end_paddr &&
679 			    (physbase+size) > start_paddr) {
680 				/* overlaps */
681 				if (physbase < start_paddr) {
682 					size -= start_paddr - physbase;
683 					physbase = start_paddr;
684 				}
685 
686 				if (size > end_paddr - physbase)
687 					size = end_paddr - physbase;
688 
689 				dbg("reserve_bootmem %lx %lx\n", physbase,
690 				    size);
691 				reserve_bootmem_node(NODE_DATA(nid), physbase,
692 						     size);
693 			}
694 		}
695 		/*
696 		 * This loop may look famaliar, but we have to do it again
697 		 * after marking our reserved memory to mark memory present
698 		 * for sparsemem.
699 		 */
700 		addr_cells = get_mem_addr_cells();
701 		size_cells = get_mem_size_cells();
702 		memory = NULL;
703 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
704 			unsigned long mem_start, mem_size;
705 			int numa_domain, ranges;
706 			unsigned int *memcell_buf;
707 			unsigned int len;
708 
709 			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
710 			if (!memcell_buf || len <= 0)
711 				continue;
712 
713 			ranges = memory->n_addrs;	/* ranges in cell */
714 new_range2:
715 			mem_start = read_n_cells(addr_cells, &memcell_buf);
716 			mem_size = read_n_cells(size_cells, &memcell_buf);
717 			if (numa_enabled) {
718 				numa_domain = of_node_numa_domain(memory);
719 				if (numa_domain  >= MAX_NUMNODES)
720 					numa_domain = 0;
721 			} else
722 				numa_domain =  0;
723 
724 			if (numa_domain != nid)
725 				continue;
726 
727 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
728 			memory_present(numa_domain, mem_start >> PAGE_SHIFT,
729 				       (mem_start + mem_size) >> PAGE_SHIFT);
730 
731 			if (--ranges)		/* process all ranges in cell */
732 				goto new_range2;
733 		}
734 
735 	}
736 }
737 
738 void __init paging_init(void)
739 {
740 	unsigned long zones_size[MAX_NR_ZONES];
741 	unsigned long zholes_size[MAX_NR_ZONES];
742 	int nid;
743 
744 	memset(zones_size, 0, sizeof(zones_size));
745 	memset(zholes_size, 0, sizeof(zholes_size));
746 
747 	for_each_online_node(nid) {
748 		unsigned long start_pfn;
749 		unsigned long end_pfn;
750 
751 		start_pfn = init_node_data[nid].node_start_pfn;
752 		end_pfn = init_node_data[nid].node_end_pfn;
753 
754 		zones_size[ZONE_DMA] = end_pfn - start_pfn;
755 		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
756 			init_node_data[nid].node_present_pages;
757 
758 		dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
759 		    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
760 
761 		free_area_init_node(nid, NODE_DATA(nid), zones_size,
762 							start_pfn, zholes_size);
763 	}
764 }
765 
766 static int __init early_numa(char *p)
767 {
768 	if (!p)
769 		return 0;
770 
771 	if (strstr(p, "off"))
772 		numa_enabled = 0;
773 
774 	if (strstr(p, "debug"))
775 		numa_debug = 1;
776 
777 	return 0;
778 }
779 early_param("numa", early_numa);
780