xref: /openbmc/linux/arch/powerpc/mm/numa.c (revision 8cedcfd43a0b00741fff43d6a4c1a8b7748db3b0)
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <linux/nodemask.h>
18 #include <linux/cpu.h>
19 #include <linux/notifier.h>
20 #include <asm/lmb.h>
21 #include <asm/machdep.h>
22 #include <asm/abs_addr.h>
23 #include <asm/system.h>
24 
25 static int numa_enabled = 1;
26 
27 static int numa_debug;
28 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
29 
30 #ifdef DEBUG_NUMA
31 #define ARRAY_INITIALISER -1
32 #else
33 #define ARRAY_INITIALISER 0
34 #endif
35 
36 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
37 	ARRAY_INITIALISER};
38 char *numa_memory_lookup_table;
39 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
40 int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
41 
42 struct pglist_data *node_data[MAX_NUMNODES];
43 bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
44 static int min_common_depth;
45 
46 /*
47  * We need somewhere to store start/span for each node until we have
48  * allocated the real node_data structures.
49  */
50 static struct {
51 	unsigned long node_start_pfn;
52 	unsigned long node_end_pfn;
53 	unsigned long node_present_pages;
54 } init_node_data[MAX_NUMNODES] __initdata;
55 
56 EXPORT_SYMBOL(node_data);
57 EXPORT_SYMBOL(numa_cpu_lookup_table);
58 EXPORT_SYMBOL(numa_memory_lookup_table);
59 EXPORT_SYMBOL(numa_cpumask_lookup_table);
60 EXPORT_SYMBOL(nr_cpus_in_node);
61 
62 static inline void map_cpu_to_node(int cpu, int node)
63 {
64 	numa_cpu_lookup_table[cpu] = node;
65 	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
66 		cpu_set(cpu, numa_cpumask_lookup_table[node]);
67 		nr_cpus_in_node[node]++;
68 	}
69 }
70 
71 #ifdef CONFIG_HOTPLUG_CPU
72 static void unmap_cpu_from_node(unsigned long cpu)
73 {
74 	int node = numa_cpu_lookup_table[cpu];
75 
76 	dbg("removing cpu %lu from node %d\n", cpu, node);
77 
78 	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
79 		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
80 		nr_cpus_in_node[node]--;
81 	} else {
82 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
83 		       cpu, node);
84 	}
85 }
86 #endif /* CONFIG_HOTPLUG_CPU */
87 
88 static struct device_node * __devinit find_cpu_node(unsigned int cpu)
89 {
90 	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
91 	struct device_node *cpu_node = NULL;
92 	unsigned int *interrupt_server, *reg;
93 	int len;
94 
95 	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
96 		/* Try interrupt server first */
97 		interrupt_server = (unsigned int *)get_property(cpu_node,
98 					"ibm,ppc-interrupt-server#s", &len);
99 
100 		len = len / sizeof(u32);
101 
102 		if (interrupt_server && (len > 0)) {
103 			while (len--) {
104 				if (interrupt_server[len] == hw_cpuid)
105 					return cpu_node;
106 			}
107 		} else {
108 			reg = (unsigned int *)get_property(cpu_node,
109 							   "reg", &len);
110 			if (reg && (len > 0) && (reg[0] == hw_cpuid))
111 				return cpu_node;
112 		}
113 	}
114 
115 	return NULL;
116 }
117 
118 /* must hold reference to node during call */
119 static int *of_get_associativity(struct device_node *dev)
120 {
121 	return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
122 }
123 
124 static int of_node_numa_domain(struct device_node *device)
125 {
126 	int numa_domain;
127 	unsigned int *tmp;
128 
129 	if (min_common_depth == -1)
130 		return 0;
131 
132 	tmp = of_get_associativity(device);
133 	if (tmp && (tmp[0] >= min_common_depth)) {
134 		numa_domain = tmp[min_common_depth];
135 	} else {
136 		dbg("WARNING: no NUMA information for %s\n",
137 		    device->full_name);
138 		numa_domain = 0;
139 	}
140 	return numa_domain;
141 }
142 
143 /*
144  * In theory, the "ibm,associativity" property may contain multiple
145  * associativity lists because a resource may be multiply connected
146  * into the machine.  This resource then has different associativity
147  * characteristics relative to its multiple connections.  We ignore
148  * this for now.  We also assume that all cpu and memory sets have
149  * their distances represented at a common level.  This won't be
150  * true for heirarchical NUMA.
151  *
152  * In any case the ibm,associativity-reference-points should give
153  * the correct depth for a normal NUMA system.
154  *
155  * - Dave Hansen <haveblue@us.ibm.com>
156  */
157 static int __init find_min_common_depth(void)
158 {
159 	int depth;
160 	unsigned int *ref_points;
161 	struct device_node *rtas_root;
162 	unsigned int len;
163 
164 	rtas_root = of_find_node_by_path("/rtas");
165 
166 	if (!rtas_root)
167 		return -1;
168 
169 	/*
170 	 * this property is 2 32-bit integers, each representing a level of
171 	 * depth in the associativity nodes.  The first is for an SMP
172 	 * configuration (should be all 0's) and the second is for a normal
173 	 * NUMA configuration.
174 	 */
175 	ref_points = (unsigned int *)get_property(rtas_root,
176 			"ibm,associativity-reference-points", &len);
177 
178 	if ((len >= 1) && ref_points) {
179 		depth = ref_points[1];
180 	} else {
181 		dbg("WARNING: could not find NUMA "
182 		    "associativity reference point\n");
183 		depth = -1;
184 	}
185 	of_node_put(rtas_root);
186 
187 	return depth;
188 }
189 
190 static int __init get_mem_addr_cells(void)
191 {
192 	struct device_node *memory = NULL;
193 	int rc;
194 
195 	memory = of_find_node_by_type(memory, "memory");
196 	if (!memory)
197 		return 0; /* it won't matter */
198 
199 	rc = prom_n_addr_cells(memory);
200 	return rc;
201 }
202 
203 static int __init get_mem_size_cells(void)
204 {
205 	struct device_node *memory = NULL;
206 	int rc;
207 
208 	memory = of_find_node_by_type(memory, "memory");
209 	if (!memory)
210 		return 0; /* it won't matter */
211 	rc = prom_n_size_cells(memory);
212 	return rc;
213 }
214 
215 static unsigned long read_n_cells(int n, unsigned int **buf)
216 {
217 	unsigned long result = 0;
218 
219 	while (n--) {
220 		result = (result << 32) | **buf;
221 		(*buf)++;
222 	}
223 	return result;
224 }
225 
226 /*
227  * Figure out to which domain a cpu belongs and stick it there.
228  * Return the id of the domain used.
229  */
230 static int numa_setup_cpu(unsigned long lcpu)
231 {
232 	int numa_domain = 0;
233 	struct device_node *cpu = find_cpu_node(lcpu);
234 
235 	if (!cpu) {
236 		WARN_ON(1);
237 		goto out;
238 	}
239 
240 	numa_domain = of_node_numa_domain(cpu);
241 
242 	if (numa_domain >= num_online_nodes()) {
243 		/*
244 		 * POWER4 LPAR uses 0xffff as invalid node,
245 		 * dont warn in this case.
246 		 */
247 		if (numa_domain != 0xffff)
248 			printk(KERN_ERR "WARNING: cpu %ld "
249 			       "maps to invalid NUMA node %d\n",
250 			       lcpu, numa_domain);
251 		numa_domain = 0;
252 	}
253 out:
254 	node_set_online(numa_domain);
255 
256 	map_cpu_to_node(lcpu, numa_domain);
257 
258 	of_node_put(cpu);
259 
260 	return numa_domain;
261 }
262 
263 static int cpu_numa_callback(struct notifier_block *nfb,
264 			     unsigned long action,
265 			     void *hcpu)
266 {
267 	unsigned long lcpu = (unsigned long)hcpu;
268 	int ret = NOTIFY_DONE;
269 
270 	switch (action) {
271 	case CPU_UP_PREPARE:
272 		if (min_common_depth == -1 || !numa_enabled)
273 			map_cpu_to_node(lcpu, 0);
274 		else
275 			numa_setup_cpu(lcpu);
276 		ret = NOTIFY_OK;
277 		break;
278 #ifdef CONFIG_HOTPLUG_CPU
279 	case CPU_DEAD:
280 	case CPU_UP_CANCELED:
281 		unmap_cpu_from_node(lcpu);
282 		break;
283 		ret = NOTIFY_OK;
284 #endif
285 	}
286 	return ret;
287 }
288 
289 /*
290  * Check and possibly modify a memory region to enforce the memory limit.
291  *
292  * Returns the size the region should have to enforce the memory limit.
293  * This will either be the original value of size, a truncated value,
294  * or zero. If the returned value of size is 0 the region should be
295  * discarded as it lies wholy above the memory limit.
296  */
297 static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
298 {
299 	/*
300 	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
301 	 * we've already adjusted it for the limit and it takes care of
302 	 * having memory holes below the limit.
303 	 */
304 
305 	if (! memory_limit)
306 		return size;
307 
308 	if (start + size <= lmb_end_of_DRAM())
309 		return size;
310 
311 	if (start >= lmb_end_of_DRAM())
312 		return 0;
313 
314 	return lmb_end_of_DRAM() - start;
315 }
316 
317 static int __init parse_numa_properties(void)
318 {
319 	struct device_node *cpu = NULL;
320 	struct device_node *memory = NULL;
321 	int addr_cells, size_cells;
322 	int max_domain = 0;
323 	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
324 	unsigned long i;
325 
326 	if (numa_enabled == 0) {
327 		printk(KERN_WARNING "NUMA disabled by user\n");
328 		return -1;
329 	}
330 
331 	numa_memory_lookup_table =
332 		(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
333 	memset(numa_memory_lookup_table, 0, entries * sizeof(char));
334 
335 	for (i = 0; i < entries ; i++)
336 		numa_memory_lookup_table[i] = ARRAY_INITIALISER;
337 
338 	min_common_depth = find_min_common_depth();
339 
340 	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
341 	if (min_common_depth < 0)
342 		return min_common_depth;
343 
344 	max_domain = numa_setup_cpu(boot_cpuid);
345 
346 	/*
347 	 * Even though we connect cpus to numa domains later in SMP init,
348 	 * we need to know the maximum node id now. This is because each
349 	 * node id must have NODE_DATA etc backing it.
350 	 * As a result of hotplug we could still have cpus appear later on
351 	 * with larger node ids. In that case we force the cpu into node 0.
352 	 */
353 	for_each_cpu(i) {
354 		int numa_domain;
355 
356 		cpu = find_cpu_node(i);
357 
358 		if (cpu) {
359 			numa_domain = of_node_numa_domain(cpu);
360 			of_node_put(cpu);
361 
362 			if (numa_domain < MAX_NUMNODES &&
363 			    max_domain < numa_domain)
364 				max_domain = numa_domain;
365 		}
366 	}
367 
368 	addr_cells = get_mem_addr_cells();
369 	size_cells = get_mem_size_cells();
370 	memory = NULL;
371 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
372 		unsigned long start;
373 		unsigned long size;
374 		int numa_domain;
375 		int ranges;
376 		unsigned int *memcell_buf;
377 		unsigned int len;
378 
379 		memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
380 		if (!memcell_buf || len <= 0)
381 			continue;
382 
383 		ranges = memory->n_addrs;
384 new_range:
385 		/* these are order-sensitive, and modify the buffer pointer */
386 		start = read_n_cells(addr_cells, &memcell_buf);
387 		size = read_n_cells(size_cells, &memcell_buf);
388 
389 		start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
390 		size = _ALIGN_UP(size, MEMORY_INCREMENT);
391 
392 		numa_domain = of_node_numa_domain(memory);
393 
394 		if (numa_domain >= MAX_NUMNODES) {
395 			if (numa_domain != 0xffff)
396 				printk(KERN_ERR "WARNING: memory at %lx maps "
397 				       "to invalid NUMA node %d\n", start,
398 				       numa_domain);
399 			numa_domain = 0;
400 		}
401 
402 		if (max_domain < numa_domain)
403 			max_domain = numa_domain;
404 
405 		if (! (size = numa_enforce_memory_limit(start, size))) {
406 			if (--ranges)
407 				goto new_range;
408 			else
409 				continue;
410 		}
411 
412 		/*
413 		 * Initialize new node struct, or add to an existing one.
414 		 */
415 		if (init_node_data[numa_domain].node_end_pfn) {
416 			if ((start / PAGE_SIZE) <
417 			    init_node_data[numa_domain].node_start_pfn)
418 				init_node_data[numa_domain].node_start_pfn =
419 					start / PAGE_SIZE;
420 			if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
421 			    init_node_data[numa_domain].node_end_pfn)
422 				init_node_data[numa_domain].node_end_pfn =
423 					(start / PAGE_SIZE) +
424 					(size / PAGE_SIZE);
425 
426 			init_node_data[numa_domain].node_present_pages +=
427 				size / PAGE_SIZE;
428 		} else {
429 			node_set_online(numa_domain);
430 
431 			init_node_data[numa_domain].node_start_pfn =
432 				start / PAGE_SIZE;
433 			init_node_data[numa_domain].node_end_pfn =
434 				init_node_data[numa_domain].node_start_pfn +
435 				size / PAGE_SIZE;
436 			init_node_data[numa_domain].node_present_pages =
437 				size / PAGE_SIZE;
438 		}
439 
440 		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
441 			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
442 				numa_domain;
443 
444 		if (--ranges)
445 			goto new_range;
446 	}
447 
448 	for (i = 0; i <= max_domain; i++)
449 		node_set_online(i);
450 
451 	return 0;
452 }
453 
454 static void __init setup_nonnuma(void)
455 {
456 	unsigned long top_of_ram = lmb_end_of_DRAM();
457 	unsigned long total_ram = lmb_phys_mem_size();
458 	unsigned long i;
459 
460 	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
461 	       top_of_ram, total_ram);
462 	printk(KERN_INFO "Memory hole size: %ldMB\n",
463 	       (top_of_ram - total_ram) >> 20);
464 
465 	if (!numa_memory_lookup_table) {
466 		long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
467 		numa_memory_lookup_table =
468 			(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
469 		memset(numa_memory_lookup_table, 0, entries * sizeof(char));
470 		for (i = 0; i < entries ; i++)
471 			numa_memory_lookup_table[i] = ARRAY_INITIALISER;
472 	}
473 
474 	map_cpu_to_node(boot_cpuid, 0);
475 
476 	node_set_online(0);
477 
478 	init_node_data[0].node_start_pfn = 0;
479 	init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
480 	init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
481 
482 	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
483 		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
484 }
485 
486 static void __init dump_numa_topology(void)
487 {
488 	unsigned int node;
489 	unsigned int count;
490 
491 	if (min_common_depth == -1 || !numa_enabled)
492 		return;
493 
494 	for_each_online_node(node) {
495 		unsigned long i;
496 
497 		printk(KERN_INFO "Node %d Memory:", node);
498 
499 		count = 0;
500 
501 		for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
502 			if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
503 				if (count == 0)
504 					printk(" 0x%lx", i);
505 				++count;
506 			} else {
507 				if (count > 0)
508 					printk("-0x%lx", i);
509 				count = 0;
510 			}
511 		}
512 
513 		if (count > 0)
514 			printk("-0x%lx", i);
515 		printk("\n");
516 	}
517 	return;
518 }
519 
520 /*
521  * Allocate some memory, satisfying the lmb or bootmem allocator where
522  * required. nid is the preferred node and end is the physical address of
523  * the highest address in the node.
524  *
525  * Returns the physical address of the memory.
526  */
527 static unsigned long careful_allocation(int nid, unsigned long size,
528 					unsigned long align, unsigned long end)
529 {
530 	unsigned long ret = lmb_alloc_base(size, align, end);
531 
532 	/* retry over all memory */
533 	if (!ret)
534 		ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
535 
536 	if (!ret)
537 		panic("numa.c: cannot allocate %lu bytes on node %d",
538 		      size, nid);
539 
540 	/*
541 	 * If the memory came from a previously allocated node, we must
542 	 * retry with the bootmem allocator.
543 	 */
544 	if (pa_to_nid(ret) < nid) {
545 		nid = pa_to_nid(ret);
546 		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
547 				size, align, 0);
548 
549 		if (!ret)
550 			panic("numa.c: cannot allocate %lu bytes on node %d",
551 			      size, nid);
552 
553 		ret = virt_to_abs(ret);
554 
555 		dbg("alloc_bootmem %lx %lx\n", ret, size);
556 	}
557 
558 	return ret;
559 }
560 
561 void __init do_init_bootmem(void)
562 {
563 	int nid;
564 	int addr_cells, size_cells;
565 	struct device_node *memory = NULL;
566 	static struct notifier_block ppc64_numa_nb = {
567 		.notifier_call = cpu_numa_callback,
568 		.priority = 1 /* Must run before sched domains notifier. */
569 	};
570 
571 	min_low_pfn = 0;
572 	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
573 	max_pfn = max_low_pfn;
574 
575 	if (parse_numa_properties())
576 		setup_nonnuma();
577 	else
578 		dump_numa_topology();
579 
580 	register_cpu_notifier(&ppc64_numa_nb);
581 
582 	for_each_online_node(nid) {
583 		unsigned long start_paddr, end_paddr;
584 		int i;
585 		unsigned long bootmem_paddr;
586 		unsigned long bootmap_pages;
587 
588 		start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
589 		end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
590 
591 		/* Allocate the node structure node local if possible */
592 		NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
593 					sizeof(struct pglist_data),
594 					SMP_CACHE_BYTES, end_paddr);
595 		NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
596 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
597 
598   		dbg("node %d\n", nid);
599 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
600 
601 		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
602 		NODE_DATA(nid)->node_start_pfn =
603 			init_node_data[nid].node_start_pfn;
604 		NODE_DATA(nid)->node_spanned_pages =
605 			end_paddr - start_paddr;
606 
607 		if (NODE_DATA(nid)->node_spanned_pages == 0)
608   			continue;
609 
610   		dbg("start_paddr = %lx\n", start_paddr);
611   		dbg("end_paddr = %lx\n", end_paddr);
612 
613 		bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
614 
615 		bootmem_paddr = careful_allocation(nid,
616 				bootmap_pages << PAGE_SHIFT,
617 				PAGE_SIZE, end_paddr);
618 		memset(abs_to_virt(bootmem_paddr), 0,
619 		       bootmap_pages << PAGE_SHIFT);
620 		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
621 
622 		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
623 				  start_paddr >> PAGE_SHIFT,
624 				  end_paddr >> PAGE_SHIFT);
625 
626 		/*
627 		 * We need to do another scan of all memory sections to
628 		 * associate memory with the correct node.
629 		 */
630 		addr_cells = get_mem_addr_cells();
631 		size_cells = get_mem_size_cells();
632 		memory = NULL;
633 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
634 			unsigned long mem_start, mem_size;
635 			int numa_domain, ranges;
636 			unsigned int *memcell_buf;
637 			unsigned int len;
638 
639 			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
640 			if (!memcell_buf || len <= 0)
641 				continue;
642 
643 			ranges = memory->n_addrs;	/* ranges in cell */
644 new_range:
645 			mem_start = read_n_cells(addr_cells, &memcell_buf);
646 			mem_size = read_n_cells(size_cells, &memcell_buf);
647 			if (numa_enabled) {
648 				numa_domain = of_node_numa_domain(memory);
649 				if (numa_domain  >= MAX_NUMNODES)
650 					numa_domain = 0;
651 			} else
652 				numa_domain =  0;
653 
654 			if (numa_domain != nid)
655 				continue;
656 
657 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
658   			if (mem_size) {
659   				dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
660   				free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
661 			}
662 
663 			if (--ranges)		/* process all ranges in cell */
664 				goto new_range;
665 		}
666 
667 		/*
668 		 * Mark reserved regions on this node
669 		 */
670 		for (i = 0; i < lmb.reserved.cnt; i++) {
671 			unsigned long physbase = lmb.reserved.region[i].base;
672 			unsigned long size = lmb.reserved.region[i].size;
673 
674 			if (pa_to_nid(physbase) != nid &&
675 			    pa_to_nid(physbase+size-1) != nid)
676 				continue;
677 
678 			if (physbase < end_paddr &&
679 			    (physbase+size) > start_paddr) {
680 				/* overlaps */
681 				if (physbase < start_paddr) {
682 					size -= start_paddr - physbase;
683 					physbase = start_paddr;
684 				}
685 
686 				if (size > end_paddr - physbase)
687 					size = end_paddr - physbase;
688 
689 				dbg("reserve_bootmem %lx %lx\n", physbase,
690 				    size);
691 				reserve_bootmem_node(NODE_DATA(nid), physbase,
692 						     size);
693 			}
694 		}
695 		/*
696 		 * This loop may look famaliar, but we have to do it again
697 		 * after marking our reserved memory to mark memory present
698 		 * for sparsemem.
699 		 */
700 		addr_cells = get_mem_addr_cells();
701 		size_cells = get_mem_size_cells();
702 		memory = NULL;
703 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
704 			unsigned long mem_start, mem_size;
705 			int numa_domain, ranges;
706 			unsigned int *memcell_buf;
707 			unsigned int len;
708 
709 			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
710 			if (!memcell_buf || len <= 0)
711 				continue;
712 
713 			ranges = memory->n_addrs;	/* ranges in cell */
714 new_range2:
715 			mem_start = read_n_cells(addr_cells, &memcell_buf);
716 			mem_size = read_n_cells(size_cells, &memcell_buf);
717 			if (numa_enabled) {
718 				numa_domain = of_node_numa_domain(memory);
719 				if (numa_domain  >= MAX_NUMNODES)
720 					numa_domain = 0;
721 			} else
722 				numa_domain =  0;
723 
724 			if (numa_domain != nid)
725 				continue;
726 
727 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
728 			memory_present(numa_domain, mem_start >> PAGE_SHIFT,
729 				       (mem_start + mem_size) >> PAGE_SHIFT);
730 
731 			if (--ranges)		/* process all ranges in cell */
732 				goto new_range2;
733 		}
734 
735 	}
736 }
737 
738 void __init paging_init(void)
739 {
740 	unsigned long zones_size[MAX_NR_ZONES];
741 	unsigned long zholes_size[MAX_NR_ZONES];
742 	int nid;
743 
744 	memset(zones_size, 0, sizeof(zones_size));
745 	memset(zholes_size, 0, sizeof(zholes_size));
746 
747 	for_each_online_node(nid) {
748 		unsigned long start_pfn;
749 		unsigned long end_pfn;
750 
751 		start_pfn = init_node_data[nid].node_start_pfn;
752 		end_pfn = init_node_data[nid].node_end_pfn;
753 
754 		zones_size[ZONE_DMA] = end_pfn - start_pfn;
755 		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
756 			init_node_data[nid].node_present_pages;
757 
758 		dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
759 		    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
760 
761 		free_area_init_node(nid, NODE_DATA(nid), zones_size,
762 							start_pfn, zholes_size);
763 	}
764 }
765 
766 static int __init early_numa(char *p)
767 {
768 	if (!p)
769 		return 0;
770 
771 	if (strstr(p, "off"))
772 		numa_enabled = 0;
773 
774 	if (strstr(p, "debug"))
775 		numa_debug = 1;
776 
777 	return 0;
778 }
779 early_param("numa", early_numa);
780