xref: /openbmc/linux/arch/powerpc/mm/numa.c (revision 92ed1a76)
1 /*
2  * pSeries NUMA support
3  *
4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
14 #include <linux/mm.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <linux/nodemask.h>
18 #include <linux/cpu.h>
19 #include <linux/notifier.h>
20 #include <linux/memblock.h>
21 #include <linux/of.h>
22 #include <linux/pfn.h>
23 #include <asm/sparsemem.h>
24 #include <asm/prom.h>
25 #include <asm/system.h>
26 #include <asm/smp.h>
27 
28 static int numa_enabled = 1;
29 
30 static char *cmdline __initdata;
31 
32 static int numa_debug;
33 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
34 
35 int numa_cpu_lookup_table[NR_CPUS];
36 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
37 struct pglist_data *node_data[MAX_NUMNODES];
38 
39 EXPORT_SYMBOL(numa_cpu_lookup_table);
40 EXPORT_SYMBOL(node_to_cpumask_map);
41 EXPORT_SYMBOL(node_data);
42 
43 static int min_common_depth;
44 static int n_mem_addr_cells, n_mem_size_cells;
45 static int form1_affinity;
46 
47 #define MAX_DISTANCE_REF_POINTS 4
48 static int distance_ref_points_depth;
49 static const unsigned int *distance_ref_points;
50 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
51 
52 /*
53  * Allocate node_to_cpumask_map based on number of available nodes
54  * Requires node_possible_map to be valid.
55  *
56  * Note: node_to_cpumask() is not valid until after this is done.
57  */
58 static void __init setup_node_to_cpumask_map(void)
59 {
60 	unsigned int node, num = 0;
61 
62 	/* setup nr_node_ids if not done yet */
63 	if (nr_node_ids == MAX_NUMNODES) {
64 		for_each_node_mask(node, node_possible_map)
65 			num = node;
66 		nr_node_ids = num + 1;
67 	}
68 
69 	/* allocate the map */
70 	for (node = 0; node < nr_node_ids; node++)
71 		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
72 
73 	/* cpumask_of_node() will now work */
74 	dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
75 }
76 
77 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
78 						unsigned int *nid)
79 {
80 	unsigned long long mem;
81 	char *p = cmdline;
82 	static unsigned int fake_nid;
83 	static unsigned long long curr_boundary;
84 
85 	/*
86 	 * Modify node id, iff we started creating NUMA nodes
87 	 * We want to continue from where we left of the last time
88 	 */
89 	if (fake_nid)
90 		*nid = fake_nid;
91 	/*
92 	 * In case there are no more arguments to parse, the
93 	 * node_id should be the same as the last fake node id
94 	 * (we've handled this above).
95 	 */
96 	if (!p)
97 		return 0;
98 
99 	mem = memparse(p, &p);
100 	if (!mem)
101 		return 0;
102 
103 	if (mem < curr_boundary)
104 		return 0;
105 
106 	curr_boundary = mem;
107 
108 	if ((end_pfn << PAGE_SHIFT) > mem) {
109 		/*
110 		 * Skip commas and spaces
111 		 */
112 		while (*p == ',' || *p == ' ' || *p == '\t')
113 			p++;
114 
115 		cmdline = p;
116 		fake_nid++;
117 		*nid = fake_nid;
118 		dbg("created new fake_node with id %d\n", fake_nid);
119 		return 1;
120 	}
121 	return 0;
122 }
123 
124 /*
125  * get_active_region_work_fn - A helper function for get_node_active_region
126  *	Returns datax set to the start_pfn and end_pfn if they contain
127  *	the initial value of datax->start_pfn between them
128  * @start_pfn: start page(inclusive) of region to check
129  * @end_pfn: end page(exclusive) of region to check
130  * @datax: comes in with ->start_pfn set to value to search for and
131  *	goes out with active range if it contains it
132  * Returns 1 if search value is in range else 0
133  */
134 static int __init get_active_region_work_fn(unsigned long start_pfn,
135 					unsigned long end_pfn, void *datax)
136 {
137 	struct node_active_region *data;
138 	data = (struct node_active_region *)datax;
139 
140 	if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
141 		data->start_pfn = start_pfn;
142 		data->end_pfn = end_pfn;
143 		return 1;
144 	}
145 	return 0;
146 
147 }
148 
149 /*
150  * get_node_active_region - Return active region containing start_pfn
151  * Active range returned is empty if none found.
152  * @start_pfn: The page to return the region for.
153  * @node_ar: Returned set to the active region containing start_pfn
154  */
155 static void __init get_node_active_region(unsigned long start_pfn,
156 		       struct node_active_region *node_ar)
157 {
158 	int nid = early_pfn_to_nid(start_pfn);
159 
160 	node_ar->nid = nid;
161 	node_ar->start_pfn = start_pfn;
162 	node_ar->end_pfn = start_pfn;
163 	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
164 }
165 
166 static void __cpuinit map_cpu_to_node(int cpu, int node)
167 {
168 	numa_cpu_lookup_table[cpu] = node;
169 
170 	dbg("adding cpu %d to node %d\n", cpu, node);
171 
172 	if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
173 		cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
174 }
175 
176 #ifdef CONFIG_HOTPLUG_CPU
177 static void unmap_cpu_from_node(unsigned long cpu)
178 {
179 	int node = numa_cpu_lookup_table[cpu];
180 
181 	dbg("removing cpu %lu from node %d\n", cpu, node);
182 
183 	if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
184 		cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
185 	} else {
186 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
187 		       cpu, node);
188 	}
189 }
190 #endif /* CONFIG_HOTPLUG_CPU */
191 
192 /* must hold reference to node during call */
193 static const int *of_get_associativity(struct device_node *dev)
194 {
195 	return of_get_property(dev, "ibm,associativity", NULL);
196 }
197 
198 /*
199  * Returns the property linux,drconf-usable-memory if
200  * it exists (the property exists only in kexec/kdump kernels,
201  * added by kexec-tools)
202  */
203 static const u32 *of_get_usable_memory(struct device_node *memory)
204 {
205 	const u32 *prop;
206 	u32 len;
207 	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
208 	if (!prop || len < sizeof(unsigned int))
209 		return 0;
210 	return prop;
211 }
212 
213 int __node_distance(int a, int b)
214 {
215 	int i;
216 	int distance = LOCAL_DISTANCE;
217 
218 	if (!form1_affinity)
219 		return distance;
220 
221 	for (i = 0; i < distance_ref_points_depth; i++) {
222 		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
223 			break;
224 
225 		/* Double the distance for each NUMA level */
226 		distance *= 2;
227 	}
228 
229 	return distance;
230 }
231 
232 static void initialize_distance_lookup_table(int nid,
233 		const unsigned int *associativity)
234 {
235 	int i;
236 
237 	if (!form1_affinity)
238 		return;
239 
240 	for (i = 0; i < distance_ref_points_depth; i++) {
241 		distance_lookup_table[nid][i] =
242 			associativity[distance_ref_points[i]];
243 	}
244 }
245 
246 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
247  * info is found.
248  */
249 static int of_node_to_nid_single(struct device_node *device)
250 {
251 	int nid = -1;
252 	const unsigned int *tmp;
253 
254 	if (min_common_depth == -1)
255 		goto out;
256 
257 	tmp = of_get_associativity(device);
258 	if (!tmp)
259 		goto out;
260 
261 	if (tmp[0] >= min_common_depth)
262 		nid = tmp[min_common_depth];
263 
264 	/* POWER4 LPAR uses 0xffff as invalid node */
265 	if (nid == 0xffff || nid >= MAX_NUMNODES)
266 		nid = -1;
267 
268 	if (nid > 0 && tmp[0] >= distance_ref_points_depth)
269 		initialize_distance_lookup_table(nid, tmp);
270 
271 out:
272 	return nid;
273 }
274 
275 /* Walk the device tree upwards, looking for an associativity id */
276 int of_node_to_nid(struct device_node *device)
277 {
278 	struct device_node *tmp;
279 	int nid = -1;
280 
281 	of_node_get(device);
282 	while (device) {
283 		nid = of_node_to_nid_single(device);
284 		if (nid != -1)
285 			break;
286 
287 	        tmp = device;
288 		device = of_get_parent(tmp);
289 		of_node_put(tmp);
290 	}
291 	of_node_put(device);
292 
293 	return nid;
294 }
295 EXPORT_SYMBOL_GPL(of_node_to_nid);
296 
297 static int __init find_min_common_depth(void)
298 {
299 	int depth;
300 	struct device_node *rtas_root;
301 	struct device_node *chosen;
302 	const char *vec5;
303 
304 	rtas_root = of_find_node_by_path("/rtas");
305 
306 	if (!rtas_root)
307 		return -1;
308 
309 	/*
310 	 * This property is a set of 32-bit integers, each representing
311 	 * an index into the ibm,associativity nodes.
312 	 *
313 	 * With form 0 affinity the first integer is for an SMP configuration
314 	 * (should be all 0's) and the second is for a normal NUMA
315 	 * configuration. We have only one level of NUMA.
316 	 *
317 	 * With form 1 affinity the first integer is the most significant
318 	 * NUMA boundary and the following are progressively less significant
319 	 * boundaries. There can be more than one level of NUMA.
320 	 */
321 	distance_ref_points = of_get_property(rtas_root,
322 					"ibm,associativity-reference-points",
323 					&distance_ref_points_depth);
324 
325 	if (!distance_ref_points) {
326 		dbg("NUMA: ibm,associativity-reference-points not found.\n");
327 		goto err;
328 	}
329 
330 	distance_ref_points_depth /= sizeof(int);
331 
332 #define VEC5_AFFINITY_BYTE	5
333 #define VEC5_AFFINITY		0x80
334 	chosen = of_find_node_by_path("/chosen");
335 	if (chosen) {
336 		vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
337 		if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) {
338 			dbg("Using form 1 affinity\n");
339 			form1_affinity = 1;
340 		}
341 	}
342 
343 	if (form1_affinity) {
344 		depth = distance_ref_points[0];
345 	} else {
346 		if (distance_ref_points_depth < 2) {
347 			printk(KERN_WARNING "NUMA: "
348 				"short ibm,associativity-reference-points\n");
349 			goto err;
350 		}
351 
352 		depth = distance_ref_points[1];
353 	}
354 
355 	/*
356 	 * Warn and cap if the hardware supports more than
357 	 * MAX_DISTANCE_REF_POINTS domains.
358 	 */
359 	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
360 		printk(KERN_WARNING "NUMA: distance array capped at "
361 			"%d entries\n", MAX_DISTANCE_REF_POINTS);
362 		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
363 	}
364 
365 	of_node_put(rtas_root);
366 	return depth;
367 
368 err:
369 	of_node_put(rtas_root);
370 	return -1;
371 }
372 
373 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
374 {
375 	struct device_node *memory = NULL;
376 
377 	memory = of_find_node_by_type(memory, "memory");
378 	if (!memory)
379 		panic("numa.c: No memory nodes found!");
380 
381 	*n_addr_cells = of_n_addr_cells(memory);
382 	*n_size_cells = of_n_size_cells(memory);
383 	of_node_put(memory);
384 }
385 
386 static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
387 {
388 	unsigned long result = 0;
389 
390 	while (n--) {
391 		result = (result << 32) | **buf;
392 		(*buf)++;
393 	}
394 	return result;
395 }
396 
397 struct of_drconf_cell {
398 	u64	base_addr;
399 	u32	drc_index;
400 	u32	reserved;
401 	u32	aa_index;
402 	u32	flags;
403 };
404 
405 #define DRCONF_MEM_ASSIGNED	0x00000008
406 #define DRCONF_MEM_AI_INVALID	0x00000040
407 #define DRCONF_MEM_RESERVED	0x00000080
408 
409 /*
410  * Read the next memblock list entry from the ibm,dynamic-memory property
411  * and return the information in the provided of_drconf_cell structure.
412  */
413 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
414 {
415 	const u32 *cp;
416 
417 	drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
418 
419 	cp = *cellp;
420 	drmem->drc_index = cp[0];
421 	drmem->reserved = cp[1];
422 	drmem->aa_index = cp[2];
423 	drmem->flags = cp[3];
424 
425 	*cellp = cp + 4;
426 }
427 
428 /*
429  * Retreive and validate the ibm,dynamic-memory property of the device tree.
430  *
431  * The layout of the ibm,dynamic-memory property is a number N of memblock
432  * list entries followed by N memblock list entries.  Each memblock list entry
433  * contains information as layed out in the of_drconf_cell struct above.
434  */
435 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
436 {
437 	const u32 *prop;
438 	u32 len, entries;
439 
440 	prop = of_get_property(memory, "ibm,dynamic-memory", &len);
441 	if (!prop || len < sizeof(unsigned int))
442 		return 0;
443 
444 	entries = *prop++;
445 
446 	/* Now that we know the number of entries, revalidate the size
447 	 * of the property read in to ensure we have everything
448 	 */
449 	if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
450 		return 0;
451 
452 	*dm = prop;
453 	return entries;
454 }
455 
456 /*
457  * Retreive and validate the ibm,lmb-size property for drconf memory
458  * from the device tree.
459  */
460 static u64 of_get_lmb_size(struct device_node *memory)
461 {
462 	const u32 *prop;
463 	u32 len;
464 
465 	prop = of_get_property(memory, "ibm,lmb-size", &len);
466 	if (!prop || len < sizeof(unsigned int))
467 		return 0;
468 
469 	return read_n_cells(n_mem_size_cells, &prop);
470 }
471 
472 struct assoc_arrays {
473 	u32	n_arrays;
474 	u32	array_sz;
475 	const u32 *arrays;
476 };
477 
478 /*
479  * Retreive and validate the list of associativity arrays for drconf
480  * memory from the ibm,associativity-lookup-arrays property of the
481  * device tree..
482  *
483  * The layout of the ibm,associativity-lookup-arrays property is a number N
484  * indicating the number of associativity arrays, followed by a number M
485  * indicating the size of each associativity array, followed by a list
486  * of N associativity arrays.
487  */
488 static int of_get_assoc_arrays(struct device_node *memory,
489 			       struct assoc_arrays *aa)
490 {
491 	const u32 *prop;
492 	u32 len;
493 
494 	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
495 	if (!prop || len < 2 * sizeof(unsigned int))
496 		return -1;
497 
498 	aa->n_arrays = *prop++;
499 	aa->array_sz = *prop++;
500 
501 	/* Now that we know the number of arrrays and size of each array,
502 	 * revalidate the size of the property read in.
503 	 */
504 	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
505 		return -1;
506 
507 	aa->arrays = prop;
508 	return 0;
509 }
510 
511 /*
512  * This is like of_node_to_nid_single() for memory represented in the
513  * ibm,dynamic-reconfiguration-memory node.
514  */
515 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
516 				   struct assoc_arrays *aa)
517 {
518 	int default_nid = 0;
519 	int nid = default_nid;
520 	int index;
521 
522 	if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
523 	    !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
524 	    drmem->aa_index < aa->n_arrays) {
525 		index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
526 		nid = aa->arrays[index];
527 
528 		if (nid == 0xffff || nid >= MAX_NUMNODES)
529 			nid = default_nid;
530 	}
531 
532 	return nid;
533 }
534 
535 /*
536  * Figure out to which domain a cpu belongs and stick it there.
537  * Return the id of the domain used.
538  */
539 static int __cpuinit numa_setup_cpu(unsigned long lcpu)
540 {
541 	int nid = 0;
542 	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
543 
544 	if (!cpu) {
545 		WARN_ON(1);
546 		goto out;
547 	}
548 
549 	nid = of_node_to_nid_single(cpu);
550 
551 	if (nid < 0 || !node_online(nid))
552 		nid = first_online_node;
553 out:
554 	map_cpu_to_node(lcpu, nid);
555 
556 	of_node_put(cpu);
557 
558 	return nid;
559 }
560 
561 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
562 			     unsigned long action,
563 			     void *hcpu)
564 {
565 	unsigned long lcpu = (unsigned long)hcpu;
566 	int ret = NOTIFY_DONE;
567 
568 	switch (action) {
569 	case CPU_UP_PREPARE:
570 	case CPU_UP_PREPARE_FROZEN:
571 		numa_setup_cpu(lcpu);
572 		ret = NOTIFY_OK;
573 		break;
574 #ifdef CONFIG_HOTPLUG_CPU
575 	case CPU_DEAD:
576 	case CPU_DEAD_FROZEN:
577 	case CPU_UP_CANCELED:
578 	case CPU_UP_CANCELED_FROZEN:
579 		unmap_cpu_from_node(lcpu);
580 		break;
581 		ret = NOTIFY_OK;
582 #endif
583 	}
584 	return ret;
585 }
586 
587 /*
588  * Check and possibly modify a memory region to enforce the memory limit.
589  *
590  * Returns the size the region should have to enforce the memory limit.
591  * This will either be the original value of size, a truncated value,
592  * or zero. If the returned value of size is 0 the region should be
593  * discarded as it lies wholy above the memory limit.
594  */
595 static unsigned long __init numa_enforce_memory_limit(unsigned long start,
596 						      unsigned long size)
597 {
598 	/*
599 	 * We use memblock_end_of_DRAM() in here instead of memory_limit because
600 	 * we've already adjusted it for the limit and it takes care of
601 	 * having memory holes below the limit.  Also, in the case of
602 	 * iommu_is_off, memory_limit is not set but is implicitly enforced.
603 	 */
604 
605 	if (start + size <= memblock_end_of_DRAM())
606 		return size;
607 
608 	if (start >= memblock_end_of_DRAM())
609 		return 0;
610 
611 	return memblock_end_of_DRAM() - start;
612 }
613 
614 /*
615  * Reads the counter for a given entry in
616  * linux,drconf-usable-memory property
617  */
618 static inline int __init read_usm_ranges(const u32 **usm)
619 {
620 	/*
621 	 * For each lmb in ibm,dynamic-memory a corresponding
622 	 * entry in linux,drconf-usable-memory property contains
623 	 * a counter followed by that many (base, size) duple.
624 	 * read the counter from linux,drconf-usable-memory
625 	 */
626 	return read_n_cells(n_mem_size_cells, usm);
627 }
628 
629 /*
630  * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
631  * node.  This assumes n_mem_{addr,size}_cells have been set.
632  */
633 static void __init parse_drconf_memory(struct device_node *memory)
634 {
635 	const u32 *dm, *usm;
636 	unsigned int n, rc, ranges, is_kexec_kdump = 0;
637 	unsigned long lmb_size, base, size, sz;
638 	int nid;
639 	struct assoc_arrays aa;
640 
641 	n = of_get_drconf_memory(memory, &dm);
642 	if (!n)
643 		return;
644 
645 	lmb_size = of_get_lmb_size(memory);
646 	if (!lmb_size)
647 		return;
648 
649 	rc = of_get_assoc_arrays(memory, &aa);
650 	if (rc)
651 		return;
652 
653 	/* check if this is a kexec/kdump kernel */
654 	usm = of_get_usable_memory(memory);
655 	if (usm != NULL)
656 		is_kexec_kdump = 1;
657 
658 	for (; n != 0; --n) {
659 		struct of_drconf_cell drmem;
660 
661 		read_drconf_cell(&drmem, &dm);
662 
663 		/* skip this block if the reserved bit is set in flags (0x80)
664 		   or if the block is not assigned to this partition (0x8) */
665 		if ((drmem.flags & DRCONF_MEM_RESERVED)
666 		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
667 			continue;
668 
669 		base = drmem.base_addr;
670 		size = lmb_size;
671 		ranges = 1;
672 
673 		if (is_kexec_kdump) {
674 			ranges = read_usm_ranges(&usm);
675 			if (!ranges) /* there are no (base, size) duple */
676 				continue;
677 		}
678 		do {
679 			if (is_kexec_kdump) {
680 				base = read_n_cells(n_mem_addr_cells, &usm);
681 				size = read_n_cells(n_mem_size_cells, &usm);
682 			}
683 			nid = of_drconf_to_nid_single(&drmem, &aa);
684 			fake_numa_create_new_node(
685 				((base + size) >> PAGE_SHIFT),
686 					   &nid);
687 			node_set_online(nid);
688 			sz = numa_enforce_memory_limit(base, size);
689 			if (sz)
690 				add_active_range(nid, base >> PAGE_SHIFT,
691 						 (base >> PAGE_SHIFT)
692 						 + (sz >> PAGE_SHIFT));
693 		} while (--ranges);
694 	}
695 }
696 
697 static int __init parse_numa_properties(void)
698 {
699 	struct device_node *cpu = NULL;
700 	struct device_node *memory = NULL;
701 	int default_nid = 0;
702 	unsigned long i;
703 
704 	if (numa_enabled == 0) {
705 		printk(KERN_WARNING "NUMA disabled by user\n");
706 		return -1;
707 	}
708 
709 	min_common_depth = find_min_common_depth();
710 
711 	if (min_common_depth < 0)
712 		return min_common_depth;
713 
714 	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
715 
716 	/*
717 	 * Even though we connect cpus to numa domains later in SMP
718 	 * init, we need to know the node ids now. This is because
719 	 * each node to be onlined must have NODE_DATA etc backing it.
720 	 */
721 	for_each_present_cpu(i) {
722 		int nid;
723 
724 		cpu = of_get_cpu_node(i, NULL);
725 		BUG_ON(!cpu);
726 		nid = of_node_to_nid_single(cpu);
727 		of_node_put(cpu);
728 
729 		/*
730 		 * Don't fall back to default_nid yet -- we will plug
731 		 * cpus into nodes once the memory scan has discovered
732 		 * the topology.
733 		 */
734 		if (nid < 0)
735 			continue;
736 		node_set_online(nid);
737 	}
738 
739 	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
740 	memory = NULL;
741 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
742 		unsigned long start;
743 		unsigned long size;
744 		int nid;
745 		int ranges;
746 		const unsigned int *memcell_buf;
747 		unsigned int len;
748 
749 		memcell_buf = of_get_property(memory,
750 			"linux,usable-memory", &len);
751 		if (!memcell_buf || len <= 0)
752 			memcell_buf = of_get_property(memory, "reg", &len);
753 		if (!memcell_buf || len <= 0)
754 			continue;
755 
756 		/* ranges in cell */
757 		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
758 new_range:
759 		/* these are order-sensitive, and modify the buffer pointer */
760 		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
761 		size = read_n_cells(n_mem_size_cells, &memcell_buf);
762 
763 		/*
764 		 * Assumption: either all memory nodes or none will
765 		 * have associativity properties.  If none, then
766 		 * everything goes to default_nid.
767 		 */
768 		nid = of_node_to_nid_single(memory);
769 		if (nid < 0)
770 			nid = default_nid;
771 
772 		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
773 		node_set_online(nid);
774 
775 		if (!(size = numa_enforce_memory_limit(start, size))) {
776 			if (--ranges)
777 				goto new_range;
778 			else
779 				continue;
780 		}
781 
782 		add_active_range(nid, start >> PAGE_SHIFT,
783 				(start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
784 
785 		if (--ranges)
786 			goto new_range;
787 	}
788 
789 	/*
790 	 * Now do the same thing for each MEMBLOCK listed in the ibm,dynamic-memory
791 	 * property in the ibm,dynamic-reconfiguration-memory node.
792 	 */
793 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
794 	if (memory)
795 		parse_drconf_memory(memory);
796 
797 	return 0;
798 }
799 
800 static void __init setup_nonnuma(void)
801 {
802 	unsigned long top_of_ram = memblock_end_of_DRAM();
803 	unsigned long total_ram = memblock_phys_mem_size();
804 	unsigned long start_pfn, end_pfn;
805 	unsigned int nid = 0;
806 	struct memblock_region *reg;
807 
808 	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
809 	       top_of_ram, total_ram);
810 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
811 	       (top_of_ram - total_ram) >> 20);
812 
813 	for_each_memblock(memory, reg) {
814 		start_pfn = memblock_region_memory_base_pfn(reg);
815 		end_pfn = memblock_region_memory_end_pfn(reg);
816 
817 		fake_numa_create_new_node(end_pfn, &nid);
818 		add_active_range(nid, start_pfn, end_pfn);
819 		node_set_online(nid);
820 	}
821 }
822 
823 void __init dump_numa_cpu_topology(void)
824 {
825 	unsigned int node;
826 	unsigned int cpu, count;
827 
828 	if (min_common_depth == -1 || !numa_enabled)
829 		return;
830 
831 	for_each_online_node(node) {
832 		printk(KERN_DEBUG "Node %d CPUs:", node);
833 
834 		count = 0;
835 		/*
836 		 * If we used a CPU iterator here we would miss printing
837 		 * the holes in the cpumap.
838 		 */
839 		for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
840 			if (cpumask_test_cpu(cpu,
841 					node_to_cpumask_map[node])) {
842 				if (count == 0)
843 					printk(" %u", cpu);
844 				++count;
845 			} else {
846 				if (count > 1)
847 					printk("-%u", cpu - 1);
848 				count = 0;
849 			}
850 		}
851 
852 		if (count > 1)
853 			printk("-%u", nr_cpu_ids - 1);
854 		printk("\n");
855 	}
856 }
857 
858 static void __init dump_numa_memory_topology(void)
859 {
860 	unsigned int node;
861 	unsigned int count;
862 
863 	if (min_common_depth == -1 || !numa_enabled)
864 		return;
865 
866 	for_each_online_node(node) {
867 		unsigned long i;
868 
869 		printk(KERN_DEBUG "Node %d Memory:", node);
870 
871 		count = 0;
872 
873 		for (i = 0; i < memblock_end_of_DRAM();
874 		     i += (1 << SECTION_SIZE_BITS)) {
875 			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
876 				if (count == 0)
877 					printk(" 0x%lx", i);
878 				++count;
879 			} else {
880 				if (count > 0)
881 					printk("-0x%lx", i);
882 				count = 0;
883 			}
884 		}
885 
886 		if (count > 0)
887 			printk("-0x%lx", i);
888 		printk("\n");
889 	}
890 }
891 
892 /*
893  * Allocate some memory, satisfying the memblock or bootmem allocator where
894  * required. nid is the preferred node and end is the physical address of
895  * the highest address in the node.
896  *
897  * Returns the virtual address of the memory.
898  */
899 static void __init *careful_zallocation(int nid, unsigned long size,
900 				       unsigned long align,
901 				       unsigned long end_pfn)
902 {
903 	void *ret;
904 	int new_nid;
905 	unsigned long ret_paddr;
906 
907 	ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
908 
909 	/* retry over all memory */
910 	if (!ret_paddr)
911 		ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
912 
913 	if (!ret_paddr)
914 		panic("numa.c: cannot allocate %lu bytes for node %d",
915 		      size, nid);
916 
917 	ret = __va(ret_paddr);
918 
919 	/*
920 	 * We initialize the nodes in numeric order: 0, 1, 2...
921 	 * and hand over control from the MEMBLOCK allocator to the
922 	 * bootmem allocator.  If this function is called for
923 	 * node 5, then we know that all nodes <5 are using the
924 	 * bootmem allocator instead of the MEMBLOCK allocator.
925 	 *
926 	 * So, check the nid from which this allocation came
927 	 * and double check to see if we need to use bootmem
928 	 * instead of the MEMBLOCK.  We don't free the MEMBLOCK memory
929 	 * since it would be useless.
930 	 */
931 	new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
932 	if (new_nid < nid) {
933 		ret = __alloc_bootmem_node(NODE_DATA(new_nid),
934 				size, align, 0);
935 
936 		dbg("alloc_bootmem %p %lx\n", ret, size);
937 	}
938 
939 	memset(ret, 0, size);
940 	return ret;
941 }
942 
943 static struct notifier_block __cpuinitdata ppc64_numa_nb = {
944 	.notifier_call = cpu_numa_callback,
945 	.priority = 1 /* Must run before sched domains notifier. */
946 };
947 
948 static void mark_reserved_regions_for_nid(int nid)
949 {
950 	struct pglist_data *node = NODE_DATA(nid);
951 	struct memblock_region *reg;
952 
953 	for_each_memblock(reserved, reg) {
954 		unsigned long physbase = reg->base;
955 		unsigned long size = reg->size;
956 		unsigned long start_pfn = physbase >> PAGE_SHIFT;
957 		unsigned long end_pfn = PFN_UP(physbase + size);
958 		struct node_active_region node_ar;
959 		unsigned long node_end_pfn = node->node_start_pfn +
960 					     node->node_spanned_pages;
961 
962 		/*
963 		 * Check to make sure that this memblock.reserved area is
964 		 * within the bounds of the node that we care about.
965 		 * Checking the nid of the start and end points is not
966 		 * sufficient because the reserved area could span the
967 		 * entire node.
968 		 */
969 		if (end_pfn <= node->node_start_pfn ||
970 		    start_pfn >= node_end_pfn)
971 			continue;
972 
973 		get_node_active_region(start_pfn, &node_ar);
974 		while (start_pfn < end_pfn &&
975 			node_ar.start_pfn < node_ar.end_pfn) {
976 			unsigned long reserve_size = size;
977 			/*
978 			 * if reserved region extends past active region
979 			 * then trim size to active region
980 			 */
981 			if (end_pfn > node_ar.end_pfn)
982 				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
983 					- physbase;
984 			/*
985 			 * Only worry about *this* node, others may not
986 			 * yet have valid NODE_DATA().
987 			 */
988 			if (node_ar.nid == nid) {
989 				dbg("reserve_bootmem %lx %lx nid=%d\n",
990 					physbase, reserve_size, node_ar.nid);
991 				reserve_bootmem_node(NODE_DATA(node_ar.nid),
992 						physbase, reserve_size,
993 						BOOTMEM_DEFAULT);
994 			}
995 			/*
996 			 * if reserved region is contained in the active region
997 			 * then done.
998 			 */
999 			if (end_pfn <= node_ar.end_pfn)
1000 				break;
1001 
1002 			/*
1003 			 * reserved region extends past the active region
1004 			 *   get next active region that contains this
1005 			 *   reserved region
1006 			 */
1007 			start_pfn = node_ar.end_pfn;
1008 			physbase = start_pfn << PAGE_SHIFT;
1009 			size = size - reserve_size;
1010 			get_node_active_region(start_pfn, &node_ar);
1011 		}
1012 	}
1013 }
1014 
1015 
1016 void __init do_init_bootmem(void)
1017 {
1018 	int nid;
1019 
1020 	min_low_pfn = 0;
1021 	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
1022 	max_pfn = max_low_pfn;
1023 
1024 	if (parse_numa_properties())
1025 		setup_nonnuma();
1026 	else
1027 		dump_numa_memory_topology();
1028 
1029 	for_each_online_node(nid) {
1030 		unsigned long start_pfn, end_pfn;
1031 		void *bootmem_vaddr;
1032 		unsigned long bootmap_pages;
1033 
1034 		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1035 
1036 		/*
1037 		 * Allocate the node structure node local if possible
1038 		 *
1039 		 * Be careful moving this around, as it relies on all
1040 		 * previous nodes' bootmem to be initialized and have
1041 		 * all reserved areas marked.
1042 		 */
1043 		NODE_DATA(nid) = careful_zallocation(nid,
1044 					sizeof(struct pglist_data),
1045 					SMP_CACHE_BYTES, end_pfn);
1046 
1047   		dbg("node %d\n", nid);
1048 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
1049 
1050 		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
1051 		NODE_DATA(nid)->node_start_pfn = start_pfn;
1052 		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
1053 
1054 		if (NODE_DATA(nid)->node_spanned_pages == 0)
1055   			continue;
1056 
1057   		dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
1058   		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
1059 
1060 		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
1061 		bootmem_vaddr = careful_zallocation(nid,
1062 					bootmap_pages << PAGE_SHIFT,
1063 					PAGE_SIZE, end_pfn);
1064 
1065 		dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
1066 
1067 		init_bootmem_node(NODE_DATA(nid),
1068 				  __pa(bootmem_vaddr) >> PAGE_SHIFT,
1069 				  start_pfn, end_pfn);
1070 
1071 		free_bootmem_with_active_regions(nid, end_pfn);
1072 		/*
1073 		 * Be very careful about moving this around.  Future
1074 		 * calls to careful_zallocation() depend on this getting
1075 		 * done correctly.
1076 		 */
1077 		mark_reserved_regions_for_nid(nid);
1078 		sparse_memory_present_with_active_regions(nid);
1079 	}
1080 
1081 	init_bootmem_done = 1;
1082 
1083 	/*
1084 	 * Now bootmem is initialised we can create the node to cpumask
1085 	 * lookup tables and setup the cpu callback to populate them.
1086 	 */
1087 	setup_node_to_cpumask_map();
1088 
1089 	register_cpu_notifier(&ppc64_numa_nb);
1090 	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
1091 			  (void *)(unsigned long)boot_cpuid);
1092 }
1093 
1094 void __init paging_init(void)
1095 {
1096 	unsigned long max_zone_pfns[MAX_NR_ZONES];
1097 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
1098 	max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
1099 	free_area_init_nodes(max_zone_pfns);
1100 }
1101 
1102 static int __init early_numa(char *p)
1103 {
1104 	if (!p)
1105 		return 0;
1106 
1107 	if (strstr(p, "off"))
1108 		numa_enabled = 0;
1109 
1110 	if (strstr(p, "debug"))
1111 		numa_debug = 1;
1112 
1113 	p = strstr(p, "fake=");
1114 	if (p)
1115 		cmdline = p + strlen("fake=");
1116 
1117 	return 0;
1118 }
1119 early_param("numa", early_numa);
1120 
1121 #ifdef CONFIG_MEMORY_HOTPLUG
1122 /*
1123  * Find the node associated with a hot added memory section for
1124  * memory represented in the device tree by the property
1125  * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
1126  */
1127 static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1128 				     unsigned long scn_addr)
1129 {
1130 	const u32 *dm;
1131 	unsigned int drconf_cell_cnt, rc;
1132 	unsigned long lmb_size;
1133 	struct assoc_arrays aa;
1134 	int nid = -1;
1135 
1136 	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1137 	if (!drconf_cell_cnt)
1138 		return -1;
1139 
1140 	lmb_size = of_get_lmb_size(memory);
1141 	if (!lmb_size)
1142 		return -1;
1143 
1144 	rc = of_get_assoc_arrays(memory, &aa);
1145 	if (rc)
1146 		return -1;
1147 
1148 	for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
1149 		struct of_drconf_cell drmem;
1150 
1151 		read_drconf_cell(&drmem, &dm);
1152 
1153 		/* skip this block if it is reserved or not assigned to
1154 		 * this partition */
1155 		if ((drmem.flags & DRCONF_MEM_RESERVED)
1156 		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1157 			continue;
1158 
1159 		if ((scn_addr < drmem.base_addr)
1160 		    || (scn_addr >= (drmem.base_addr + lmb_size)))
1161 			continue;
1162 
1163 		nid = of_drconf_to_nid_single(&drmem, &aa);
1164 		break;
1165 	}
1166 
1167 	return nid;
1168 }
1169 
1170 /*
1171  * Find the node associated with a hot added memory section for memory
1172  * represented in the device tree as a node (i.e. memory@XXXX) for
1173  * each memblock.
1174  */
1175 int hot_add_node_scn_to_nid(unsigned long scn_addr)
1176 {
1177 	struct device_node *memory = NULL;
1178 	int nid = -1;
1179 
1180 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
1181 		unsigned long start, size;
1182 		int ranges;
1183 		const unsigned int *memcell_buf;
1184 		unsigned int len;
1185 
1186 		memcell_buf = of_get_property(memory, "reg", &len);
1187 		if (!memcell_buf || len <= 0)
1188 			continue;
1189 
1190 		/* ranges in cell */
1191 		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1192 
1193 		while (ranges--) {
1194 			start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1195 			size = read_n_cells(n_mem_size_cells, &memcell_buf);
1196 
1197 			if ((scn_addr < start) || (scn_addr >= (start + size)))
1198 				continue;
1199 
1200 			nid = of_node_to_nid_single(memory);
1201 			break;
1202 		}
1203 
1204 		of_node_put(memory);
1205 		if (nid >= 0)
1206 			break;
1207 	}
1208 
1209 	return nid;
1210 }
1211 
1212 /*
1213  * Find the node associated with a hot added memory section.  Section
1214  * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
1215  * sections are fully contained within a single MEMBLOCK.
1216  */
1217 int hot_add_scn_to_nid(unsigned long scn_addr)
1218 {
1219 	struct device_node *memory = NULL;
1220 	int nid, found = 0;
1221 
1222 	if (!numa_enabled || (min_common_depth < 0))
1223 		return first_online_node;
1224 
1225 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1226 	if (memory) {
1227 		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1228 		of_node_put(memory);
1229 	} else {
1230 		nid = hot_add_node_scn_to_nid(scn_addr);
1231 	}
1232 
1233 	if (nid < 0 || !node_online(nid))
1234 		nid = first_online_node;
1235 
1236 	if (NODE_DATA(nid)->node_spanned_pages)
1237 		return nid;
1238 
1239 	for_each_online_node(nid) {
1240 		if (NODE_DATA(nid)->node_spanned_pages) {
1241 			found = 1;
1242 			break;
1243 		}
1244 	}
1245 
1246 	BUG_ON(!found);
1247 	return nid;
1248 }
1249 
1250 #endif /* CONFIG_MEMORY_HOTPLUG */
1251