xref: /openbmc/linux/arch/x86/kernel/e820.c (revision 1d1997db)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Low level x86 E820 memory map handling functions.
4  *
5  * The firmware and bootloader passes us the "E820 table", which is the primary
6  * physical memory layout description available about x86 systems.
7  *
8  * The kernel takes the E820 memory layout and optionally modifies it with
9  * quirks and other tweaks, and feeds that into the generic Linux memory
10  * allocation code routines via a platform independent interface (memblock, etc.).
11  */
12 #include <linux/crash_dump.h>
13 #include <linux/memblock.h>
14 #include <linux/suspend.h>
15 #include <linux/acpi.h>
16 #include <linux/firmware-map.h>
17 #include <linux/sort.h>
18 #include <linux/memory_hotplug.h>
19 
20 #include <asm/e820/api.h>
21 #include <asm/setup.h>
22 
23 /*
24  * We organize the E820 table into three main data structures:
25  *
26  * - 'e820_table_firmware': the original firmware version passed to us by the
27  *   bootloader - not modified by the kernel. It is composed of two parts:
28  *   the first 128 E820 memory entries in boot_params.e820_table and the remaining
29  *   (if any) entries of the SETUP_E820_EXT nodes. We use this to:
30  *
31  *       - inform the user about the firmware's notion of memory layout
32  *         via /sys/firmware/memmap
33  *
34  *       - the hibernation code uses it to generate a kernel-independent MD5
35  *         fingerprint of the physical memory layout of a system.
36  *
37  * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
38  *   passed to us by the bootloader - the major difference between
39  *   e820_table_firmware[] and this one is that, the latter marks the setup_data
40  *   list created by the EFI boot stub as reserved, so that kexec can reuse the
41  *   setup_data information in the second kernel. Besides, e820_table_kexec[]
42  *   might also be modified by the kexec itself to fake a mptable.
43  *   We use this to:
44  *
45  *       - kexec, which is a bootloader in disguise, uses the original E820
46  *         layout to pass to the kexec-ed kernel. This way the original kernel
47  *         can have a restricted E820 map while the kexec()-ed kexec-kernel
48  *         can have access to full memory - etc.
49  *
50  * - 'e820_table': this is the main E820 table that is massaged by the
51  *   low level x86 platform code, or modified by boot parameters, before
52  *   passed on to higher level MM layers.
53  *
54  * Once the E820 map has been converted to the standard Linux memory layout
55  * information its role stops - modifying it has no effect and does not get
56  * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
57  * specific memory layout data during early bootup.
58  */
59 static struct e820_table e820_table_init		__initdata;
60 static struct e820_table e820_table_kexec_init		__initdata;
61 static struct e820_table e820_table_firmware_init	__initdata;
62 
63 struct e820_table *e820_table __refdata			= &e820_table_init;
64 struct e820_table *e820_table_kexec __refdata		= &e820_table_kexec_init;
65 struct e820_table *e820_table_firmware __refdata	= &e820_table_firmware_init;
66 
67 /* For PCI or other memory-mapped resources */
68 unsigned long pci_mem_start = 0xaeedbabe;
69 #ifdef CONFIG_PCI
70 EXPORT_SYMBOL(pci_mem_start);
71 #endif
72 
73 /*
74  * This function checks if any part of the range <start,end> is mapped
75  * with type.
76  */
77 static bool _e820__mapped_any(struct e820_table *table,
78 			      u64 start, u64 end, enum e820_type type)
79 {
80 	int i;
81 
82 	for (i = 0; i < table->nr_entries; i++) {
83 		struct e820_entry *entry = &table->entries[i];
84 
85 		if (type && entry->type != type)
86 			continue;
87 		if (entry->addr >= end || entry->addr + entry->size <= start)
88 			continue;
89 		return true;
90 	}
91 	return false;
92 }
93 
94 bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
95 {
96 	return _e820__mapped_any(e820_table_firmware, start, end, type);
97 }
98 EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
99 
100 bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
101 {
102 	return _e820__mapped_any(e820_table, start, end, type);
103 }
104 EXPORT_SYMBOL_GPL(e820__mapped_any);
105 
106 /*
107  * This function checks if the entire <start,end> range is mapped with 'type'.
108  *
109  * Note: this function only works correctly once the E820 table is sorted and
110  * not-overlapping (at least for the range specified), which is the case normally.
111  */
112 static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
113 					     enum e820_type type)
114 {
115 	int i;
116 
117 	for (i = 0; i < e820_table->nr_entries; i++) {
118 		struct e820_entry *entry = &e820_table->entries[i];
119 
120 		if (type && entry->type != type)
121 			continue;
122 
123 		/* Is the region (part) in overlap with the current region? */
124 		if (entry->addr >= end || entry->addr + entry->size <= start)
125 			continue;
126 
127 		/*
128 		 * If the region is at the beginning of <start,end> we move
129 		 * 'start' to the end of the region since it's ok until there
130 		 */
131 		if (entry->addr <= start)
132 			start = entry->addr + entry->size;
133 
134 		/*
135 		 * If 'start' is now at or beyond 'end', we're done, full
136 		 * coverage of the desired range exists:
137 		 */
138 		if (start >= end)
139 			return entry;
140 	}
141 
142 	return NULL;
143 }
144 
145 /*
146  * This function checks if the entire range <start,end> is mapped with type.
147  */
148 bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
149 {
150 	return __e820__mapped_all(start, end, type);
151 }
152 
153 /*
154  * This function returns the type associated with the range <start,end>.
155  */
156 int e820__get_entry_type(u64 start, u64 end)
157 {
158 	struct e820_entry *entry = __e820__mapped_all(start, end, 0);
159 
160 	return entry ? entry->type : -EINVAL;
161 }
162 
163 /*
164  * Add a memory region to the kernel E820 map.
165  */
166 static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
167 {
168 	int x = table->nr_entries;
169 
170 	if (x >= ARRAY_SIZE(table->entries)) {
171 		pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
172 		       start, start + size - 1);
173 		return;
174 	}
175 
176 	table->entries[x].addr = start;
177 	table->entries[x].size = size;
178 	table->entries[x].type = type;
179 	table->nr_entries++;
180 }
181 
182 void __init e820__range_add(u64 start, u64 size, enum e820_type type)
183 {
184 	__e820__range_add(e820_table, start, size, type);
185 }
186 
187 static void __init e820_print_type(enum e820_type type)
188 {
189 	switch (type) {
190 	case E820_TYPE_RAM:		/* Fall through: */
191 	case E820_TYPE_RESERVED_KERN:	pr_cont("usable");			break;
192 	case E820_TYPE_RESERVED:	pr_cont("reserved");			break;
193 	case E820_TYPE_SOFT_RESERVED:	pr_cont("soft reserved");		break;
194 	case E820_TYPE_ACPI:		pr_cont("ACPI data");			break;
195 	case E820_TYPE_NVS:		pr_cont("ACPI NVS");			break;
196 	case E820_TYPE_UNUSABLE:	pr_cont("unusable");			break;
197 	case E820_TYPE_PMEM:		/* Fall through: */
198 	case E820_TYPE_PRAM:		pr_cont("persistent (type %u)", type);	break;
199 	default:			pr_cont("type %u", type);		break;
200 	}
201 }
202 
203 void __init e820__print_table(char *who)
204 {
205 	int i;
206 
207 	for (i = 0; i < e820_table->nr_entries; i++) {
208 		pr_info("%s: [mem %#018Lx-%#018Lx] ",
209 			who,
210 			e820_table->entries[i].addr,
211 			e820_table->entries[i].addr + e820_table->entries[i].size - 1);
212 
213 		e820_print_type(e820_table->entries[i].type);
214 		pr_cont("\n");
215 	}
216 }
217 
218 /*
219  * Sanitize an E820 map.
220  *
221  * Some E820 layouts include overlapping entries. The following
222  * replaces the original E820 map with a new one, removing overlaps,
223  * and resolving conflicting memory types in favor of highest
224  * numbered type.
225  *
226  * The input parameter 'entries' points to an array of 'struct
227  * e820_entry' which on entry has elements in the range [0, *nr_entries)
228  * valid, and which has space for up to max_nr_entries entries.
229  * On return, the resulting sanitized E820 map entries will be in
230  * overwritten in the same location, starting at 'entries'.
231  *
232  * The integer pointed to by nr_entries must be valid on entry (the
233  * current number of valid entries located at 'entries'). If the
234  * sanitizing succeeds the *nr_entries will be updated with the new
235  * number of valid entries (something no more than max_nr_entries).
236  *
237  * The return value from e820__update_table() is zero if it
238  * successfully 'sanitized' the map entries passed in, and is -1
239  * if it did nothing, which can happen if either of (1) it was
240  * only passed one map entry, or (2) any of the input map entries
241  * were invalid (start + size < start, meaning that the size was
242  * so big the described memory range wrapped around through zero.)
243  *
244  *	Visually we're performing the following
245  *	(1,2,3,4 = memory types)...
246  *
247  *	Sample memory map (w/overlaps):
248  *	   ____22__________________
249  *	   ______________________4_
250  *	   ____1111________________
251  *	   _44_____________________
252  *	   11111111________________
253  *	   ____________________33__
254  *	   ___________44___________
255  *	   __________33333_________
256  *	   ______________22________
257  *	   ___________________2222_
258  *	   _________111111111______
259  *	   _____________________11_
260  *	   _________________4______
261  *
262  *	Sanitized equivalent (no overlap):
263  *	   1_______________________
264  *	   _44_____________________
265  *	   ___1____________________
266  *	   ____22__________________
267  *	   ______11________________
268  *	   _________1______________
269  *	   __________3_____________
270  *	   ___________44___________
271  *	   _____________33_________
272  *	   _______________2________
273  *	   ________________1_______
274  *	   _________________4______
275  *	   ___________________2____
276  *	   ____________________33__
277  *	   ______________________4_
278  */
279 struct change_member {
280 	/* Pointer to the original entry: */
281 	struct e820_entry	*entry;
282 	/* Address for this change point: */
283 	unsigned long long	addr;
284 };
285 
286 static struct change_member	change_point_list[2*E820_MAX_ENTRIES]	__initdata;
287 static struct change_member	*change_point[2*E820_MAX_ENTRIES]	__initdata;
288 static struct e820_entry	*overlap_list[E820_MAX_ENTRIES]		__initdata;
289 static struct e820_entry	new_entries[E820_MAX_ENTRIES]		__initdata;
290 
291 static int __init cpcompare(const void *a, const void *b)
292 {
293 	struct change_member * const *app = a, * const *bpp = b;
294 	const struct change_member *ap = *app, *bp = *bpp;
295 
296 	/*
297 	 * Inputs are pointers to two elements of change_point[].  If their
298 	 * addresses are not equal, their difference dominates.  If the addresses
299 	 * are equal, then consider one that represents the end of its region
300 	 * to be greater than one that does not.
301 	 */
302 	if (ap->addr != bp->addr)
303 		return ap->addr > bp->addr ? 1 : -1;
304 
305 	return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
306 }
307 
308 int __init e820__update_table(struct e820_table *table)
309 {
310 	struct e820_entry *entries = table->entries;
311 	u32 max_nr_entries = ARRAY_SIZE(table->entries);
312 	enum e820_type current_type, last_type;
313 	unsigned long long last_addr;
314 	u32 new_nr_entries, overlap_entries;
315 	u32 i, chg_idx, chg_nr;
316 
317 	/* If there's only one memory region, don't bother: */
318 	if (table->nr_entries < 2)
319 		return -1;
320 
321 	BUG_ON(table->nr_entries > max_nr_entries);
322 
323 	/* Bail out if we find any unreasonable addresses in the map: */
324 	for (i = 0; i < table->nr_entries; i++) {
325 		if (entries[i].addr + entries[i].size < entries[i].addr)
326 			return -1;
327 	}
328 
329 	/* Create pointers for initial change-point information (for sorting): */
330 	for (i = 0; i < 2 * table->nr_entries; i++)
331 		change_point[i] = &change_point_list[i];
332 
333 	/*
334 	 * Record all known change-points (starting and ending addresses),
335 	 * omitting empty memory regions:
336 	 */
337 	chg_idx = 0;
338 	for (i = 0; i < table->nr_entries; i++)	{
339 		if (entries[i].size != 0) {
340 			change_point[chg_idx]->addr	= entries[i].addr;
341 			change_point[chg_idx++]->entry	= &entries[i];
342 			change_point[chg_idx]->addr	= entries[i].addr + entries[i].size;
343 			change_point[chg_idx++]->entry	= &entries[i];
344 		}
345 	}
346 	chg_nr = chg_idx;
347 
348 	/* Sort change-point list by memory addresses (low -> high): */
349 	sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
350 
351 	/* Create a new memory map, removing overlaps: */
352 	overlap_entries = 0;	 /* Number of entries in the overlap table */
353 	new_nr_entries = 0;	 /* Index for creating new map entries */
354 	last_type = 0;		 /* Start with undefined memory type */
355 	last_addr = 0;		 /* Start with 0 as last starting address */
356 
357 	/* Loop through change-points, determining effect on the new map: */
358 	for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
359 		/* Keep track of all overlapping entries */
360 		if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
361 			/* Add map entry to overlap list (> 1 entry implies an overlap) */
362 			overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
363 		} else {
364 			/* Remove entry from list (order independent, so swap with last): */
365 			for (i = 0; i < overlap_entries; i++) {
366 				if (overlap_list[i] == change_point[chg_idx]->entry)
367 					overlap_list[i] = overlap_list[overlap_entries-1];
368 			}
369 			overlap_entries--;
370 		}
371 		/*
372 		 * If there are overlapping entries, decide which
373 		 * "type" to use (larger value takes precedence --
374 		 * 1=usable, 2,3,4,4+=unusable)
375 		 */
376 		current_type = 0;
377 		for (i = 0; i < overlap_entries; i++) {
378 			if (overlap_list[i]->type > current_type)
379 				current_type = overlap_list[i]->type;
380 		}
381 
382 		/* Continue building up new map based on this information: */
383 		if (current_type != last_type || current_type == E820_TYPE_PRAM) {
384 			if (last_type != 0)	 {
385 				new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
386 				/* Move forward only if the new size was non-zero: */
387 				if (new_entries[new_nr_entries].size != 0)
388 					/* No more space left for new entries? */
389 					if (++new_nr_entries >= max_nr_entries)
390 						break;
391 			}
392 			if (current_type != 0)	{
393 				new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
394 				new_entries[new_nr_entries].type = current_type;
395 				last_addr = change_point[chg_idx]->addr;
396 			}
397 			last_type = current_type;
398 		}
399 	}
400 
401 	/* Copy the new entries into the original location: */
402 	memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
403 	table->nr_entries = new_nr_entries;
404 
405 	return 0;
406 }
407 
408 static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
409 {
410 	struct boot_e820_entry *entry = entries;
411 
412 	while (nr_entries) {
413 		u64 start = entry->addr;
414 		u64 size = entry->size;
415 		u64 end = start + size - 1;
416 		u32 type = entry->type;
417 
418 		/* Ignore the entry on 64-bit overflow: */
419 		if (start > end && likely(size))
420 			return -1;
421 
422 		e820__range_add(start, size, type);
423 
424 		entry++;
425 		nr_entries--;
426 	}
427 	return 0;
428 }
429 
430 /*
431  * Copy the BIOS E820 map into a safe place.
432  *
433  * Sanity-check it while we're at it..
434  *
435  * If we're lucky and live on a modern system, the setup code
436  * will have given us a memory map that we can use to properly
437  * set up memory.  If we aren't, we'll fake a memory map.
438  */
439 static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
440 {
441 	/* Only one memory region (or negative)? Ignore it */
442 	if (nr_entries < 2)
443 		return -1;
444 
445 	return __append_e820_table(entries, nr_entries);
446 }
447 
448 static u64 __init
449 __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
450 {
451 	u64 end;
452 	unsigned int i;
453 	u64 real_updated_size = 0;
454 
455 	BUG_ON(old_type == new_type);
456 
457 	if (size > (ULLONG_MAX - start))
458 		size = ULLONG_MAX - start;
459 
460 	end = start + size;
461 	printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
462 	e820_print_type(old_type);
463 	pr_cont(" ==> ");
464 	e820_print_type(new_type);
465 	pr_cont("\n");
466 
467 	for (i = 0; i < table->nr_entries; i++) {
468 		struct e820_entry *entry = &table->entries[i];
469 		u64 final_start, final_end;
470 		u64 entry_end;
471 
472 		if (entry->type != old_type)
473 			continue;
474 
475 		entry_end = entry->addr + entry->size;
476 
477 		/* Completely covered by new range? */
478 		if (entry->addr >= start && entry_end <= end) {
479 			entry->type = new_type;
480 			real_updated_size += entry->size;
481 			continue;
482 		}
483 
484 		/* New range is completely covered? */
485 		if (entry->addr < start && entry_end > end) {
486 			__e820__range_add(table, start, size, new_type);
487 			__e820__range_add(table, end, entry_end - end, entry->type);
488 			entry->size = start - entry->addr;
489 			real_updated_size += size;
490 			continue;
491 		}
492 
493 		/* Partially covered: */
494 		final_start = max(start, entry->addr);
495 		final_end = min(end, entry_end);
496 		if (final_start >= final_end)
497 			continue;
498 
499 		__e820__range_add(table, final_start, final_end - final_start, new_type);
500 
501 		real_updated_size += final_end - final_start;
502 
503 		/*
504 		 * Left range could be head or tail, so need to update
505 		 * its size first:
506 		 */
507 		entry->size -= final_end - final_start;
508 		if (entry->addr < final_start)
509 			continue;
510 
511 		entry->addr = final_end;
512 	}
513 	return real_updated_size;
514 }
515 
516 u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
517 {
518 	return __e820__range_update(e820_table, start, size, old_type, new_type);
519 }
520 
521 static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
522 {
523 	return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
524 }
525 
526 /* Remove a range of memory from the E820 table: */
527 u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
528 {
529 	int i;
530 	u64 end;
531 	u64 real_removed_size = 0;
532 
533 	if (size > (ULLONG_MAX - start))
534 		size = ULLONG_MAX - start;
535 
536 	end = start + size;
537 	printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
538 	if (check_type)
539 		e820_print_type(old_type);
540 	pr_cont("\n");
541 
542 	for (i = 0; i < e820_table->nr_entries; i++) {
543 		struct e820_entry *entry = &e820_table->entries[i];
544 		u64 final_start, final_end;
545 		u64 entry_end;
546 
547 		if (check_type && entry->type != old_type)
548 			continue;
549 
550 		entry_end = entry->addr + entry->size;
551 
552 		/* Completely covered? */
553 		if (entry->addr >= start && entry_end <= end) {
554 			real_removed_size += entry->size;
555 			memset(entry, 0, sizeof(*entry));
556 			continue;
557 		}
558 
559 		/* Is the new range completely covered? */
560 		if (entry->addr < start && entry_end > end) {
561 			e820__range_add(end, entry_end - end, entry->type);
562 			entry->size = start - entry->addr;
563 			real_removed_size += size;
564 			continue;
565 		}
566 
567 		/* Partially covered: */
568 		final_start = max(start, entry->addr);
569 		final_end = min(end, entry_end);
570 		if (final_start >= final_end)
571 			continue;
572 
573 		real_removed_size += final_end - final_start;
574 
575 		/*
576 		 * Left range could be head or tail, so need to update
577 		 * the size first:
578 		 */
579 		entry->size -= final_end - final_start;
580 		if (entry->addr < final_start)
581 			continue;
582 
583 		entry->addr = final_end;
584 	}
585 	return real_removed_size;
586 }
587 
588 void __init e820__update_table_print(void)
589 {
590 	if (e820__update_table(e820_table))
591 		return;
592 
593 	pr_info("modified physical RAM map:\n");
594 	e820__print_table("modified");
595 }
596 
597 static void __init e820__update_table_kexec(void)
598 {
599 	e820__update_table(e820_table_kexec);
600 }
601 
602 #define MAX_GAP_END 0x100000000ull
603 
604 /*
605  * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
606  */
607 static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
608 {
609 	unsigned long long last = MAX_GAP_END;
610 	int i = e820_table->nr_entries;
611 	int found = 0;
612 
613 	while (--i >= 0) {
614 		unsigned long long start = e820_table->entries[i].addr;
615 		unsigned long long end = start + e820_table->entries[i].size;
616 
617 		/*
618 		 * Since "last" is at most 4GB, we know we'll
619 		 * fit in 32 bits if this condition is true:
620 		 */
621 		if (last > end) {
622 			unsigned long gap = last - end;
623 
624 			if (gap >= *gapsize) {
625 				*gapsize = gap;
626 				*gapstart = end;
627 				found = 1;
628 			}
629 		}
630 		if (start < last)
631 			last = start;
632 	}
633 	return found;
634 }
635 
636 /*
637  * Search for the biggest gap in the low 32 bits of the E820
638  * memory space. We pass this space to the PCI subsystem, so
639  * that it can assign MMIO resources for hotplug or
640  * unconfigured devices in.
641  *
642  * Hopefully the BIOS let enough space left.
643  */
644 __init void e820__setup_pci_gap(void)
645 {
646 	unsigned long gapstart, gapsize;
647 	int found;
648 
649 	gapsize = 0x400000;
650 	found  = e820_search_gap(&gapstart, &gapsize);
651 
652 	if (!found) {
653 #ifdef CONFIG_X86_64
654 		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
655 		pr_err("Cannot find an available gap in the 32-bit address range\n");
656 		pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
657 #else
658 		gapstart = 0x10000000;
659 #endif
660 	}
661 
662 	/*
663 	 * e820__reserve_resources_late() protects stolen RAM already:
664 	 */
665 	pci_mem_start = gapstart;
666 
667 	pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
668 		gapstart, gapstart + gapsize - 1);
669 }
670 
671 /*
672  * Called late during init, in free_initmem().
673  *
674  * Initial e820_table and e820_table_kexec are largish __initdata arrays.
675  *
676  * Copy them to a (usually much smaller) dynamically allocated area that is
677  * sized precisely after the number of e820 entries.
678  *
679  * This is done after we've performed all the fixes and tweaks to the tables.
680  * All functions which modify them are __init functions, which won't exist
681  * after free_initmem().
682  */
683 __init void e820__reallocate_tables(void)
684 {
685 	struct e820_table *n;
686 	int size;
687 
688 	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
689 	n = kmemdup(e820_table, size, GFP_KERNEL);
690 	BUG_ON(!n);
691 	e820_table = n;
692 
693 	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
694 	n = kmemdup(e820_table_kexec, size, GFP_KERNEL);
695 	BUG_ON(!n);
696 	e820_table_kexec = n;
697 
698 	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
699 	n = kmemdup(e820_table_firmware, size, GFP_KERNEL);
700 	BUG_ON(!n);
701 	e820_table_firmware = n;
702 }
703 
704 /*
705  * Because of the small fixed size of struct boot_params, only the first
706  * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
707  * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
708  * struct setup_data, which is parsed here.
709  */
710 void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
711 {
712 	int entries;
713 	struct boot_e820_entry *extmap;
714 	struct setup_data *sdata;
715 
716 	sdata = early_memremap(phys_addr, data_len);
717 	entries = sdata->len / sizeof(*extmap);
718 	extmap = (struct boot_e820_entry *)(sdata->data);
719 
720 	__append_e820_table(extmap, entries);
721 	e820__update_table(e820_table);
722 
723 	memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
724 	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
725 
726 	early_memunmap(sdata, data_len);
727 	pr_info("extended physical RAM map:\n");
728 	e820__print_table("extended");
729 }
730 
731 /*
732  * Find the ranges of physical addresses that do not correspond to
733  * E820 RAM areas and register the corresponding pages as 'nosave' for
734  * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
735  *
736  * This function requires the E820 map to be sorted and without any
737  * overlapping entries.
738  */
739 void __init e820__register_nosave_regions(unsigned long limit_pfn)
740 {
741 	int i;
742 	unsigned long pfn = 0;
743 
744 	for (i = 0; i < e820_table->nr_entries; i++) {
745 		struct e820_entry *entry = &e820_table->entries[i];
746 
747 		if (pfn < PFN_UP(entry->addr))
748 			register_nosave_region(pfn, PFN_UP(entry->addr));
749 
750 		pfn = PFN_DOWN(entry->addr + entry->size);
751 
752 		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
753 			register_nosave_region(PFN_UP(entry->addr), pfn);
754 
755 		if (pfn >= limit_pfn)
756 			break;
757 	}
758 }
759 
760 #ifdef CONFIG_ACPI
761 /*
762  * Register ACPI NVS memory regions, so that we can save/restore them during
763  * hibernation and the subsequent resume:
764  */
765 static int __init e820__register_nvs_regions(void)
766 {
767 	int i;
768 
769 	for (i = 0; i < e820_table->nr_entries; i++) {
770 		struct e820_entry *entry = &e820_table->entries[i];
771 
772 		if (entry->type == E820_TYPE_NVS)
773 			acpi_nvs_register(entry->addr, entry->size);
774 	}
775 
776 	return 0;
777 }
778 core_initcall(e820__register_nvs_regions);
779 #endif
780 
781 /*
782  * Allocate the requested number of bytes with the requsted alignment
783  * and return (the physical address) to the caller. Also register this
784  * range in the 'kexec' E820 table as a reserved range.
785  *
786  * This allows kexec to fake a new mptable, as if it came from the real
787  * system.
788  */
789 u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
790 {
791 	u64 addr;
792 
793 	addr = memblock_phys_alloc(size, align);
794 	if (addr) {
795 		e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
796 		pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
797 		e820__update_table_kexec();
798 	}
799 
800 	return addr;
801 }
802 
803 #ifdef CONFIG_X86_32
804 # ifdef CONFIG_X86_PAE
805 #  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
806 # else
807 #  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
808 # endif
809 #else /* CONFIG_X86_32 */
810 # define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
811 #endif
812 
813 /*
814  * Find the highest page frame number we have available
815  */
816 static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
817 {
818 	int i;
819 	unsigned long last_pfn = 0;
820 	unsigned long max_arch_pfn = MAX_ARCH_PFN;
821 
822 	for (i = 0; i < e820_table->nr_entries; i++) {
823 		struct e820_entry *entry = &e820_table->entries[i];
824 		unsigned long start_pfn;
825 		unsigned long end_pfn;
826 
827 		if (entry->type != type)
828 			continue;
829 
830 		start_pfn = entry->addr >> PAGE_SHIFT;
831 		end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
832 
833 		if (start_pfn >= limit_pfn)
834 			continue;
835 		if (end_pfn > limit_pfn) {
836 			last_pfn = limit_pfn;
837 			break;
838 		}
839 		if (end_pfn > last_pfn)
840 			last_pfn = end_pfn;
841 	}
842 
843 	if (last_pfn > max_arch_pfn)
844 		last_pfn = max_arch_pfn;
845 
846 	pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
847 		last_pfn, max_arch_pfn);
848 	return last_pfn;
849 }
850 
851 unsigned long __init e820__end_of_ram_pfn(void)
852 {
853 	return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
854 }
855 
856 unsigned long __init e820__end_of_low_ram_pfn(void)
857 {
858 	return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
859 }
860 
861 static void __init early_panic(char *msg)
862 {
863 	early_printk(msg);
864 	panic(msg);
865 }
866 
867 static int userdef __initdata;
868 
869 /* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
870 static int __init parse_memopt(char *p)
871 {
872 	u64 mem_size;
873 
874 	if (!p)
875 		return -EINVAL;
876 
877 	if (!strcmp(p, "nopentium")) {
878 #ifdef CONFIG_X86_32
879 		setup_clear_cpu_cap(X86_FEATURE_PSE);
880 		return 0;
881 #else
882 		pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
883 		return -EINVAL;
884 #endif
885 	}
886 
887 	userdef = 1;
888 	mem_size = memparse(p, &p);
889 
890 	/* Don't remove all memory when getting "mem={invalid}" parameter: */
891 	if (mem_size == 0)
892 		return -EINVAL;
893 
894 	e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
895 
896 #ifdef CONFIG_MEMORY_HOTPLUG
897 	max_mem_size = mem_size;
898 #endif
899 
900 	return 0;
901 }
902 early_param("mem", parse_memopt);
903 
904 static int __init parse_memmap_one(char *p)
905 {
906 	char *oldp;
907 	u64 start_at, mem_size;
908 
909 	if (!p)
910 		return -EINVAL;
911 
912 	if (!strncmp(p, "exactmap", 8)) {
913 #ifdef CONFIG_CRASH_DUMP
914 		/*
915 		 * If we are doing a crash dump, we still need to know
916 		 * the real memory size before the original memory map is
917 		 * reset.
918 		 */
919 		saved_max_pfn = e820__end_of_ram_pfn();
920 #endif
921 		e820_table->nr_entries = 0;
922 		userdef = 1;
923 		return 0;
924 	}
925 
926 	oldp = p;
927 	mem_size = memparse(p, &p);
928 	if (p == oldp)
929 		return -EINVAL;
930 
931 	userdef = 1;
932 	if (*p == '@') {
933 		start_at = memparse(p+1, &p);
934 		e820__range_add(start_at, mem_size, E820_TYPE_RAM);
935 	} else if (*p == '#') {
936 		start_at = memparse(p+1, &p);
937 		e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
938 	} else if (*p == '$') {
939 		start_at = memparse(p+1, &p);
940 		e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
941 	} else if (*p == '!') {
942 		start_at = memparse(p+1, &p);
943 		e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
944 	} else if (*p == '%') {
945 		enum e820_type from = 0, to = 0;
946 
947 		start_at = memparse(p + 1, &p);
948 		if (*p == '-')
949 			from = simple_strtoull(p + 1, &p, 0);
950 		if (*p == '+')
951 			to = simple_strtoull(p + 1, &p, 0);
952 		if (*p != '\0')
953 			return -EINVAL;
954 		if (from && to)
955 			e820__range_update(start_at, mem_size, from, to);
956 		else if (to)
957 			e820__range_add(start_at, mem_size, to);
958 		else if (from)
959 			e820__range_remove(start_at, mem_size, from, 1);
960 		else
961 			e820__range_remove(start_at, mem_size, 0, 0);
962 	} else {
963 		e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
964 	}
965 
966 	return *p == '\0' ? 0 : -EINVAL;
967 }
968 
969 static int __init parse_memmap_opt(char *str)
970 {
971 	while (str) {
972 		char *k = strchr(str, ',');
973 
974 		if (k)
975 			*k++ = 0;
976 
977 		parse_memmap_one(str);
978 		str = k;
979 	}
980 
981 	return 0;
982 }
983 early_param("memmap", parse_memmap_opt);
984 
985 /*
986  * Reserve all entries from the bootloader's extensible data nodes list,
987  * because if present we are going to use it later on to fetch e820
988  * entries from it:
989  */
990 void __init e820__reserve_setup_data(void)
991 {
992 	struct setup_data *data;
993 	u64 pa_data;
994 
995 	pa_data = boot_params.hdr.setup_data;
996 	if (!pa_data)
997 		return;
998 
999 	while (pa_data) {
1000 		data = early_memremap(pa_data, sizeof(*data));
1001 		e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1002 		e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1003 
1004 		if (data->type == SETUP_INDIRECT &&
1005 		    ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
1006 			e820__range_update(((struct setup_indirect *)data->data)->addr,
1007 					   ((struct setup_indirect *)data->data)->len,
1008 					   E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1009 			e820__range_update_kexec(((struct setup_indirect *)data->data)->addr,
1010 						 ((struct setup_indirect *)data->data)->len,
1011 						 E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1012 		}
1013 
1014 		pa_data = data->next;
1015 		early_memunmap(data, sizeof(*data));
1016 	}
1017 
1018 	e820__update_table(e820_table);
1019 	e820__update_table(e820_table_kexec);
1020 
1021 	pr_info("extended physical RAM map:\n");
1022 	e820__print_table("reserve setup_data");
1023 }
1024 
1025 /*
1026  * Called after parse_early_param(), after early parameters (such as mem=)
1027  * have been processed, in which case we already have an E820 table filled in
1028  * via the parameter callback function(s), but it's not sorted and printed yet:
1029  */
1030 void __init e820__finish_early_params(void)
1031 {
1032 	if (userdef) {
1033 		if (e820__update_table(e820_table) < 0)
1034 			early_panic("Invalid user supplied memory map");
1035 
1036 		pr_info("user-defined physical RAM map:\n");
1037 		e820__print_table("user");
1038 	}
1039 }
1040 
1041 static const char *__init e820_type_to_string(struct e820_entry *entry)
1042 {
1043 	switch (entry->type) {
1044 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
1045 	case E820_TYPE_RAM:		return "System RAM";
1046 	case E820_TYPE_ACPI:		return "ACPI Tables";
1047 	case E820_TYPE_NVS:		return "ACPI Non-volatile Storage";
1048 	case E820_TYPE_UNUSABLE:	return "Unusable memory";
1049 	case E820_TYPE_PRAM:		return "Persistent Memory (legacy)";
1050 	case E820_TYPE_PMEM:		return "Persistent Memory";
1051 	case E820_TYPE_RESERVED:	return "Reserved";
1052 	case E820_TYPE_SOFT_RESERVED:	return "Soft Reserved";
1053 	default:			return "Unknown E820 type";
1054 	}
1055 }
1056 
1057 static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
1058 {
1059 	switch (entry->type) {
1060 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
1061 	case E820_TYPE_RAM:		return IORESOURCE_SYSTEM_RAM;
1062 	case E820_TYPE_ACPI:		/* Fall-through: */
1063 	case E820_TYPE_NVS:		/* Fall-through: */
1064 	case E820_TYPE_UNUSABLE:	/* Fall-through: */
1065 	case E820_TYPE_PRAM:		/* Fall-through: */
1066 	case E820_TYPE_PMEM:		/* Fall-through: */
1067 	case E820_TYPE_RESERVED:	/* Fall-through: */
1068 	case E820_TYPE_SOFT_RESERVED:	/* Fall-through: */
1069 	default:			return IORESOURCE_MEM;
1070 	}
1071 }
1072 
1073 static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
1074 {
1075 	switch (entry->type) {
1076 	case E820_TYPE_ACPI:		return IORES_DESC_ACPI_TABLES;
1077 	case E820_TYPE_NVS:		return IORES_DESC_ACPI_NV_STORAGE;
1078 	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
1079 	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
1080 	case E820_TYPE_RESERVED:	return IORES_DESC_RESERVED;
1081 	case E820_TYPE_SOFT_RESERVED:	return IORES_DESC_SOFT_RESERVED;
1082 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
1083 	case E820_TYPE_RAM:		/* Fall-through: */
1084 	case E820_TYPE_UNUSABLE:	/* Fall-through: */
1085 	default:			return IORES_DESC_NONE;
1086 	}
1087 }
1088 
1089 static bool __init do_mark_busy(enum e820_type type, struct resource *res)
1090 {
1091 	/* this is the legacy bios/dos rom-shadow + mmio region */
1092 	if (res->start < (1ULL<<20))
1093 		return true;
1094 
1095 	/*
1096 	 * Treat persistent memory and other special memory ranges like
1097 	 * device memory, i.e. reserve it for exclusive use of a driver
1098 	 */
1099 	switch (type) {
1100 	case E820_TYPE_RESERVED:
1101 	case E820_TYPE_SOFT_RESERVED:
1102 	case E820_TYPE_PRAM:
1103 	case E820_TYPE_PMEM:
1104 		return false;
1105 	case E820_TYPE_RESERVED_KERN:
1106 	case E820_TYPE_RAM:
1107 	case E820_TYPE_ACPI:
1108 	case E820_TYPE_NVS:
1109 	case E820_TYPE_UNUSABLE:
1110 	default:
1111 		return true;
1112 	}
1113 }
1114 
1115 /*
1116  * Mark E820 reserved areas as busy for the resource manager:
1117  */
1118 
1119 static struct resource __initdata *e820_res;
1120 
1121 void __init e820__reserve_resources(void)
1122 {
1123 	int i;
1124 	struct resource *res;
1125 	u64 end;
1126 
1127 	res = memblock_alloc(sizeof(*res) * e820_table->nr_entries,
1128 			     SMP_CACHE_BYTES);
1129 	if (!res)
1130 		panic("%s: Failed to allocate %zu bytes\n", __func__,
1131 		      sizeof(*res) * e820_table->nr_entries);
1132 	e820_res = res;
1133 
1134 	for (i = 0; i < e820_table->nr_entries; i++) {
1135 		struct e820_entry *entry = e820_table->entries + i;
1136 
1137 		end = entry->addr + entry->size - 1;
1138 		if (end != (resource_size_t)end) {
1139 			res++;
1140 			continue;
1141 		}
1142 		res->start = entry->addr;
1143 		res->end   = end;
1144 		res->name  = e820_type_to_string(entry);
1145 		res->flags = e820_type_to_iomem_type(entry);
1146 		res->desc  = e820_type_to_iores_desc(entry);
1147 
1148 		/*
1149 		 * Don't register the region that could be conflicted with
1150 		 * PCI device BAR resources and insert them later in
1151 		 * pcibios_resource_survey():
1152 		 */
1153 		if (do_mark_busy(entry->type, res)) {
1154 			res->flags |= IORESOURCE_BUSY;
1155 			insert_resource(&iomem_resource, res);
1156 		}
1157 		res++;
1158 	}
1159 
1160 	/* Expose the bootloader-provided memory layout to the sysfs. */
1161 	for (i = 0; i < e820_table_firmware->nr_entries; i++) {
1162 		struct e820_entry *entry = e820_table_firmware->entries + i;
1163 
1164 		firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
1165 	}
1166 }
1167 
1168 /*
1169  * How much should we pad the end of RAM, depending on where it is?
1170  */
1171 static unsigned long __init ram_alignment(resource_size_t pos)
1172 {
1173 	unsigned long mb = pos >> 20;
1174 
1175 	/* To 64kB in the first megabyte */
1176 	if (!mb)
1177 		return 64*1024;
1178 
1179 	/* To 1MB in the first 16MB */
1180 	if (mb < 16)
1181 		return 1024*1024;
1182 
1183 	/* To 64MB for anything above that */
1184 	return 64*1024*1024;
1185 }
1186 
1187 #define MAX_RESOURCE_SIZE ((resource_size_t)-1)
1188 
1189 void __init e820__reserve_resources_late(void)
1190 {
1191 	int i;
1192 	struct resource *res;
1193 
1194 	res = e820_res;
1195 	for (i = 0; i < e820_table->nr_entries; i++) {
1196 		if (!res->parent && res->end)
1197 			insert_resource_expand_to_fit(&iomem_resource, res);
1198 		res++;
1199 	}
1200 
1201 	/*
1202 	 * Try to bump up RAM regions to reasonable boundaries, to
1203 	 * avoid stolen RAM:
1204 	 */
1205 	for (i = 0; i < e820_table->nr_entries; i++) {
1206 		struct e820_entry *entry = &e820_table->entries[i];
1207 		u64 start, end;
1208 
1209 		if (entry->type != E820_TYPE_RAM)
1210 			continue;
1211 
1212 		start = entry->addr + entry->size;
1213 		end = round_up(start, ram_alignment(start)) - 1;
1214 		if (end > MAX_RESOURCE_SIZE)
1215 			end = MAX_RESOURCE_SIZE;
1216 		if (start >= end)
1217 			continue;
1218 
1219 		printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1220 		reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1221 	}
1222 }
1223 
1224 /*
1225  * Pass the firmware (bootloader) E820 map to the kernel and process it:
1226  */
1227 char *__init e820__memory_setup_default(void)
1228 {
1229 	char *who = "BIOS-e820";
1230 
1231 	/*
1232 	 * Try to copy the BIOS-supplied E820-map.
1233 	 *
1234 	 * Otherwise fake a memory map; one section from 0k->640k,
1235 	 * the next section from 1mb->appropriate_mem_k
1236 	 */
1237 	if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1238 		u64 mem_size;
1239 
1240 		/* Compare results from other methods and take the one that gives more RAM: */
1241 		if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1242 			mem_size = boot_params.screen_info.ext_mem_k;
1243 			who = "BIOS-88";
1244 		} else {
1245 			mem_size = boot_params.alt_mem_k;
1246 			who = "BIOS-e801";
1247 		}
1248 
1249 		e820_table->nr_entries = 0;
1250 		e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
1251 		e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
1252 	}
1253 
1254 	/* We just appended a lot of ranges, sanitize the table: */
1255 	e820__update_table(e820_table);
1256 
1257 	return who;
1258 }
1259 
1260 /*
1261  * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
1262  * E820 map - with an optional platform quirk available for virtual platforms
1263  * to override this method of boot environment processing:
1264  */
1265 void __init e820__memory_setup(void)
1266 {
1267 	char *who;
1268 
1269 	/* This is a firmware interface ABI - make sure we don't break it: */
1270 	BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
1271 
1272 	who = x86_init.resources.memory_setup();
1273 
1274 	memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
1275 	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
1276 
1277 	pr_info("BIOS-provided physical RAM map:\n");
1278 	e820__print_table(who);
1279 }
1280 
1281 void __init e820__memblock_setup(void)
1282 {
1283 	int i;
1284 	u64 end;
1285 
1286 	/*
1287 	 * The bootstrap memblock region count maximum is 128 entries
1288 	 * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
1289 	 * than that - so allow memblock resizing.
1290 	 *
1291 	 * This is safe, because this call happens pretty late during x86 setup,
1292 	 * so we know about reserved memory regions already. (This is important
1293 	 * so that memblock resizing does no stomp over reserved areas.)
1294 	 */
1295 	memblock_allow_resize();
1296 
1297 	for (i = 0; i < e820_table->nr_entries; i++) {
1298 		struct e820_entry *entry = &e820_table->entries[i];
1299 
1300 		end = entry->addr + entry->size;
1301 		if (end != (resource_size_t)end)
1302 			continue;
1303 
1304 		if (entry->type == E820_TYPE_SOFT_RESERVED)
1305 			memblock_reserve(entry->addr, entry->size);
1306 
1307 		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1308 			continue;
1309 
1310 		memblock_add(entry->addr, entry->size);
1311 	}
1312 
1313 	/* Throw away partial pages: */
1314 	memblock_trim_memory(PAGE_SIZE);
1315 
1316 	memblock_dump_all();
1317 }
1318