xref: /openbmc/linux/arch/x86/kernel/e820.c (revision 81b3e090fa1f237d49c8feb2fa4afe2aabd3a4ff)
1 /*
2  * Low level x86 E820 memory map handling functions.
3  *
4  * The firmware and bootloader passes us the "E820 table", which is the primary
5  * physical memory layout description available about x86 systems.
6  *
7  * The kernel takes the E820 memory layout and optionally modifies it with
8  * quirks and other tweaks, and feeds that into the generic Linux memory
9  * allocation code routines via a platform independent interface (memblock, etc.).
10  */
11 #include <linux/kernel.h>
12 #include <linux/types.h>
13 #include <linux/init.h>
14 #include <linux/crash_dump.h>
15 #include <linux/export.h>
16 #include <linux/bootmem.h>
17 #include <linux/pfn.h>
18 #include <linux/suspend.h>
19 #include <linux/acpi.h>
20 #include <linux/firmware-map.h>
21 #include <linux/memblock.h>
22 #include <linux/sort.h>
23 
24 #include <asm/e820/api.h>
25 #include <asm/proto.h>
26 #include <asm/setup.h>
27 #include <asm/cpufeature.h>
28 
29 /*
30  * We organize the E820 table into two main data structures:
31  *
32  * - 'e820_table_firmware': the original firmware version passed to us by the
33  *   bootloader - not modified by the kernel. We use this to:
34  *
35  *       - inform the user about the firmware's notion of memory layout
36  *         via /sys/firmware/memmap
37  *
38  *       - the hibernation code uses it to generate a kernel-independent MD5
39  *         fingerprint of the physical memory layout of a system.
40  *
41  *       - kexec, which is a bootloader in disguise, uses the original E820
42  *         layout to pass to the kexec-ed kernel. This way the original kernel
43  *         can have a restricted E820 map while the kexec()-ed kexec-kernel
44  *         can have access to full memory - etc.
45  *
46  * - 'e820_table': this is the main E820 table that is massaged by the
47  *   low level x86 platform code, or modified by boot parameters, before
48  *   passed on to higher level MM layers.
49  *
50  * Once the E820 map has been converted to the standard Linux memory layout
51  * information its role stops - modifying it has no effect and does not get
52  * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
53  * specific memory layout data during early bootup.
54  */
55 static struct e820_table e820_table_init		__initdata;
56 static struct e820_table e820_table_firmware_init	__initdata;
57 
58 struct e820_table *e820_table __refdata			= &e820_table_init;
59 struct e820_table *e820_table_firmware __refdata	= &e820_table_firmware_init;
60 
61 /* For PCI or other memory-mapped resources */
62 unsigned long pci_mem_start = 0xaeedbabe;
63 #ifdef CONFIG_PCI
64 EXPORT_SYMBOL(pci_mem_start);
65 #endif
66 
67 /*
68  * This function checks if any part of the range <start,end> is mapped
69  * with type.
70  */
71 bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
72 {
73 	int i;
74 
75 	for (i = 0; i < e820_table->nr_entries; i++) {
76 		struct e820_entry *entry = &e820_table->entries[i];
77 
78 		if (type && entry->type != type)
79 			continue;
80 		if (entry->addr >= end || entry->addr + entry->size <= start)
81 			continue;
82 		return 1;
83 	}
84 	return 0;
85 }
86 EXPORT_SYMBOL_GPL(e820__mapped_any);
87 
88 /*
89  * This function checks if the entire <start,end> range is mapped with 'type'.
90  *
91  * Note: this function only works correctly once the E820 table is sorted and
92  * not-overlapping (at least for the range specified), which is the case normally.
93  */
94 bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
95 {
96 	int i;
97 
98 	for (i = 0; i < e820_table->nr_entries; i++) {
99 		struct e820_entry *entry = &e820_table->entries[i];
100 
101 		if (type && entry->type != type)
102 			continue;
103 
104 		/* Is the region (part) in overlap with the current region? */
105 		if (entry->addr >= end || entry->addr + entry->size <= start)
106 			continue;
107 
108 		/*
109 		 * If the region is at the beginning of <start,end> we move
110 		 * 'start' to the end of the region since it's ok until there
111 		 */
112 		if (entry->addr <= start)
113 			start = entry->addr + entry->size;
114 
115 		/*
116 		 * If 'start' is now at or beyond 'end', we're done, full
117 		 * coverage of the desired range exists:
118 		 */
119 		if (start >= end)
120 			return 1;
121 	}
122 	return 0;
123 }
124 
125 /*
126  * Add a memory region to the kernel E820 map.
127  */
128 static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
129 {
130 	int x = table->nr_entries;
131 
132 	if (x >= ARRAY_SIZE(table->entries)) {
133 		pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
134 		return;
135 	}
136 
137 	table->entries[x].addr = start;
138 	table->entries[x].size = size;
139 	table->entries[x].type = type;
140 	table->nr_entries++;
141 }
142 
143 void __init e820__range_add(u64 start, u64 size, enum e820_type type)
144 {
145 	__e820__range_add(e820_table, start, size, type);
146 }
147 
148 static void __init e820_print_type(enum e820_type type)
149 {
150 	switch (type) {
151 	case E820_TYPE_RAM:		/* Fall through: */
152 	case E820_TYPE_RESERVED_KERN:	pr_cont("usable");			break;
153 	case E820_TYPE_RESERVED:	pr_cont("reserved");			break;
154 	case E820_TYPE_ACPI:		pr_cont("ACPI data");			break;
155 	case E820_TYPE_NVS:		pr_cont("ACPI NVS");			break;
156 	case E820_TYPE_UNUSABLE:	pr_cont("unusable");			break;
157 	case E820_TYPE_PMEM:		/* Fall through: */
158 	case E820_TYPE_PRAM:		pr_cont("persistent (type %u)", type);	break;
159 	default:			pr_cont("type %u", type);		break;
160 	}
161 }
162 
163 void __init e820__print_table(char *who)
164 {
165 	int i;
166 
167 	for (i = 0; i < e820_table->nr_entries; i++) {
168 		pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
169 		       e820_table->entries[i].addr,
170 		       e820_table->entries[i].addr + e820_table->entries[i].size - 1);
171 
172 		e820_print_type(e820_table->entries[i].type);
173 		pr_cont("\n");
174 	}
175 }
176 
177 /*
178  * Sanitize an E820 map.
179  *
180  * Some E820 layouts include overlapping entries. The following
181  * replaces the original E820 map with a new one, removing overlaps,
182  * and resolving conflicting memory types in favor of highest
183  * numbered type.
184  *
185  * The input parameter 'entries' points to an array of 'struct
186  * e820_entry' which on entry has elements in the range [0, *nr_entries)
187  * valid, and which has space for up to max_nr_entries entries.
188  * On return, the resulting sanitized E820 map entries will be in
189  * overwritten in the same location, starting at 'entries'.
190  *
191  * The integer pointed to by nr_entries must be valid on entry (the
192  * current number of valid entries located at 'entries'). If the
193  * sanitizing succeeds the *nr_entries will be updated with the new
194  * number of valid entries (something no more than max_nr_entries).
195  *
196  * The return value from e820__update_table() is zero if it
197  * successfully 'sanitized' the map entries passed in, and is -1
198  * if it did nothing, which can happen if either of (1) it was
199  * only passed one map entry, or (2) any of the input map entries
200  * were invalid (start + size < start, meaning that the size was
201  * so big the described memory range wrapped around through zero.)
202  *
203  *	Visually we're performing the following
204  *	(1,2,3,4 = memory types)...
205  *
206  *	Sample memory map (w/overlaps):
207  *	   ____22__________________
208  *	   ______________________4_
209  *	   ____1111________________
210  *	   _44_____________________
211  *	   11111111________________
212  *	   ____________________33__
213  *	   ___________44___________
214  *	   __________33333_________
215  *	   ______________22________
216  *	   ___________________2222_
217  *	   _________111111111______
218  *	   _____________________11_
219  *	   _________________4______
220  *
221  *	Sanitized equivalent (no overlap):
222  *	   1_______________________
223  *	   _44_____________________
224  *	   ___1____________________
225  *	   ____22__________________
226  *	   ______11________________
227  *	   _________1______________
228  *	   __________3_____________
229  *	   ___________44___________
230  *	   _____________33_________
231  *	   _______________2________
232  *	   ________________1_______
233  *	   _________________4______
234  *	   ___________________2____
235  *	   ____________________33__
236  *	   ______________________4_
237  */
238 struct change_member {
239 	/* Pointer to the original entry: */
240 	struct e820_entry	*entry;
241 	/* Address for this change point: */
242 	unsigned long long	addr;
243 };
244 
245 static int __init cpcompare(const void *a, const void *b)
246 {
247 	struct change_member * const *app = a, * const *bpp = b;
248 	const struct change_member *ap = *app, *bp = *bpp;
249 
250 	/*
251 	 * Inputs are pointers to two elements of change_point[].  If their
252 	 * addresses are not equal, their difference dominates.  If the addresses
253 	 * are equal, then consider one that represents the end of its region
254 	 * to be greater than one that does not.
255 	 */
256 	if (ap->addr != bp->addr)
257 		return ap->addr > bp->addr ? 1 : -1;
258 
259 	return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
260 }
261 
262 static int __init __e820__update_table(struct e820_entry *entries, u32 max_nr_entries, u32 *nr_entries)
263 {
264 	static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata;
265 	static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata;
266 	static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata;
267 	static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata;
268 	enum e820_type current_type, last_type;
269 	unsigned long long last_addr;
270 	u32 chgidx;
271 	u32 overlap_entries;
272 	u32 new_nr_entries;
273 	u32 old_nr, new_nr, chg_nr;
274 	u32 i;
275 
276 	/* If there's only one memory region, don't bother: */
277 	if (*nr_entries < 2)
278 		return -1;
279 
280 	old_nr = *nr_entries;
281 	BUG_ON(old_nr > max_nr_entries);
282 
283 	/* Bail out if we find any unreasonable addresses in the map: */
284 	for (i = 0; i < old_nr; i++) {
285 		if (entries[i].addr + entries[i].size < entries[i].addr)
286 			return -1;
287 	}
288 
289 	/* Create pointers for initial change-point information (for sorting): */
290 	for (i = 0; i < 2 * old_nr; i++)
291 		change_point[i] = &change_point_list[i];
292 
293 	/*
294 	 * Record all known change-points (starting and ending addresses),
295 	 * omitting empty memory regions:
296 	 */
297 	chgidx = 0;
298 	for (i = 0; i < old_nr; i++)	{
299 		if (entries[i].size != 0) {
300 			change_point[chgidx]->addr	= entries[i].addr;
301 			change_point[chgidx++]->entry	= &entries[i];
302 			change_point[chgidx]->addr	= entries[i].addr + entries[i].size;
303 			change_point[chgidx++]->entry	= &entries[i];
304 		}
305 	}
306 	chg_nr = chgidx;
307 
308 	/* Sort change-point list by memory addresses (low -> high): */
309 	sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
310 
311 	/* Create a new memory map, removing overlaps: */
312 	overlap_entries = 0;	 /* Number of entries in the overlap table */
313 	new_nr_entries = 0;	 /* Index for creating new map entries */
314 	last_type = 0;		 /* Start with undefined memory type */
315 	last_addr = 0;		 /* Start with 0 as last starting address */
316 
317 	/* Loop through change-points, determining effect on the new map: */
318 	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
319 		/* Keep track of all overlapping entries */
320 		if (change_point[chgidx]->addr == change_point[chgidx]->entry->addr) {
321 			/* Add map entry to overlap list (> 1 entry implies an overlap) */
322 			overlap_list[overlap_entries++] = change_point[chgidx]->entry;
323 		} else {
324 			/* Remove entry from list (order independent, so swap with last): */
325 			for (i = 0; i < overlap_entries; i++) {
326 				if (overlap_list[i] == change_point[chgidx]->entry)
327 					overlap_list[i] = overlap_list[overlap_entries-1];
328 			}
329 			overlap_entries--;
330 		}
331 		/*
332 		 * If there are overlapping entries, decide which
333 		 * "type" to use (larger value takes precedence --
334 		 * 1=usable, 2,3,4,4+=unusable)
335 		 */
336 		current_type = 0;
337 		for (i = 0; i < overlap_entries; i++) {
338 			if (overlap_list[i]->type > current_type)
339 				current_type = overlap_list[i]->type;
340 		}
341 
342 		/* Continue building up new map based on this information: */
343 		if (current_type != last_type || current_type == E820_TYPE_PRAM) {
344 			if (last_type != 0)	 {
345 				new_entries[new_nr_entries].size = change_point[chgidx]->addr - last_addr;
346 				/* Move forward only if the new size was non-zero: */
347 				if (new_entries[new_nr_entries].size != 0)
348 					/* No more space left for new entries? */
349 					if (++new_nr_entries >= max_nr_entries)
350 						break;
351 			}
352 			if (current_type != 0)	{
353 				new_entries[new_nr_entries].addr = change_point[chgidx]->addr;
354 				new_entries[new_nr_entries].type = current_type;
355 				last_addr = change_point[chgidx]->addr;
356 			}
357 			last_type = current_type;
358 		}
359 	}
360 
361 	/* Retain count for the new entries: */
362 	new_nr = new_nr_entries;
363 
364 	/* Copy the new entries into the original location: */
365 	memcpy(entries, new_entries, new_nr*sizeof(*entries));
366 	*nr_entries = new_nr;
367 
368 	return 0;
369 }
370 
371 int __init e820__update_table(struct e820_table *table)
372 {
373 	return __e820__update_table(table->entries, ARRAY_SIZE(table->entries), &table->nr_entries);
374 }
375 
376 static int __init __append_e820_table(struct e820_entry *entries, u32 nr_entries)
377 {
378 	struct e820_entry *entry = entries;
379 
380 	while (nr_entries) {
381 		u64 start = entry->addr;
382 		u64 size = entry->size;
383 		u64 end = start + size - 1;
384 		u32 type = entry->type;
385 
386 		/* Ignore the entry on 64-bit overflow: */
387 		if (start > end && likely(size))
388 			return -1;
389 
390 		e820__range_add(start, size, type);
391 
392 		entry++;
393 		nr_entries--;
394 	}
395 	return 0;
396 }
397 
398 /*
399  * Copy the BIOS E820 map into a safe place.
400  *
401  * Sanity-check it while we're at it..
402  *
403  * If we're lucky and live on a modern system, the setup code
404  * will have given us a memory map that we can use to properly
405  * set up memory.  If we aren't, we'll fake a memory map.
406  */
407 static int __init append_e820_table(struct e820_entry *entries, u32 nr_entries)
408 {
409 	/* Only one memory region (or negative)? Ignore it */
410 	if (nr_entries < 2)
411 		return -1;
412 
413 	return __append_e820_table(entries, nr_entries);
414 }
415 
416 static u64 __init
417 __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
418 {
419 	u64 end;
420 	unsigned int i;
421 	u64 real_updated_size = 0;
422 
423 	BUG_ON(old_type == new_type);
424 
425 	if (size > (ULLONG_MAX - start))
426 		size = ULLONG_MAX - start;
427 
428 	end = start + size;
429 	pr_debug("e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
430 	e820_print_type(old_type);
431 	pr_cont(" ==> ");
432 	e820_print_type(new_type);
433 	pr_cont("\n");
434 
435 	for (i = 0; i < table->nr_entries; i++) {
436 		struct e820_entry *entry = &table->entries[i];
437 		u64 final_start, final_end;
438 		u64 entry_end;
439 
440 		if (entry->type != old_type)
441 			continue;
442 
443 		entry_end = entry->addr + entry->size;
444 
445 		/* Completely covered by new range? */
446 		if (entry->addr >= start && entry_end <= end) {
447 			entry->type = new_type;
448 			real_updated_size += entry->size;
449 			continue;
450 		}
451 
452 		/* New range is completely covered? */
453 		if (entry->addr < start && entry_end > end) {
454 			__e820__range_add(table, start, size, new_type);
455 			__e820__range_add(table, end, entry_end - end, entry->type);
456 			entry->size = start - entry->addr;
457 			real_updated_size += size;
458 			continue;
459 		}
460 
461 		/* Partially covered: */
462 		final_start = max(start, entry->addr);
463 		final_end = min(end, entry_end);
464 		if (final_start >= final_end)
465 			continue;
466 
467 		__e820__range_add(table, final_start, final_end - final_start, new_type);
468 
469 		real_updated_size += final_end - final_start;
470 
471 		/*
472 		 * Left range could be head or tail, so need to update
473 		 * its size first:
474 		 */
475 		entry->size -= final_end - final_start;
476 		if (entry->addr < final_start)
477 			continue;
478 
479 		entry->addr = final_end;
480 	}
481 	return real_updated_size;
482 }
483 
484 u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
485 {
486 	return __e820__range_update(e820_table, start, size, old_type, new_type);
487 }
488 
489 static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
490 {
491 	return __e820__range_update(e820_table_firmware, start, size, old_type, new_type);
492 }
493 
494 /* Remove a range of memory from the E820 table: */
495 u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
496 {
497 	int i;
498 	u64 end;
499 	u64 real_removed_size = 0;
500 
501 	if (size > (ULLONG_MAX - start))
502 		size = ULLONG_MAX - start;
503 
504 	end = start + size;
505 	pr_debug("e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
506 	if (check_type)
507 		e820_print_type(old_type);
508 	pr_cont("\n");
509 
510 	for (i = 0; i < e820_table->nr_entries; i++) {
511 		struct e820_entry *entry = &e820_table->entries[i];
512 		u64 final_start, final_end;
513 		u64 entry_end;
514 
515 		if (check_type && entry->type != old_type)
516 			continue;
517 
518 		entry_end = entry->addr + entry->size;
519 
520 		/* Completely covered? */
521 		if (entry->addr >= start && entry_end <= end) {
522 			real_removed_size += entry->size;
523 			memset(entry, 0, sizeof(*entry));
524 			continue;
525 		}
526 
527 		/* Is the new range completely covered? */
528 		if (entry->addr < start && entry_end > end) {
529 			e820__range_add(end, entry_end - end, entry->type);
530 			entry->size = start - entry->addr;
531 			real_removed_size += size;
532 			continue;
533 		}
534 
535 		/* Partially covered: */
536 		final_start = max(start, entry->addr);
537 		final_end = min(end, entry_end);
538 		if (final_start >= final_end)
539 			continue;
540 
541 		real_removed_size += final_end - final_start;
542 
543 		/*
544 		 * Left range could be head or tail, so need to update
545 		 * the size first:
546 		 */
547 		entry->size -= final_end - final_start;
548 		if (entry->addr < final_start)
549 			continue;
550 
551 		entry->addr = final_end;
552 	}
553 	return real_removed_size;
554 }
555 
556 void __init e820__update_table_print(void)
557 {
558 	if (e820__update_table(e820_table))
559 		return;
560 
561 	pr_info("e820: modified physical RAM map:\n");
562 	e820__print_table("modified");
563 }
564 
565 static void __init e820__update_table_firmware(void)
566 {
567 	e820__update_table(e820_table_firmware);
568 }
569 
570 #define MAX_GAP_END 0x100000000ull
571 
572 /*
573  * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
574  */
575 static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
576 {
577 	unsigned long long last = MAX_GAP_END;
578 	int i = e820_table->nr_entries;
579 	int found = 0;
580 
581 	while (--i >= 0) {
582 		unsigned long long start = e820_table->entries[i].addr;
583 		unsigned long long end = start + e820_table->entries[i].size;
584 
585 		/*
586 		 * Since "last" is at most 4GB, we know we'll
587 		 * fit in 32 bits if this condition is true:
588 		 */
589 		if (last > end) {
590 			unsigned long gap = last - end;
591 
592 			if (gap >= *gapsize) {
593 				*gapsize = gap;
594 				*gapstart = end;
595 				found = 1;
596 			}
597 		}
598 		if (start < last)
599 			last = start;
600 	}
601 	return found;
602 }
603 
604 /*
605  * Search for the biggest gap in the low 32 bits of the E820
606  * memory space. We pass this space to the PCI subsystem, so
607  * that it can assign MMIO resources for hotplug or
608  * unconfigured devices in.
609  *
610  * Hopefully the BIOS let enough space left.
611  */
612 __init void e820__setup_pci_gap(void)
613 {
614 	unsigned long gapstart, gapsize;
615 	int found;
616 
617 	gapsize = 0x400000;
618 	found  = e820_search_gap(&gapstart, &gapsize);
619 
620 	if (!found) {
621 #ifdef CONFIG_X86_64
622 		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
623 		pr_err(
624 			"e820: Cannot find an available gap in the 32-bit address range\n"
625 			"e820: PCI devices with unassigned 32-bit BARs may not work!\n");
626 #else
627 		gapstart = 0x10000000;
628 #endif
629 	}
630 
631 	/*
632 	 * e820_reserve_resources_late protect stolen RAM already
633 	 */
634 	pci_mem_start = gapstart;
635 
636 	pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
637 }
638 
639 /*
640  * Called late during init, in free_initmem().
641  *
642  * Initial e820_table and e820_table_firmware are largish __initdata arrays.
643  *
644  * Copy them to a (usually much smaller) dynamically allocated area that is
645  * sized precisely after the number of e820 entries.
646  *
647  * This is done after we've performed all the fixes and tweaks to the tables.
648  * All functions which modify them are __init functions, which won't exist
649  * after free_initmem().
650  */
651 __init void e820_reallocate_tables(void)
652 {
653 	struct e820_table *n;
654 	int size;
655 
656 	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
657 	n = kmalloc(size, GFP_KERNEL);
658 	BUG_ON(!n);
659 	memcpy(n, e820_table, size);
660 	e820_table = n;
661 
662 	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
663 	n = kmalloc(size, GFP_KERNEL);
664 	BUG_ON(!n);
665 	memcpy(n, e820_table_firmware, size);
666 	e820_table_firmware = n;
667 }
668 
669 /*
670  * Because of the small fixed size of struct boot_params, only the first
671  * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
672  * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
673  * struct setup_data, which is parsed here.
674  */
675 void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
676 {
677 	int entries;
678 	struct e820_entry *extmap;
679 	struct setup_data *sdata;
680 
681 	sdata = early_memremap(phys_addr, data_len);
682 	entries = sdata->len / sizeof(*extmap);
683 	extmap = (struct e820_entry *)(sdata->data);
684 
685 	__append_e820_table(extmap, entries);
686 	e820__update_table(e820_table);
687 
688 	early_memunmap(sdata, data_len);
689 	pr_info("e820: extended physical RAM map:\n");
690 	e820__print_table("extended");
691 }
692 
693 /**
694  * Find the ranges of physical addresses that do not correspond to
695  * E820 RAM areas and mark the corresponding pages as 'nosave' for
696  * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
697  *
698  * This function requires the E820 map to be sorted and without any
699  * overlapping entries.
700  */
701 void __init e820_mark_nosave_regions(unsigned long limit_pfn)
702 {
703 	int i;
704 	unsigned long pfn = 0;
705 
706 	for (i = 0; i < e820_table->nr_entries; i++) {
707 		struct e820_entry *entry = &e820_table->entries[i];
708 
709 		if (pfn < PFN_UP(entry->addr))
710 			register_nosave_region(pfn, PFN_UP(entry->addr));
711 
712 		pfn = PFN_DOWN(entry->addr + entry->size);
713 
714 		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
715 			register_nosave_region(PFN_UP(entry->addr), pfn);
716 
717 		if (pfn >= limit_pfn)
718 			break;
719 	}
720 }
721 
722 #ifdef CONFIG_ACPI
723 /*
724  * Register ACPI NVS memory regions, so that we can save/restore them during
725  * hibernation and the subsequent resume:
726  */
727 static int __init e820_mark_nvs_memory(void)
728 {
729 	int i;
730 
731 	for (i = 0; i < e820_table->nr_entries; i++) {
732 		struct e820_entry *entry = &e820_table->entries[i];
733 
734 		if (entry->type == E820_TYPE_NVS)
735 			acpi_nvs_register(entry->addr, entry->size);
736 	}
737 
738 	return 0;
739 }
740 core_initcall(e820_mark_nvs_memory);
741 #endif
742 
743 /*
744  * Allocate the requested number of bytes with the requsted alignment
745  * and return (the physical address) to the caller. Also register this
746  * range in the 'firmware' E820 table as a reserved range.
747  *
748  * This allows kexec to fake a new mptable, as if it came from the real
749  * system.
750  */
751 u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
752 {
753 	u64 addr;
754 
755 	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
756 	if (addr) {
757 		e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
758 		pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n");
759 		e820__update_table_firmware();
760 	}
761 
762 	return addr;
763 }
764 
765 #ifdef CONFIG_X86_32
766 # ifdef CONFIG_X86_PAE
767 #  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
768 # else
769 #  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
770 # endif
771 #else /* CONFIG_X86_32 */
772 # define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
773 #endif
774 
775 /*
776  * Find the highest page frame number we have available
777  */
778 static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
779 {
780 	int i;
781 	unsigned long last_pfn = 0;
782 	unsigned long max_arch_pfn = MAX_ARCH_PFN;
783 
784 	for (i = 0; i < e820_table->nr_entries; i++) {
785 		struct e820_entry *entry = &e820_table->entries[i];
786 		unsigned long start_pfn;
787 		unsigned long end_pfn;
788 
789 		if (entry->type != type)
790 			continue;
791 
792 		start_pfn = entry->addr >> PAGE_SHIFT;
793 		end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
794 
795 		if (start_pfn >= limit_pfn)
796 			continue;
797 		if (end_pfn > limit_pfn) {
798 			last_pfn = limit_pfn;
799 			break;
800 		}
801 		if (end_pfn > last_pfn)
802 			last_pfn = end_pfn;
803 	}
804 
805 	if (last_pfn > max_arch_pfn)
806 		last_pfn = max_arch_pfn;
807 
808 	pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
809 			 last_pfn, max_arch_pfn);
810 	return last_pfn;
811 }
812 
813 unsigned long __init e820_end_of_ram_pfn(void)
814 {
815 	return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
816 }
817 
818 unsigned long __init e820_end_of_low_ram_pfn(void)
819 {
820 	return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
821 }
822 
823 static void __init early_panic(char *msg)
824 {
825 	early_printk(msg);
826 	panic(msg);
827 }
828 
829 static int userdef __initdata;
830 
831 /* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
832 static int __init parse_memopt(char *p)
833 {
834 	u64 mem_size;
835 
836 	if (!p)
837 		return -EINVAL;
838 
839 	if (!strcmp(p, "nopentium")) {
840 #ifdef CONFIG_X86_32
841 		setup_clear_cpu_cap(X86_FEATURE_PSE);
842 		return 0;
843 #else
844 		pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
845 		return -EINVAL;
846 #endif
847 	}
848 
849 	userdef = 1;
850 	mem_size = memparse(p, &p);
851 
852 	/* Don't remove all memory when getting "mem={invalid}" parameter: */
853 	if (mem_size == 0)
854 		return -EINVAL;
855 
856 	e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
857 
858 	return 0;
859 }
860 early_param("mem", parse_memopt);
861 
862 static int __init parse_memmap_one(char *p)
863 {
864 	char *oldp;
865 	u64 start_at, mem_size;
866 
867 	if (!p)
868 		return -EINVAL;
869 
870 	if (!strncmp(p, "exactmap", 8)) {
871 #ifdef CONFIG_CRASH_DUMP
872 		/*
873 		 * If we are doing a crash dump, we still need to know
874 		 * the real memory size before the original memory map is
875 		 * reset.
876 		 */
877 		saved_max_pfn = e820_end_of_ram_pfn();
878 #endif
879 		e820_table->nr_entries = 0;
880 		userdef = 1;
881 		return 0;
882 	}
883 
884 	oldp = p;
885 	mem_size = memparse(p, &p);
886 	if (p == oldp)
887 		return -EINVAL;
888 
889 	userdef = 1;
890 	if (*p == '@') {
891 		start_at = memparse(p+1, &p);
892 		e820__range_add(start_at, mem_size, E820_TYPE_RAM);
893 	} else if (*p == '#') {
894 		start_at = memparse(p+1, &p);
895 		e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
896 	} else if (*p == '$') {
897 		start_at = memparse(p+1, &p);
898 		e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
899 	} else if (*p == '!') {
900 		start_at = memparse(p+1, &p);
901 		e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
902 	} else {
903 		e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
904 	}
905 
906 	return *p == '\0' ? 0 : -EINVAL;
907 }
908 
909 static int __init parse_memmap_opt(char *str)
910 {
911 	while (str) {
912 		char *k = strchr(str, ',');
913 
914 		if (k)
915 			*k++ = 0;
916 
917 		parse_memmap_one(str);
918 		str = k;
919 	}
920 
921 	return 0;
922 }
923 early_param("memmap", parse_memmap_opt);
924 
925 /*
926  * Reserve all entries from the bootloader's extensible data nodes list,
927  * because if present we are going to use it later on to fetch e820
928  * entries from it:
929  */
930 void __init e820__reserve_setup_data(void)
931 {
932 	struct setup_data *data;
933 	u64 pa_data;
934 
935 	pa_data = boot_params.hdr.setup_data;
936 	if (!pa_data)
937 		return;
938 
939 	while (pa_data) {
940 		data = early_memremap(pa_data, sizeof(*data));
941 		e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
942 		pa_data = data->next;
943 		early_memunmap(data, sizeof(*data));
944 	}
945 
946 	e820__update_table(e820_table);
947 
948 	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
949 
950 	pr_info("extended physical RAM map:\n");
951 	e820__print_table("reserve setup_data");
952 }
953 
954 /*
955  * Called after parse_early_param(), after early parameters (such as mem=)
956  * have been processed, in which case we already have an E820 table filled in
957  * via the parameter callback function(s), but it's not sorted and printed yet:
958  */
959 void __init e820__finish_early_params(void)
960 {
961 	if (userdef) {
962 		if (e820__update_table(e820_table) < 0)
963 			early_panic("Invalid user supplied memory map");
964 
965 		pr_info("e820: user-defined physical RAM map:\n");
966 		e820__print_table("user");
967 	}
968 }
969 
970 static const char *__init e820_type_to_string(struct e820_entry *entry)
971 {
972 	switch (entry->type) {
973 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
974 	case E820_TYPE_RAM:		return "System RAM";
975 	case E820_TYPE_ACPI:		return "ACPI Tables";
976 	case E820_TYPE_NVS:		return "ACPI Non-volatile Storage";
977 	case E820_TYPE_UNUSABLE:	return "Unusable memory";
978 	case E820_TYPE_PRAM:		return "Persistent Memory (legacy)";
979 	case E820_TYPE_PMEM:		return "Persistent Memory";
980 	default:			return "Reserved";
981 	}
982 }
983 
984 static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
985 {
986 	switch (entry->type) {
987 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
988 	case E820_TYPE_RAM:		return IORESOURCE_SYSTEM_RAM;
989 	case E820_TYPE_ACPI:		/* Fall-through: */
990 	case E820_TYPE_NVS:		/* Fall-through: */
991 	case E820_TYPE_UNUSABLE:	/* Fall-through: */
992 	case E820_TYPE_PRAM:		/* Fall-through: */
993 	case E820_TYPE_PMEM:		/* Fall-through: */
994 	default:			return IORESOURCE_MEM;
995 	}
996 }
997 
998 static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
999 {
1000 	switch (entry->type) {
1001 	case E820_TYPE_ACPI:		return IORES_DESC_ACPI_TABLES;
1002 	case E820_TYPE_NVS:		return IORES_DESC_ACPI_NV_STORAGE;
1003 	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
1004 	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
1005 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
1006 	case E820_TYPE_RAM:		/* Fall-through: */
1007 	case E820_TYPE_UNUSABLE:	/* Fall-through: */
1008 	default:			return IORES_DESC_NONE;
1009 	}
1010 }
1011 
1012 static bool __init do_mark_busy(u32 type, struct resource *res)
1013 {
1014 	/* this is the legacy bios/dos rom-shadow + mmio region */
1015 	if (res->start < (1ULL<<20))
1016 		return true;
1017 
1018 	/*
1019 	 * Treat persistent memory like device memory, i.e. reserve it
1020 	 * for exclusive use of a driver
1021 	 */
1022 	switch (type) {
1023 	case E820_TYPE_RESERVED:
1024 	case E820_TYPE_PRAM:
1025 	case E820_TYPE_PMEM:
1026 		return false;
1027 	default:
1028 		return true;
1029 	}
1030 }
1031 
1032 /*
1033  * Mark E820 reserved areas as busy for the resource manager:
1034  */
1035 
1036 static struct resource __initdata *e820_res;
1037 
1038 void __init e820_reserve_resources(void)
1039 {
1040 	int i;
1041 	struct resource *res;
1042 	u64 end;
1043 
1044 	res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries);
1045 	e820_res = res;
1046 
1047 	for (i = 0; i < e820_table->nr_entries; i++) {
1048 		struct e820_entry *entry = e820_table->entries + i;
1049 
1050 		end = entry->addr + entry->size - 1;
1051 		if (end != (resource_size_t)end) {
1052 			res++;
1053 			continue;
1054 		}
1055 		res->start = entry->addr;
1056 		res->end   = end;
1057 		res->name  = e820_type_to_string(entry);
1058 		res->flags = e820_type_to_iomem_type(entry);
1059 		res->desc  = e820_type_to_iores_desc(entry);
1060 
1061 		/*
1062 		 * don't register the region that could be conflicted with
1063 		 * pci device BAR resource and insert them later in
1064 		 * pcibios_resource_survey()
1065 		 */
1066 		if (do_mark_busy(entry->type, res)) {
1067 			res->flags |= IORESOURCE_BUSY;
1068 			insert_resource(&iomem_resource, res);
1069 		}
1070 		res++;
1071 	}
1072 
1073 	for (i = 0; i < e820_table_firmware->nr_entries; i++) {
1074 		struct e820_entry *entry = e820_table_firmware->entries + i;
1075 
1076 		firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
1077 	}
1078 }
1079 
1080 /* How much should we pad RAM ending depending on where it is? */
1081 static unsigned long __init ram_alignment(resource_size_t pos)
1082 {
1083 	unsigned long mb = pos >> 20;
1084 
1085 	/* To 64kB in the first megabyte */
1086 	if (!mb)
1087 		return 64*1024;
1088 
1089 	/* To 1MB in the first 16MB */
1090 	if (mb < 16)
1091 		return 1024*1024;
1092 
1093 	/* To 64MB for anything above that */
1094 	return 64*1024*1024;
1095 }
1096 
1097 #define MAX_RESOURCE_SIZE ((resource_size_t)-1)
1098 
1099 void __init e820_reserve_resources_late(void)
1100 {
1101 	int i;
1102 	struct resource *res;
1103 
1104 	res = e820_res;
1105 	for (i = 0; i < e820_table->nr_entries; i++) {
1106 		if (!res->parent && res->end)
1107 			insert_resource_expand_to_fit(&iomem_resource, res);
1108 		res++;
1109 	}
1110 
1111 	/*
1112 	 * Try to bump up RAM regions to reasonable boundaries, to
1113 	 * avoid stolen RAM:
1114 	 */
1115 	for (i = 0; i < e820_table->nr_entries; i++) {
1116 		struct e820_entry *entry = &e820_table->entries[i];
1117 		u64 start, end;
1118 
1119 		if (entry->type != E820_TYPE_RAM)
1120 			continue;
1121 
1122 		start = entry->addr + entry->size;
1123 		end = round_up(start, ram_alignment(start)) - 1;
1124 		if (end > MAX_RESOURCE_SIZE)
1125 			end = MAX_RESOURCE_SIZE;
1126 		if (start >= end)
1127 			continue;
1128 
1129 		pr_debug("e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1130 		reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1131 	}
1132 }
1133 
1134 /*
1135  * Pass the firmware (bootloader) E820 map to the kernel and process it:
1136  */
1137 char *__init e820__memory_setup_default(void)
1138 {
1139 	char *who = "BIOS-e820";
1140 	u32 new_nr;
1141 
1142 	/*
1143 	 * Try to copy the BIOS-supplied E820-map.
1144 	 *
1145 	 * Otherwise fake a memory map; one section from 0k->640k,
1146 	 * the next section from 1mb->appropriate_mem_k
1147 	 */
1148 	new_nr = boot_params.e820_entries;
1149 	__e820__update_table(boot_params.e820_table, ARRAY_SIZE(boot_params.e820_table), &new_nr);
1150 	boot_params.e820_entries = new_nr;
1151 
1152 	if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1153 		u64 mem_size;
1154 
1155 		/* Compare results from other methods and take the one that gives more RAM: */
1156 		if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1157 			mem_size = boot_params.screen_info.ext_mem_k;
1158 			who = "BIOS-88";
1159 		} else {
1160 			mem_size = boot_params.alt_mem_k;
1161 			who = "BIOS-e801";
1162 		}
1163 
1164 		e820_table->nr_entries = 0;
1165 		e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
1166 		e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
1167 	}
1168 
1169 	return who;
1170 }
1171 
1172 /*
1173  * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
1174  * E820 map - with an optional platform quirk available for virtual platforms
1175  * to override this method of boot environment processing:
1176  */
1177 void __init e820__memory_setup(void)
1178 {
1179 	char *who;
1180 
1181 	/* This is a firmware interface ABI - make sure we don't break it: */
1182 	BUILD_BUG_ON(sizeof(struct e820_entry) != 20);
1183 
1184 	who = x86_init.resources.memory_setup();
1185 
1186 	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
1187 
1188 	pr_info("e820: BIOS-provided physical RAM map:\n");
1189 	e820__print_table(who);
1190 }
1191 
1192 void __init e820__memblock_setup(void)
1193 {
1194 	int i;
1195 	u64 end;
1196 
1197 	/*
1198 	 * The bootstrap memblock region count maximum is 128 entries
1199 	 * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
1200 	 * than that - so allow memblock resizing.
1201 	 *
1202 	 * This is safe, because this call happens pretty late during x86 setup,
1203 	 * so we know about reserved memory regions already. (This is important
1204 	 * so that memblock resizing does no stomp over reserved areas.)
1205 	 */
1206 	memblock_allow_resize();
1207 
1208 	for (i = 0; i < e820_table->nr_entries; i++) {
1209 		struct e820_entry *entry = &e820_table->entries[i];
1210 
1211 		end = entry->addr + entry->size;
1212 		if (end != (resource_size_t)end)
1213 			continue;
1214 
1215 		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1216 			continue;
1217 
1218 		memblock_add(entry->addr, entry->size);
1219 	}
1220 
1221 	/* Throw away partial pages: */
1222 	memblock_trim_memory(PAGE_SIZE);
1223 
1224 	memblock_dump_all();
1225 }
1226