1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
27de828dfSKees Cook /*
37de828dfSKees Cook * kaslr.c
47de828dfSKees Cook *
57de828dfSKees Cook * This contains the routines needed to generate a reasonable level of
67de828dfSKees Cook * entropy to choose a randomized kernel base address offset in support
77de828dfSKees Cook * of Kernel Address Space Layout Randomization (KASLR). Additionally
87de828dfSKees Cook * handles walking the physical memory maps (and tracking memory regions
97de828dfSKees Cook * to avoid) in order to select a physical memory location that can
107de828dfSKees Cook * contain the entire properly aligned running kernel image.
117de828dfSKees Cook *
127de828dfSKees Cook */
13d52e7d5aSBaoquan He
14d52e7d5aSBaoquan He /*
15d52e7d5aSBaoquan He * isspace() in linux/ctype.h is expected by next_args() to filter
16d52e7d5aSBaoquan He * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
17d52e7d5aSBaoquan He * since isdigit() is implemented in both of them. Hence disable it
18d52e7d5aSBaoquan He * here.
19d52e7d5aSBaoquan He */
20d52e7d5aSBaoquan He #define BOOT_CTYPE_H
21d52e7d5aSBaoquan He
229b238748SKees Cook #include "misc.h"
23dc425a6eSKees Cook #include "error.h"
245b8b9cf7SArnd Bergmann #include "../string.h"
255dc91f2dSBorislav Petkov #include "efi.h"
269b238748SKees Cook
279b238748SKees Cook #include <generated/compile.h>
289b238748SKees Cook #include <linux/module.h>
299b238748SKees Cook #include <linux/uts.h>
309b238748SKees Cook #include <linux/utsname.h>
31d52e7d5aSBaoquan He #include <linux/ctype.h>
322df8220cSMasahiro Yamada #include <generated/utsversion.h>
339b238748SKees Cook #include <generated/utsrelease.h>
349b238748SKees Cook
3576167e5cSArvind Sankar #define _SETUP
3676167e5cSArvind Sankar #include <asm/setup.h> /* For COMMAND_LINE_SIZE */
3776167e5cSArvind Sankar #undef _SETUP
3876167e5cSArvind Sankar
39d52e7d5aSBaoquan He extern unsigned long get_cmd_line_ptr(void);
40d52e7d5aSBaoquan He
419b238748SKees Cook /* Simplified build-specific string for starting entropy. */
429b238748SKees Cook static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
439b238748SKees Cook LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
449b238748SKees Cook
rotate_xor(unsigned long hash,const void * area,size_t size)459b238748SKees Cook static unsigned long rotate_xor(unsigned long hash, const void *area,
469b238748SKees Cook size_t size)
479b238748SKees Cook {
489b238748SKees Cook size_t i;
499b238748SKees Cook unsigned long *ptr = (unsigned long *)area;
509b238748SKees Cook
519b238748SKees Cook for (i = 0; i < size / sizeof(hash); i++) {
529b238748SKees Cook /* Rotate by odd number of bits and XOR. */
539b238748SKees Cook hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
549b238748SKees Cook hash ^= ptr[i];
559b238748SKees Cook }
569b238748SKees Cook
579b238748SKees Cook return hash;
589b238748SKees Cook }
599b238748SKees Cook
609b238748SKees Cook /* Attempt to create a simple but unpredictable starting entropy. */
get_boot_seed(void)61d899a7d1SThomas Garnier static unsigned long get_boot_seed(void)
629b238748SKees Cook {
639b238748SKees Cook unsigned long hash = 0;
649b238748SKees Cook
659b238748SKees Cook hash = rotate_xor(hash, build_str, sizeof(build_str));
66*c59843e8SArd Biesheuvel hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr));
679b238748SKees Cook
689b238748SKees Cook return hash;
699b238748SKees Cook }
709b238748SKees Cook
71d899a7d1SThomas Garnier #define KASLR_COMPRESSED_BOOT
72d899a7d1SThomas Garnier #include "../../lib/kaslr.c"
739b238748SKees Cook
749b238748SKees Cook
75f2844249SDave Jiang /* Only supporting at most 4 unusable memmap regions with kaslr */
76f2844249SDave Jiang #define MAX_MEMMAP_REGIONS 4
77f2844249SDave Jiang
78f2844249SDave Jiang static bool memmap_too_large;
79f2844249SDave Jiang
80d52e7d5aSBaoquan He
8145128694SArvind Sankar /*
8245128694SArvind Sankar * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit.
8345128694SArvind Sankar * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options.
8445128694SArvind Sankar */
853a066990SArvind Sankar static u64 mem_limit;
864cdba14fSBaoquan He
87690eaa53SChao Fan /* Number of immovable memory regions */
88690eaa53SChao Fan static int num_immovable_mem;
894cdba14fSBaoquan He
90ed09acdeSKees Cook enum mem_avoid_index {
91ed09acdeSKees Cook MEM_AVOID_ZO_RANGE = 0,
92ed09acdeSKees Cook MEM_AVOID_INITRD,
93ed09acdeSKees Cook MEM_AVOID_CMDLINE,
94ed09acdeSKees Cook MEM_AVOID_BOOTPARAMS,
95f2844249SDave Jiang MEM_AVOID_MEMMAP_BEGIN,
96f2844249SDave Jiang MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
97ed09acdeSKees Cook MEM_AVOID_MAX,
98ed09acdeSKees Cook };
99ed09acdeSKees Cook
1009b238748SKees Cook static struct mem_vector mem_avoid[MEM_AVOID_MAX];
1019b238748SKees Cook
mem_overlaps(struct mem_vector * one,struct mem_vector * two)1029b238748SKees Cook static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
1039b238748SKees Cook {
1049b238748SKees Cook /* Item one is entirely before item two. */
1059b238748SKees Cook if (one->start + one->size <= two->start)
1069b238748SKees Cook return false;
1079b238748SKees Cook /* Item one is entirely after item two. */
1089b238748SKees Cook if (one->start >= two->start + two->size)
1099b238748SKees Cook return false;
1109b238748SKees Cook return true;
1119b238748SKees Cook }
1129b238748SKees Cook
skip_spaces(const char * str)113d52e7d5aSBaoquan He char *skip_spaces(const char *str)
114f2844249SDave Jiang {
115d52e7d5aSBaoquan He while (isspace(*str))
116d52e7d5aSBaoquan He ++str;
117d52e7d5aSBaoquan He return (char *)str;
118f2844249SDave Jiang }
119d52e7d5aSBaoquan He #include "../../../../lib/ctype.c"
120d52e7d5aSBaoquan He #include "../../../../lib/cmdline.c"
121f2844249SDave Jiang
122199c8471SDan Williams enum parse_mode {
123199c8471SDan Williams PARSE_MEMMAP,
124199c8471SDan Williams PARSE_EFI,
125199c8471SDan Williams };
126199c8471SDan Williams
127f2844249SDave Jiang static int
parse_memmap(char * p,u64 * start,u64 * size,enum parse_mode mode)1283a066990SArvind Sankar parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode)
129f2844249SDave Jiang {
130f2844249SDave Jiang char *oldp;
131f2844249SDave Jiang
132f2844249SDave Jiang if (!p)
133f2844249SDave Jiang return -EINVAL;
134f2844249SDave Jiang
135f2844249SDave Jiang /* We don't care about this option here */
136f2844249SDave Jiang if (!strncmp(p, "exactmap", 8))
137f2844249SDave Jiang return -EINVAL;
138f2844249SDave Jiang
139f2844249SDave Jiang oldp = p;
140d52e7d5aSBaoquan He *size = memparse(p, &p);
141f2844249SDave Jiang if (p == oldp)
142f2844249SDave Jiang return -EINVAL;
143f2844249SDave Jiang
144f2844249SDave Jiang switch (*p) {
145f2844249SDave Jiang case '#':
146f2844249SDave Jiang case '$':
147f2844249SDave Jiang case '!':
148d52e7d5aSBaoquan He *start = memparse(p + 1, &p);
149f2844249SDave Jiang return 0;
1504cdba14fSBaoquan He case '@':
151199c8471SDan Williams if (mode == PARSE_MEMMAP) {
152199c8471SDan Williams /*
153199c8471SDan Williams * memmap=nn@ss specifies usable region, should
154199c8471SDan Williams * be skipped
155199c8471SDan Williams */
1564cdba14fSBaoquan He *size = 0;
157199c8471SDan Williams } else {
1583a066990SArvind Sankar u64 flags;
159199c8471SDan Williams
160199c8471SDan Williams /*
161199c8471SDan Williams * efi_fake_mem=nn@ss:attr the attr specifies
162199c8471SDan Williams * flags that might imply a soft-reservation.
163199c8471SDan Williams */
164199c8471SDan Williams *start = memparse(p + 1, &p);
165199c8471SDan Williams if (p && *p == ':') {
166199c8471SDan Williams p++;
167199c8471SDan Williams if (kstrtoull(p, 0, &flags) < 0)
168199c8471SDan Williams *size = 0;
169199c8471SDan Williams else if (flags & EFI_MEMORY_SP)
170199c8471SDan Williams return 0;
171199c8471SDan Williams }
172199c8471SDan Williams *size = 0;
173199c8471SDan Williams }
174df561f66SGustavo A. R. Silva fallthrough;
1754cdba14fSBaoquan He default:
1764cdba14fSBaoquan He /*
1774cdba14fSBaoquan He * If w/o offset, only size specified, memmap=nn[KMG] has the
1784cdba14fSBaoquan He * same behaviour as mem=nn[KMG]. It limits the max address
1794cdba14fSBaoquan He * system can use. Region above the limit should be avoided.
1804cdba14fSBaoquan He */
1814cdba14fSBaoquan He *start = 0;
182f2844249SDave Jiang return 0;
183f2844249SDave Jiang }
184f2844249SDave Jiang
185f2844249SDave Jiang return -EINVAL;
186f2844249SDave Jiang }
187f2844249SDave Jiang
mem_avoid_memmap(enum parse_mode mode,char * str)188199c8471SDan Williams static void mem_avoid_memmap(enum parse_mode mode, char *str)
189f2844249SDave Jiang {
190d52e7d5aSBaoquan He static int i;
191f2844249SDave Jiang
192d52e7d5aSBaoquan He if (i >= MAX_MEMMAP_REGIONS)
193f2844249SDave Jiang return;
194f2844249SDave Jiang
195f2844249SDave Jiang while (str && (i < MAX_MEMMAP_REGIONS)) {
196f2844249SDave Jiang int rc;
1973a066990SArvind Sankar u64 start, size;
198f2844249SDave Jiang char *k = strchr(str, ',');
199f2844249SDave Jiang
200f2844249SDave Jiang if (k)
201f2844249SDave Jiang *k++ = 0;
202f2844249SDave Jiang
203199c8471SDan Williams rc = parse_memmap(str, &start, &size, mode);
204f2844249SDave Jiang if (rc < 0)
205f2844249SDave Jiang break;
206f2844249SDave Jiang str = k;
2074cdba14fSBaoquan He
2084cdba14fSBaoquan He if (start == 0) {
2094cdba14fSBaoquan He /* Store the specified memory limit if size > 0 */
21045128694SArvind Sankar if (size > 0 && size < mem_limit)
2114cdba14fSBaoquan He mem_limit = size;
2124cdba14fSBaoquan He
213f2844249SDave Jiang continue;
2144cdba14fSBaoquan He }
215f2844249SDave Jiang
216f2844249SDave Jiang mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
217f2844249SDave Jiang mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
218f2844249SDave Jiang i++;
219f2844249SDave Jiang }
220f2844249SDave Jiang
221f2844249SDave Jiang /* More than 4 memmaps, fail kaslr */
222f2844249SDave Jiang if ((i >= MAX_MEMMAP_REGIONS) && str)
223f2844249SDave Jiang memmap_too_large = true;
224f2844249SDave Jiang }
225f2844249SDave Jiang
2269b912485SBaoquan He /* Store the number of 1GB huge pages which users specified: */
2279b912485SBaoquan He static unsigned long max_gb_huge_pages;
2289b912485SBaoquan He
parse_gb_huge_pages(char * param,char * val)2299b912485SBaoquan He static void parse_gb_huge_pages(char *param, char *val)
2309b912485SBaoquan He {
2319b912485SBaoquan He static bool gbpage_sz;
2329b912485SBaoquan He char *p;
2339b912485SBaoquan He
2349b912485SBaoquan He if (!strcmp(param, "hugepagesz")) {
2359b912485SBaoquan He p = val;
2369b912485SBaoquan He if (memparse(p, &p) != PUD_SIZE) {
2379b912485SBaoquan He gbpage_sz = false;
2389b912485SBaoquan He return;
2399b912485SBaoquan He }
2409b912485SBaoquan He
2419b912485SBaoquan He if (gbpage_sz)
2429b912485SBaoquan He warn("Repeatedly set hugeTLB page size of 1G!\n");
2439b912485SBaoquan He gbpage_sz = true;
2449b912485SBaoquan He return;
2459b912485SBaoquan He }
2469b912485SBaoquan He
2479b912485SBaoquan He if (!strcmp(param, "hugepages") && gbpage_sz) {
2489b912485SBaoquan He p = val;
2499b912485SBaoquan He max_gb_huge_pages = simple_strtoull(p, &p, 0);
2509b912485SBaoquan He return;
2519b912485SBaoquan He }
2529b912485SBaoquan He }
2539b912485SBaoquan He
handle_mem_options(void)25444060e8aSChao Fan static void handle_mem_options(void)
255d52e7d5aSBaoquan He {
256d52e7d5aSBaoquan He char *args = (char *)get_cmd_line_ptr();
257709709acSArvind Sankar size_t len;
258d52e7d5aSBaoquan He char *tmp_cmdline;
259d52e7d5aSBaoquan He char *param, *val;
2604cdba14fSBaoquan He u64 mem_size;
261d52e7d5aSBaoquan He
262709709acSArvind Sankar if (!args)
26344060e8aSChao Fan return;
264d52e7d5aSBaoquan He
26576167e5cSArvind Sankar len = strnlen(args, COMMAND_LINE_SIZE-1);
266d52e7d5aSBaoquan He tmp_cmdline = malloc(len + 1);
267d52e7d5aSBaoquan He if (!tmp_cmdline)
268d52e7d5aSBaoquan He error("Failed to allocate space for tmp_cmdline");
269d52e7d5aSBaoquan He
270d52e7d5aSBaoquan He memcpy(tmp_cmdline, args, len);
271d52e7d5aSBaoquan He tmp_cmdline[len] = 0;
272d52e7d5aSBaoquan He args = tmp_cmdline;
273d52e7d5aSBaoquan He
274d52e7d5aSBaoquan He /* Chew leading spaces */
275d52e7d5aSBaoquan He args = skip_spaces(args);
276d52e7d5aSBaoquan He
277d52e7d5aSBaoquan He while (*args) {
278d52e7d5aSBaoquan He args = next_arg(args, ¶m, &val);
279d52e7d5aSBaoquan He /* Stop at -- */
280e2ee6173SArvind Sankar if (!val && strcmp(param, "--") == 0)
281e2ee6173SArvind Sankar break;
282d52e7d5aSBaoquan He
2834cdba14fSBaoquan He if (!strcmp(param, "memmap")) {
284199c8471SDan Williams mem_avoid_memmap(PARSE_MEMMAP, val);
28550def269SArvind Sankar } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) {
286747ff626SBaoquan He parse_gb_huge_pages(param, val);
2874cdba14fSBaoquan He } else if (!strcmp(param, "mem")) {
2884cdba14fSBaoquan He char *p = val;
2894cdba14fSBaoquan He
2904cdba14fSBaoquan He if (!strcmp(p, "nopentium"))
2914cdba14fSBaoquan He continue;
2924cdba14fSBaoquan He mem_size = memparse(p, &p);
29344060e8aSChao Fan if (mem_size == 0)
294e2ee6173SArvind Sankar break;
29544060e8aSChao Fan
29645128694SArvind Sankar if (mem_size < mem_limit)
2974cdba14fSBaoquan He mem_limit = mem_size;
298199c8471SDan Williams } else if (!strcmp(param, "efi_fake_mem")) {
299199c8471SDan Williams mem_avoid_memmap(PARSE_EFI, val);
3004cdba14fSBaoquan He }
301d52e7d5aSBaoquan He }
302d52e7d5aSBaoquan He
303d52e7d5aSBaoquan He free(tmp_cmdline);
30444060e8aSChao Fan return;
305d52e7d5aSBaoquan He }
306d52e7d5aSBaoquan He
3079dc1969cSYinghai Lu /*
30845128694SArvind Sankar * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM)
30945128694SArvind Sankar * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit.
31045128694SArvind Sankar *
311ed09acdeSKees Cook * The mem_avoid array is used to store the ranges that need to be avoided
312ed09acdeSKees Cook * when KASLR searches for an appropriate random address. We must avoid any
3139dc1969cSYinghai Lu * regions that are unsafe to overlap with during decompression, and other
314ed09acdeSKees Cook * things like the initrd, cmdline and boot_params. This comment seeks to
315ed09acdeSKees Cook * explain mem_avoid as clearly as possible since incorrect mem_avoid
316ed09acdeSKees Cook * memory ranges lead to really hard to debug boot failures.
3179dc1969cSYinghai Lu *
318ed09acdeSKees Cook * The initrd, cmdline, and boot_params are trivial to identify for
319cb18ef0dSKees Cook * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
320ed09acdeSKees Cook * MEM_AVOID_BOOTPARAMS respectively below.
3219dc1969cSYinghai Lu *
322ed09acdeSKees Cook * What is not obvious how to avoid is the range of memory that is used
323ed09acdeSKees Cook * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
324ed09acdeSKees Cook * the compressed kernel (ZO) and its run space, which is used to extract
325ed09acdeSKees Cook * the uncompressed kernel (VO) and relocs.
3269dc1969cSYinghai Lu *
327ed09acdeSKees Cook * ZO's full run size sits against the end of the decompression buffer, so
328ed09acdeSKees Cook * we can calculate where text, data, bss, etc of ZO are positioned more
329ed09acdeSKees Cook * easily.
3309dc1969cSYinghai Lu *
331ed09acdeSKees Cook * For additional background, the decompression calculations can be found
332ed09acdeSKees Cook * in header.S, and the memory diagram is based on the one found in misc.c.
3339dc1969cSYinghai Lu *
334ed09acdeSKees Cook * The following conditions are already enforced by the image layouts and
335ed09acdeSKees Cook * associated code:
336ed09acdeSKees Cook * - input + input_size >= output + output_size
337ed09acdeSKees Cook * - kernel_total_size <= init_size
338ed09acdeSKees Cook * - kernel_total_size <= output_size (see Note below)
339ed09acdeSKees Cook * - output + init_size >= output + output_size
3409dc1969cSYinghai Lu *
341ed09acdeSKees Cook * (Note that kernel_total_size and output_size have no fundamental
342ed09acdeSKees Cook * relationship, but output_size is passed to choose_random_location
343ed09acdeSKees Cook * as a maximum of the two. The diagram is showing a case where
344ed09acdeSKees Cook * kernel_total_size is larger than output_size, but this case is
345ed09acdeSKees Cook * handled by bumping output_size.)
346ed09acdeSKees Cook *
347ed09acdeSKees Cook * The above conditions can be illustrated by a diagram:
348ed09acdeSKees Cook *
349ed09acdeSKees Cook * 0 output input input+input_size output+init_size
3509dc1969cSYinghai Lu * | | | | |
3519dc1969cSYinghai Lu * | | | | |
352ed09acdeSKees Cook * |-----|--------|--------|--------------|-----------|--|-------------|
3539dc1969cSYinghai Lu * | | |
3549dc1969cSYinghai Lu * | | |
355ed09acdeSKees Cook * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size
3569dc1969cSYinghai Lu *
357ed09acdeSKees Cook * [output, output+init_size) is the entire memory range used for
358ed09acdeSKees Cook * extracting the compressed image.
3599dc1969cSYinghai Lu *
360ed09acdeSKees Cook * [output, output+kernel_total_size) is the range needed for the
361ed09acdeSKees Cook * uncompressed kernel (VO) and its run size (bss, brk, etc).
3629dc1969cSYinghai Lu *
363ed09acdeSKees Cook * [output, output+output_size) is VO plus relocs (i.e. the entire
364ed09acdeSKees Cook * uncompressed payload contained by ZO). This is the area of the buffer
365ed09acdeSKees Cook * written to during decompression.
3669dc1969cSYinghai Lu *
367ed09acdeSKees Cook * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
368ed09acdeSKees Cook * range of the copied ZO and decompression code. (i.e. the range
369ed09acdeSKees Cook * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
370ed09acdeSKees Cook *
371ed09acdeSKees Cook * [input, input+input_size) is the original copied compressed image (ZO)
372ed09acdeSKees Cook * (i.e. it does not include its run size). This range must be avoided
373ed09acdeSKees Cook * because it contains the data used for decompression.
374ed09acdeSKees Cook *
375ed09acdeSKees Cook * [input+input_size, output+init_size) is [_text, _end) for ZO. This
376ed09acdeSKees Cook * range includes ZO's heap and stack, and must be avoided since it
377ed09acdeSKees Cook * performs the decompression.
378ed09acdeSKees Cook *
379ed09acdeSKees Cook * Since the above two ranges need to be avoided and they are adjacent,
380ed09acdeSKees Cook * they can be merged, resulting in: [input, output+init_size) which
381ed09acdeSKees Cook * becomes the MEM_AVOID_ZO_RANGE below.
3829dc1969cSYinghai Lu */
mem_avoid_init(unsigned long input,unsigned long input_size,unsigned long output)3839b238748SKees Cook static void mem_avoid_init(unsigned long input, unsigned long input_size,
3849dc1969cSYinghai Lu unsigned long output)
3859b238748SKees Cook {
386*c59843e8SArd Biesheuvel unsigned long init_size = boot_params_ptr->hdr.init_size;
3879b238748SKees Cook u64 initrd_start, initrd_size;
388709709acSArvind Sankar unsigned long cmd_line, cmd_line_size;
3899b238748SKees Cook
3909b238748SKees Cook /*
3919b238748SKees Cook * Avoid the region that is unsafe to overlap during
3929dc1969cSYinghai Lu * decompression.
3939b238748SKees Cook */
394ed09acdeSKees Cook mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
395ed09acdeSKees Cook mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
3969b238748SKees Cook
3979b238748SKees Cook /* Avoid initrd. */
398*c59843e8SArd Biesheuvel initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32;
399*c59843e8SArd Biesheuvel initrd_start |= boot_params_ptr->hdr.ramdisk_image;
400*c59843e8SArd Biesheuvel initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32;
401*c59843e8SArd Biesheuvel initrd_size |= boot_params_ptr->hdr.ramdisk_size;
402ed09acdeSKees Cook mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
403ed09acdeSKees Cook mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
4043a94707dSKees Cook /* No need to set mapping for initrd, it will be handled in VO. */
4059b238748SKees Cook
4069b238748SKees Cook /* Avoid kernel command line. */
407709709acSArvind Sankar cmd_line = get_cmd_line_ptr();
4089b238748SKees Cook /* Calculate size of cmd_line. */
409709709acSArvind Sankar if (cmd_line) {
41076167e5cSArvind Sankar cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;
411ed09acdeSKees Cook mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
412ed09acdeSKees Cook mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
413709709acSArvind Sankar }
4149b238748SKees Cook
415ed09acdeSKees Cook /* Avoid boot parameters. */
416*c59843e8SArd Biesheuvel mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr;
417*c59843e8SArd Biesheuvel mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr);
4183a94707dSKees Cook
4193a94707dSKees Cook /* We don't need to set a mapping for setup_data. */
4203a94707dSKees Cook
421f2844249SDave Jiang /* Mark the memmap regions we need to avoid */
422747ff626SBaoquan He handle_mem_options();
423f2844249SDave Jiang
424690eaa53SChao Fan /* Enumerate the immovable memory regions */
425690eaa53SChao Fan num_immovable_mem = count_immovable_mem_regions();
4269b238748SKees Cook }
4279b238748SKees Cook
42806486d6cSKees Cook /*
42906486d6cSKees Cook * Does this memory vector overlap a known avoided area? If so, record the
43006486d6cSKees Cook * overlap region with the lowest address.
43106486d6cSKees Cook */
mem_avoid_overlap(struct mem_vector * img,struct mem_vector * overlap)43206486d6cSKees Cook static bool mem_avoid_overlap(struct mem_vector *img,
43306486d6cSKees Cook struct mem_vector *overlap)
4349b238748SKees Cook {
4359b238748SKees Cook int i;
4369b238748SKees Cook struct setup_data *ptr;
4370eb1a8afSArvind Sankar u64 earliest = img->start + img->size;
43806486d6cSKees Cook bool is_overlapping = false;
4399b238748SKees Cook
4409b238748SKees Cook for (i = 0; i < MEM_AVOID_MAX; i++) {
44106486d6cSKees Cook if (mem_overlaps(img, &mem_avoid[i]) &&
44206486d6cSKees Cook mem_avoid[i].start < earliest) {
44306486d6cSKees Cook *overlap = mem_avoid[i];
4446daa2ec0SBaoquan He earliest = overlap->start;
44506486d6cSKees Cook is_overlapping = true;
44606486d6cSKees Cook }
4479b238748SKees Cook }
4489b238748SKees Cook
4499b238748SKees Cook /* Avoid all entries in the setup_data linked list. */
450*c59843e8SArd Biesheuvel ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
4519b238748SKees Cook while (ptr) {
4529b238748SKees Cook struct mem_vector avoid;
4539b238748SKees Cook
4549b238748SKees Cook avoid.start = (unsigned long)ptr;
4559b238748SKees Cook avoid.size = sizeof(*ptr) + ptr->len;
4569b238748SKees Cook
45706486d6cSKees Cook if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
45806486d6cSKees Cook *overlap = avoid;
4596daa2ec0SBaoquan He earliest = overlap->start;
46006486d6cSKees Cook is_overlapping = true;
46106486d6cSKees Cook }
4629b238748SKees Cook
463b3c72fc9SDaniel Kiper if (ptr->type == SETUP_INDIRECT &&
464b3c72fc9SDaniel Kiper ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) {
465b3c72fc9SDaniel Kiper avoid.start = ((struct setup_indirect *)ptr->data)->addr;
466b3c72fc9SDaniel Kiper avoid.size = ((struct setup_indirect *)ptr->data)->len;
467b3c72fc9SDaniel Kiper
468b3c72fc9SDaniel Kiper if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
469b3c72fc9SDaniel Kiper *overlap = avoid;
470b3c72fc9SDaniel Kiper earliest = overlap->start;
471b3c72fc9SDaniel Kiper is_overlapping = true;
472b3c72fc9SDaniel Kiper }
473b3c72fc9SDaniel Kiper }
474b3c72fc9SDaniel Kiper
4759b238748SKees Cook ptr = (struct setup_data *)(unsigned long)ptr->next;
4769b238748SKees Cook }
4779b238748SKees Cook
47806486d6cSKees Cook return is_overlapping;
4799b238748SKees Cook }
4809b238748SKees Cook
481c401cf15SBaoquan He struct slot_area {
4820eb1a8afSArvind Sankar u64 addr;
483d6d0f36cSArvind Sankar unsigned long num;
484c401cf15SBaoquan He };
485c401cf15SBaoquan He
486c401cf15SBaoquan He #define MAX_SLOT_AREA 100
487c401cf15SBaoquan He
488c401cf15SBaoquan He static struct slot_area slot_areas[MAX_SLOT_AREA];
489d6d0f36cSArvind Sankar static unsigned int slot_area_index;
4909b238748SKees Cook static unsigned long slot_max;
4919b238748SKees Cook
store_slot_info(struct mem_vector * region,unsigned long image_size)492c401cf15SBaoquan He static void store_slot_info(struct mem_vector *region, unsigned long image_size)
493c401cf15SBaoquan He {
494c401cf15SBaoquan He struct slot_area slot_area;
495c401cf15SBaoquan He
496c401cf15SBaoquan He if (slot_area_index == MAX_SLOT_AREA)
497c401cf15SBaoquan He return;
498c401cf15SBaoquan He
499c401cf15SBaoquan He slot_area.addr = region->start;
50046a5b29aSArvind Sankar slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN;
501c401cf15SBaoquan He
502c401cf15SBaoquan He slot_areas[slot_area_index++] = slot_area;
503c401cf15SBaoquan He slot_max += slot_area.num;
504c401cf15SBaoquan He }
505c401cf15SBaoquan He
5069b912485SBaoquan He /*
5079b912485SBaoquan He * Skip as many 1GB huge pages as possible in the passed region
5089b912485SBaoquan He * according to the number which users specified:
5099b912485SBaoquan He */
5109b912485SBaoquan He static void
process_gb_huge_pages(struct mem_vector * region,unsigned long image_size)5119b912485SBaoquan He process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
5129b912485SBaoquan He {
5130eb1a8afSArvind Sankar u64 pud_start, pud_end;
5140eb1a8afSArvind Sankar unsigned long gb_huge_pages;
5159b912485SBaoquan He struct mem_vector tmp;
5169b912485SBaoquan He
51750def269SArvind Sankar if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) {
5189b912485SBaoquan He store_slot_info(region, image_size);
5199b912485SBaoquan He return;
5209b912485SBaoquan He }
5219b912485SBaoquan He
522be9e8d95SArvind Sankar /* Are there any 1GB pages in the region? */
523be9e8d95SArvind Sankar pud_start = ALIGN(region->start, PUD_SIZE);
524be9e8d95SArvind Sankar pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE);
5259b912485SBaoquan He
5269b912485SBaoquan He /* No good 1GB huge pages found: */
527be9e8d95SArvind Sankar if (pud_start >= pud_end) {
5289b912485SBaoquan He store_slot_info(region, image_size);
5299b912485SBaoquan He return;
5309b912485SBaoquan He }
5319b912485SBaoquan He
532be9e8d95SArvind Sankar /* Check if the head part of the region is usable. */
533be9e8d95SArvind Sankar if (pud_start >= region->start + image_size) {
5349b912485SBaoquan He tmp.start = region->start;
535be9e8d95SArvind Sankar tmp.size = pud_start - region->start;
5369b912485SBaoquan He store_slot_info(&tmp, image_size);
5379b912485SBaoquan He }
5389b912485SBaoquan He
539be9e8d95SArvind Sankar /* Skip the good 1GB pages. */
540be9e8d95SArvind Sankar gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT;
541be9e8d95SArvind Sankar if (gb_huge_pages > max_gb_huge_pages) {
542be9e8d95SArvind Sankar pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT);
543be9e8d95SArvind Sankar max_gb_huge_pages = 0;
544be9e8d95SArvind Sankar } else {
545be9e8d95SArvind Sankar max_gb_huge_pages -= gb_huge_pages;
546be9e8d95SArvind Sankar }
547be9e8d95SArvind Sankar
548be9e8d95SArvind Sankar /* Check if the tail part of the region is usable. */
549be9e8d95SArvind Sankar if (region->start + region->size >= pud_end + image_size) {
550be9e8d95SArvind Sankar tmp.start = pud_end;
551be9e8d95SArvind Sankar tmp.size = region->start + region->size - pud_end;
5529b912485SBaoquan He store_slot_info(&tmp, image_size);
5539b912485SBaoquan He }
5549b912485SBaoquan He }
5559b912485SBaoquan He
slots_fetch_random(void)5560eb1a8afSArvind Sankar static u64 slots_fetch_random(void)
5579b238748SKees Cook {
558ed9f007eSKees Cook unsigned long slot;
559d6d0f36cSArvind Sankar unsigned int i;
560ed9f007eSKees Cook
5619b238748SKees Cook /* Handle case of no slots stored. */
5629b238748SKees Cook if (slot_max == 0)
5639b238748SKees Cook return 0;
5649b238748SKees Cook
565d899a7d1SThomas Garnier slot = kaslr_get_random_long("Physical") % slot_max;
566ed9f007eSKees Cook
567ed9f007eSKees Cook for (i = 0; i < slot_area_index; i++) {
568ed9f007eSKees Cook if (slot >= slot_areas[i].num) {
569ed9f007eSKees Cook slot -= slot_areas[i].num;
570ed9f007eSKees Cook continue;
571ed9f007eSKees Cook }
5720eb1a8afSArvind Sankar return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN);
573ed9f007eSKees Cook }
574ed9f007eSKees Cook
575ed9f007eSKees Cook if (i == slot_area_index)
576ed9f007eSKees Cook debug_putstr("slots_fetch_random() failed!?\n");
577ed9f007eSKees Cook return 0;
5789b238748SKees Cook }
5799b238748SKees Cook
__process_mem_region(struct mem_vector * entry,unsigned long minimum,unsigned long image_size)580690eaa53SChao Fan static void __process_mem_region(struct mem_vector *entry,
5819b238748SKees Cook unsigned long minimum,
5829b238748SKees Cook unsigned long image_size)
5839b238748SKees Cook {
584ed9f007eSKees Cook struct mem_vector region, overlap;
5850eb1a8afSArvind Sankar u64 region_end;
5869b238748SKees Cook
587bf457be1SArvind Sankar /* Enforce minimum and memory limit. */
5883a066990SArvind Sankar region.start = max_t(u64, entry->start, minimum);
589bf457be1SArvind Sankar region_end = min(entry->start + entry->size, mem_limit);
5909b238748SKees Cook
591ed9f007eSKees Cook /* Give up if slot area array is full. */
592ed9f007eSKees Cook while (slot_area_index < MAX_SLOT_AREA) {
593ed9f007eSKees Cook /* Potentially raise address to meet alignment needs. */
5949b238748SKees Cook region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
5959b238748SKees Cook
59627aac205SBaoquan He /* Did we raise the address above the passed in memory entry? */
597bf457be1SArvind Sankar if (region.start > region_end)
5989b238748SKees Cook return;
5999b238748SKees Cook
6009b238748SKees Cook /* Reduce size by any delta from the original address. */
601bf457be1SArvind Sankar region.size = region_end - region.start;
6029b238748SKees Cook
603ed9f007eSKees Cook /* Return if region can't contain decompressed kernel */
604ed9f007eSKees Cook if (region.size < image_size)
605ed9f007eSKees Cook return;
606ed9f007eSKees Cook
607ed9f007eSKees Cook /* If nothing overlaps, store the region and return. */
608ed9f007eSKees Cook if (!mem_avoid_overlap(®ion, &overlap)) {
609747ff626SBaoquan He process_gb_huge_pages(®ion, image_size);
610ed9f007eSKees Cook return;
611ed9f007eSKees Cook }
612ed9f007eSKees Cook
613ed9f007eSKees Cook /* Store beginning of region if holds at least image_size. */
6148d1cf859SArvind Sankar if (overlap.start >= region.start + image_size) {
615ef7b07d5SArvind Sankar region.size = overlap.start - region.start;
616ef7b07d5SArvind Sankar process_gb_huge_pages(®ion, image_size);
617ed9f007eSKees Cook }
618ed9f007eSKees Cook
619ed9f007eSKees Cook /* Clip off the overlapping region and start over. */
620ed9f007eSKees Cook region.start = overlap.start + overlap.size;
6219b238748SKees Cook }
6229b238748SKees Cook }
6239b238748SKees Cook
process_mem_region(struct mem_vector * region,unsigned long minimum,unsigned long image_size)624690eaa53SChao Fan static bool process_mem_region(struct mem_vector *region,
625e4cb955bSArvind Sankar unsigned long minimum,
626e4cb955bSArvind Sankar unsigned long image_size)
627690eaa53SChao Fan {
628690eaa53SChao Fan int i;
629690eaa53SChao Fan /*
630690eaa53SChao Fan * If no immovable memory found, or MEMORY_HOTREMOVE disabled,
631690eaa53SChao Fan * use @region directly.
632690eaa53SChao Fan */
633690eaa53SChao Fan if (!num_immovable_mem) {
634690eaa53SChao Fan __process_mem_region(region, minimum, image_size);
635690eaa53SChao Fan
636690eaa53SChao Fan if (slot_area_index == MAX_SLOT_AREA) {
637690eaa53SChao Fan debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n");
63821d6a7dcSJiapeng Chong return true;
639690eaa53SChao Fan }
64021d6a7dcSJiapeng Chong return false;
641690eaa53SChao Fan }
642690eaa53SChao Fan
64382df8261SBorislav Petkov #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
644690eaa53SChao Fan /*
645690eaa53SChao Fan * If immovable memory found, filter the intersection between
646690eaa53SChao Fan * immovable memory and @region.
647690eaa53SChao Fan */
648690eaa53SChao Fan for (i = 0; i < num_immovable_mem; i++) {
6493a066990SArvind Sankar u64 start, end, entry_end, region_end;
650690eaa53SChao Fan struct mem_vector entry;
651690eaa53SChao Fan
652690eaa53SChao Fan if (!mem_overlaps(region, &immovable_mem[i]))
653690eaa53SChao Fan continue;
654690eaa53SChao Fan
655690eaa53SChao Fan start = immovable_mem[i].start;
656690eaa53SChao Fan end = start + immovable_mem[i].size;
657690eaa53SChao Fan region_end = region->start + region->size;
658690eaa53SChao Fan
659690eaa53SChao Fan entry.start = clamp(region->start, start, end);
660690eaa53SChao Fan entry_end = clamp(region_end, start, end);
661690eaa53SChao Fan entry.size = entry_end - entry.start;
662690eaa53SChao Fan
663690eaa53SChao Fan __process_mem_region(&entry, minimum, image_size);
664690eaa53SChao Fan
665690eaa53SChao Fan if (slot_area_index == MAX_SLOT_AREA) {
666690eaa53SChao Fan debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n");
6675b3fd8aaSJing Yangyang return true;
668690eaa53SChao Fan }
669690eaa53SChao Fan }
670690eaa53SChao Fan #endif
671ee92fa03SJiapeng Chong return false;
672690eaa53SChao Fan }
673690eaa53SChao Fan
674c05cd797SBaoquan He #ifdef CONFIG_EFI
6753fd1239aSKirill A. Shutemov
6763fd1239aSKirill A. Shutemov /*
6773fd1239aSKirill A. Shutemov * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are
6783fd1239aSKirill A. Shutemov * guaranteed to be free.
6793fd1239aSKirill A. Shutemov *
6803fd1239aSKirill A. Shutemov * Pick free memory more conservatively than the EFI spec allows: according to
6813fd1239aSKirill A. Shutemov * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus
6823fd1239aSKirill A. Shutemov * available to place the kernel image into, but in practice there's firmware
6833fd1239aSKirill A. Shutemov * where using that memory leads to crashes. Buggy vendor EFI code registers
6843fd1239aSKirill A. Shutemov * for an event that triggers on SetVirtualAddressMap(). The handler assumes
6853fd1239aSKirill A. Shutemov * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which
6863fd1239aSKirill A. Shutemov * is probably true for Windows.
6873fd1239aSKirill A. Shutemov *
6883fd1239aSKirill A. Shutemov * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap().
6893fd1239aSKirill A. Shutemov */
memory_type_is_free(efi_memory_desc_t * md)6903fd1239aSKirill A. Shutemov static inline bool memory_type_is_free(efi_memory_desc_t *md)
6913fd1239aSKirill A. Shutemov {
6923fd1239aSKirill A. Shutemov if (md->type == EFI_CONVENTIONAL_MEMORY)
6933fd1239aSKirill A. Shutemov return true;
6943fd1239aSKirill A. Shutemov
6953fd1239aSKirill A. Shutemov if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) &&
6963fd1239aSKirill A. Shutemov md->type == EFI_UNACCEPTED_MEMORY)
6973fd1239aSKirill A. Shutemov return true;
6983fd1239aSKirill A. Shutemov
6993fd1239aSKirill A. Shutemov return false;
7003fd1239aSKirill A. Shutemov }
7013fd1239aSKirill A. Shutemov
702c05cd797SBaoquan He /*
70308705365SArvind Sankar * Returns true if we processed the EFI memmap, which we prefer over the E820
70408705365SArvind Sankar * table if it is available.
705c05cd797SBaoquan He */
706c05cd797SBaoquan He static bool
process_efi_entries(unsigned long minimum,unsigned long image_size)707c05cd797SBaoquan He process_efi_entries(unsigned long minimum, unsigned long image_size)
708c05cd797SBaoquan He {
709*c59843e8SArd Biesheuvel struct efi_info *e = &boot_params_ptr->efi_info;
710c05cd797SBaoquan He bool efi_mirror_found = false;
711c05cd797SBaoquan He struct mem_vector region;
712c05cd797SBaoquan He efi_memory_desc_t *md;
713c05cd797SBaoquan He unsigned long pmap;
714c05cd797SBaoquan He char *signature;
715c05cd797SBaoquan He u32 nr_desc;
716c05cd797SBaoquan He int i;
717c05cd797SBaoquan He
718c05cd797SBaoquan He signature = (char *)&e->efi_loader_signature;
719c05cd797SBaoquan He if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
720c05cd797SBaoquan He strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
721c05cd797SBaoquan He return false;
722c05cd797SBaoquan He
723c05cd797SBaoquan He #ifdef CONFIG_X86_32
724c05cd797SBaoquan He /* Can't handle data above 4GB at this time */
725c05cd797SBaoquan He if (e->efi_memmap_hi) {
726c05cd797SBaoquan He warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
727c05cd797SBaoquan He return false;
728c05cd797SBaoquan He }
729c05cd797SBaoquan He pmap = e->efi_memmap;
730c05cd797SBaoquan He #else
731c05cd797SBaoquan He pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
732c05cd797SBaoquan He #endif
733c05cd797SBaoquan He
734c05cd797SBaoquan He nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
735c05cd797SBaoquan He for (i = 0; i < nr_desc; i++) {
736c05cd797SBaoquan He md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
737c05cd797SBaoquan He if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
7380982adc7SNaoya Horiguchi efi_mirror_found = true;
7390982adc7SNaoya Horiguchi break;
7400982adc7SNaoya Horiguchi }
7410982adc7SNaoya Horiguchi }
7420982adc7SNaoya Horiguchi
7430982adc7SNaoya Horiguchi for (i = 0; i < nr_desc; i++) {
7440982adc7SNaoya Horiguchi md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
7450982adc7SNaoya Horiguchi
7463fd1239aSKirill A. Shutemov if (!memory_type_is_free(md))
7470982adc7SNaoya Horiguchi continue;
7480982adc7SNaoya Horiguchi
749262b45aeSDan Williams if (efi_soft_reserve_enabled() &&
750262b45aeSDan Williams (md->attribute & EFI_MEMORY_SP))
751262b45aeSDan Williams continue;
752262b45aeSDan Williams
7530982adc7SNaoya Horiguchi if (efi_mirror_found &&
7540982adc7SNaoya Horiguchi !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
7550982adc7SNaoya Horiguchi continue;
7560982adc7SNaoya Horiguchi
757c05cd797SBaoquan He region.start = md->phys_addr;
758c05cd797SBaoquan He region.size = md->num_pages << EFI_PAGE_SHIFT;
759690eaa53SChao Fan if (process_mem_region(®ion, minimum, image_size))
760c05cd797SBaoquan He break;
761c05cd797SBaoquan He }
7620982adc7SNaoya Horiguchi return true;
763c05cd797SBaoquan He }
764c05cd797SBaoquan He #else
765c05cd797SBaoquan He static inline bool
process_efi_entries(unsigned long minimum,unsigned long image_size)766c05cd797SBaoquan He process_efi_entries(unsigned long minimum, unsigned long image_size)
767c05cd797SBaoquan He {
768c05cd797SBaoquan He return false;
769c05cd797SBaoquan He }
770c05cd797SBaoquan He #endif
771c05cd797SBaoquan He
process_e820_entries(unsigned long minimum,unsigned long image_size)772f62995c9SBaoquan He static void process_e820_entries(unsigned long minimum,
773071a7493SBaoquan He unsigned long image_size)
7749b238748SKees Cook {
7759b238748SKees Cook int i;
77687891b01SBaoquan He struct mem_vector region;
777f62995c9SBaoquan He struct boot_e820_entry *entry;
7789b238748SKees Cook
779f62995c9SBaoquan He /* Verify potential e820 positions, appending to slots list. */
780*c59843e8SArd Biesheuvel for (i = 0; i < boot_params_ptr->e820_entries; i++) {
781*c59843e8SArd Biesheuvel entry = &boot_params_ptr->e820_table[i];
782f62995c9SBaoquan He /* Skip non-RAM entries. */
783f62995c9SBaoquan He if (entry->type != E820_TYPE_RAM)
784f62995c9SBaoquan He continue;
78587891b01SBaoquan He region.start = entry->addr;
78687891b01SBaoquan He region.size = entry->size;
787690eaa53SChao Fan if (process_mem_region(®ion, minimum, image_size))
788f62995c9SBaoquan He break;
789f62995c9SBaoquan He }
790f62995c9SBaoquan He }
791f62995c9SBaoquan He
find_random_phys_addr(unsigned long minimum,unsigned long image_size)792f62995c9SBaoquan He static unsigned long find_random_phys_addr(unsigned long minimum,
793f62995c9SBaoquan He unsigned long image_size)
794f62995c9SBaoquan He {
795f49236aeSArvind Sankar u64 phys_addr;
796f49236aeSArvind Sankar
79745128694SArvind Sankar /* Bail out early if it's impossible to succeed. */
79845128694SArvind Sankar if (minimum + image_size > mem_limit)
79945128694SArvind Sankar return 0;
80045128694SArvind Sankar
801f2844249SDave Jiang /* Check if we had too many memmaps. */
802f2844249SDave Jiang if (memmap_too_large) {
803c05cd797SBaoquan He debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
804f2844249SDave Jiang return 0;
805f2844249SDave Jiang }
806f2844249SDave Jiang
8074268b4daSArvind Sankar if (!process_efi_entries(minimum, image_size))
808f62995c9SBaoquan He process_e820_entries(minimum, image_size);
8094268b4daSArvind Sankar
810f49236aeSArvind Sankar phys_addr = slots_fetch_random();
811f49236aeSArvind Sankar
812f49236aeSArvind Sankar /* Perform a final check to make sure the address is in range. */
813f49236aeSArvind Sankar if (phys_addr < minimum || phys_addr + image_size > mem_limit) {
814f49236aeSArvind Sankar warn("Invalid physical address chosen!\n");
815f49236aeSArvind Sankar return 0;
816f49236aeSArvind Sankar }
817f49236aeSArvind Sankar
818f49236aeSArvind Sankar return (unsigned long)phys_addr;
8199b238748SKees Cook }
8209b238748SKees Cook
find_random_virt_addr(unsigned long minimum,unsigned long image_size)821071a7493SBaoquan He static unsigned long find_random_virt_addr(unsigned long minimum,
822071a7493SBaoquan He unsigned long image_size)
823071a7493SBaoquan He {
824071a7493SBaoquan He unsigned long slots, random_addr;
825071a7493SBaoquan He
826071a7493SBaoquan He /*
827071a7493SBaoquan He * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
828071a7493SBaoquan He * that can hold image_size within the range of minimum to
829071a7493SBaoquan He * KERNEL_IMAGE_SIZE?
830071a7493SBaoquan He */
831eb38be6dSArvind Sankar slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN;
832071a7493SBaoquan He
833d899a7d1SThomas Garnier random_addr = kaslr_get_random_long("Virtual") % slots;
834071a7493SBaoquan He
835071a7493SBaoquan He return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
836071a7493SBaoquan He }
837071a7493SBaoquan He
838549f90dbSBorislav Petkov /*
839549f90dbSBorislav Petkov * Since this function examines addresses much more numerically,
840549f90dbSBorislav Petkov * it takes the input and output pointers as 'unsigned long'.
841549f90dbSBorislav Petkov */
choose_random_location(unsigned long input,unsigned long input_size,unsigned long * output,unsigned long output_size,unsigned long * virt_addr)8428391c73cSBaoquan He void choose_random_location(unsigned long input,
8439b238748SKees Cook unsigned long input_size,
8448391c73cSBaoquan He unsigned long *output,
8458391c73cSBaoquan He unsigned long output_size,
8468391c73cSBaoquan He unsigned long *virt_addr)
8479b238748SKees Cook {
848e066cc47SYinghai Lu unsigned long random_addr, min_addr;
8499b238748SKees Cook
8509b238748SKees Cook if (cmdline_find_option_bool("nokaslr")) {
8510f8ede1bSKees Cook warn("KASLR disabled: 'nokaslr' on cmdline.");
8528391c73cSBaoquan He return;
8539b238748SKees Cook }
8549b238748SKees Cook
855*c59843e8SArd Biesheuvel boot_params_ptr->hdr.loadflags |= KASLR_FLAG;
8569b238748SKees Cook
85745128694SArvind Sankar if (IS_ENABLED(CONFIG_X86_32))
85845128694SArvind Sankar mem_limit = KERNEL_IMAGE_SIZE;
85945128694SArvind Sankar else
86045128694SArvind Sankar mem_limit = MAXMEM;
86145128694SArvind Sankar
8629b238748SKees Cook /* Record the various known unsafe memory ranges. */
8638391c73cSBaoquan He mem_avoid_init(input, input_size, *output);
8649b238748SKees Cook
865e066cc47SYinghai Lu /*
866e066cc47SYinghai Lu * Low end of the randomization range should be the
867e066cc47SYinghai Lu * smaller of 512M or the initial kernel image
868e066cc47SYinghai Lu * location:
869e066cc47SYinghai Lu */
870e066cc47SYinghai Lu min_addr = min(*output, 512UL << 20);
87145128694SArvind Sankar /* Make sure minimum is aligned. */
87245128694SArvind Sankar min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN);
873e066cc47SYinghai Lu
874c05cd797SBaoquan He /* Walk available memory entries to find a random address. */
875e066cc47SYinghai Lu random_addr = find_random_phys_addr(min_addr, output_size);
8769016875dSKees Cook if (!random_addr) {
877f2844249SDave Jiang warn("Physical KASLR disabled: no suitable memory region!");
8788391c73cSBaoquan He } else {
8798391c73cSBaoquan He /* Update the new physical address location. */
8808570978eSJoerg Roedel if (*output != random_addr)
8818391c73cSBaoquan He *output = random_addr;
8829b238748SKees Cook }
883da63b6b2SBaoquan He
8848391c73cSBaoquan He
8858391c73cSBaoquan He /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
8868391c73cSBaoquan He if (IS_ENABLED(CONFIG_X86_64))
8878391c73cSBaoquan He random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
8888391c73cSBaoquan He *virt_addr = random_addr;
8899b238748SKees Cook }
890