1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * kaslr.c 4 * 5 * This contains the routines needed to generate a reasonable level of 6 * entropy to choose a randomized kernel base address offset in support 7 * of Kernel Address Space Layout Randomization (KASLR). Additionally 8 * handles walking the physical memory maps (and tracking memory regions 9 * to avoid) in order to select a physical memory location that can 10 * contain the entire properly aligned running kernel image. 11 * 12 */ 13 14 /* 15 * isspace() in linux/ctype.h is expected by next_args() to filter 16 * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, 17 * since isdigit() is implemented in both of them. Hence disable it 18 * here. 19 */ 20 #define BOOT_CTYPE_H 21 22 /* 23 * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. 24 * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL 25 * which is meaningless and will cause compiling error in some cases. 26 */ 27 #define __DISABLE_EXPORTS 28 29 #include "misc.h" 30 #include "error.h" 31 #include "../string.h" 32 33 #include <generated/compile.h> 34 #include <linux/module.h> 35 #include <linux/uts.h> 36 #include <linux/utsname.h> 37 #include <linux/ctype.h> 38 #include <linux/efi.h> 39 #include <generated/utsrelease.h> 40 #include <asm/efi.h> 41 42 /* Macros used by the included decompressor code below. */ 43 #define STATIC 44 #include <linux/decompress/mm.h> 45 46 #ifdef CONFIG_X86_5LEVEL 47 unsigned int __pgtable_l5_enabled; 48 unsigned int pgdir_shift __ro_after_init = 39; 49 unsigned int ptrs_per_p4d __ro_after_init = 1; 50 #endif 51 52 extern unsigned long get_cmd_line_ptr(void); 53 54 /* Used by PAGE_KERN* macros: */ 55 pteval_t __default_kernel_pte_mask __read_mostly = ~0; 56 57 /* Simplified build-specific string for starting entropy. */ 58 static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" 59 LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; 60 61 static unsigned long rotate_xor(unsigned long hash, const void *area, 62 size_t size) 63 { 64 size_t i; 65 unsigned long *ptr = (unsigned long *)area; 66 67 for (i = 0; i < size / sizeof(hash); i++) { 68 /* Rotate by odd number of bits and XOR. */ 69 hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); 70 hash ^= ptr[i]; 71 } 72 73 return hash; 74 } 75 76 /* Attempt to create a simple but unpredictable starting entropy. */ 77 static unsigned long get_boot_seed(void) 78 { 79 unsigned long hash = 0; 80 81 hash = rotate_xor(hash, build_str, sizeof(build_str)); 82 hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); 83 84 return hash; 85 } 86 87 #define KASLR_COMPRESSED_BOOT 88 #include "../../lib/kaslr.c" 89 90 91 /* Only supporting at most 4 unusable memmap regions with kaslr */ 92 #define MAX_MEMMAP_REGIONS 4 93 94 static bool memmap_too_large; 95 96 97 /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ 98 static unsigned long long mem_limit = ULLONG_MAX; 99 100 /* Number of immovable memory regions */ 101 static int num_immovable_mem; 102 103 enum mem_avoid_index { 104 MEM_AVOID_ZO_RANGE = 0, 105 MEM_AVOID_INITRD, 106 MEM_AVOID_CMDLINE, 107 MEM_AVOID_BOOTPARAMS, 108 MEM_AVOID_MEMMAP_BEGIN, 109 MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, 110 MEM_AVOID_MAX, 111 }; 112 113 static struct mem_vector mem_avoid[MEM_AVOID_MAX]; 114 115 static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) 116 { 117 /* Item one is entirely before item two. */ 118 if (one->start + one->size <= two->start) 119 return false; 120 /* Item one is entirely after item two. */ 121 if (one->start >= two->start + two->size) 122 return false; 123 return true; 124 } 125 126 char *skip_spaces(const char *str) 127 { 128 while (isspace(*str)) 129 ++str; 130 return (char *)str; 131 } 132 #include "../../../../lib/ctype.c" 133 #include "../../../../lib/cmdline.c" 134 135 static int 136 parse_memmap(char *p, unsigned long long *start, unsigned long long *size) 137 { 138 char *oldp; 139 140 if (!p) 141 return -EINVAL; 142 143 /* We don't care about this option here */ 144 if (!strncmp(p, "exactmap", 8)) 145 return -EINVAL; 146 147 oldp = p; 148 *size = memparse(p, &p); 149 if (p == oldp) 150 return -EINVAL; 151 152 switch (*p) { 153 case '#': 154 case '$': 155 case '!': 156 *start = memparse(p + 1, &p); 157 return 0; 158 case '@': 159 /* memmap=nn@ss specifies usable region, should be skipped */ 160 *size = 0; 161 /* Fall through */ 162 default: 163 /* 164 * If w/o offset, only size specified, memmap=nn[KMG] has the 165 * same behaviour as mem=nn[KMG]. It limits the max address 166 * system can use. Region above the limit should be avoided. 167 */ 168 *start = 0; 169 return 0; 170 } 171 172 return -EINVAL; 173 } 174 175 static void mem_avoid_memmap(char *str) 176 { 177 static int i; 178 179 if (i >= MAX_MEMMAP_REGIONS) 180 return; 181 182 while (str && (i < MAX_MEMMAP_REGIONS)) { 183 int rc; 184 unsigned long long start, size; 185 char *k = strchr(str, ','); 186 187 if (k) 188 *k++ = 0; 189 190 rc = parse_memmap(str, &start, &size); 191 if (rc < 0) 192 break; 193 str = k; 194 195 if (start == 0) { 196 /* Store the specified memory limit if size > 0 */ 197 if (size > 0) 198 mem_limit = size; 199 200 continue; 201 } 202 203 mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; 204 mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; 205 i++; 206 } 207 208 /* More than 4 memmaps, fail kaslr */ 209 if ((i >= MAX_MEMMAP_REGIONS) && str) 210 memmap_too_large = true; 211 } 212 213 /* Store the number of 1GB huge pages which users specified: */ 214 static unsigned long max_gb_huge_pages; 215 216 static void parse_gb_huge_pages(char *param, char *val) 217 { 218 static bool gbpage_sz; 219 char *p; 220 221 if (!strcmp(param, "hugepagesz")) { 222 p = val; 223 if (memparse(p, &p) != PUD_SIZE) { 224 gbpage_sz = false; 225 return; 226 } 227 228 if (gbpage_sz) 229 warn("Repeatedly set hugeTLB page size of 1G!\n"); 230 gbpage_sz = true; 231 return; 232 } 233 234 if (!strcmp(param, "hugepages") && gbpage_sz) { 235 p = val; 236 max_gb_huge_pages = simple_strtoull(p, &p, 0); 237 return; 238 } 239 } 240 241 242 static void handle_mem_options(void) 243 { 244 char *args = (char *)get_cmd_line_ptr(); 245 size_t len = strlen((char *)args); 246 char *tmp_cmdline; 247 char *param, *val; 248 u64 mem_size; 249 250 if (!strstr(args, "memmap=") && !strstr(args, "mem=") && 251 !strstr(args, "hugepages")) 252 return; 253 254 tmp_cmdline = malloc(len + 1); 255 if (!tmp_cmdline) 256 error("Failed to allocate space for tmp_cmdline"); 257 258 memcpy(tmp_cmdline, args, len); 259 tmp_cmdline[len] = 0; 260 args = tmp_cmdline; 261 262 /* Chew leading spaces */ 263 args = skip_spaces(args); 264 265 while (*args) { 266 args = next_arg(args, ¶m, &val); 267 /* Stop at -- */ 268 if (!val && strcmp(param, "--") == 0) { 269 warn("Only '--' specified in cmdline"); 270 goto out; 271 } 272 273 if (!strcmp(param, "memmap")) { 274 mem_avoid_memmap(val); 275 } else if (strstr(param, "hugepages")) { 276 parse_gb_huge_pages(param, val); 277 } else if (!strcmp(param, "mem")) { 278 char *p = val; 279 280 if (!strcmp(p, "nopentium")) 281 continue; 282 mem_size = memparse(p, &p); 283 if (mem_size == 0) 284 goto out; 285 286 mem_limit = mem_size; 287 } 288 } 289 290 out: 291 free(tmp_cmdline); 292 return; 293 } 294 295 /* 296 * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). 297 * The mem_avoid array is used to store the ranges that need to be avoided 298 * when KASLR searches for an appropriate random address. We must avoid any 299 * regions that are unsafe to overlap with during decompression, and other 300 * things like the initrd, cmdline and boot_params. This comment seeks to 301 * explain mem_avoid as clearly as possible since incorrect mem_avoid 302 * memory ranges lead to really hard to debug boot failures. 303 * 304 * The initrd, cmdline, and boot_params are trivial to identify for 305 * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and 306 * MEM_AVOID_BOOTPARAMS respectively below. 307 * 308 * What is not obvious how to avoid is the range of memory that is used 309 * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover 310 * the compressed kernel (ZO) and its run space, which is used to extract 311 * the uncompressed kernel (VO) and relocs. 312 * 313 * ZO's full run size sits against the end of the decompression buffer, so 314 * we can calculate where text, data, bss, etc of ZO are positioned more 315 * easily. 316 * 317 * For additional background, the decompression calculations can be found 318 * in header.S, and the memory diagram is based on the one found in misc.c. 319 * 320 * The following conditions are already enforced by the image layouts and 321 * associated code: 322 * - input + input_size >= output + output_size 323 * - kernel_total_size <= init_size 324 * - kernel_total_size <= output_size (see Note below) 325 * - output + init_size >= output + output_size 326 * 327 * (Note that kernel_total_size and output_size have no fundamental 328 * relationship, but output_size is passed to choose_random_location 329 * as a maximum of the two. The diagram is showing a case where 330 * kernel_total_size is larger than output_size, but this case is 331 * handled by bumping output_size.) 332 * 333 * The above conditions can be illustrated by a diagram: 334 * 335 * 0 output input input+input_size output+init_size 336 * | | | | | 337 * | | | | | 338 * |-----|--------|--------|--------------|-----------|--|-------------| 339 * | | | 340 * | | | 341 * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size 342 * 343 * [output, output+init_size) is the entire memory range used for 344 * extracting the compressed image. 345 * 346 * [output, output+kernel_total_size) is the range needed for the 347 * uncompressed kernel (VO) and its run size (bss, brk, etc). 348 * 349 * [output, output+output_size) is VO plus relocs (i.e. the entire 350 * uncompressed payload contained by ZO). This is the area of the buffer 351 * written to during decompression. 352 * 353 * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case 354 * range of the copied ZO and decompression code. (i.e. the range 355 * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) 356 * 357 * [input, input+input_size) is the original copied compressed image (ZO) 358 * (i.e. it does not include its run size). This range must be avoided 359 * because it contains the data used for decompression. 360 * 361 * [input+input_size, output+init_size) is [_text, _end) for ZO. This 362 * range includes ZO's heap and stack, and must be avoided since it 363 * performs the decompression. 364 * 365 * Since the above two ranges need to be avoided and they are adjacent, 366 * they can be merged, resulting in: [input, output+init_size) which 367 * becomes the MEM_AVOID_ZO_RANGE below. 368 */ 369 static void mem_avoid_init(unsigned long input, unsigned long input_size, 370 unsigned long output) 371 { 372 unsigned long init_size = boot_params->hdr.init_size; 373 u64 initrd_start, initrd_size; 374 u64 cmd_line, cmd_line_size; 375 char *ptr; 376 377 /* 378 * Avoid the region that is unsafe to overlap during 379 * decompression. 380 */ 381 mem_avoid[MEM_AVOID_ZO_RANGE].start = input; 382 mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; 383 add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, 384 mem_avoid[MEM_AVOID_ZO_RANGE].size); 385 386 /* Avoid initrd. */ 387 initrd_start = (u64)boot_params->ext_ramdisk_image << 32; 388 initrd_start |= boot_params->hdr.ramdisk_image; 389 initrd_size = (u64)boot_params->ext_ramdisk_size << 32; 390 initrd_size |= boot_params->hdr.ramdisk_size; 391 mem_avoid[MEM_AVOID_INITRD].start = initrd_start; 392 mem_avoid[MEM_AVOID_INITRD].size = initrd_size; 393 /* No need to set mapping for initrd, it will be handled in VO. */ 394 395 /* Avoid kernel command line. */ 396 cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; 397 cmd_line |= boot_params->hdr.cmd_line_ptr; 398 /* Calculate size of cmd_line. */ 399 ptr = (char *)(unsigned long)cmd_line; 400 for (cmd_line_size = 0; ptr[cmd_line_size++];) 401 ; 402 mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; 403 mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; 404 add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, 405 mem_avoid[MEM_AVOID_CMDLINE].size); 406 407 /* Avoid boot parameters. */ 408 mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; 409 mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); 410 add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, 411 mem_avoid[MEM_AVOID_BOOTPARAMS].size); 412 413 /* We don't need to set a mapping for setup_data. */ 414 415 /* Mark the memmap regions we need to avoid */ 416 handle_mem_options(); 417 418 /* Enumerate the immovable memory regions */ 419 num_immovable_mem = count_immovable_mem_regions(); 420 421 #ifdef CONFIG_X86_VERBOSE_BOOTUP 422 /* Make sure video RAM can be used. */ 423 add_identity_map(0, PMD_SIZE); 424 #endif 425 } 426 427 /* 428 * Does this memory vector overlap a known avoided area? If so, record the 429 * overlap region with the lowest address. 430 */ 431 static bool mem_avoid_overlap(struct mem_vector *img, 432 struct mem_vector *overlap) 433 { 434 int i; 435 struct setup_data *ptr; 436 unsigned long earliest = img->start + img->size; 437 bool is_overlapping = false; 438 439 for (i = 0; i < MEM_AVOID_MAX; i++) { 440 if (mem_overlaps(img, &mem_avoid[i]) && 441 mem_avoid[i].start < earliest) { 442 *overlap = mem_avoid[i]; 443 earliest = overlap->start; 444 is_overlapping = true; 445 } 446 } 447 448 /* Avoid all entries in the setup_data linked list. */ 449 ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; 450 while (ptr) { 451 struct mem_vector avoid; 452 453 avoid.start = (unsigned long)ptr; 454 avoid.size = sizeof(*ptr) + ptr->len; 455 456 if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { 457 *overlap = avoid; 458 earliest = overlap->start; 459 is_overlapping = true; 460 } 461 462 ptr = (struct setup_data *)(unsigned long)ptr->next; 463 } 464 465 return is_overlapping; 466 } 467 468 struct slot_area { 469 unsigned long addr; 470 int num; 471 }; 472 473 #define MAX_SLOT_AREA 100 474 475 static struct slot_area slot_areas[MAX_SLOT_AREA]; 476 477 static unsigned long slot_max; 478 479 static unsigned long slot_area_index; 480 481 static void store_slot_info(struct mem_vector *region, unsigned long image_size) 482 { 483 struct slot_area slot_area; 484 485 if (slot_area_index == MAX_SLOT_AREA) 486 return; 487 488 slot_area.addr = region->start; 489 slot_area.num = (region->size - image_size) / 490 CONFIG_PHYSICAL_ALIGN + 1; 491 492 if (slot_area.num > 0) { 493 slot_areas[slot_area_index++] = slot_area; 494 slot_max += slot_area.num; 495 } 496 } 497 498 /* 499 * Skip as many 1GB huge pages as possible in the passed region 500 * according to the number which users specified: 501 */ 502 static void 503 process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) 504 { 505 unsigned long addr, size = 0; 506 struct mem_vector tmp; 507 int i = 0; 508 509 if (!max_gb_huge_pages) { 510 store_slot_info(region, image_size); 511 return; 512 } 513 514 addr = ALIGN(region->start, PUD_SIZE); 515 /* Did we raise the address above the passed in memory entry? */ 516 if (addr < region->start + region->size) 517 size = region->size - (addr - region->start); 518 519 /* Check how many 1GB huge pages can be filtered out: */ 520 while (size > PUD_SIZE && max_gb_huge_pages) { 521 size -= PUD_SIZE; 522 max_gb_huge_pages--; 523 i++; 524 } 525 526 /* No good 1GB huge pages found: */ 527 if (!i) { 528 store_slot_info(region, image_size); 529 return; 530 } 531 532 /* 533 * Skip those 'i'*1GB good huge pages, and continue checking and 534 * processing the remaining head or tail part of the passed region 535 * if available. 536 */ 537 538 if (addr >= region->start + image_size) { 539 tmp.start = region->start; 540 tmp.size = addr - region->start; 541 store_slot_info(&tmp, image_size); 542 } 543 544 size = region->size - (addr - region->start) - i * PUD_SIZE; 545 if (size >= image_size) { 546 tmp.start = addr + i * PUD_SIZE; 547 tmp.size = size; 548 store_slot_info(&tmp, image_size); 549 } 550 } 551 552 static unsigned long slots_fetch_random(void) 553 { 554 unsigned long slot; 555 int i; 556 557 /* Handle case of no slots stored. */ 558 if (slot_max == 0) 559 return 0; 560 561 slot = kaslr_get_random_long("Physical") % slot_max; 562 563 for (i = 0; i < slot_area_index; i++) { 564 if (slot >= slot_areas[i].num) { 565 slot -= slot_areas[i].num; 566 continue; 567 } 568 return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN; 569 } 570 571 if (i == slot_area_index) 572 debug_putstr("slots_fetch_random() failed!?\n"); 573 return 0; 574 } 575 576 static void __process_mem_region(struct mem_vector *entry, 577 unsigned long minimum, 578 unsigned long image_size) 579 { 580 struct mem_vector region, overlap; 581 unsigned long start_orig, end; 582 struct mem_vector cur_entry; 583 584 /* On 32-bit, ignore entries entirely above our maximum. */ 585 if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) 586 return; 587 588 /* Ignore entries entirely below our minimum. */ 589 if (entry->start + entry->size < minimum) 590 return; 591 592 /* Ignore entries above memory limit */ 593 end = min(entry->size + entry->start, mem_limit); 594 if (entry->start >= end) 595 return; 596 cur_entry.start = entry->start; 597 cur_entry.size = end - entry->start; 598 599 region.start = cur_entry.start; 600 region.size = cur_entry.size; 601 602 /* Give up if slot area array is full. */ 603 while (slot_area_index < MAX_SLOT_AREA) { 604 start_orig = region.start; 605 606 /* Potentially raise address to minimum location. */ 607 if (region.start < minimum) 608 region.start = minimum; 609 610 /* Potentially raise address to meet alignment needs. */ 611 region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); 612 613 /* Did we raise the address above the passed in memory entry? */ 614 if (region.start > cur_entry.start + cur_entry.size) 615 return; 616 617 /* Reduce size by any delta from the original address. */ 618 region.size -= region.start - start_orig; 619 620 /* On 32-bit, reduce region size to fit within max size. */ 621 if (IS_ENABLED(CONFIG_X86_32) && 622 region.start + region.size > KERNEL_IMAGE_SIZE) 623 region.size = KERNEL_IMAGE_SIZE - region.start; 624 625 /* Return if region can't contain decompressed kernel */ 626 if (region.size < image_size) 627 return; 628 629 /* If nothing overlaps, store the region and return. */ 630 if (!mem_avoid_overlap(®ion, &overlap)) { 631 process_gb_huge_pages(®ion, image_size); 632 return; 633 } 634 635 /* Store beginning of region if holds at least image_size. */ 636 if (overlap.start > region.start + image_size) { 637 struct mem_vector beginning; 638 639 beginning.start = region.start; 640 beginning.size = overlap.start - region.start; 641 process_gb_huge_pages(&beginning, image_size); 642 } 643 644 /* Return if overlap extends to or past end of region. */ 645 if (overlap.start + overlap.size >= region.start + region.size) 646 return; 647 648 /* Clip off the overlapping region and start over. */ 649 region.size -= overlap.start - region.start + overlap.size; 650 region.start = overlap.start + overlap.size; 651 } 652 } 653 654 static bool process_mem_region(struct mem_vector *region, 655 unsigned long long minimum, 656 unsigned long long image_size) 657 { 658 int i; 659 /* 660 * If no immovable memory found, or MEMORY_HOTREMOVE disabled, 661 * use @region directly. 662 */ 663 if (!num_immovable_mem) { 664 __process_mem_region(region, minimum, image_size); 665 666 if (slot_area_index == MAX_SLOT_AREA) { 667 debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); 668 return 1; 669 } 670 return 0; 671 } 672 673 #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) 674 /* 675 * If immovable memory found, filter the intersection between 676 * immovable memory and @region. 677 */ 678 for (i = 0; i < num_immovable_mem; i++) { 679 unsigned long long start, end, entry_end, region_end; 680 struct mem_vector entry; 681 682 if (!mem_overlaps(region, &immovable_mem[i])) 683 continue; 684 685 start = immovable_mem[i].start; 686 end = start + immovable_mem[i].size; 687 region_end = region->start + region->size; 688 689 entry.start = clamp(region->start, start, end); 690 entry_end = clamp(region_end, start, end); 691 entry.size = entry_end - entry.start; 692 693 __process_mem_region(&entry, minimum, image_size); 694 695 if (slot_area_index == MAX_SLOT_AREA) { 696 debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n"); 697 return 1; 698 } 699 } 700 #endif 701 return 0; 702 } 703 704 #ifdef CONFIG_EFI 705 /* 706 * Returns true if mirror region found (and must have been processed 707 * for slots adding) 708 */ 709 static bool 710 process_efi_entries(unsigned long minimum, unsigned long image_size) 711 { 712 struct efi_info *e = &boot_params->efi_info; 713 bool efi_mirror_found = false; 714 struct mem_vector region; 715 efi_memory_desc_t *md; 716 unsigned long pmap; 717 char *signature; 718 u32 nr_desc; 719 int i; 720 721 signature = (char *)&e->efi_loader_signature; 722 if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && 723 strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) 724 return false; 725 726 #ifdef CONFIG_X86_32 727 /* Can't handle data above 4GB at this time */ 728 if (e->efi_memmap_hi) { 729 warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); 730 return false; 731 } 732 pmap = e->efi_memmap; 733 #else 734 pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); 735 #endif 736 737 nr_desc = e->efi_memmap_size / e->efi_memdesc_size; 738 for (i = 0; i < nr_desc; i++) { 739 md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); 740 if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { 741 efi_mirror_found = true; 742 break; 743 } 744 } 745 746 for (i = 0; i < nr_desc; i++) { 747 md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); 748 749 /* 750 * Here we are more conservative in picking free memory than 751 * the EFI spec allows: 752 * 753 * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also 754 * free memory and thus available to place the kernel image into, 755 * but in practice there's firmware where using that memory leads 756 * to crashes. 757 * 758 * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. 759 */ 760 if (md->type != EFI_CONVENTIONAL_MEMORY) 761 continue; 762 763 if (efi_mirror_found && 764 !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) 765 continue; 766 767 region.start = md->phys_addr; 768 region.size = md->num_pages << EFI_PAGE_SHIFT; 769 if (process_mem_region(®ion, minimum, image_size)) 770 break; 771 } 772 return true; 773 } 774 #else 775 static inline bool 776 process_efi_entries(unsigned long minimum, unsigned long image_size) 777 { 778 return false; 779 } 780 #endif 781 782 static void process_e820_entries(unsigned long minimum, 783 unsigned long image_size) 784 { 785 int i; 786 struct mem_vector region; 787 struct boot_e820_entry *entry; 788 789 /* Verify potential e820 positions, appending to slots list. */ 790 for (i = 0; i < boot_params->e820_entries; i++) { 791 entry = &boot_params->e820_table[i]; 792 /* Skip non-RAM entries. */ 793 if (entry->type != E820_TYPE_RAM) 794 continue; 795 region.start = entry->addr; 796 region.size = entry->size; 797 if (process_mem_region(®ion, minimum, image_size)) 798 break; 799 } 800 } 801 802 static unsigned long find_random_phys_addr(unsigned long minimum, 803 unsigned long image_size) 804 { 805 /* Check if we had too many memmaps. */ 806 if (memmap_too_large) { 807 debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); 808 return 0; 809 } 810 811 /* Make sure minimum is aligned. */ 812 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); 813 814 if (process_efi_entries(minimum, image_size)) 815 return slots_fetch_random(); 816 817 process_e820_entries(minimum, image_size); 818 return slots_fetch_random(); 819 } 820 821 static unsigned long find_random_virt_addr(unsigned long minimum, 822 unsigned long image_size) 823 { 824 unsigned long slots, random_addr; 825 826 /* Make sure minimum is aligned. */ 827 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); 828 /* Align image_size for easy slot calculations. */ 829 image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); 830 831 /* 832 * There are how many CONFIG_PHYSICAL_ALIGN-sized slots 833 * that can hold image_size within the range of minimum to 834 * KERNEL_IMAGE_SIZE? 835 */ 836 slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / 837 CONFIG_PHYSICAL_ALIGN + 1; 838 839 random_addr = kaslr_get_random_long("Virtual") % slots; 840 841 return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; 842 } 843 844 /* 845 * Since this function examines addresses much more numerically, 846 * it takes the input and output pointers as 'unsigned long'. 847 */ 848 void choose_random_location(unsigned long input, 849 unsigned long input_size, 850 unsigned long *output, 851 unsigned long output_size, 852 unsigned long *virt_addr) 853 { 854 unsigned long random_addr, min_addr; 855 856 if (cmdline_find_option_bool("nokaslr")) { 857 warn("KASLR disabled: 'nokaslr' on cmdline."); 858 return; 859 } 860 861 #ifdef CONFIG_X86_5LEVEL 862 if (__read_cr4() & X86_CR4_LA57) { 863 __pgtable_l5_enabled = 1; 864 pgdir_shift = 48; 865 ptrs_per_p4d = 512; 866 } 867 #endif 868 869 boot_params->hdr.loadflags |= KASLR_FLAG; 870 871 /* Prepare to add new identity pagetables on demand. */ 872 initialize_identity_maps(); 873 874 /* Record the various known unsafe memory ranges. */ 875 mem_avoid_init(input, input_size, *output); 876 877 /* 878 * Low end of the randomization range should be the 879 * smaller of 512M or the initial kernel image 880 * location: 881 */ 882 min_addr = min(*output, 512UL << 20); 883 884 /* Walk available memory entries to find a random address. */ 885 random_addr = find_random_phys_addr(min_addr, output_size); 886 if (!random_addr) { 887 warn("Physical KASLR disabled: no suitable memory region!"); 888 } else { 889 /* Update the new physical address location. */ 890 if (*output != random_addr) { 891 add_identity_map(random_addr, output_size); 892 *output = random_addr; 893 } 894 895 /* 896 * This loads the identity mapping page table. 897 * This should only be done if a new physical address 898 * is found for the kernel, otherwise we should keep 899 * the old page table to make it be like the "nokaslr" 900 * case. 901 */ 902 finalize_identity_maps(); 903 } 904 905 906 /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ 907 if (IS_ENABLED(CONFIG_X86_64)) 908 random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); 909 *virt_addr = random_addr; 910 } 911