1 /* 2 * Copyright(c) 2017 Intel Corporation. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * This code is based in part on work published here: 14 * 15 * https://github.com/IAIK/KAISER 16 * 17 * The original work was written by and and signed off by for the Linux 18 * kernel by: 19 * 20 * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> 21 * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> 22 * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> 23 * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> 24 * 25 * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> 26 * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and 27 * Andy Lutomirsky <luto@amacapital.net> 28 */ 29 #include <linux/kernel.h> 30 #include <linux/errno.h> 31 #include <linux/string.h> 32 #include <linux/types.h> 33 #include <linux/bug.h> 34 #include <linux/init.h> 35 #include <linux/spinlock.h> 36 #include <linux/mm.h> 37 #include <linux/uaccess.h> 38 #include <linux/cpu.h> 39 40 #include <asm/cpufeature.h> 41 #include <asm/hypervisor.h> 42 #include <asm/vsyscall.h> 43 #include <asm/cmdline.h> 44 #include <asm/pti.h> 45 #include <asm/pgtable.h> 46 #include <asm/pgalloc.h> 47 #include <asm/tlbflush.h> 48 #include <asm/desc.h> 49 #include <asm/sections.h> 50 51 #undef pr_fmt 52 #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt 53 54 /* Backporting helper */ 55 #ifndef __GFP_NOTRACK 56 #define __GFP_NOTRACK 0 57 #endif 58 59 /* 60 * Define the page-table levels we clone for user-space on 32 61 * and 64 bit. 62 */ 63 #ifdef CONFIG_X86_64 64 #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD 65 #else 66 #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE 67 #endif 68 69 static void __init pti_print_if_insecure(const char *reason) 70 { 71 if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 72 pr_info("%s\n", reason); 73 } 74 75 static void __init pti_print_if_secure(const char *reason) 76 { 77 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 78 pr_info("%s\n", reason); 79 } 80 81 static enum pti_mode { 82 PTI_AUTO = 0, 83 PTI_FORCE_OFF, 84 PTI_FORCE_ON 85 } pti_mode; 86 87 void __init pti_check_boottime_disable(void) 88 { 89 char arg[5]; 90 int ret; 91 92 /* Assume mode is auto unless overridden. */ 93 pti_mode = PTI_AUTO; 94 95 if (hypervisor_is_type(X86_HYPER_XEN_PV)) { 96 pti_mode = PTI_FORCE_OFF; 97 pti_print_if_insecure("disabled on XEN PV."); 98 return; 99 } 100 101 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); 102 if (ret > 0) { 103 if (ret == 3 && !strncmp(arg, "off", 3)) { 104 pti_mode = PTI_FORCE_OFF; 105 pti_print_if_insecure("disabled on command line."); 106 return; 107 } 108 if (ret == 2 && !strncmp(arg, "on", 2)) { 109 pti_mode = PTI_FORCE_ON; 110 pti_print_if_secure("force enabled on command line."); 111 goto enable; 112 } 113 if (ret == 4 && !strncmp(arg, "auto", 4)) { 114 pti_mode = PTI_AUTO; 115 goto autosel; 116 } 117 } 118 119 if (cmdline_find_option_bool(boot_command_line, "nopti") || 120 cpu_mitigations_off()) { 121 pti_mode = PTI_FORCE_OFF; 122 pti_print_if_insecure("disabled on command line."); 123 return; 124 } 125 126 autosel: 127 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 128 return; 129 enable: 130 setup_force_cpu_cap(X86_FEATURE_PTI); 131 } 132 133 pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) 134 { 135 /* 136 * Changes to the high (kernel) portion of the kernelmode page 137 * tables are not automatically propagated to the usermode tables. 138 * 139 * Users should keep in mind that, unlike the kernelmode tables, 140 * there is no vmalloc_fault equivalent for the usermode tables. 141 * Top-level entries added to init_mm's usermode pgd after boot 142 * will not be automatically propagated to other mms. 143 */ 144 if (!pgdp_maps_userspace(pgdp)) 145 return pgd; 146 147 /* 148 * The user page tables get the full PGD, accessible from 149 * userspace: 150 */ 151 kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; 152 153 /* 154 * If this is normal user memory, make it NX in the kernel 155 * pagetables so that, if we somehow screw up and return to 156 * usermode with the kernel CR3 loaded, we'll get a page fault 157 * instead of allowing user code to execute with the wrong CR3. 158 * 159 * As exceptions, we don't set NX if: 160 * - _PAGE_USER is not set. This could be an executable 161 * EFI runtime mapping or something similar, and the kernel 162 * may execute from it 163 * - we don't have NX support 164 * - we're clearing the PGD (i.e. the new pgd is not present). 165 */ 166 if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && 167 (__supported_pte_mask & _PAGE_NX)) 168 pgd.pgd |= _PAGE_NX; 169 170 /* return the copy of the PGD we want the kernel to use: */ 171 return pgd; 172 } 173 174 /* 175 * Walk the user copy of the page tables (optionally) trying to allocate 176 * page table pages on the way down. 177 * 178 * Returns a pointer to a P4D on success, or NULL on failure. 179 */ 180 static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) 181 { 182 pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); 183 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 184 185 if (address < PAGE_OFFSET) { 186 WARN_ONCE(1, "attempt to walk user address\n"); 187 return NULL; 188 } 189 190 if (pgd_none(*pgd)) { 191 unsigned long new_p4d_page = __get_free_page(gfp); 192 if (WARN_ON_ONCE(!new_p4d_page)) 193 return NULL; 194 195 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); 196 } 197 BUILD_BUG_ON(pgd_large(*pgd) != 0); 198 199 return p4d_offset(pgd, address); 200 } 201 202 /* 203 * Walk the user copy of the page tables (optionally) trying to allocate 204 * page table pages on the way down. 205 * 206 * Returns a pointer to a PMD on success, or NULL on failure. 207 */ 208 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) 209 { 210 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 211 p4d_t *p4d; 212 pud_t *pud; 213 214 p4d = pti_user_pagetable_walk_p4d(address); 215 if (!p4d) 216 return NULL; 217 218 BUILD_BUG_ON(p4d_large(*p4d) != 0); 219 if (p4d_none(*p4d)) { 220 unsigned long new_pud_page = __get_free_page(gfp); 221 if (WARN_ON_ONCE(!new_pud_page)) 222 return NULL; 223 224 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); 225 } 226 227 pud = pud_offset(p4d, address); 228 /* The user page tables do not use large mappings: */ 229 if (pud_large(*pud)) { 230 WARN_ON(1); 231 return NULL; 232 } 233 if (pud_none(*pud)) { 234 unsigned long new_pmd_page = __get_free_page(gfp); 235 if (WARN_ON_ONCE(!new_pmd_page)) 236 return NULL; 237 238 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); 239 } 240 241 return pmd_offset(pud, address); 242 } 243 244 /* 245 * Walk the shadow copy of the page tables (optionally) trying to allocate 246 * page table pages on the way down. Does not support large pages. 247 * 248 * Note: this is only used when mapping *new* kernel data into the 249 * user/shadow page tables. It is never used for userspace data. 250 * 251 * Returns a pointer to a PTE on success, or NULL on failure. 252 */ 253 static pte_t *pti_user_pagetable_walk_pte(unsigned long address) 254 { 255 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 256 pmd_t *pmd; 257 pte_t *pte; 258 259 pmd = pti_user_pagetable_walk_pmd(address); 260 if (!pmd) 261 return NULL; 262 263 /* We can't do anything sensible if we hit a large mapping. */ 264 if (pmd_large(*pmd)) { 265 WARN_ON(1); 266 return NULL; 267 } 268 269 if (pmd_none(*pmd)) { 270 unsigned long new_pte_page = __get_free_page(gfp); 271 if (!new_pte_page) 272 return NULL; 273 274 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); 275 } 276 277 pte = pte_offset_kernel(pmd, address); 278 if (pte_flags(*pte) & _PAGE_USER) { 279 WARN_ONCE(1, "attempt to walk to user pte\n"); 280 return NULL; 281 } 282 return pte; 283 } 284 285 #ifdef CONFIG_X86_VSYSCALL_EMULATION 286 static void __init pti_setup_vsyscall(void) 287 { 288 pte_t *pte, *target_pte; 289 unsigned int level; 290 291 pte = lookup_address(VSYSCALL_ADDR, &level); 292 if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) 293 return; 294 295 target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); 296 if (WARN_ON(!target_pte)) 297 return; 298 299 *target_pte = *pte; 300 set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); 301 } 302 #else 303 static void __init pti_setup_vsyscall(void) { } 304 #endif 305 306 enum pti_clone_level { 307 PTI_CLONE_PMD, 308 PTI_CLONE_PTE, 309 }; 310 311 static void 312 pti_clone_pgtable(unsigned long start, unsigned long end, 313 enum pti_clone_level level) 314 { 315 unsigned long addr; 316 317 /* 318 * Clone the populated PMDs which cover start to end. These PMD areas 319 * can have holes. 320 */ 321 for (addr = start; addr < end;) { 322 pte_t *pte, *target_pte; 323 pmd_t *pmd, *target_pmd; 324 pgd_t *pgd; 325 p4d_t *p4d; 326 pud_t *pud; 327 328 /* Overflow check */ 329 if (addr < start) 330 break; 331 332 pgd = pgd_offset_k(addr); 333 if (WARN_ON(pgd_none(*pgd))) 334 return; 335 p4d = p4d_offset(pgd, addr); 336 if (WARN_ON(p4d_none(*p4d))) 337 return; 338 339 pud = pud_offset(p4d, addr); 340 if (pud_none(*pud)) { 341 addr += PUD_SIZE; 342 continue; 343 } 344 345 pmd = pmd_offset(pud, addr); 346 if (pmd_none(*pmd)) { 347 addr += PMD_SIZE; 348 continue; 349 } 350 351 if (pmd_large(*pmd) || level == PTI_CLONE_PMD) { 352 target_pmd = pti_user_pagetable_walk_pmd(addr); 353 if (WARN_ON(!target_pmd)) 354 return; 355 356 /* 357 * Only clone present PMDs. This ensures only setting 358 * _PAGE_GLOBAL on present PMDs. This should only be 359 * called on well-known addresses anyway, so a non- 360 * present PMD would be a surprise. 361 */ 362 if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) 363 return; 364 365 /* 366 * Setting 'target_pmd' below creates a mapping in both 367 * the user and kernel page tables. It is effectively 368 * global, so set it as global in both copies. Note: 369 * the X86_FEATURE_PGE check is not _required_ because 370 * the CPU ignores _PAGE_GLOBAL when PGE is not 371 * supported. The check keeps consistentency with 372 * code that only set this bit when supported. 373 */ 374 if (boot_cpu_has(X86_FEATURE_PGE)) 375 *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); 376 377 /* 378 * Copy the PMD. That is, the kernelmode and usermode 379 * tables will share the last-level page tables of this 380 * address range 381 */ 382 *target_pmd = *pmd; 383 384 addr += PMD_SIZE; 385 386 } else if (level == PTI_CLONE_PTE) { 387 388 /* Walk the page-table down to the pte level */ 389 pte = pte_offset_kernel(pmd, addr); 390 if (pte_none(*pte)) { 391 addr += PAGE_SIZE; 392 continue; 393 } 394 395 /* Only clone present PTEs */ 396 if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) 397 return; 398 399 /* Allocate PTE in the user page-table */ 400 target_pte = pti_user_pagetable_walk_pte(addr); 401 if (WARN_ON(!target_pte)) 402 return; 403 404 /* Set GLOBAL bit in both PTEs */ 405 if (boot_cpu_has(X86_FEATURE_PGE)) 406 *pte = pte_set_flags(*pte, _PAGE_GLOBAL); 407 408 /* Clone the PTE */ 409 *target_pte = *pte; 410 411 addr += PAGE_SIZE; 412 413 } else { 414 BUG(); 415 } 416 } 417 } 418 419 #ifdef CONFIG_X86_64 420 /* 421 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a 422 * next-level entry on 5-level systems. 423 */ 424 static void __init pti_clone_p4d(unsigned long addr) 425 { 426 p4d_t *kernel_p4d, *user_p4d; 427 pgd_t *kernel_pgd; 428 429 user_p4d = pti_user_pagetable_walk_p4d(addr); 430 if (!user_p4d) 431 return; 432 433 kernel_pgd = pgd_offset_k(addr); 434 kernel_p4d = p4d_offset(kernel_pgd, addr); 435 *user_p4d = *kernel_p4d; 436 } 437 438 /* 439 * Clone the CPU_ENTRY_AREA and associated data into the user space visible 440 * page table. 441 */ 442 static void __init pti_clone_user_shared(void) 443 { 444 unsigned int cpu; 445 446 pti_clone_p4d(CPU_ENTRY_AREA_BASE); 447 448 for_each_possible_cpu(cpu) { 449 /* 450 * The SYSCALL64 entry code needs to be able to find the 451 * thread stack and needs one word of scratch space in which 452 * to spill a register. All of this lives in the TSS, in 453 * the sp1 and sp2 slots. 454 * 455 * This is done for all possible CPUs during boot to ensure 456 * that it's propagated to all mms. If we were to add one of 457 * these mappings during CPU hotplug, we would need to take 458 * some measure to make sure that every mm that subsequently 459 * ran on that CPU would have the relevant PGD entry in its 460 * pagetables. The usual vmalloc_fault() mechanism would not 461 * work for page faults taken in entry_SYSCALL_64 before RSP 462 * is set up. 463 */ 464 465 unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); 466 phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); 467 pte_t *target_pte; 468 469 target_pte = pti_user_pagetable_walk_pte(va); 470 if (WARN_ON(!target_pte)) 471 return; 472 473 *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL); 474 } 475 } 476 477 #else /* CONFIG_X86_64 */ 478 479 /* 480 * On 32 bit PAE systems with 1GB of Kernel address space there is only 481 * one pgd/p4d for the whole kernel. Cloning that would map the whole 482 * address space into the user page-tables, making PTI useless. So clone 483 * the page-table on the PMD level to prevent that. 484 */ 485 static void __init pti_clone_user_shared(void) 486 { 487 unsigned long start, end; 488 489 start = CPU_ENTRY_AREA_BASE; 490 end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); 491 492 pti_clone_pgtable(start, end, PTI_CLONE_PMD); 493 } 494 #endif /* CONFIG_X86_64 */ 495 496 /* 497 * Clone the ESPFIX P4D into the user space visible page table 498 */ 499 static void __init pti_setup_espfix64(void) 500 { 501 #ifdef CONFIG_X86_ESPFIX64 502 pti_clone_p4d(ESPFIX_BASE_ADDR); 503 #endif 504 } 505 506 /* 507 * Clone the populated PMDs of the entry and irqentry text and force it RO. 508 */ 509 static void pti_clone_entry_text(void) 510 { 511 pti_clone_pgtable((unsigned long) __entry_text_start, 512 (unsigned long) __irqentry_text_end, 513 PTI_CLONE_PMD); 514 } 515 516 /* 517 * Global pages and PCIDs are both ways to make kernel TLB entries 518 * live longer, reduce TLB misses and improve kernel performance. 519 * But, leaving all kernel text Global makes it potentially accessible 520 * to Meltdown-style attacks which make it trivial to find gadgets or 521 * defeat KASLR. 522 * 523 * Only use global pages when it is really worth it. 524 */ 525 static inline bool pti_kernel_image_global_ok(void) 526 { 527 /* 528 * Systems with PCIDs get litlle benefit from global 529 * kernel text and are not worth the downsides. 530 */ 531 if (cpu_feature_enabled(X86_FEATURE_PCID)) 532 return false; 533 534 /* 535 * Only do global kernel image for pti=auto. Do the most 536 * secure thing (not global) if pti=on specified. 537 */ 538 if (pti_mode != PTI_AUTO) 539 return false; 540 541 /* 542 * K8 may not tolerate the cleared _PAGE_RW on the userspace 543 * global kernel image pages. Do the safe thing (disable 544 * global kernel image). This is unlikely to ever be 545 * noticed because PTI is disabled by default on AMD CPUs. 546 */ 547 if (boot_cpu_has(X86_FEATURE_K8)) 548 return false; 549 550 /* 551 * RANDSTRUCT derives its hardening benefits from the 552 * attacker's lack of knowledge about the layout of kernel 553 * data structures. Keep the kernel image non-global in 554 * cases where RANDSTRUCT is in use to help keep the layout a 555 * secret. 556 */ 557 if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT)) 558 return false; 559 560 return true; 561 } 562 563 /* 564 * This is the only user for these and it is not arch-generic 565 * like the other set_memory.h functions. Just extern them. 566 */ 567 extern int set_memory_nonglobal(unsigned long addr, int numpages); 568 extern int set_memory_global(unsigned long addr, int numpages); 569 570 /* 571 * For some configurations, map all of kernel text into the user page 572 * tables. This reduces TLB misses, especially on non-PCID systems. 573 */ 574 static void pti_clone_kernel_text(void) 575 { 576 /* 577 * rodata is part of the kernel image and is normally 578 * readable on the filesystem or on the web. But, do not 579 * clone the areas past rodata, they might contain secrets. 580 */ 581 unsigned long start = PFN_ALIGN(_text); 582 unsigned long end_clone = (unsigned long)__end_rodata_aligned; 583 unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table); 584 585 if (!pti_kernel_image_global_ok()) 586 return; 587 588 pr_debug("mapping partial kernel image into user address space\n"); 589 590 /* 591 * Note that this will undo _some_ of the work that 592 * pti_set_kernel_image_nonglobal() did to clear the 593 * global bit. 594 */ 595 pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); 596 597 /* 598 * pti_clone_pgtable() will set the global bit in any PMDs 599 * that it clones, but we also need to get any PTEs in 600 * the last level for areas that are not huge-page-aligned. 601 */ 602 603 /* Set the global bit for normal non-__init kernel text: */ 604 set_memory_global(start, (end_global - start) >> PAGE_SHIFT); 605 } 606 607 static void pti_set_kernel_image_nonglobal(void) 608 { 609 /* 610 * The identity map is created with PMDs, regardless of the 611 * actual length of the kernel. We need to clear 612 * _PAGE_GLOBAL up to a PMD boundary, not just to the end 613 * of the image. 614 */ 615 unsigned long start = PFN_ALIGN(_text); 616 unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); 617 618 /* 619 * This clears _PAGE_GLOBAL from the entire kernel image. 620 * pti_clone_kernel_text() map put _PAGE_GLOBAL back for 621 * areas that are mapped to userspace. 622 */ 623 set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); 624 } 625 626 /* 627 * Initialize kernel page table isolation 628 */ 629 void __init pti_init(void) 630 { 631 if (!boot_cpu_has(X86_FEATURE_PTI)) 632 return; 633 634 pr_info("enabled\n"); 635 636 #ifdef CONFIG_X86_32 637 /* 638 * We check for X86_FEATURE_PCID here. But the init-code will 639 * clear the feature flag on 32 bit because the feature is not 640 * supported on 32 bit anyway. To print the warning we need to 641 * check with cpuid directly again. 642 */ 643 if (cpuid_ecx(0x1) & BIT(17)) { 644 /* Use printk to work around pr_fmt() */ 645 printk(KERN_WARNING "\n"); 646 printk(KERN_WARNING "************************************************************\n"); 647 printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); 648 printk(KERN_WARNING "** **\n"); 649 printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); 650 printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); 651 printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); 652 printk(KERN_WARNING "** **\n"); 653 printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); 654 printk(KERN_WARNING "************************************************************\n"); 655 } 656 #endif 657 658 pti_clone_user_shared(); 659 660 /* Undo all global bits from the init pagetables in head_64.S: */ 661 pti_set_kernel_image_nonglobal(); 662 /* Replace some of the global bits just for shared entry text: */ 663 pti_clone_entry_text(); 664 pti_setup_espfix64(); 665 pti_setup_vsyscall(); 666 } 667 668 /* 669 * Finalize the kernel mappings in the userspace page-table. Some of the 670 * mappings for the kernel image might have changed since pti_init() 671 * cloned them. This is because parts of the kernel image have been 672 * mapped RO and/or NX. These changes need to be cloned again to the 673 * userspace page-table. 674 */ 675 void pti_finalize(void) 676 { 677 /* 678 * We need to clone everything (again) that maps parts of the 679 * kernel image. 680 */ 681 pti_clone_entry_text(); 682 pti_clone_kernel_text(); 683 684 debug_checkwx_user(); 685 } 686