1 /* 2 * Copyright(c) 2017 Intel Corporation. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * This code is based in part on work published here: 14 * 15 * https://github.com/IAIK/KAISER 16 * 17 * The original work was written by and and signed off by for the Linux 18 * kernel by: 19 * 20 * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> 21 * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> 22 * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> 23 * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> 24 * 25 * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> 26 * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and 27 * Andy Lutomirsky <luto@amacapital.net> 28 */ 29 #include <linux/kernel.h> 30 #include <linux/errno.h> 31 #include <linux/string.h> 32 #include <linux/types.h> 33 #include <linux/bug.h> 34 #include <linux/init.h> 35 #include <linux/spinlock.h> 36 #include <linux/mm.h> 37 #include <linux/uaccess.h> 38 39 #include <asm/cpufeature.h> 40 #include <asm/hypervisor.h> 41 #include <asm/vsyscall.h> 42 #include <asm/cmdline.h> 43 #include <asm/pti.h> 44 #include <asm/pgtable.h> 45 #include <asm/pgalloc.h> 46 #include <asm/tlbflush.h> 47 #include <asm/desc.h> 48 49 #undef pr_fmt 50 #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt 51 52 /* Backporting helper */ 53 #ifndef __GFP_NOTRACK 54 #define __GFP_NOTRACK 0 55 #endif 56 57 static void __init pti_print_if_insecure(const char *reason) 58 { 59 if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) 60 pr_info("%s\n", reason); 61 } 62 63 static void __init pti_print_if_secure(const char *reason) 64 { 65 if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) 66 pr_info("%s\n", reason); 67 } 68 69 void __init pti_check_boottime_disable(void) 70 { 71 char arg[5]; 72 int ret; 73 74 if (hypervisor_is_type(X86_HYPER_XEN_PV)) { 75 pti_print_if_insecure("disabled on XEN PV."); 76 return; 77 } 78 79 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); 80 if (ret > 0) { 81 if (ret == 3 && !strncmp(arg, "off", 3)) { 82 pti_print_if_insecure("disabled on command line."); 83 return; 84 } 85 if (ret == 2 && !strncmp(arg, "on", 2)) { 86 pti_print_if_secure("force enabled on command line."); 87 goto enable; 88 } 89 if (ret == 4 && !strncmp(arg, "auto", 4)) 90 goto autosel; 91 } 92 93 if (cmdline_find_option_bool(boot_command_line, "nopti")) { 94 pti_print_if_insecure("disabled on command line."); 95 return; 96 } 97 98 autosel: 99 if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) 100 return; 101 enable: 102 setup_force_cpu_cap(X86_FEATURE_PTI); 103 } 104 105 pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) 106 { 107 /* 108 * Changes to the high (kernel) portion of the kernelmode page 109 * tables are not automatically propagated to the usermode tables. 110 * 111 * Users should keep in mind that, unlike the kernelmode tables, 112 * there is no vmalloc_fault equivalent for the usermode tables. 113 * Top-level entries added to init_mm's usermode pgd after boot 114 * will not be automatically propagated to other mms. 115 */ 116 if (!pgdp_maps_userspace(pgdp)) 117 return pgd; 118 119 /* 120 * The user page tables get the full PGD, accessible from 121 * userspace: 122 */ 123 kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; 124 125 /* 126 * If this is normal user memory, make it NX in the kernel 127 * pagetables so that, if we somehow screw up and return to 128 * usermode with the kernel CR3 loaded, we'll get a page fault 129 * instead of allowing user code to execute with the wrong CR3. 130 * 131 * As exceptions, we don't set NX if: 132 * - _PAGE_USER is not set. This could be an executable 133 * EFI runtime mapping or something similar, and the kernel 134 * may execute from it 135 * - we don't have NX support 136 * - we're clearing the PGD (i.e. the new pgd is not present). 137 */ 138 if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && 139 (__supported_pte_mask & _PAGE_NX)) 140 pgd.pgd |= _PAGE_NX; 141 142 /* return the copy of the PGD we want the kernel to use: */ 143 return pgd; 144 } 145 146 /* 147 * Walk the user copy of the page tables (optionally) trying to allocate 148 * page table pages on the way down. 149 * 150 * Returns a pointer to a P4D on success, or NULL on failure. 151 */ 152 static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) 153 { 154 pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); 155 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 156 157 if (address < PAGE_OFFSET) { 158 WARN_ONCE(1, "attempt to walk user address\n"); 159 return NULL; 160 } 161 162 if (pgd_none(*pgd)) { 163 unsigned long new_p4d_page = __get_free_page(gfp); 164 if (!new_p4d_page) 165 return NULL; 166 167 if (pgd_none(*pgd)) { 168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); 169 new_p4d_page = 0; 170 } 171 if (new_p4d_page) 172 free_page(new_p4d_page); 173 } 174 BUILD_BUG_ON(pgd_large(*pgd) != 0); 175 176 return p4d_offset(pgd, address); 177 } 178 179 /* 180 * Walk the user copy of the page tables (optionally) trying to allocate 181 * page table pages on the way down. 182 * 183 * Returns a pointer to a PMD on success, or NULL on failure. 184 */ 185 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) 186 { 187 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 188 p4d_t *p4d = pti_user_pagetable_walk_p4d(address); 189 pud_t *pud; 190 191 BUILD_BUG_ON(p4d_large(*p4d) != 0); 192 if (p4d_none(*p4d)) { 193 unsigned long new_pud_page = __get_free_page(gfp); 194 if (!new_pud_page) 195 return NULL; 196 197 if (p4d_none(*p4d)) { 198 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); 199 new_pud_page = 0; 200 } 201 if (new_pud_page) 202 free_page(new_pud_page); 203 } 204 205 pud = pud_offset(p4d, address); 206 /* The user page tables do not use large mappings: */ 207 if (pud_large(*pud)) { 208 WARN_ON(1); 209 return NULL; 210 } 211 if (pud_none(*pud)) { 212 unsigned long new_pmd_page = __get_free_page(gfp); 213 if (!new_pmd_page) 214 return NULL; 215 216 if (pud_none(*pud)) { 217 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); 218 new_pmd_page = 0; 219 } 220 if (new_pmd_page) 221 free_page(new_pmd_page); 222 } 223 224 return pmd_offset(pud, address); 225 } 226 227 #ifdef CONFIG_X86_VSYSCALL_EMULATION 228 /* 229 * Walk the shadow copy of the page tables (optionally) trying to allocate 230 * page table pages on the way down. Does not support large pages. 231 * 232 * Note: this is only used when mapping *new* kernel data into the 233 * user/shadow page tables. It is never used for userspace data. 234 * 235 * Returns a pointer to a PTE on success, or NULL on failure. 236 */ 237 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) 238 { 239 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 240 pmd_t *pmd = pti_user_pagetable_walk_pmd(address); 241 pte_t *pte; 242 243 /* We can't do anything sensible if we hit a large mapping. */ 244 if (pmd_large(*pmd)) { 245 WARN_ON(1); 246 return NULL; 247 } 248 249 if (pmd_none(*pmd)) { 250 unsigned long new_pte_page = __get_free_page(gfp); 251 if (!new_pte_page) 252 return NULL; 253 254 if (pmd_none(*pmd)) { 255 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); 256 new_pte_page = 0; 257 } 258 if (new_pte_page) 259 free_page(new_pte_page); 260 } 261 262 pte = pte_offset_kernel(pmd, address); 263 if (pte_flags(*pte) & _PAGE_USER) { 264 WARN_ONCE(1, "attempt to walk to user pte\n"); 265 return NULL; 266 } 267 return pte; 268 } 269 270 static void __init pti_setup_vsyscall(void) 271 { 272 pte_t *pte, *target_pte; 273 unsigned int level; 274 275 pte = lookup_address(VSYSCALL_ADDR, &level); 276 if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) 277 return; 278 279 target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); 280 if (WARN_ON(!target_pte)) 281 return; 282 283 *target_pte = *pte; 284 set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); 285 } 286 #else 287 static void __init pti_setup_vsyscall(void) { } 288 #endif 289 290 static void __init 291 pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) 292 { 293 unsigned long addr; 294 295 /* 296 * Clone the populated PMDs which cover start to end. These PMD areas 297 * can have holes. 298 */ 299 for (addr = start; addr < end; addr += PMD_SIZE) { 300 pmd_t *pmd, *target_pmd; 301 pgd_t *pgd; 302 p4d_t *p4d; 303 pud_t *pud; 304 305 pgd = pgd_offset_k(addr); 306 if (WARN_ON(pgd_none(*pgd))) 307 return; 308 p4d = p4d_offset(pgd, addr); 309 if (WARN_ON(p4d_none(*p4d))) 310 return; 311 pud = pud_offset(p4d, addr); 312 if (pud_none(*pud)) 313 continue; 314 pmd = pmd_offset(pud, addr); 315 if (pmd_none(*pmd)) 316 continue; 317 318 target_pmd = pti_user_pagetable_walk_pmd(addr); 319 if (WARN_ON(!target_pmd)) 320 return; 321 322 /* 323 * Copy the PMD. That is, the kernelmode and usermode 324 * tables will share the last-level page tables of this 325 * address range 326 */ 327 *target_pmd = pmd_clear_flags(*pmd, clear); 328 } 329 } 330 331 /* 332 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a 333 * next-level entry on 5-level systems. 334 */ 335 static void __init pti_clone_p4d(unsigned long addr) 336 { 337 p4d_t *kernel_p4d, *user_p4d; 338 pgd_t *kernel_pgd; 339 340 user_p4d = pti_user_pagetable_walk_p4d(addr); 341 kernel_pgd = pgd_offset_k(addr); 342 kernel_p4d = p4d_offset(kernel_pgd, addr); 343 *user_p4d = *kernel_p4d; 344 } 345 346 /* 347 * Clone the CPU_ENTRY_AREA into the user space visible page table. 348 */ 349 static void __init pti_clone_user_shared(void) 350 { 351 pti_clone_p4d(CPU_ENTRY_AREA_BASE); 352 } 353 354 /* 355 * Clone the ESPFIX P4D into the user space visinble page table 356 */ 357 static void __init pti_setup_espfix64(void) 358 { 359 #ifdef CONFIG_X86_ESPFIX64 360 pti_clone_p4d(ESPFIX_BASE_ADDR); 361 #endif 362 } 363 364 /* 365 * Clone the populated PMDs of the entry and irqentry text and force it RO. 366 */ 367 static void __init pti_clone_entry_text(void) 368 { 369 pti_clone_pmds((unsigned long) __entry_text_start, 370 (unsigned long) __irqentry_text_end, _PAGE_RW); 371 } 372 373 /* 374 * Initialize kernel page table isolation 375 */ 376 void __init pti_init(void) 377 { 378 if (!static_cpu_has(X86_FEATURE_PTI)) 379 return; 380 381 pr_info("enabled\n"); 382 383 pti_clone_user_shared(); 384 pti_clone_entry_text(); 385 pti_setup_espfix64(); 386 pti_setup_vsyscall(); 387 } 388