1 /* 2 * Copyright(c) 2017 Intel Corporation. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * This code is based in part on work published here: 14 * 15 * https://github.com/IAIK/KAISER 16 * 17 * The original work was written by and and signed off by for the Linux 18 * kernel by: 19 * 20 * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> 21 * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> 22 * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> 23 * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> 24 * 25 * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> 26 * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and 27 * Andy Lutomirsky <luto@amacapital.net> 28 */ 29 #include <linux/kernel.h> 30 #include <linux/errno.h> 31 #include <linux/string.h> 32 #include <linux/types.h> 33 #include <linux/bug.h> 34 #include <linux/init.h> 35 #include <linux/spinlock.h> 36 #include <linux/mm.h> 37 #include <linux/uaccess.h> 38 39 #include <asm/cpufeature.h> 40 #include <asm/hypervisor.h> 41 #include <asm/vsyscall.h> 42 #include <asm/cmdline.h> 43 #include <asm/pti.h> 44 #include <asm/pgtable.h> 45 #include <asm/pgalloc.h> 46 #include <asm/tlbflush.h> 47 #include <asm/desc.h> 48 49 #undef pr_fmt 50 #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt 51 52 /* Backporting helper */ 53 #ifndef __GFP_NOTRACK 54 #define __GFP_NOTRACK 0 55 #endif 56 57 static void __init pti_print_if_insecure(const char *reason) 58 { 59 if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 60 pr_info("%s\n", reason); 61 } 62 63 static void __init pti_print_if_secure(const char *reason) 64 { 65 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 66 pr_info("%s\n", reason); 67 } 68 69 void __init pti_check_boottime_disable(void) 70 { 71 char arg[5]; 72 int ret; 73 74 if (hypervisor_is_type(X86_HYPER_XEN_PV)) { 75 pti_print_if_insecure("disabled on XEN PV."); 76 return; 77 } 78 79 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); 80 if (ret > 0) { 81 if (ret == 3 && !strncmp(arg, "off", 3)) { 82 pti_print_if_insecure("disabled on command line."); 83 return; 84 } 85 if (ret == 2 && !strncmp(arg, "on", 2)) { 86 pti_print_if_secure("force enabled on command line."); 87 goto enable; 88 } 89 if (ret == 4 && !strncmp(arg, "auto", 4)) 90 goto autosel; 91 } 92 93 if (cmdline_find_option_bool(boot_command_line, "nopti")) { 94 pti_print_if_insecure("disabled on command line."); 95 return; 96 } 97 98 autosel: 99 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 100 return; 101 enable: 102 setup_force_cpu_cap(X86_FEATURE_PTI); 103 } 104 105 pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) 106 { 107 /* 108 * Changes to the high (kernel) portion of the kernelmode page 109 * tables are not automatically propagated to the usermode tables. 110 * 111 * Users should keep in mind that, unlike the kernelmode tables, 112 * there is no vmalloc_fault equivalent for the usermode tables. 113 * Top-level entries added to init_mm's usermode pgd after boot 114 * will not be automatically propagated to other mms. 115 */ 116 if (!pgdp_maps_userspace(pgdp)) 117 return pgd; 118 119 /* 120 * The user page tables get the full PGD, accessible from 121 * userspace: 122 */ 123 kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; 124 125 /* 126 * If this is normal user memory, make it NX in the kernel 127 * pagetables so that, if we somehow screw up and return to 128 * usermode with the kernel CR3 loaded, we'll get a page fault 129 * instead of allowing user code to execute with the wrong CR3. 130 * 131 * As exceptions, we don't set NX if: 132 * - _PAGE_USER is not set. This could be an executable 133 * EFI runtime mapping or something similar, and the kernel 134 * may execute from it 135 * - we don't have NX support 136 * - we're clearing the PGD (i.e. the new pgd is not present). 137 */ 138 if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && 139 (__supported_pte_mask & _PAGE_NX)) 140 pgd.pgd |= _PAGE_NX; 141 142 /* return the copy of the PGD we want the kernel to use: */ 143 return pgd; 144 } 145 146 /* 147 * Walk the user copy of the page tables (optionally) trying to allocate 148 * page table pages on the way down. 149 * 150 * Returns a pointer to a P4D on success, or NULL on failure. 151 */ 152 static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) 153 { 154 pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); 155 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 156 157 if (address < PAGE_OFFSET) { 158 WARN_ONCE(1, "attempt to walk user address\n"); 159 return NULL; 160 } 161 162 if (pgd_none(*pgd)) { 163 unsigned long new_p4d_page = __get_free_page(gfp); 164 if (!new_p4d_page) 165 return NULL; 166 167 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); 168 } 169 BUILD_BUG_ON(pgd_large(*pgd) != 0); 170 171 return p4d_offset(pgd, address); 172 } 173 174 /* 175 * Walk the user copy of the page tables (optionally) trying to allocate 176 * page table pages on the way down. 177 * 178 * Returns a pointer to a PMD on success, or NULL on failure. 179 */ 180 static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) 181 { 182 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 183 p4d_t *p4d = pti_user_pagetable_walk_p4d(address); 184 pud_t *pud; 185 186 BUILD_BUG_ON(p4d_large(*p4d) != 0); 187 if (p4d_none(*p4d)) { 188 unsigned long new_pud_page = __get_free_page(gfp); 189 if (!new_pud_page) 190 return NULL; 191 192 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); 193 } 194 195 pud = pud_offset(p4d, address); 196 /* The user page tables do not use large mappings: */ 197 if (pud_large(*pud)) { 198 WARN_ON(1); 199 return NULL; 200 } 201 if (pud_none(*pud)) { 202 unsigned long new_pmd_page = __get_free_page(gfp); 203 if (!new_pmd_page) 204 return NULL; 205 206 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); 207 } 208 209 return pmd_offset(pud, address); 210 } 211 212 #ifdef CONFIG_X86_VSYSCALL_EMULATION 213 /* 214 * Walk the shadow copy of the page tables (optionally) trying to allocate 215 * page table pages on the way down. Does not support large pages. 216 * 217 * Note: this is only used when mapping *new* kernel data into the 218 * user/shadow page tables. It is never used for userspace data. 219 * 220 * Returns a pointer to a PTE on success, or NULL on failure. 221 */ 222 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) 223 { 224 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 225 pmd_t *pmd = pti_user_pagetable_walk_pmd(address); 226 pte_t *pte; 227 228 /* We can't do anything sensible if we hit a large mapping. */ 229 if (pmd_large(*pmd)) { 230 WARN_ON(1); 231 return NULL; 232 } 233 234 if (pmd_none(*pmd)) { 235 unsigned long new_pte_page = __get_free_page(gfp); 236 if (!new_pte_page) 237 return NULL; 238 239 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); 240 } 241 242 pte = pte_offset_kernel(pmd, address); 243 if (pte_flags(*pte) & _PAGE_USER) { 244 WARN_ONCE(1, "attempt to walk to user pte\n"); 245 return NULL; 246 } 247 return pte; 248 } 249 250 static void __init pti_setup_vsyscall(void) 251 { 252 pte_t *pte, *target_pte; 253 unsigned int level; 254 255 pte = lookup_address(VSYSCALL_ADDR, &level); 256 if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) 257 return; 258 259 target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); 260 if (WARN_ON(!target_pte)) 261 return; 262 263 *target_pte = *pte; 264 set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); 265 } 266 #else 267 static void __init pti_setup_vsyscall(void) { } 268 #endif 269 270 static void __init 271 pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) 272 { 273 unsigned long addr; 274 275 /* 276 * Clone the populated PMDs which cover start to end. These PMD areas 277 * can have holes. 278 */ 279 for (addr = start; addr < end; addr += PMD_SIZE) { 280 pmd_t *pmd, *target_pmd; 281 pgd_t *pgd; 282 p4d_t *p4d; 283 pud_t *pud; 284 285 pgd = pgd_offset_k(addr); 286 if (WARN_ON(pgd_none(*pgd))) 287 return; 288 p4d = p4d_offset(pgd, addr); 289 if (WARN_ON(p4d_none(*p4d))) 290 return; 291 pud = pud_offset(p4d, addr); 292 if (pud_none(*pud)) 293 continue; 294 pmd = pmd_offset(pud, addr); 295 if (pmd_none(*pmd)) 296 continue; 297 298 target_pmd = pti_user_pagetable_walk_pmd(addr); 299 if (WARN_ON(!target_pmd)) 300 return; 301 302 /* 303 * Copy the PMD. That is, the kernelmode and usermode 304 * tables will share the last-level page tables of this 305 * address range 306 */ 307 *target_pmd = pmd_clear_flags(*pmd, clear); 308 } 309 } 310 311 /* 312 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a 313 * next-level entry on 5-level systems. 314 */ 315 static void __init pti_clone_p4d(unsigned long addr) 316 { 317 p4d_t *kernel_p4d, *user_p4d; 318 pgd_t *kernel_pgd; 319 320 user_p4d = pti_user_pagetable_walk_p4d(addr); 321 kernel_pgd = pgd_offset_k(addr); 322 kernel_p4d = p4d_offset(kernel_pgd, addr); 323 *user_p4d = *kernel_p4d; 324 } 325 326 /* 327 * Clone the CPU_ENTRY_AREA into the user space visible page table. 328 */ 329 static void __init pti_clone_user_shared(void) 330 { 331 pti_clone_p4d(CPU_ENTRY_AREA_BASE); 332 } 333 334 /* 335 * Clone the ESPFIX P4D into the user space visinble page table 336 */ 337 static void __init pti_setup_espfix64(void) 338 { 339 #ifdef CONFIG_X86_ESPFIX64 340 pti_clone_p4d(ESPFIX_BASE_ADDR); 341 #endif 342 } 343 344 /* 345 * Clone the populated PMDs of the entry and irqentry text and force it RO. 346 */ 347 static void __init pti_clone_entry_text(void) 348 { 349 pti_clone_pmds((unsigned long) __entry_text_start, 350 (unsigned long) __irqentry_text_end, 351 _PAGE_RW | _PAGE_GLOBAL); 352 } 353 354 /* 355 * Initialize kernel page table isolation 356 */ 357 void __init pti_init(void) 358 { 359 if (!static_cpu_has(X86_FEATURE_PTI)) 360 return; 361 362 pr_info("enabled\n"); 363 364 pti_clone_user_shared(); 365 pti_clone_entry_text(); 366 pti_setup_espfix64(); 367 pti_setup_vsyscall(); 368 } 369