1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Debug helper to dump the current kernel pagetables of the system 4 * so that we can see what the various memory ranges are set to. 5 * 6 * (C) Copyright 2008 Intel Corporation 7 * 8 * Author: Arjan van de Ven <arjan@linux.intel.com> 9 */ 10 11 #include <linux/debugfs.h> 12 #include <linux/kasan.h> 13 #include <linux/mm.h> 14 #include <linux/init.h> 15 #include <linux/sched.h> 16 #include <linux/seq_file.h> 17 #include <linux/highmem.h> 18 #include <linux/pci.h> 19 #include <linux/ptdump.h> 20 21 #include <asm/e820/types.h> 22 #include <asm/pgtable.h> 23 24 /* 25 * The dumper groups pagetable entries of the same type into one, and for 26 * that it needs to keep some state when walking, and flush this state 27 * when a "break" in the continuity is found. 28 */ 29 struct pg_state { 30 struct ptdump_state ptdump; 31 int level; 32 pgprotval_t current_prot; 33 pgprotval_t effective_prot; 34 pgprotval_t prot_levels[5]; 35 unsigned long start_address; 36 const struct addr_marker *marker; 37 unsigned long lines; 38 bool to_dmesg; 39 bool check_wx; 40 unsigned long wx_pages; 41 struct seq_file *seq; 42 }; 43 44 struct addr_marker { 45 unsigned long start_address; 46 const char *name; 47 unsigned long max_lines; 48 }; 49 50 /* Address space markers hints */ 51 52 #ifdef CONFIG_X86_64 53 54 enum address_markers_idx { 55 USER_SPACE_NR = 0, 56 KERNEL_SPACE_NR, 57 #ifdef CONFIG_MODIFY_LDT_SYSCALL 58 LDT_NR, 59 #endif 60 LOW_KERNEL_NR, 61 VMALLOC_START_NR, 62 VMEMMAP_START_NR, 63 #ifdef CONFIG_KASAN 64 KASAN_SHADOW_START_NR, 65 KASAN_SHADOW_END_NR, 66 #endif 67 CPU_ENTRY_AREA_NR, 68 #ifdef CONFIG_X86_ESPFIX64 69 ESPFIX_START_NR, 70 #endif 71 #ifdef CONFIG_EFI 72 EFI_END_NR, 73 #endif 74 HIGH_KERNEL_NR, 75 MODULES_VADDR_NR, 76 MODULES_END_NR, 77 FIXADDR_START_NR, 78 END_OF_SPACE_NR, 79 }; 80 81 static struct addr_marker address_markers[] = { 82 [USER_SPACE_NR] = { 0, "User Space" }, 83 [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, 84 [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, 85 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 86 [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, 87 #ifdef CONFIG_KASAN 88 /* 89 * These fields get initialized with the (dynamic) 90 * KASAN_SHADOW_{START,END} values in pt_dump_init(). 91 */ 92 [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, 93 [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, 94 #endif 95 #ifdef CONFIG_MODIFY_LDT_SYSCALL 96 [LDT_NR] = { 0UL, "LDT remap" }, 97 #endif 98 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, 99 #ifdef CONFIG_X86_ESPFIX64 100 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 101 #endif 102 #ifdef CONFIG_EFI 103 [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, 104 #endif 105 [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, 106 [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, 107 [MODULES_END_NR] = { MODULES_END, "End Modules" }, 108 [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, 109 [END_OF_SPACE_NR] = { -1, NULL } 110 }; 111 112 #define INIT_PGD ((pgd_t *) &init_top_pgt) 113 114 #else /* CONFIG_X86_64 */ 115 116 enum address_markers_idx { 117 USER_SPACE_NR = 0, 118 KERNEL_SPACE_NR, 119 VMALLOC_START_NR, 120 VMALLOC_END_NR, 121 #ifdef CONFIG_HIGHMEM 122 PKMAP_BASE_NR, 123 #endif 124 #ifdef CONFIG_MODIFY_LDT_SYSCALL 125 LDT_NR, 126 #endif 127 CPU_ENTRY_AREA_NR, 128 FIXADDR_START_NR, 129 END_OF_SPACE_NR, 130 }; 131 132 static struct addr_marker address_markers[] = { 133 [USER_SPACE_NR] = { 0, "User Space" }, 134 [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, 135 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 136 [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, 137 #ifdef CONFIG_HIGHMEM 138 [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, 139 #endif 140 #ifdef CONFIG_MODIFY_LDT_SYSCALL 141 [LDT_NR] = { 0UL, "LDT remap" }, 142 #endif 143 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, 144 [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, 145 [END_OF_SPACE_NR] = { -1, NULL } 146 }; 147 148 #define INIT_PGD (swapper_pg_dir) 149 150 #endif /* !CONFIG_X86_64 */ 151 152 /* Multipliers for offsets within the PTEs */ 153 #define PTE_LEVEL_MULT (PAGE_SIZE) 154 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) 155 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) 156 #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) 157 #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) 158 159 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ 160 ({ \ 161 if (to_dmesg) \ 162 printk(KERN_INFO fmt, ##args); \ 163 else \ 164 if (m) \ 165 seq_printf(m, fmt, ##args); \ 166 }) 167 168 #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ 169 ({ \ 170 if (to_dmesg) \ 171 printk(KERN_CONT fmt, ##args); \ 172 else \ 173 if (m) \ 174 seq_printf(m, fmt, ##args); \ 175 }) 176 177 /* 178 * Print a readable form of a pgprot_t to the seq_file 179 */ 180 static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg) 181 { 182 static const char * const level_name[] = 183 { "pgd", "p4d", "pud", "pmd", "pte" }; 184 185 if (!(pr & _PAGE_PRESENT)) { 186 /* Not present */ 187 pt_dump_cont_printf(m, dmsg, " "); 188 } else { 189 if (pr & _PAGE_USER) 190 pt_dump_cont_printf(m, dmsg, "USR "); 191 else 192 pt_dump_cont_printf(m, dmsg, " "); 193 if (pr & _PAGE_RW) 194 pt_dump_cont_printf(m, dmsg, "RW "); 195 else 196 pt_dump_cont_printf(m, dmsg, "ro "); 197 if (pr & _PAGE_PWT) 198 pt_dump_cont_printf(m, dmsg, "PWT "); 199 else 200 pt_dump_cont_printf(m, dmsg, " "); 201 if (pr & _PAGE_PCD) 202 pt_dump_cont_printf(m, dmsg, "PCD "); 203 else 204 pt_dump_cont_printf(m, dmsg, " "); 205 206 /* Bit 7 has a different meaning on level 3 vs 4 */ 207 if (level <= 3 && pr & _PAGE_PSE) 208 pt_dump_cont_printf(m, dmsg, "PSE "); 209 else 210 pt_dump_cont_printf(m, dmsg, " "); 211 if ((level == 4 && pr & _PAGE_PAT) || 212 ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) 213 pt_dump_cont_printf(m, dmsg, "PAT "); 214 else 215 pt_dump_cont_printf(m, dmsg, " "); 216 if (pr & _PAGE_GLOBAL) 217 pt_dump_cont_printf(m, dmsg, "GLB "); 218 else 219 pt_dump_cont_printf(m, dmsg, " "); 220 if (pr & _PAGE_NX) 221 pt_dump_cont_printf(m, dmsg, "NX "); 222 else 223 pt_dump_cont_printf(m, dmsg, "x "); 224 } 225 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); 226 } 227 228 static void note_wx(struct pg_state *st, unsigned long addr) 229 { 230 unsigned long npages; 231 232 npages = (addr - st->start_address) / PAGE_SIZE; 233 234 #ifdef CONFIG_PCI_BIOS 235 /* 236 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. 237 * Inform about it, but avoid the warning. 238 */ 239 if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && 240 addr <= PAGE_OFFSET + BIOS_END) { 241 pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); 242 return; 243 } 244 #endif 245 /* Account the WX pages */ 246 st->wx_pages += npages; 247 WARN_ONCE(__supported_pte_mask & _PAGE_NX, 248 "x86/mm: Found insecure W+X mapping at address %pS\n", 249 (void *)st->start_address); 250 } 251 252 static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) 253 { 254 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | 255 ((prot1 | prot2) & _PAGE_NX); 256 } 257 258 /* 259 * This function gets called on a break in a continuous series 260 * of PTE entries; the next one is different so we need to 261 * print what we collected so far. 262 */ 263 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, 264 unsigned long val) 265 { 266 struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); 267 pgprotval_t new_prot, new_eff; 268 pgprotval_t cur, eff; 269 static const char units[] = "BKMGTPE"; 270 struct seq_file *m = st->seq; 271 272 new_prot = val & PTE_FLAGS_MASK; 273 274 if (level > 0) { 275 new_eff = effective_prot(st->prot_levels[level - 1], 276 new_prot); 277 } else { 278 new_eff = new_prot; 279 } 280 281 if (level >= 0) 282 st->prot_levels[level] = new_eff; 283 284 /* 285 * If we have a "break" in the series, we need to flush the state that 286 * we have now. "break" is either changing perms, levels or 287 * address space marker. 288 */ 289 cur = st->current_prot; 290 eff = st->effective_prot; 291 292 if (st->level == -1) { 293 /* First entry */ 294 st->current_prot = new_prot; 295 st->effective_prot = new_eff; 296 st->level = level; 297 st->marker = address_markers; 298 st->lines = 0; 299 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 300 st->marker->name); 301 } else if (new_prot != cur || new_eff != eff || level != st->level || 302 addr >= st->marker[1].start_address) { 303 const char *unit = units; 304 unsigned long delta; 305 int width = sizeof(unsigned long) * 2; 306 307 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) 308 note_wx(st, addr); 309 310 /* 311 * Now print the actual finished series 312 */ 313 if (!st->marker->max_lines || 314 st->lines < st->marker->max_lines) { 315 pt_dump_seq_printf(m, st->to_dmesg, 316 "0x%0*lx-0x%0*lx ", 317 width, st->start_address, 318 width, addr); 319 320 delta = addr - st->start_address; 321 while (!(delta & 1023) && unit[1]) { 322 delta >>= 10; 323 unit++; 324 } 325 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", 326 delta, *unit); 327 printk_prot(m, st->current_prot, st->level, 328 st->to_dmesg); 329 } 330 st->lines++; 331 332 /* 333 * We print markers for special areas of address space, 334 * such as the start of vmalloc space etc. 335 * This helps in the interpretation. 336 */ 337 if (addr >= st->marker[1].start_address) { 338 if (st->marker->max_lines && 339 st->lines > st->marker->max_lines) { 340 unsigned long nskip = 341 st->lines - st->marker->max_lines; 342 pt_dump_seq_printf(m, st->to_dmesg, 343 "... %lu entr%s skipped ... \n", 344 nskip, 345 nskip == 1 ? "y" : "ies"); 346 } 347 st->marker++; 348 st->lines = 0; 349 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 350 st->marker->name); 351 } 352 353 st->start_address = addr; 354 st->current_prot = new_prot; 355 st->effective_prot = new_eff; 356 st->level = level; 357 } 358 } 359 360 static void ptdump_walk_pgd_level_core(struct seq_file *m, 361 struct mm_struct *mm, pgd_t *pgd, 362 bool checkwx, bool dmesg) 363 { 364 const struct ptdump_range ptdump_ranges[] = { 365 #ifdef CONFIG_X86_64 366 367 #define normalize_addr_shift (64 - (__VIRTUAL_MASK_SHIFT + 1)) 368 #define normalize_addr(u) ((signed long)((u) << normalize_addr_shift) >> \ 369 normalize_addr_shift) 370 371 {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, 372 {normalize_addr(PTRS_PER_PGD * PGD_LEVEL_MULT / 2), ~0UL}, 373 #else 374 {0, ~0UL}, 375 #endif 376 {0, 0} 377 }; 378 379 struct pg_state st = { 380 .ptdump = { 381 .note_page = note_page, 382 .range = ptdump_ranges 383 }, 384 .level = -1, 385 .to_dmesg = dmesg, 386 .check_wx = checkwx, 387 .seq = m 388 }; 389 390 ptdump_walk_pgd(&st.ptdump, mm, pgd); 391 392 if (!checkwx) 393 return; 394 if (st.wx_pages) 395 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", 396 st.wx_pages); 397 else 398 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); 399 } 400 401 void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) 402 { 403 ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true); 404 } 405 406 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, 407 bool user) 408 { 409 pgd_t *pgd = mm->pgd; 410 #ifdef CONFIG_PAGE_TABLE_ISOLATION 411 if (user && boot_cpu_has(X86_FEATURE_PTI)) 412 pgd = kernel_to_user_pgdp(pgd); 413 #endif 414 ptdump_walk_pgd_level_core(m, mm, pgd, false, false); 415 } 416 EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); 417 418 void ptdump_walk_user_pgd_level_checkwx(void) 419 { 420 #ifdef CONFIG_PAGE_TABLE_ISOLATION 421 pgd_t *pgd = INIT_PGD; 422 423 if (!(__supported_pte_mask & _PAGE_NX) || 424 !boot_cpu_has(X86_FEATURE_PTI)) 425 return; 426 427 pr_info("x86/mm: Checking user space page tables\n"); 428 pgd = kernel_to_user_pgdp(pgd); 429 ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false); 430 #endif 431 } 432 433 void ptdump_walk_pgd_level_checkwx(void) 434 { 435 ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); 436 } 437 438 static int __init pt_dump_init(void) 439 { 440 /* 441 * Various markers are not compile-time constants, so assign them 442 * here. 443 */ 444 #ifdef CONFIG_X86_64 445 address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; 446 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 447 address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; 448 #ifdef CONFIG_MODIFY_LDT_SYSCALL 449 address_markers[LDT_NR].start_address = LDT_BASE_ADDR; 450 #endif 451 #ifdef CONFIG_KASAN 452 address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; 453 address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; 454 #endif 455 #endif 456 #ifdef CONFIG_X86_32 457 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 458 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; 459 # ifdef CONFIG_HIGHMEM 460 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; 461 # endif 462 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; 463 address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; 464 # ifdef CONFIG_MODIFY_LDT_SYSCALL 465 address_markers[LDT_NR].start_address = LDT_BASE_ADDR; 466 # endif 467 #endif 468 return 0; 469 } 470 __initcall(pt_dump_init); 471