1 /* 2 * Debug helper to dump the current kernel pagetables of the system 3 * so that we can see what the various memory ranges are set to. 4 * 5 * (C) Copyright 2008 Intel Corporation 6 * 7 * Author: Arjan van de Ven <arjan@linux.intel.com> 8 * 9 * This program is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License 11 * as published by the Free Software Foundation; version 2 12 * of the License. 13 */ 14 15 #include <linux/debugfs.h> 16 #include <linux/kasan.h> 17 #include <linux/mm.h> 18 #include <linux/init.h> 19 #include <linux/sched.h> 20 #include <linux/seq_file.h> 21 22 #include <asm/pgtable.h> 23 24 /* 25 * The dumper groups pagetable entries of the same type into one, and for 26 * that it needs to keep some state when walking, and flush this state 27 * when a "break" in the continuity is found. 28 */ 29 struct pg_state { 30 int level; 31 pgprot_t current_prot; 32 pgprotval_t effective_prot; 33 unsigned long start_address; 34 unsigned long current_address; 35 const struct addr_marker *marker; 36 unsigned long lines; 37 bool to_dmesg; 38 bool check_wx; 39 unsigned long wx_pages; 40 }; 41 42 struct addr_marker { 43 unsigned long start_address; 44 const char *name; 45 unsigned long max_lines; 46 }; 47 48 /* Address space markers hints */ 49 50 #ifdef CONFIG_X86_64 51 52 enum address_markers_idx { 53 USER_SPACE_NR = 0, 54 KERNEL_SPACE_NR, 55 LOW_KERNEL_NR, 56 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) 57 LDT_NR, 58 #endif 59 VMALLOC_START_NR, 60 VMEMMAP_START_NR, 61 #ifdef CONFIG_KASAN 62 KASAN_SHADOW_START_NR, 63 KASAN_SHADOW_END_NR, 64 #endif 65 CPU_ENTRY_AREA_NR, 66 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) 67 LDT_NR, 68 #endif 69 #ifdef CONFIG_X86_ESPFIX64 70 ESPFIX_START_NR, 71 #endif 72 #ifdef CONFIG_EFI 73 EFI_END_NR, 74 #endif 75 HIGH_KERNEL_NR, 76 MODULES_VADDR_NR, 77 MODULES_END_NR, 78 FIXADDR_START_NR, 79 END_OF_SPACE_NR, 80 }; 81 82 static struct addr_marker address_markers[] = { 83 [USER_SPACE_NR] = { 0, "User Space" }, 84 [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, 85 [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, 86 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 87 [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, 88 #ifdef CONFIG_KASAN 89 /* 90 * These fields get initialized with the (dynamic) 91 * KASAN_SHADOW_{START,END} values in pt_dump_init(). 92 */ 93 [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, 94 [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, 95 #endif 96 #ifdef CONFIG_MODIFY_LDT_SYSCALL 97 [LDT_NR] = { 0UL, "LDT remap" }, 98 #endif 99 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, 100 #ifdef CONFIG_X86_ESPFIX64 101 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 102 #endif 103 #ifdef CONFIG_EFI 104 [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, 105 #endif 106 [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, 107 [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, 108 [MODULES_END_NR] = { MODULES_END, "End Modules" }, 109 [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, 110 [END_OF_SPACE_NR] = { -1, NULL } 111 }; 112 113 #else /* CONFIG_X86_64 */ 114 115 enum address_markers_idx { 116 USER_SPACE_NR = 0, 117 KERNEL_SPACE_NR, 118 VMALLOC_START_NR, 119 VMALLOC_END_NR, 120 #ifdef CONFIG_HIGHMEM 121 PKMAP_BASE_NR, 122 #endif 123 CPU_ENTRY_AREA_NR, 124 FIXADDR_START_NR, 125 END_OF_SPACE_NR, 126 }; 127 128 static struct addr_marker address_markers[] = { 129 [USER_SPACE_NR] = { 0, "User Space" }, 130 [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, 131 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 132 [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, 133 #ifdef CONFIG_HIGHMEM 134 [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, 135 #endif 136 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, 137 [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, 138 [END_OF_SPACE_NR] = { -1, NULL } 139 }; 140 141 #endif /* !CONFIG_X86_64 */ 142 143 /* Multipliers for offsets within the PTEs */ 144 #define PTE_LEVEL_MULT (PAGE_SIZE) 145 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) 146 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) 147 #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) 148 #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) 149 150 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ 151 ({ \ 152 if (to_dmesg) \ 153 printk(KERN_INFO fmt, ##args); \ 154 else \ 155 if (m) \ 156 seq_printf(m, fmt, ##args); \ 157 }) 158 159 #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ 160 ({ \ 161 if (to_dmesg) \ 162 printk(KERN_CONT fmt, ##args); \ 163 else \ 164 if (m) \ 165 seq_printf(m, fmt, ##args); \ 166 }) 167 168 /* 169 * Print a readable form of a pgprot_t to the seq_file 170 */ 171 static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) 172 { 173 pgprotval_t pr = pgprot_val(prot); 174 static const char * const level_name[] = 175 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; 176 177 if (!(pr & _PAGE_PRESENT)) { 178 /* Not present */ 179 pt_dump_cont_printf(m, dmsg, " "); 180 } else { 181 if (pr & _PAGE_USER) 182 pt_dump_cont_printf(m, dmsg, "USR "); 183 else 184 pt_dump_cont_printf(m, dmsg, " "); 185 if (pr & _PAGE_RW) 186 pt_dump_cont_printf(m, dmsg, "RW "); 187 else 188 pt_dump_cont_printf(m, dmsg, "ro "); 189 if (pr & _PAGE_PWT) 190 pt_dump_cont_printf(m, dmsg, "PWT "); 191 else 192 pt_dump_cont_printf(m, dmsg, " "); 193 if (pr & _PAGE_PCD) 194 pt_dump_cont_printf(m, dmsg, "PCD "); 195 else 196 pt_dump_cont_printf(m, dmsg, " "); 197 198 /* Bit 7 has a different meaning on level 3 vs 4 */ 199 if (level <= 4 && pr & _PAGE_PSE) 200 pt_dump_cont_printf(m, dmsg, "PSE "); 201 else 202 pt_dump_cont_printf(m, dmsg, " "); 203 if ((level == 5 && pr & _PAGE_PAT) || 204 ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) 205 pt_dump_cont_printf(m, dmsg, "PAT "); 206 else 207 pt_dump_cont_printf(m, dmsg, " "); 208 if (pr & _PAGE_GLOBAL) 209 pt_dump_cont_printf(m, dmsg, "GLB "); 210 else 211 pt_dump_cont_printf(m, dmsg, " "); 212 if (pr & _PAGE_NX) 213 pt_dump_cont_printf(m, dmsg, "NX "); 214 else 215 pt_dump_cont_printf(m, dmsg, "x "); 216 } 217 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); 218 } 219 220 /* 221 * On 64 bits, sign-extend the 48 bit address to 64 bit 222 */ 223 static unsigned long normalize_addr(unsigned long u) 224 { 225 int shift; 226 if (!IS_ENABLED(CONFIG_X86_64)) 227 return u; 228 229 shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 230 return (signed long)(u << shift) >> shift; 231 } 232 233 /* 234 * This function gets called on a break in a continuous series 235 * of PTE entries; the next one is different so we need to 236 * print what we collected so far. 237 */ 238 static void note_page(struct seq_file *m, struct pg_state *st, 239 pgprot_t new_prot, pgprotval_t new_eff, int level) 240 { 241 pgprotval_t prot, cur, eff; 242 static const char units[] = "BKMGTPE"; 243 244 /* 245 * If we have a "break" in the series, we need to flush the state that 246 * we have now. "break" is either changing perms, levels or 247 * address space marker. 248 */ 249 prot = pgprot_val(new_prot); 250 cur = pgprot_val(st->current_prot); 251 eff = st->effective_prot; 252 253 if (!st->level) { 254 /* First entry */ 255 st->current_prot = new_prot; 256 st->effective_prot = new_eff; 257 st->level = level; 258 st->marker = address_markers; 259 st->lines = 0; 260 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 261 st->marker->name); 262 } else if (prot != cur || new_eff != eff || level != st->level || 263 st->current_address >= st->marker[1].start_address) { 264 const char *unit = units; 265 unsigned long delta; 266 int width = sizeof(unsigned long) * 2; 267 268 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { 269 WARN_ONCE(1, 270 "x86/mm: Found insecure W+X mapping at address %p/%pS\n", 271 (void *)st->start_address, 272 (void *)st->start_address); 273 st->wx_pages += (st->current_address - 274 st->start_address) / PAGE_SIZE; 275 } 276 277 /* 278 * Now print the actual finished series 279 */ 280 if (!st->marker->max_lines || 281 st->lines < st->marker->max_lines) { 282 pt_dump_seq_printf(m, st->to_dmesg, 283 "0x%0*lx-0x%0*lx ", 284 width, st->start_address, 285 width, st->current_address); 286 287 delta = st->current_address - st->start_address; 288 while (!(delta & 1023) && unit[1]) { 289 delta >>= 10; 290 unit++; 291 } 292 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", 293 delta, *unit); 294 printk_prot(m, st->current_prot, st->level, 295 st->to_dmesg); 296 } 297 st->lines++; 298 299 /* 300 * We print markers for special areas of address space, 301 * such as the start of vmalloc space etc. 302 * This helps in the interpretation. 303 */ 304 if (st->current_address >= st->marker[1].start_address) { 305 if (st->marker->max_lines && 306 st->lines > st->marker->max_lines) { 307 unsigned long nskip = 308 st->lines - st->marker->max_lines; 309 pt_dump_seq_printf(m, st->to_dmesg, 310 "... %lu entr%s skipped ... \n", 311 nskip, 312 nskip == 1 ? "y" : "ies"); 313 } 314 st->marker++; 315 st->lines = 0; 316 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 317 st->marker->name); 318 } 319 320 st->start_address = st->current_address; 321 st->current_prot = new_prot; 322 st->effective_prot = new_eff; 323 st->level = level; 324 } 325 } 326 327 static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) 328 { 329 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | 330 ((prot1 | prot2) & _PAGE_NX); 331 } 332 333 static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, 334 pgprotval_t eff_in, unsigned long P) 335 { 336 int i; 337 pte_t *start; 338 pgprotval_t prot, eff; 339 340 start = (pte_t *)pmd_page_vaddr(addr); 341 for (i = 0; i < PTRS_PER_PTE; i++) { 342 prot = pte_flags(*start); 343 eff = effective_prot(eff_in, prot); 344 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 345 note_page(m, st, __pgprot(prot), eff, 5); 346 start++; 347 } 348 } 349 #ifdef CONFIG_KASAN 350 351 /* 352 * This is an optimization for KASAN=y case. Since all kasan page tables 353 * eventually point to the kasan_zero_page we could call note_page() 354 * right away without walking through lower level page tables. This saves 355 * us dozens of seconds (minutes for 5-level config) while checking for 356 * W+X mapping or reading kernel_page_tables debugfs file. 357 */ 358 static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, 359 void *pt) 360 { 361 if (__pa(pt) == __pa(kasan_zero_pmd) || 362 (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) || 363 __pa(pt) == __pa(kasan_zero_pud)) { 364 pgprotval_t prot = pte_flags(kasan_zero_pte[0]); 365 note_page(m, st, __pgprot(prot), 0, 5); 366 return true; 367 } 368 return false; 369 } 370 #else 371 static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, 372 void *pt) 373 { 374 return false; 375 } 376 #endif 377 378 #if PTRS_PER_PMD > 1 379 380 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, 381 pgprotval_t eff_in, unsigned long P) 382 { 383 int i; 384 pmd_t *start, *pmd_start; 385 pgprotval_t prot, eff; 386 387 pmd_start = start = (pmd_t *)pud_page_vaddr(addr); 388 for (i = 0; i < PTRS_PER_PMD; i++) { 389 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 390 if (!pmd_none(*start)) { 391 prot = pmd_flags(*start); 392 eff = effective_prot(eff_in, prot); 393 if (pmd_large(*start) || !pmd_present(*start)) { 394 note_page(m, st, __pgprot(prot), eff, 4); 395 } else if (!kasan_page_table(m, st, pmd_start)) { 396 walk_pte_level(m, st, *start, eff, 397 P + i * PMD_LEVEL_MULT); 398 } 399 } else 400 note_page(m, st, __pgprot(0), 0, 4); 401 start++; 402 } 403 } 404 405 #else 406 #define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p) 407 #define pud_large(a) pmd_large(__pmd(pud_val(a))) 408 #define pud_none(a) pmd_none(__pmd(pud_val(a))) 409 #endif 410 411 #if PTRS_PER_PUD > 1 412 413 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, 414 pgprotval_t eff_in, unsigned long P) 415 { 416 int i; 417 pud_t *start, *pud_start; 418 pgprotval_t prot, eff; 419 pud_t *prev_pud = NULL; 420 421 pud_start = start = (pud_t *)p4d_page_vaddr(addr); 422 423 for (i = 0; i < PTRS_PER_PUD; i++) { 424 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 425 if (!pud_none(*start)) { 426 prot = pud_flags(*start); 427 eff = effective_prot(eff_in, prot); 428 if (pud_large(*start) || !pud_present(*start)) { 429 note_page(m, st, __pgprot(prot), eff, 3); 430 } else if (!kasan_page_table(m, st, pud_start)) { 431 walk_pmd_level(m, st, *start, eff, 432 P + i * PUD_LEVEL_MULT); 433 } 434 } else 435 note_page(m, st, __pgprot(0), 0, 3); 436 437 prev_pud = start; 438 start++; 439 } 440 } 441 442 #else 443 #define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p) 444 #define p4d_large(a) pud_large(__pud(p4d_val(a))) 445 #define p4d_none(a) pud_none(__pud(p4d_val(a))) 446 #endif 447 448 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, 449 pgprotval_t eff_in, unsigned long P) 450 { 451 int i; 452 p4d_t *start, *p4d_start; 453 pgprotval_t prot, eff; 454 455 if (PTRS_PER_P4D == 1) 456 return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P); 457 458 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); 459 460 for (i = 0; i < PTRS_PER_P4D; i++) { 461 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); 462 if (!p4d_none(*start)) { 463 prot = p4d_flags(*start); 464 eff = effective_prot(eff_in, prot); 465 if (p4d_large(*start) || !p4d_present(*start)) { 466 note_page(m, st, __pgprot(prot), eff, 2); 467 } else if (!kasan_page_table(m, st, p4d_start)) { 468 walk_pud_level(m, st, *start, eff, 469 P + i * P4D_LEVEL_MULT); 470 } 471 } else 472 note_page(m, st, __pgprot(0), 0, 2); 473 474 start++; 475 } 476 } 477 478 #define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) 479 #define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) 480 481 static inline bool is_hypervisor_range(int idx) 482 { 483 #ifdef CONFIG_X86_64 484 /* 485 * ffff800000000000 - ffff87ffffffffff is reserved for 486 * the hypervisor. 487 */ 488 return (idx >= pgd_index(__PAGE_OFFSET) - 16) && 489 (idx < pgd_index(__PAGE_OFFSET)); 490 #else 491 return false; 492 #endif 493 } 494 495 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 496 bool checkwx, bool dmesg) 497 { 498 #ifdef CONFIG_X86_64 499 pgd_t *start = (pgd_t *) &init_top_pgt; 500 #else 501 pgd_t *start = swapper_pg_dir; 502 #endif 503 pgprotval_t prot, eff; 504 int i; 505 struct pg_state st = {}; 506 507 if (pgd) { 508 start = pgd; 509 st.to_dmesg = dmesg; 510 } 511 512 st.check_wx = checkwx; 513 if (checkwx) 514 st.wx_pages = 0; 515 516 for (i = 0; i < PTRS_PER_PGD; i++) { 517 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 518 if (!pgd_none(*start) && !is_hypervisor_range(i)) { 519 prot = pgd_flags(*start); 520 #ifdef CONFIG_X86_PAE 521 eff = _PAGE_USER | _PAGE_RW; 522 #else 523 eff = prot; 524 #endif 525 if (pgd_large(*start) || !pgd_present(*start)) { 526 note_page(m, &st, __pgprot(prot), eff, 1); 527 } else { 528 walk_p4d_level(m, &st, *start, eff, 529 i * PGD_LEVEL_MULT); 530 } 531 } else 532 note_page(m, &st, __pgprot(0), 0, 1); 533 534 cond_resched(); 535 start++; 536 } 537 538 /* Flush out the last page */ 539 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); 540 note_page(m, &st, __pgprot(0), 0, 0); 541 if (!checkwx) 542 return; 543 if (st.wx_pages) 544 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", 545 st.wx_pages); 546 else 547 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); 548 } 549 550 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) 551 { 552 ptdump_walk_pgd_level_core(m, pgd, false, true); 553 } 554 555 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) 556 { 557 #ifdef CONFIG_PAGE_TABLE_ISOLATION 558 if (user && static_cpu_has(X86_FEATURE_PTI)) 559 pgd = kernel_to_user_pgdp(pgd); 560 #endif 561 ptdump_walk_pgd_level_core(m, pgd, false, false); 562 } 563 EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); 564 565 static void ptdump_walk_user_pgd_level_checkwx(void) 566 { 567 #ifdef CONFIG_PAGE_TABLE_ISOLATION 568 pgd_t *pgd = (pgd_t *) &init_top_pgt; 569 570 if (!static_cpu_has(X86_FEATURE_PTI)) 571 return; 572 573 pr_info("x86/mm: Checking user space page tables\n"); 574 pgd = kernel_to_user_pgdp(pgd); 575 ptdump_walk_pgd_level_core(NULL, pgd, true, false); 576 #endif 577 } 578 579 void ptdump_walk_pgd_level_checkwx(void) 580 { 581 ptdump_walk_pgd_level_core(NULL, NULL, true, false); 582 ptdump_walk_user_pgd_level_checkwx(); 583 } 584 585 static int __init pt_dump_init(void) 586 { 587 /* 588 * Various markers are not compile-time constants, so assign them 589 * here. 590 */ 591 #ifdef CONFIG_X86_64 592 address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; 593 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 594 address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; 595 #ifdef CONFIG_MODIFY_LDT_SYSCALL 596 address_markers[LDT_NR].start_address = LDT_BASE_ADDR; 597 #endif 598 #ifdef CONFIG_KASAN 599 address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; 600 address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; 601 #endif 602 #endif 603 #ifdef CONFIG_X86_32 604 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 605 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; 606 # ifdef CONFIG_HIGHMEM 607 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; 608 # endif 609 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; 610 address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; 611 #endif 612 return 0; 613 } 614 __initcall(pt_dump_init); 615