1 /* 2 * Debug helper to dump the current kernel pagetables of the system 3 * so that we can see what the various memory ranges are set to. 4 * 5 * (C) Copyright 2008 Intel Corporation 6 * 7 * Author: Arjan van de Ven <arjan@linux.intel.com> 8 * 9 * This program is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License 11 * as published by the Free Software Foundation; version 2 12 * of the License. 13 */ 14 15 #include <linux/debugfs.h> 16 #include <linux/kasan.h> 17 #include <linux/mm.h> 18 #include <linux/init.h> 19 #include <linux/sched.h> 20 #include <linux/seq_file.h> 21 #include <linux/highmem.h> 22 23 #include <asm/pgtable.h> 24 25 /* 26 * The dumper groups pagetable entries of the same type into one, and for 27 * that it needs to keep some state when walking, and flush this state 28 * when a "break" in the continuity is found. 29 */ 30 struct pg_state { 31 int level; 32 pgprot_t current_prot; 33 pgprotval_t effective_prot; 34 unsigned long start_address; 35 unsigned long current_address; 36 const struct addr_marker *marker; 37 unsigned long lines; 38 bool to_dmesg; 39 bool check_wx; 40 unsigned long wx_pages; 41 }; 42 43 struct addr_marker { 44 unsigned long start_address; 45 const char *name; 46 unsigned long max_lines; 47 }; 48 49 /* Address space markers hints */ 50 51 #ifdef CONFIG_X86_64 52 53 enum address_markers_idx { 54 USER_SPACE_NR = 0, 55 KERNEL_SPACE_NR, 56 LOW_KERNEL_NR, 57 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) 58 LDT_NR, 59 #endif 60 VMALLOC_START_NR, 61 VMEMMAP_START_NR, 62 #ifdef CONFIG_KASAN 63 KASAN_SHADOW_START_NR, 64 KASAN_SHADOW_END_NR, 65 #endif 66 CPU_ENTRY_AREA_NR, 67 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) 68 LDT_NR, 69 #endif 70 #ifdef CONFIG_X86_ESPFIX64 71 ESPFIX_START_NR, 72 #endif 73 #ifdef CONFIG_EFI 74 EFI_END_NR, 75 #endif 76 HIGH_KERNEL_NR, 77 MODULES_VADDR_NR, 78 MODULES_END_NR, 79 FIXADDR_START_NR, 80 END_OF_SPACE_NR, 81 }; 82 83 static struct addr_marker address_markers[] = { 84 [USER_SPACE_NR] = { 0, "User Space" }, 85 [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, 86 [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, 87 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 88 [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, 89 #ifdef CONFIG_KASAN 90 /* 91 * These fields get initialized with the (dynamic) 92 * KASAN_SHADOW_{START,END} values in pt_dump_init(). 93 */ 94 [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, 95 [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, 96 #endif 97 #ifdef CONFIG_MODIFY_LDT_SYSCALL 98 [LDT_NR] = { 0UL, "LDT remap" }, 99 #endif 100 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, 101 #ifdef CONFIG_X86_ESPFIX64 102 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 103 #endif 104 #ifdef CONFIG_EFI 105 [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, 106 #endif 107 [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, 108 [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, 109 [MODULES_END_NR] = { MODULES_END, "End Modules" }, 110 [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, 111 [END_OF_SPACE_NR] = { -1, NULL } 112 }; 113 114 #else /* CONFIG_X86_64 */ 115 116 enum address_markers_idx { 117 USER_SPACE_NR = 0, 118 KERNEL_SPACE_NR, 119 VMALLOC_START_NR, 120 VMALLOC_END_NR, 121 #ifdef CONFIG_HIGHMEM 122 PKMAP_BASE_NR, 123 #endif 124 CPU_ENTRY_AREA_NR, 125 FIXADDR_START_NR, 126 END_OF_SPACE_NR, 127 }; 128 129 static struct addr_marker address_markers[] = { 130 [USER_SPACE_NR] = { 0, "User Space" }, 131 [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, 132 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 133 [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, 134 #ifdef CONFIG_HIGHMEM 135 [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, 136 #endif 137 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, 138 [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, 139 [END_OF_SPACE_NR] = { -1, NULL } 140 }; 141 142 #endif /* !CONFIG_X86_64 */ 143 144 /* Multipliers for offsets within the PTEs */ 145 #define PTE_LEVEL_MULT (PAGE_SIZE) 146 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) 147 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) 148 #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) 149 #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) 150 151 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ 152 ({ \ 153 if (to_dmesg) \ 154 printk(KERN_INFO fmt, ##args); \ 155 else \ 156 if (m) \ 157 seq_printf(m, fmt, ##args); \ 158 }) 159 160 #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ 161 ({ \ 162 if (to_dmesg) \ 163 printk(KERN_CONT fmt, ##args); \ 164 else \ 165 if (m) \ 166 seq_printf(m, fmt, ##args); \ 167 }) 168 169 /* 170 * Print a readable form of a pgprot_t to the seq_file 171 */ 172 static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) 173 { 174 pgprotval_t pr = pgprot_val(prot); 175 static const char * const level_name[] = 176 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; 177 178 if (!(pr & _PAGE_PRESENT)) { 179 /* Not present */ 180 pt_dump_cont_printf(m, dmsg, " "); 181 } else { 182 if (pr & _PAGE_USER) 183 pt_dump_cont_printf(m, dmsg, "USR "); 184 else 185 pt_dump_cont_printf(m, dmsg, " "); 186 if (pr & _PAGE_RW) 187 pt_dump_cont_printf(m, dmsg, "RW "); 188 else 189 pt_dump_cont_printf(m, dmsg, "ro "); 190 if (pr & _PAGE_PWT) 191 pt_dump_cont_printf(m, dmsg, "PWT "); 192 else 193 pt_dump_cont_printf(m, dmsg, " "); 194 if (pr & _PAGE_PCD) 195 pt_dump_cont_printf(m, dmsg, "PCD "); 196 else 197 pt_dump_cont_printf(m, dmsg, " "); 198 199 /* Bit 7 has a different meaning on level 3 vs 4 */ 200 if (level <= 4 && pr & _PAGE_PSE) 201 pt_dump_cont_printf(m, dmsg, "PSE "); 202 else 203 pt_dump_cont_printf(m, dmsg, " "); 204 if ((level == 5 && pr & _PAGE_PAT) || 205 ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) 206 pt_dump_cont_printf(m, dmsg, "PAT "); 207 else 208 pt_dump_cont_printf(m, dmsg, " "); 209 if (pr & _PAGE_GLOBAL) 210 pt_dump_cont_printf(m, dmsg, "GLB "); 211 else 212 pt_dump_cont_printf(m, dmsg, " "); 213 if (pr & _PAGE_NX) 214 pt_dump_cont_printf(m, dmsg, "NX "); 215 else 216 pt_dump_cont_printf(m, dmsg, "x "); 217 } 218 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); 219 } 220 221 /* 222 * On 64 bits, sign-extend the 48 bit address to 64 bit 223 */ 224 static unsigned long normalize_addr(unsigned long u) 225 { 226 int shift; 227 if (!IS_ENABLED(CONFIG_X86_64)) 228 return u; 229 230 shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 231 return (signed long)(u << shift) >> shift; 232 } 233 234 /* 235 * This function gets called on a break in a continuous series 236 * of PTE entries; the next one is different so we need to 237 * print what we collected so far. 238 */ 239 static void note_page(struct seq_file *m, struct pg_state *st, 240 pgprot_t new_prot, pgprotval_t new_eff, int level) 241 { 242 pgprotval_t prot, cur, eff; 243 static const char units[] = "BKMGTPE"; 244 245 /* 246 * If we have a "break" in the series, we need to flush the state that 247 * we have now. "break" is either changing perms, levels or 248 * address space marker. 249 */ 250 prot = pgprot_val(new_prot); 251 cur = pgprot_val(st->current_prot); 252 eff = st->effective_prot; 253 254 if (!st->level) { 255 /* First entry */ 256 st->current_prot = new_prot; 257 st->effective_prot = new_eff; 258 st->level = level; 259 st->marker = address_markers; 260 st->lines = 0; 261 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 262 st->marker->name); 263 } else if (prot != cur || new_eff != eff || level != st->level || 264 st->current_address >= st->marker[1].start_address) { 265 const char *unit = units; 266 unsigned long delta; 267 int width = sizeof(unsigned long) * 2; 268 269 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { 270 WARN_ONCE(1, 271 "x86/mm: Found insecure W+X mapping at address %p/%pS\n", 272 (void *)st->start_address, 273 (void *)st->start_address); 274 st->wx_pages += (st->current_address - 275 st->start_address) / PAGE_SIZE; 276 } 277 278 /* 279 * Now print the actual finished series 280 */ 281 if (!st->marker->max_lines || 282 st->lines < st->marker->max_lines) { 283 pt_dump_seq_printf(m, st->to_dmesg, 284 "0x%0*lx-0x%0*lx ", 285 width, st->start_address, 286 width, st->current_address); 287 288 delta = st->current_address - st->start_address; 289 while (!(delta & 1023) && unit[1]) { 290 delta >>= 10; 291 unit++; 292 } 293 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", 294 delta, *unit); 295 printk_prot(m, st->current_prot, st->level, 296 st->to_dmesg); 297 } 298 st->lines++; 299 300 /* 301 * We print markers for special areas of address space, 302 * such as the start of vmalloc space etc. 303 * This helps in the interpretation. 304 */ 305 if (st->current_address >= st->marker[1].start_address) { 306 if (st->marker->max_lines && 307 st->lines > st->marker->max_lines) { 308 unsigned long nskip = 309 st->lines - st->marker->max_lines; 310 pt_dump_seq_printf(m, st->to_dmesg, 311 "... %lu entr%s skipped ... \n", 312 nskip, 313 nskip == 1 ? "y" : "ies"); 314 } 315 st->marker++; 316 st->lines = 0; 317 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 318 st->marker->name); 319 } 320 321 st->start_address = st->current_address; 322 st->current_prot = new_prot; 323 st->effective_prot = new_eff; 324 st->level = level; 325 } 326 } 327 328 static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) 329 { 330 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | 331 ((prot1 | prot2) & _PAGE_NX); 332 } 333 334 static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, 335 pgprotval_t eff_in, unsigned long P) 336 { 337 int i; 338 pte_t *pte; 339 pgprotval_t prot, eff; 340 341 for (i = 0; i < PTRS_PER_PTE; i++) { 342 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 343 pte = pte_offset_map(&addr, st->current_address); 344 prot = pte_flags(*pte); 345 eff = effective_prot(eff_in, prot); 346 note_page(m, st, __pgprot(prot), eff, 5); 347 pte_unmap(pte); 348 } 349 } 350 #ifdef CONFIG_KASAN 351 352 /* 353 * This is an optimization for KASAN=y case. Since all kasan page tables 354 * eventually point to the kasan_zero_page we could call note_page() 355 * right away without walking through lower level page tables. This saves 356 * us dozens of seconds (minutes for 5-level config) while checking for 357 * W+X mapping or reading kernel_page_tables debugfs file. 358 */ 359 static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, 360 void *pt) 361 { 362 if (__pa(pt) == __pa(kasan_zero_pmd) || 363 (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) || 364 __pa(pt) == __pa(kasan_zero_pud)) { 365 pgprotval_t prot = pte_flags(kasan_zero_pte[0]); 366 note_page(m, st, __pgprot(prot), 0, 5); 367 return true; 368 } 369 return false; 370 } 371 #else 372 static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, 373 void *pt) 374 { 375 return false; 376 } 377 #endif 378 379 #if PTRS_PER_PMD > 1 380 381 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, 382 pgprotval_t eff_in, unsigned long P) 383 { 384 int i; 385 pmd_t *start, *pmd_start; 386 pgprotval_t prot, eff; 387 388 pmd_start = start = (pmd_t *)pud_page_vaddr(addr); 389 for (i = 0; i < PTRS_PER_PMD; i++) { 390 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 391 if (!pmd_none(*start)) { 392 prot = pmd_flags(*start); 393 eff = effective_prot(eff_in, prot); 394 if (pmd_large(*start) || !pmd_present(*start)) { 395 note_page(m, st, __pgprot(prot), eff, 4); 396 } else if (!kasan_page_table(m, st, pmd_start)) { 397 walk_pte_level(m, st, *start, eff, 398 P + i * PMD_LEVEL_MULT); 399 } 400 } else 401 note_page(m, st, __pgprot(0), 0, 4); 402 start++; 403 } 404 } 405 406 #else 407 #define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p) 408 #define pud_large(a) pmd_large(__pmd(pud_val(a))) 409 #define pud_none(a) pmd_none(__pmd(pud_val(a))) 410 #endif 411 412 #if PTRS_PER_PUD > 1 413 414 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, 415 pgprotval_t eff_in, unsigned long P) 416 { 417 int i; 418 pud_t *start, *pud_start; 419 pgprotval_t prot, eff; 420 pud_t *prev_pud = NULL; 421 422 pud_start = start = (pud_t *)p4d_page_vaddr(addr); 423 424 for (i = 0; i < PTRS_PER_PUD; i++) { 425 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 426 if (!pud_none(*start)) { 427 prot = pud_flags(*start); 428 eff = effective_prot(eff_in, prot); 429 if (pud_large(*start) || !pud_present(*start)) { 430 note_page(m, st, __pgprot(prot), eff, 3); 431 } else if (!kasan_page_table(m, st, pud_start)) { 432 walk_pmd_level(m, st, *start, eff, 433 P + i * PUD_LEVEL_MULT); 434 } 435 } else 436 note_page(m, st, __pgprot(0), 0, 3); 437 438 prev_pud = start; 439 start++; 440 } 441 } 442 443 #else 444 #define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p) 445 #define p4d_large(a) pud_large(__pud(p4d_val(a))) 446 #define p4d_none(a) pud_none(__pud(p4d_val(a))) 447 #endif 448 449 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, 450 pgprotval_t eff_in, unsigned long P) 451 { 452 int i; 453 p4d_t *start, *p4d_start; 454 pgprotval_t prot, eff; 455 456 if (PTRS_PER_P4D == 1) 457 return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P); 458 459 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); 460 461 for (i = 0; i < PTRS_PER_P4D; i++) { 462 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); 463 if (!p4d_none(*start)) { 464 prot = p4d_flags(*start); 465 eff = effective_prot(eff_in, prot); 466 if (p4d_large(*start) || !p4d_present(*start)) { 467 note_page(m, st, __pgprot(prot), eff, 2); 468 } else if (!kasan_page_table(m, st, p4d_start)) { 469 walk_pud_level(m, st, *start, eff, 470 P + i * P4D_LEVEL_MULT); 471 } 472 } else 473 note_page(m, st, __pgprot(0), 0, 2); 474 475 start++; 476 } 477 } 478 479 #define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) 480 #define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) 481 482 static inline bool is_hypervisor_range(int idx) 483 { 484 #ifdef CONFIG_X86_64 485 /* 486 * ffff800000000000 - ffff87ffffffffff is reserved for 487 * the hypervisor. 488 */ 489 return (idx >= pgd_index(__PAGE_OFFSET) - 16) && 490 (idx < pgd_index(__PAGE_OFFSET)); 491 #else 492 return false; 493 #endif 494 } 495 496 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 497 bool checkwx, bool dmesg) 498 { 499 #ifdef CONFIG_X86_64 500 pgd_t *start = (pgd_t *) &init_top_pgt; 501 #else 502 pgd_t *start = swapper_pg_dir; 503 #endif 504 pgprotval_t prot, eff; 505 int i; 506 struct pg_state st = {}; 507 508 if (pgd) { 509 start = pgd; 510 st.to_dmesg = dmesg; 511 } 512 513 st.check_wx = checkwx; 514 if (checkwx) 515 st.wx_pages = 0; 516 517 for (i = 0; i < PTRS_PER_PGD; i++) { 518 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 519 if (!pgd_none(*start) && !is_hypervisor_range(i)) { 520 prot = pgd_flags(*start); 521 #ifdef CONFIG_X86_PAE 522 eff = _PAGE_USER | _PAGE_RW; 523 #else 524 eff = prot; 525 #endif 526 if (pgd_large(*start) || !pgd_present(*start)) { 527 note_page(m, &st, __pgprot(prot), eff, 1); 528 } else { 529 walk_p4d_level(m, &st, *start, eff, 530 i * PGD_LEVEL_MULT); 531 } 532 } else 533 note_page(m, &st, __pgprot(0), 0, 1); 534 535 cond_resched(); 536 start++; 537 } 538 539 /* Flush out the last page */ 540 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); 541 note_page(m, &st, __pgprot(0), 0, 0); 542 if (!checkwx) 543 return; 544 if (st.wx_pages) 545 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", 546 st.wx_pages); 547 else 548 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); 549 } 550 551 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) 552 { 553 ptdump_walk_pgd_level_core(m, pgd, false, true); 554 } 555 556 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) 557 { 558 #ifdef CONFIG_PAGE_TABLE_ISOLATION 559 if (user && static_cpu_has(X86_FEATURE_PTI)) 560 pgd = kernel_to_user_pgdp(pgd); 561 #endif 562 ptdump_walk_pgd_level_core(m, pgd, false, false); 563 } 564 EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); 565 566 static void ptdump_walk_user_pgd_level_checkwx(void) 567 { 568 #ifdef CONFIG_PAGE_TABLE_ISOLATION 569 pgd_t *pgd = (pgd_t *) &init_top_pgt; 570 571 if (!static_cpu_has(X86_FEATURE_PTI)) 572 return; 573 574 pr_info("x86/mm: Checking user space page tables\n"); 575 pgd = kernel_to_user_pgdp(pgd); 576 ptdump_walk_pgd_level_core(NULL, pgd, true, false); 577 #endif 578 } 579 580 void ptdump_walk_pgd_level_checkwx(void) 581 { 582 ptdump_walk_pgd_level_core(NULL, NULL, true, false); 583 ptdump_walk_user_pgd_level_checkwx(); 584 } 585 586 static int __init pt_dump_init(void) 587 { 588 /* 589 * Various markers are not compile-time constants, so assign them 590 * here. 591 */ 592 #ifdef CONFIG_X86_64 593 address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; 594 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 595 address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; 596 #ifdef CONFIG_MODIFY_LDT_SYSCALL 597 address_markers[LDT_NR].start_address = LDT_BASE_ADDR; 598 #endif 599 #ifdef CONFIG_KASAN 600 address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; 601 address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; 602 #endif 603 #endif 604 #ifdef CONFIG_X86_32 605 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 606 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; 607 # ifdef CONFIG_HIGHMEM 608 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; 609 # endif 610 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; 611 address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; 612 #endif 613 return 0; 614 } 615 __initcall(pt_dump_init); 616