1 /* 2 * Debug helper to dump the current kernel pagetables of the system 3 * so that we can see what the various memory ranges are set to. 4 * 5 * (C) Copyright 2008 Intel Corporation 6 * 7 * Author: Arjan van de Ven <arjan@linux.intel.com> 8 * 9 * This program is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License 11 * as published by the Free Software Foundation; version 2 12 * of the License. 13 */ 14 15 #include <linux/debugfs.h> 16 #include <linux/mm.h> 17 #include <linux/module.h> 18 #include <linux/seq_file.h> 19 20 #include <asm/pgtable.h> 21 22 /* 23 * The dumper groups pagetable entries of the same type into one, and for 24 * that it needs to keep some state when walking, and flush this state 25 * when a "break" in the continuity is found. 26 */ 27 struct pg_state { 28 int level; 29 pgprot_t current_prot; 30 unsigned long start_address; 31 unsigned long current_address; 32 const struct addr_marker *marker; 33 unsigned long lines; 34 bool to_dmesg; 35 bool check_wx; 36 unsigned long wx_pages; 37 }; 38 39 struct addr_marker { 40 unsigned long start_address; 41 const char *name; 42 unsigned long max_lines; 43 }; 44 45 /* indices for address_markers; keep sync'd w/ address_markers below */ 46 enum address_markers_idx { 47 USER_SPACE_NR = 0, 48 #ifdef CONFIG_X86_64 49 KERNEL_SPACE_NR, 50 LOW_KERNEL_NR, 51 VMALLOC_START_NR, 52 VMEMMAP_START_NR, 53 # ifdef CONFIG_X86_ESPFIX64 54 ESPFIX_START_NR, 55 # endif 56 HIGH_KERNEL_NR, 57 MODULES_VADDR_NR, 58 MODULES_END_NR, 59 #else 60 KERNEL_SPACE_NR, 61 VMALLOC_START_NR, 62 VMALLOC_END_NR, 63 # ifdef CONFIG_HIGHMEM 64 PKMAP_BASE_NR, 65 # endif 66 FIXADDR_START_NR, 67 #endif 68 }; 69 70 /* Address space markers hints */ 71 static struct addr_marker address_markers[] = { 72 { 0, "User Space" }, 73 #ifdef CONFIG_X86_64 74 { 0x8000000000000000UL, "Kernel Space" }, 75 { PAGE_OFFSET, "Low Kernel Mapping" }, 76 { VMALLOC_START, "vmalloc() Area" }, 77 { VMEMMAP_START, "Vmemmap" }, 78 # ifdef CONFIG_X86_ESPFIX64 79 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 80 # endif 81 # ifdef CONFIG_EFI 82 { EFI_VA_END, "EFI Runtime Services" }, 83 # endif 84 { __START_KERNEL_map, "High Kernel Mapping" }, 85 { MODULES_VADDR, "Modules" }, 86 { MODULES_END, "End Modules" }, 87 #else 88 { PAGE_OFFSET, "Kernel Mapping" }, 89 { 0/* VMALLOC_START */, "vmalloc() Area" }, 90 { 0/*VMALLOC_END*/, "vmalloc() End" }, 91 # ifdef CONFIG_HIGHMEM 92 { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, 93 # endif 94 { 0/*FIXADDR_START*/, "Fixmap Area" }, 95 #endif 96 { -1, NULL } /* End of list */ 97 }; 98 99 /* Multipliers for offsets within the PTEs */ 100 #define PTE_LEVEL_MULT (PAGE_SIZE) 101 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) 102 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) 103 #define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) 104 105 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ 106 ({ \ 107 if (to_dmesg) \ 108 printk(KERN_INFO fmt, ##args); \ 109 else \ 110 if (m) \ 111 seq_printf(m, fmt, ##args); \ 112 }) 113 114 #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ 115 ({ \ 116 if (to_dmesg) \ 117 printk(KERN_CONT fmt, ##args); \ 118 else \ 119 if (m) \ 120 seq_printf(m, fmt, ##args); \ 121 }) 122 123 /* 124 * Print a readable form of a pgprot_t to the seq_file 125 */ 126 static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) 127 { 128 pgprotval_t pr = pgprot_val(prot); 129 static const char * const level_name[] = 130 { "cr3", "pgd", "pud", "pmd", "pte" }; 131 132 if (!pgprot_val(prot)) { 133 /* Not present */ 134 pt_dump_cont_printf(m, dmsg, " "); 135 } else { 136 if (pr & _PAGE_USER) 137 pt_dump_cont_printf(m, dmsg, "USR "); 138 else 139 pt_dump_cont_printf(m, dmsg, " "); 140 if (pr & _PAGE_RW) 141 pt_dump_cont_printf(m, dmsg, "RW "); 142 else 143 pt_dump_cont_printf(m, dmsg, "ro "); 144 if (pr & _PAGE_PWT) 145 pt_dump_cont_printf(m, dmsg, "PWT "); 146 else 147 pt_dump_cont_printf(m, dmsg, " "); 148 if (pr & _PAGE_PCD) 149 pt_dump_cont_printf(m, dmsg, "PCD "); 150 else 151 pt_dump_cont_printf(m, dmsg, " "); 152 153 /* Bit 7 has a different meaning on level 3 vs 4 */ 154 if (level <= 3 && pr & _PAGE_PSE) 155 pt_dump_cont_printf(m, dmsg, "PSE "); 156 else 157 pt_dump_cont_printf(m, dmsg, " "); 158 if ((level == 4 && pr & _PAGE_PAT) || 159 ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) 160 pt_dump_cont_printf(m, dmsg, "PAT "); 161 else 162 pt_dump_cont_printf(m, dmsg, " "); 163 if (pr & _PAGE_GLOBAL) 164 pt_dump_cont_printf(m, dmsg, "GLB "); 165 else 166 pt_dump_cont_printf(m, dmsg, " "); 167 if (pr & _PAGE_NX) 168 pt_dump_cont_printf(m, dmsg, "NX "); 169 else 170 pt_dump_cont_printf(m, dmsg, "x "); 171 } 172 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); 173 } 174 175 /* 176 * On 64 bits, sign-extend the 48 bit address to 64 bit 177 */ 178 static unsigned long normalize_addr(unsigned long u) 179 { 180 #ifdef CONFIG_X86_64 181 return (signed long)(u << 16) >> 16; 182 #else 183 return u; 184 #endif 185 } 186 187 /* 188 * This function gets called on a break in a continuous series 189 * of PTE entries; the next one is different so we need to 190 * print what we collected so far. 191 */ 192 static void note_page(struct seq_file *m, struct pg_state *st, 193 pgprot_t new_prot, int level) 194 { 195 pgprotval_t prot, cur; 196 static const char units[] = "BKMGTPE"; 197 198 /* 199 * If we have a "break" in the series, we need to flush the state that 200 * we have now. "break" is either changing perms, levels or 201 * address space marker. 202 */ 203 prot = pgprot_val(new_prot); 204 cur = pgprot_val(st->current_prot); 205 206 if (!st->level) { 207 /* First entry */ 208 st->current_prot = new_prot; 209 st->level = level; 210 st->marker = address_markers; 211 st->lines = 0; 212 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 213 st->marker->name); 214 } else if (prot != cur || level != st->level || 215 st->current_address >= st->marker[1].start_address) { 216 const char *unit = units; 217 unsigned long delta; 218 int width = sizeof(unsigned long) * 2; 219 pgprotval_t pr = pgprot_val(st->current_prot); 220 221 if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { 222 WARN_ONCE(1, 223 "x86/mm: Found insecure W+X mapping at address %p/%pS\n", 224 (void *)st->start_address, 225 (void *)st->start_address); 226 st->wx_pages += (st->current_address - 227 st->start_address) / PAGE_SIZE; 228 } 229 230 /* 231 * Now print the actual finished series 232 */ 233 if (!st->marker->max_lines || 234 st->lines < st->marker->max_lines) { 235 pt_dump_seq_printf(m, st->to_dmesg, 236 "0x%0*lx-0x%0*lx ", 237 width, st->start_address, 238 width, st->current_address); 239 240 delta = st->current_address - st->start_address; 241 while (!(delta & 1023) && unit[1]) { 242 delta >>= 10; 243 unit++; 244 } 245 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", 246 delta, *unit); 247 printk_prot(m, st->current_prot, st->level, 248 st->to_dmesg); 249 } 250 st->lines++; 251 252 /* 253 * We print markers for special areas of address space, 254 * such as the start of vmalloc space etc. 255 * This helps in the interpretation. 256 */ 257 if (st->current_address >= st->marker[1].start_address) { 258 if (st->marker->max_lines && 259 st->lines > st->marker->max_lines) { 260 unsigned long nskip = 261 st->lines - st->marker->max_lines; 262 pt_dump_seq_printf(m, st->to_dmesg, 263 "... %lu entr%s skipped ... \n", 264 nskip, 265 nskip == 1 ? "y" : "ies"); 266 } 267 st->marker++; 268 st->lines = 0; 269 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 270 st->marker->name); 271 } 272 273 st->start_address = st->current_address; 274 st->current_prot = new_prot; 275 st->level = level; 276 } 277 } 278 279 static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, 280 unsigned long P) 281 { 282 int i; 283 pte_t *start; 284 pgprotval_t prot; 285 286 start = (pte_t *) pmd_page_vaddr(addr); 287 for (i = 0; i < PTRS_PER_PTE; i++) { 288 prot = pte_flags(*start); 289 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 290 note_page(m, st, __pgprot(prot), 4); 291 start++; 292 } 293 } 294 295 #if PTRS_PER_PMD > 1 296 297 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, 298 unsigned long P) 299 { 300 int i; 301 pmd_t *start; 302 pgprotval_t prot; 303 304 start = (pmd_t *) pud_page_vaddr(addr); 305 for (i = 0; i < PTRS_PER_PMD; i++) { 306 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 307 if (!pmd_none(*start)) { 308 if (pmd_large(*start) || !pmd_present(*start)) { 309 prot = pmd_flags(*start); 310 note_page(m, st, __pgprot(prot), 3); 311 } else { 312 walk_pte_level(m, st, *start, 313 P + i * PMD_LEVEL_MULT); 314 } 315 } else 316 note_page(m, st, __pgprot(0), 3); 317 start++; 318 } 319 } 320 321 #else 322 #define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) 323 #define pud_large(a) pmd_large(__pmd(pud_val(a))) 324 #define pud_none(a) pmd_none(__pmd(pud_val(a))) 325 #endif 326 327 #if PTRS_PER_PUD > 1 328 329 static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, 330 unsigned long P) 331 { 332 int i; 333 pud_t *start; 334 pgprotval_t prot; 335 336 start = (pud_t *) pgd_page_vaddr(addr); 337 338 for (i = 0; i < PTRS_PER_PUD; i++) { 339 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 340 if (!pud_none(*start)) { 341 if (pud_large(*start) || !pud_present(*start)) { 342 prot = pud_flags(*start); 343 note_page(m, st, __pgprot(prot), 2); 344 } else { 345 walk_pmd_level(m, st, *start, 346 P + i * PUD_LEVEL_MULT); 347 } 348 } else 349 note_page(m, st, __pgprot(0), 2); 350 351 start++; 352 } 353 } 354 355 #else 356 #define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) 357 #define pgd_large(a) pud_large(__pud(pgd_val(a))) 358 #define pgd_none(a) pud_none(__pud(pgd_val(a))) 359 #endif 360 361 #ifdef CONFIG_X86_64 362 static inline bool is_hypervisor_range(int idx) 363 { 364 /* 365 * ffff800000000000 - ffff87ffffffffff is reserved for 366 * the hypervisor. 367 */ 368 return paravirt_enabled() && 369 (idx >= pgd_index(__PAGE_OFFSET) - 16) && 370 (idx < pgd_index(__PAGE_OFFSET)); 371 } 372 #else 373 static inline bool is_hypervisor_range(int idx) { return false; } 374 #endif 375 376 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 377 bool checkwx) 378 { 379 #ifdef CONFIG_X86_64 380 pgd_t *start = (pgd_t *) &init_level4_pgt; 381 #else 382 pgd_t *start = swapper_pg_dir; 383 #endif 384 pgprotval_t prot; 385 int i; 386 struct pg_state st = {}; 387 388 if (pgd) { 389 start = pgd; 390 st.to_dmesg = true; 391 } 392 393 st.check_wx = checkwx; 394 if (checkwx) 395 st.wx_pages = 0; 396 397 for (i = 0; i < PTRS_PER_PGD; i++) { 398 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 399 if (!pgd_none(*start) && !is_hypervisor_range(i)) { 400 if (pgd_large(*start) || !pgd_present(*start)) { 401 prot = pgd_flags(*start); 402 note_page(m, &st, __pgprot(prot), 1); 403 } else { 404 walk_pud_level(m, &st, *start, 405 i * PGD_LEVEL_MULT); 406 } 407 } else 408 note_page(m, &st, __pgprot(0), 1); 409 410 start++; 411 } 412 413 /* Flush out the last page */ 414 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); 415 note_page(m, &st, __pgprot(0), 0); 416 if (!checkwx) 417 return; 418 if (st.wx_pages) 419 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", 420 st.wx_pages); 421 else 422 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); 423 } 424 425 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) 426 { 427 ptdump_walk_pgd_level_core(m, pgd, false); 428 } 429 430 void ptdump_walk_pgd_level_checkwx(void) 431 { 432 ptdump_walk_pgd_level_core(NULL, NULL, true); 433 } 434 435 #ifdef CONFIG_X86_PTDUMP 436 static int ptdump_show(struct seq_file *m, void *v) 437 { 438 ptdump_walk_pgd_level(m, NULL); 439 return 0; 440 } 441 442 static int ptdump_open(struct inode *inode, struct file *filp) 443 { 444 return single_open(filp, ptdump_show, NULL); 445 } 446 447 static const struct file_operations ptdump_fops = { 448 .open = ptdump_open, 449 .read = seq_read, 450 .llseek = seq_lseek, 451 .release = single_release, 452 }; 453 #endif 454 455 static int pt_dump_init(void) 456 { 457 #ifdef CONFIG_X86_PTDUMP 458 struct dentry *pe; 459 #endif 460 461 #ifdef CONFIG_X86_32 462 /* Not a compile-time constant on x86-32 */ 463 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 464 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; 465 # ifdef CONFIG_HIGHMEM 466 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; 467 # endif 468 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; 469 #endif 470 471 #ifdef CONFIG_X86_PTDUMP 472 pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, 473 &ptdump_fops); 474 if (!pe) 475 return -ENOMEM; 476 #endif 477 478 return 0; 479 } 480 481 __initcall(pt_dump_init); 482 MODULE_LICENSE("GPL"); 483 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); 484 MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); 485