1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2016, Rashmica Gupta, IBM Corp. 4 * 5 * This traverses the kernel pagetables and dumps the 6 * information about the used sections of memory to 7 * /sys/kernel/debug/kernel_pagetables. 8 * 9 * Derived from the arm64 implementation: 10 * Copyright (c) 2014, The Linux Foundation, Laura Abbott. 11 * (C) Copyright 2008 Intel Corporation, Arjan van de Ven. 12 */ 13 #include <linux/debugfs.h> 14 #include <linux/fs.h> 15 #include <linux/hugetlb.h> 16 #include <linux/io.h> 17 #include <linux/mm.h> 18 #include <linux/highmem.h> 19 #include <linux/sched.h> 20 #include <linux/seq_file.h> 21 #include <asm/fixmap.h> 22 #include <linux/const.h> 23 #include <asm/page.h> 24 #include <asm/pgalloc.h> 25 #include <asm/hugetlb.h> 26 27 #include <mm/mmu_decl.h> 28 29 #include "ptdump.h" 30 31 /* 32 * To visualise what is happening, 33 * 34 * - PTRS_PER_P** = how many entries there are in the corresponding P** 35 * - P**_SHIFT = how many bits of the address we use to index into the 36 * corresponding P** 37 * - P**_SIZE is how much memory we can access through the table - not the 38 * size of the table itself. 39 * P**={PGD, PUD, PMD, PTE} 40 * 41 * 42 * Each entry of the PGD points to a PUD. Each entry of a PUD points to a 43 * PMD. Each entry of a PMD points to a PTE. And every PTE entry points to 44 * a page. 45 * 46 * In the case where there are only 3 levels, the PUD is folded into the 47 * PGD: every PUD has only one entry which points to the PMD. 48 * 49 * The page dumper groups page table entries of the same type into a single 50 * description. It uses pg_state to track the range information while 51 * iterating over the PTE entries. When the continuity is broken it then 52 * dumps out a description of the range - ie PTEs that are virtually contiguous 53 * with the same PTE flags are chunked together. This is to make it clear how 54 * different areas of the kernel virtual memory are used. 55 * 56 */ 57 struct pg_state { 58 struct seq_file *seq; 59 const struct addr_marker *marker; 60 unsigned long start_address; 61 unsigned long start_pa; 62 unsigned long last_pa; 63 unsigned long page_size; 64 unsigned int level; 65 u64 current_flags; 66 bool check_wx; 67 unsigned long wx_pages; 68 }; 69 70 struct addr_marker { 71 unsigned long start_address; 72 const char *name; 73 }; 74 75 static struct addr_marker address_markers[] = { 76 { 0, "Start of kernel VM" }, 77 { 0, "vmalloc() Area" }, 78 { 0, "vmalloc() End" }, 79 #ifdef CONFIG_PPC64 80 { 0, "isa I/O start" }, 81 { 0, "isa I/O end" }, 82 { 0, "phb I/O start" }, 83 { 0, "phb I/O end" }, 84 { 0, "I/O remap start" }, 85 { 0, "I/O remap end" }, 86 { 0, "vmemmap start" }, 87 #else 88 { 0, "Early I/O remap start" }, 89 { 0, "Early I/O remap end" }, 90 #ifdef CONFIG_HIGHMEM 91 { 0, "Highmem PTEs start" }, 92 { 0, "Highmem PTEs end" }, 93 #endif 94 { 0, "Fixmap start" }, 95 { 0, "Fixmap end" }, 96 #endif 97 #ifdef CONFIG_KASAN 98 { 0, "kasan shadow mem start" }, 99 { 0, "kasan shadow mem end" }, 100 #endif 101 { -1, NULL }, 102 }; 103 104 #define pt_dump_seq_printf(m, fmt, args...) \ 105 ({ \ 106 if (m) \ 107 seq_printf(m, fmt, ##args); \ 108 }) 109 110 #define pt_dump_seq_putc(m, c) \ 111 ({ \ 112 if (m) \ 113 seq_putc(m, c); \ 114 }) 115 116 void pt_dump_size(struct seq_file *m, unsigned long size) 117 { 118 static const char units[] = "KMGTPE"; 119 const char *unit = units; 120 121 /* Work out what appropriate unit to use */ 122 while (!(size & 1023) && unit[1]) { 123 size >>= 10; 124 unit++; 125 } 126 pt_dump_seq_printf(m, "%9lu%c ", size, *unit); 127 } 128 129 static void dump_flag_info(struct pg_state *st, const struct flag_info 130 *flag, u64 pte, int num) 131 { 132 unsigned int i; 133 134 for (i = 0; i < num; i++, flag++) { 135 const char *s = NULL; 136 u64 val; 137 138 /* flag not defined so don't check it */ 139 if (flag->mask == 0) 140 continue; 141 /* Some 'flags' are actually values */ 142 if (flag->is_val) { 143 val = pte & flag->val; 144 if (flag->shift) 145 val = val >> flag->shift; 146 pt_dump_seq_printf(st->seq, " %s:%llx", flag->set, val); 147 } else { 148 if ((pte & flag->mask) == flag->val) 149 s = flag->set; 150 else 151 s = flag->clear; 152 if (s) 153 pt_dump_seq_printf(st->seq, " %s", s); 154 } 155 st->current_flags &= ~flag->mask; 156 } 157 if (st->current_flags != 0) 158 pt_dump_seq_printf(st->seq, " unknown flags:%llx", st->current_flags); 159 } 160 161 static void dump_addr(struct pg_state *st, unsigned long addr) 162 { 163 unsigned long delta; 164 165 #ifdef CONFIG_PPC64 166 #define REG "0x%016lx" 167 #else 168 #define REG "0x%08lx" 169 #endif 170 171 pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); 172 if (st->start_pa == st->last_pa && st->start_address + st->page_size != addr) { 173 pt_dump_seq_printf(st->seq, "[" REG "]", st->start_pa); 174 delta = st->page_size >> 10; 175 } else { 176 pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); 177 delta = (addr - st->start_address) >> 10; 178 } 179 pt_dump_size(st->seq, delta); 180 } 181 182 static void note_prot_wx(struct pg_state *st, unsigned long addr) 183 { 184 pte_t pte = __pte(st->current_flags); 185 186 if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx) 187 return; 188 189 if (!pte_write(pte) || !pte_exec(pte)) 190 return; 191 192 WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", 193 (void *)st->start_address, (void *)st->start_address); 194 195 st->wx_pages += (addr - st->start_address) / PAGE_SIZE; 196 } 197 198 static void note_page(struct pg_state *st, unsigned long addr, 199 unsigned int level, u64 val, unsigned long page_size) 200 { 201 u64 flag = val & pg_level[level].mask; 202 u64 pa = val & PTE_RPN_MASK; 203 204 /* At first no level is set */ 205 if (!st->level) { 206 st->level = level; 207 st->current_flags = flag; 208 st->start_address = addr; 209 st->start_pa = pa; 210 st->last_pa = pa; 211 st->page_size = page_size; 212 pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); 213 /* 214 * Dump the section of virtual memory when: 215 * - the PTE flags from one entry to the next differs. 216 * - we change levels in the tree. 217 * - the address is in a different section of memory and is thus 218 * used for a different purpose, regardless of the flags. 219 * - the pa of this page is not adjacent to the last inspected page 220 */ 221 } else if (flag != st->current_flags || level != st->level || 222 addr >= st->marker[1].start_address || 223 (pa != st->last_pa + st->page_size && 224 (pa != st->start_pa || st->start_pa != st->last_pa))) { 225 226 /* Check the PTE flags */ 227 if (st->current_flags) { 228 note_prot_wx(st, addr); 229 dump_addr(st, addr); 230 231 /* Dump all the flags */ 232 if (pg_level[st->level].flag) 233 dump_flag_info(st, pg_level[st->level].flag, 234 st->current_flags, 235 pg_level[st->level].num); 236 237 pt_dump_seq_putc(st->seq, '\n'); 238 } 239 240 /* 241 * Address indicates we have passed the end of the 242 * current section of virtual memory 243 */ 244 while (addr >= st->marker[1].start_address) { 245 st->marker++; 246 pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); 247 } 248 st->start_address = addr; 249 st->start_pa = pa; 250 st->last_pa = pa; 251 st->page_size = page_size; 252 st->current_flags = flag; 253 st->level = level; 254 } else { 255 st->last_pa = pa; 256 } 257 } 258 259 static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) 260 { 261 pte_t *pte = pte_offset_kernel(pmd, 0); 262 unsigned long addr; 263 unsigned int i; 264 265 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 266 addr = start + i * PAGE_SIZE; 267 note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE); 268 269 } 270 } 271 272 static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start, 273 int pdshift, int level) 274 { 275 #ifdef CONFIG_ARCH_HAS_HUGEPD 276 unsigned int i; 277 int shift = hugepd_shift(*phpd); 278 int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1; 279 280 if (start & ((1 << shift) - 1)) 281 return; 282 283 for (i = 0; i < ptrs_per_hpd; i++) { 284 unsigned long addr = start + (i << shift); 285 pte_t *pte = hugepte_offset(*phpd, addr, pdshift); 286 287 note_page(st, addr, level + 1, pte_val(*pte), 1 << shift); 288 } 289 #endif 290 } 291 292 static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) 293 { 294 pmd_t *pmd = pmd_offset(pud, 0); 295 unsigned long addr; 296 unsigned int i; 297 298 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { 299 addr = start + i * PMD_SIZE; 300 if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd)) 301 /* pmd exists */ 302 walk_pte(st, pmd, addr); 303 else 304 note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE); 305 } 306 } 307 308 static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) 309 { 310 pud_t *pud = pud_offset(p4d, 0); 311 unsigned long addr; 312 unsigned int i; 313 314 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 315 addr = start + i * PUD_SIZE; 316 if (!pud_none(*pud) && !pud_is_leaf(*pud)) 317 /* pud exists */ 318 walk_pmd(st, pud, addr); 319 else 320 note_page(st, addr, 2, pud_val(*pud), PUD_SIZE); 321 } 322 } 323 324 static void walk_pagetables(struct pg_state *st) 325 { 326 unsigned int i; 327 unsigned long addr = st->start_address & PGDIR_MASK; 328 pgd_t *pgd = pgd_offset_k(addr); 329 330 /* 331 * Traverse the linux pagetable structure and dump pages that are in 332 * the hash pagetable. 333 */ 334 for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { 335 p4d_t *p4d = p4d_offset(pgd, 0); 336 337 if (p4d_none(*p4d) || p4d_is_leaf(*p4d)) 338 note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE); 339 else if (is_hugepd(__hugepd(p4d_val(*p4d)))) 340 walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1); 341 else 342 /* p4d exists */ 343 walk_pud(st, p4d, addr); 344 } 345 } 346 347 static void populate_markers(void) 348 { 349 int i = 0; 350 351 address_markers[i++].start_address = PAGE_OFFSET; 352 address_markers[i++].start_address = VMALLOC_START; 353 address_markers[i++].start_address = VMALLOC_END; 354 #ifdef CONFIG_PPC64 355 address_markers[i++].start_address = ISA_IO_BASE; 356 address_markers[i++].start_address = ISA_IO_END; 357 address_markers[i++].start_address = PHB_IO_BASE; 358 address_markers[i++].start_address = PHB_IO_END; 359 address_markers[i++].start_address = IOREMAP_BASE; 360 address_markers[i++].start_address = IOREMAP_END; 361 /* What is the ifdef about? */ 362 #ifdef CONFIG_PPC_BOOK3S_64 363 address_markers[i++].start_address = H_VMEMMAP_START; 364 #else 365 address_markers[i++].start_address = VMEMMAP_BASE; 366 #endif 367 #else /* !CONFIG_PPC64 */ 368 address_markers[i++].start_address = ioremap_bot; 369 address_markers[i++].start_address = IOREMAP_TOP; 370 #ifdef CONFIG_HIGHMEM 371 address_markers[i++].start_address = PKMAP_BASE; 372 address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); 373 #endif 374 address_markers[i++].start_address = FIXADDR_START; 375 address_markers[i++].start_address = FIXADDR_TOP; 376 #ifdef CONFIG_KASAN 377 address_markers[i++].start_address = KASAN_SHADOW_START; 378 address_markers[i++].start_address = KASAN_SHADOW_END; 379 #endif 380 #endif /* CONFIG_PPC64 */ 381 } 382 383 static int ptdump_show(struct seq_file *m, void *v) 384 { 385 struct pg_state st = { 386 .seq = m, 387 .marker = address_markers, 388 .start_address = PAGE_OFFSET, 389 }; 390 391 #ifdef CONFIG_PPC64 392 if (!radix_enabled()) 393 st.start_address = KERN_VIRT_START; 394 #endif 395 396 /* Traverse kernel page tables */ 397 walk_pagetables(&st); 398 note_page(&st, 0, 0, 0, 0); 399 return 0; 400 } 401 402 403 static int ptdump_open(struct inode *inode, struct file *file) 404 { 405 return single_open(file, ptdump_show, NULL); 406 } 407 408 static const struct file_operations ptdump_fops = { 409 .open = ptdump_open, 410 .read = seq_read, 411 .llseek = seq_lseek, 412 .release = single_release, 413 }; 414 415 static void build_pgtable_complete_mask(void) 416 { 417 unsigned int i, j; 418 419 for (i = 0; i < ARRAY_SIZE(pg_level); i++) 420 if (pg_level[i].flag) 421 for (j = 0; j < pg_level[i].num; j++) 422 pg_level[i].mask |= pg_level[i].flag[j].mask; 423 } 424 425 #ifdef CONFIG_PPC_DEBUG_WX 426 void ptdump_check_wx(void) 427 { 428 struct pg_state st = { 429 .seq = NULL, 430 .marker = address_markers, 431 .check_wx = true, 432 .start_address = PAGE_OFFSET, 433 }; 434 435 #ifdef CONFIG_PPC64 436 if (!radix_enabled()) 437 st.start_address = KERN_VIRT_START; 438 #endif 439 440 walk_pagetables(&st); 441 442 if (st.wx_pages) 443 pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", 444 st.wx_pages); 445 else 446 pr_info("Checked W+X mappings: passed, no W+X pages found\n"); 447 } 448 #endif 449 450 static int ptdump_init(void) 451 { 452 populate_markers(); 453 build_pgtable_complete_mask(); 454 debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, 455 &ptdump_fops); 456 return 0; 457 } 458 device_initcall(ptdump_init); 459