1 /* 2 * This file contains common routines for dealing with free of page tables 3 * Along with common page table handling code 4 * 5 * Derived from arch/powerpc/mm/tlb_64.c: 6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 7 * 8 * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) 9 * and Cort Dougan (PReP) (cort@cs.nmt.edu) 10 * Copyright (C) 1996 Paul Mackerras 11 * 12 * Derived from "arch/i386/mm/init.c" 13 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 14 * 15 * Dave Engebretsen <engebret@us.ibm.com> 16 * Rework for PPC64 port. 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 #include <linux/kernel.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/percpu.h> 28 #include <linux/hardirq.h> 29 #include <linux/hugetlb.h> 30 #include <asm/pgalloc.h> 31 #include <asm/tlbflush.h> 32 #include <asm/tlb.h> 33 #include <asm/hugetlb.h> 34 35 static inline int is_exec_fault(void) 36 { 37 return current->thread.regs && TRAP(current->thread.regs) == 0x400; 38 } 39 40 /* We only try to do i/d cache coherency on stuff that looks like 41 * reasonably "normal" PTEs. We currently require a PTE to be present 42 * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that 43 * on userspace PTEs 44 */ 45 static inline int pte_looks_normal(pte_t pte) 46 { 47 48 if (pte_present(pte) && !pte_special(pte)) { 49 if (pte_ci(pte)) 50 return 0; 51 if (pte_user(pte)) 52 return 1; 53 } 54 return 0; 55 } 56 57 static struct page *maybe_pte_to_page(pte_t pte) 58 { 59 unsigned long pfn = pte_pfn(pte); 60 struct page *page; 61 62 if (unlikely(!pfn_valid(pfn))) 63 return NULL; 64 page = pfn_to_page(pfn); 65 if (PageReserved(page)) 66 return NULL; 67 return page; 68 } 69 70 #ifdef CONFIG_PPC_BOOK3S 71 72 /* Server-style MMU handles coherency when hashing if HW exec permission 73 * is supposed per page (currently 64-bit only). If not, then, we always 74 * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec 75 * support falls into the same category. 76 */ 77 78 static pte_t set_pte_filter_hash(pte_t pte) 79 { 80 if (radix_enabled()) 81 return pte; 82 83 pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 84 if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || 85 cpu_has_feature(CPU_FTR_NOEXECUTE))) { 86 struct page *pg = maybe_pte_to_page(pte); 87 if (!pg) 88 return pte; 89 if (!test_bit(PG_arch_1, &pg->flags)) { 90 flush_dcache_icache_page(pg); 91 set_bit(PG_arch_1, &pg->flags); 92 } 93 } 94 return pte; 95 } 96 97 #else /* CONFIG_PPC_BOOK3S */ 98 99 static pte_t set_pte_filter_hash(pte_t pte) { return pte; } 100 101 #endif /* CONFIG_PPC_BOOK3S */ 102 103 /* Embedded type MMU with HW exec support. This is a bit more complicated 104 * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so 105 * instead we "filter out" the exec permission for non clean pages. 106 */ 107 static pte_t set_pte_filter(pte_t pte) 108 { 109 struct page *pg; 110 111 if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) 112 return set_pte_filter_hash(pte); 113 114 /* No exec permission in the first place, move on */ 115 if (!pte_exec(pte) || !pte_looks_normal(pte)) 116 return pte; 117 118 /* If you set _PAGE_EXEC on weird pages you're on your own */ 119 pg = maybe_pte_to_page(pte); 120 if (unlikely(!pg)) 121 return pte; 122 123 /* If the page clean, we move on */ 124 if (test_bit(PG_arch_1, &pg->flags)) 125 return pte; 126 127 /* If it's an exec fault, we flush the cache and make it clean */ 128 if (is_exec_fault()) { 129 flush_dcache_icache_page(pg); 130 set_bit(PG_arch_1, &pg->flags); 131 return pte; 132 } 133 134 /* Else, we filter out _PAGE_EXEC */ 135 return pte_exprotect(pte); 136 } 137 138 static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, 139 int dirty) 140 { 141 struct page *pg; 142 143 if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) 144 return pte; 145 146 /* So here, we only care about exec faults, as we use them 147 * to recover lost _PAGE_EXEC and perform I$/D$ coherency 148 * if necessary. Also if _PAGE_EXEC is already set, same deal, 149 * we just bail out 150 */ 151 if (dirty || pte_exec(pte) || !is_exec_fault()) 152 return pte; 153 154 #ifdef CONFIG_DEBUG_VM 155 /* So this is an exec fault, _PAGE_EXEC is not set. If it was 156 * an error we would have bailed out earlier in do_page_fault() 157 * but let's make sure of it 158 */ 159 if (WARN_ON(!(vma->vm_flags & VM_EXEC))) 160 return pte; 161 #endif /* CONFIG_DEBUG_VM */ 162 163 /* If you set _PAGE_EXEC on weird pages you're on your own */ 164 pg = maybe_pte_to_page(pte); 165 if (unlikely(!pg)) 166 goto bail; 167 168 /* If the page is already clean, we move on */ 169 if (test_bit(PG_arch_1, &pg->flags)) 170 goto bail; 171 172 /* Clean the page and set PG_arch_1 */ 173 flush_dcache_icache_page(pg); 174 set_bit(PG_arch_1, &pg->flags); 175 176 bail: 177 return pte_mkexec(pte); 178 } 179 180 /* 181 * set_pte stores a linux PTE into the linux page table. 182 */ 183 void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 184 pte_t pte) 185 { 186 /* 187 * Make sure hardware valid bit is not set. We don't do 188 * tlb flush for this update. 189 */ 190 VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); 191 192 /* Add the pte bit when trying to set a pte */ 193 pte = pte_mkpte(pte); 194 195 /* Note: mm->context.id might not yet have been assigned as 196 * this context might not have been activated yet when this 197 * is called. 198 */ 199 pte = set_pte_filter(pte); 200 201 /* Perform the setting of the PTE */ 202 __set_pte_at(mm, addr, ptep, pte, 0); 203 } 204 205 /* 206 * This is called when relaxing access to a PTE. It's also called in the page 207 * fault path when we don't hit any of the major fault cases, ie, a minor 208 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have 209 * handled those two for us, we additionally deal with missing execute 210 * permission here on some processors 211 */ 212 int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, 213 pte_t *ptep, pte_t entry, int dirty) 214 { 215 int changed; 216 entry = set_access_flags_filter(entry, vma, dirty); 217 changed = !pte_same(*(ptep), entry); 218 if (changed) { 219 assert_pte_locked(vma->vm_mm, address); 220 __ptep_set_access_flags(vma, ptep, entry, 221 address, mmu_virtual_psize); 222 } 223 return changed; 224 } 225 226 #ifdef CONFIG_HUGETLB_PAGE 227 int huge_ptep_set_access_flags(struct vm_area_struct *vma, 228 unsigned long addr, pte_t *ptep, 229 pte_t pte, int dirty) 230 { 231 #ifdef HUGETLB_NEED_PRELOAD 232 /* 233 * The "return 1" forces a call of update_mmu_cache, which will write a 234 * TLB entry. Without this, platforms that don't do a write of the TLB 235 * entry in the TLB miss handler asm will fault ad infinitum. 236 */ 237 ptep_set_access_flags(vma, addr, ptep, pte, dirty); 238 return 1; 239 #else 240 int changed, psize; 241 242 pte = set_access_flags_filter(pte, vma, dirty); 243 changed = !pte_same(*(ptep), pte); 244 if (changed) { 245 246 #ifdef CONFIG_PPC_BOOK3S_64 247 struct hstate *h = hstate_vma(vma); 248 249 psize = hstate_get_psize(h); 250 #ifdef CONFIG_DEBUG_VM 251 assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep)); 252 #endif 253 254 #else 255 /* 256 * Not used on non book3s64 platforms. But 8xx 257 * can possibly use tsize derived from hstate. 258 */ 259 psize = 0; 260 #endif 261 __ptep_set_access_flags(vma, ptep, pte, addr, psize); 262 } 263 return changed; 264 #endif 265 } 266 #endif /* CONFIG_HUGETLB_PAGE */ 267 268 #ifdef CONFIG_DEBUG_VM 269 void assert_pte_locked(struct mm_struct *mm, unsigned long addr) 270 { 271 pgd_t *pgd; 272 pud_t *pud; 273 pmd_t *pmd; 274 275 if (mm == &init_mm) 276 return; 277 pgd = mm->pgd + pgd_index(addr); 278 BUG_ON(pgd_none(*pgd)); 279 pud = pud_offset(pgd, addr); 280 BUG_ON(pud_none(*pud)); 281 pmd = pmd_offset(pud, addr); 282 /* 283 * khugepaged to collapse normal pages to hugepage, first set 284 * pmd to none to force page fault/gup to take mmap_sem. After 285 * pmd is set to none, we do a pte_clear which does this assertion 286 * so if we find pmd none, return. 287 */ 288 if (pmd_none(*pmd)) 289 return; 290 BUG_ON(!pmd_present(*pmd)); 291 assert_spin_locked(pte_lockptr(mm, pmd)); 292 } 293 #endif /* CONFIG_DEBUG_VM */ 294 295 unsigned long vmalloc_to_phys(void *va) 296 { 297 unsigned long pfn = vmalloc_to_pfn(va); 298 299 BUG_ON(!pfn); 300 return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); 301 } 302 EXPORT_SYMBOL_GPL(vmalloc_to_phys); 303 304 /* 305 * We have 4 cases for pgds and pmds: 306 * (1) invalid (all zeroes) 307 * (2) pointer to next table, as normal; bottom 6 bits == 0 308 * (3) leaf pte for huge page _PAGE_PTE set 309 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table 310 * 311 * So long as we atomically load page table pointers we are safe against teardown, 312 * we can follow the address down to the the page and take a ref on it. 313 * This function need to be called with interrupts disabled. We use this variant 314 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED 315 */ 316 pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, 317 bool *is_thp, unsigned *hpage_shift) 318 { 319 pgd_t pgd, *pgdp; 320 pud_t pud, *pudp; 321 pmd_t pmd, *pmdp; 322 pte_t *ret_pte; 323 hugepd_t *hpdp = NULL; 324 unsigned pdshift = PGDIR_SHIFT; 325 326 if (hpage_shift) 327 *hpage_shift = 0; 328 329 if (is_thp) 330 *is_thp = false; 331 332 pgdp = pgdir + pgd_index(ea); 333 pgd = READ_ONCE(*pgdp); 334 /* 335 * Always operate on the local stack value. This make sure the 336 * value don't get updated by a parallel THP split/collapse, 337 * page fault or a page unmap. The return pte_t * is still not 338 * stable. So should be checked there for above conditions. 339 */ 340 if (pgd_none(pgd)) 341 return NULL; 342 343 if (pgd_huge(pgd)) { 344 ret_pte = (pte_t *)pgdp; 345 goto out; 346 } 347 if (is_hugepd(__hugepd(pgd_val(pgd)))) { 348 hpdp = (hugepd_t *)&pgd; 349 goto out_huge; 350 } 351 352 /* 353 * Even if we end up with an unmap, the pgtable will not 354 * be freed, because we do an rcu free and here we are 355 * irq disabled 356 */ 357 pdshift = PUD_SHIFT; 358 pudp = pud_offset(&pgd, ea); 359 pud = READ_ONCE(*pudp); 360 361 if (pud_none(pud)) 362 return NULL; 363 364 if (pud_huge(pud)) { 365 ret_pte = (pte_t *)pudp; 366 goto out; 367 } 368 if (is_hugepd(__hugepd(pud_val(pud)))) { 369 hpdp = (hugepd_t *)&pud; 370 goto out_huge; 371 } 372 pdshift = PMD_SHIFT; 373 pmdp = pmd_offset(&pud, ea); 374 pmd = READ_ONCE(*pmdp); 375 /* 376 * A hugepage collapse is captured by pmd_none, because 377 * it mark the pmd none and do a hpte invalidate. 378 */ 379 if (pmd_none(pmd)) 380 return NULL; 381 382 if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { 383 if (is_thp) 384 *is_thp = true; 385 ret_pte = (pte_t *)pmdp; 386 goto out; 387 } 388 /* 389 * pmd_large check below will handle the swap pmd pte 390 * we need to do both the check because they are config 391 * dependent. 392 */ 393 if (pmd_huge(pmd) || pmd_large(pmd)) { 394 ret_pte = (pte_t *)pmdp; 395 goto out; 396 } 397 if (is_hugepd(__hugepd(pmd_val(pmd)))) { 398 hpdp = (hugepd_t *)&pmd; 399 goto out_huge; 400 } 401 402 return pte_offset_kernel(&pmd, ea); 403 404 out_huge: 405 if (!hpdp) 406 return NULL; 407 408 ret_pte = hugepte_offset(*hpdp, ea, pdshift); 409 pdshift = hugepd_shift(*hpdp); 410 out: 411 if (hpage_shift) 412 *hpage_shift = pdshift; 413 return ret_pte; 414 } 415 EXPORT_SYMBOL_GPL(__find_linux_pte); 416