1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/pagewalk.h> 3 #include <linux/highmem.h> 4 #include <linux/sched.h> 5 #include <linux/hugetlb.h> 6 7 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 8 struct mm_walk *walk) 9 { 10 pte_t *pte; 11 int err = 0; 12 const struct mm_walk_ops *ops = walk->ops; 13 14 pte = pte_offset_map(pmd, addr); 15 for (;;) { 16 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 17 if (err) 18 break; 19 addr += PAGE_SIZE; 20 if (addr == end) 21 break; 22 pte++; 23 } 24 25 pte_unmap(pte); 26 return err; 27 } 28 29 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 30 struct mm_walk *walk) 31 { 32 pmd_t *pmd; 33 unsigned long next; 34 const struct mm_walk_ops *ops = walk->ops; 35 int err = 0; 36 37 pmd = pmd_offset(pud, addr); 38 do { 39 again: 40 next = pmd_addr_end(addr, end); 41 if (pmd_none(*pmd) || !walk->vma) { 42 if (ops->pte_hole) 43 err = ops->pte_hole(addr, next, walk); 44 if (err) 45 break; 46 continue; 47 } 48 /* 49 * This implies that each ->pmd_entry() handler 50 * needs to know about pmd_trans_huge() pmds 51 */ 52 if (ops->pmd_entry) 53 err = ops->pmd_entry(pmd, addr, next, walk); 54 if (err) 55 break; 56 57 /* 58 * Check this here so we only break down trans_huge 59 * pages when we _need_ to 60 */ 61 if (!ops->pte_entry) 62 continue; 63 64 split_huge_pmd(walk->vma, pmd, addr); 65 if (pmd_trans_unstable(pmd)) 66 goto again; 67 err = walk_pte_range(pmd, addr, next, walk); 68 if (err) 69 break; 70 } while (pmd++, addr = next, addr != end); 71 72 return err; 73 } 74 75 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 76 struct mm_walk *walk) 77 { 78 pud_t *pud; 79 unsigned long next; 80 const struct mm_walk_ops *ops = walk->ops; 81 int err = 0; 82 83 pud = pud_offset(p4d, addr); 84 do { 85 again: 86 next = pud_addr_end(addr, end); 87 if (pud_none(*pud) || !walk->vma) { 88 if (ops->pte_hole) 89 err = ops->pte_hole(addr, next, walk); 90 if (err) 91 break; 92 continue; 93 } 94 95 if (ops->pud_entry) { 96 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 97 98 if (ptl) { 99 err = ops->pud_entry(pud, addr, next, walk); 100 spin_unlock(ptl); 101 if (err) 102 break; 103 continue; 104 } 105 } 106 107 split_huge_pud(walk->vma, pud, addr); 108 if (pud_none(*pud)) 109 goto again; 110 111 if (ops->pmd_entry || ops->pte_entry) 112 err = walk_pmd_range(pud, addr, next, walk); 113 if (err) 114 break; 115 } while (pud++, addr = next, addr != end); 116 117 return err; 118 } 119 120 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 121 struct mm_walk *walk) 122 { 123 p4d_t *p4d; 124 unsigned long next; 125 const struct mm_walk_ops *ops = walk->ops; 126 int err = 0; 127 128 p4d = p4d_offset(pgd, addr); 129 do { 130 next = p4d_addr_end(addr, end); 131 if (p4d_none_or_clear_bad(p4d)) { 132 if (ops->pte_hole) 133 err = ops->pte_hole(addr, next, walk); 134 if (err) 135 break; 136 continue; 137 } 138 if (ops->pmd_entry || ops->pte_entry) 139 err = walk_pud_range(p4d, addr, next, walk); 140 if (err) 141 break; 142 } while (p4d++, addr = next, addr != end); 143 144 return err; 145 } 146 147 static int walk_pgd_range(unsigned long addr, unsigned long end, 148 struct mm_walk *walk) 149 { 150 pgd_t *pgd; 151 unsigned long next; 152 const struct mm_walk_ops *ops = walk->ops; 153 int err = 0; 154 155 pgd = pgd_offset(walk->mm, addr); 156 do { 157 next = pgd_addr_end(addr, end); 158 if (pgd_none_or_clear_bad(pgd)) { 159 if (ops->pte_hole) 160 err = ops->pte_hole(addr, next, walk); 161 if (err) 162 break; 163 continue; 164 } 165 if (ops->pmd_entry || ops->pte_entry) 166 err = walk_p4d_range(pgd, addr, next, walk); 167 if (err) 168 break; 169 } while (pgd++, addr = next, addr != end); 170 171 return err; 172 } 173 174 #ifdef CONFIG_HUGETLB_PAGE 175 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 176 unsigned long end) 177 { 178 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 179 return boundary < end ? boundary : end; 180 } 181 182 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 183 struct mm_walk *walk) 184 { 185 struct vm_area_struct *vma = walk->vma; 186 struct hstate *h = hstate_vma(vma); 187 unsigned long next; 188 unsigned long hmask = huge_page_mask(h); 189 unsigned long sz = huge_page_size(h); 190 pte_t *pte; 191 const struct mm_walk_ops *ops = walk->ops; 192 int err = 0; 193 194 do { 195 next = hugetlb_entry_end(h, addr, end); 196 pte = huge_pte_offset(walk->mm, addr & hmask, sz); 197 198 if (pte) 199 err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 200 else if (ops->pte_hole) 201 err = ops->pte_hole(addr, next, walk); 202 203 if (err) 204 break; 205 } while (addr = next, addr != end); 206 207 return err; 208 } 209 210 #else /* CONFIG_HUGETLB_PAGE */ 211 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 212 struct mm_walk *walk) 213 { 214 return 0; 215 } 216 217 #endif /* CONFIG_HUGETLB_PAGE */ 218 219 /* 220 * Decide whether we really walk over the current vma on [@start, @end) 221 * or skip it via the returned value. Return 0 if we do walk over the 222 * current vma, and return 1 if we skip the vma. Negative values means 223 * error, where we abort the current walk. 224 */ 225 static int walk_page_test(unsigned long start, unsigned long end, 226 struct mm_walk *walk) 227 { 228 struct vm_area_struct *vma = walk->vma; 229 const struct mm_walk_ops *ops = walk->ops; 230 231 if (ops->test_walk) 232 return ops->test_walk(start, end, walk); 233 234 /* 235 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 236 * range, so we don't walk over it as we do for normal vmas. However, 237 * Some callers are interested in handling hole range and they don't 238 * want to just ignore any single address range. Such users certainly 239 * define their ->pte_hole() callbacks, so let's delegate them to handle 240 * vma(VM_PFNMAP). 241 */ 242 if (vma->vm_flags & VM_PFNMAP) { 243 int err = 1; 244 if (ops->pte_hole) 245 err = ops->pte_hole(start, end, walk); 246 return err ? err : 1; 247 } 248 return 0; 249 } 250 251 static int __walk_page_range(unsigned long start, unsigned long end, 252 struct mm_walk *walk) 253 { 254 int err = 0; 255 struct vm_area_struct *vma = walk->vma; 256 257 if (vma && is_vm_hugetlb_page(vma)) { 258 if (walk->ops->hugetlb_entry) 259 err = walk_hugetlb_range(start, end, walk); 260 } else 261 err = walk_pgd_range(start, end, walk); 262 263 return err; 264 } 265 266 /** 267 * walk_page_range - walk page table with caller specific callbacks 268 * @mm: mm_struct representing the target process of page table walk 269 * @start: start address of the virtual address range 270 * @end: end address of the virtual address range 271 * @ops: operation to call during the walk 272 * @private: private data for callbacks' usage 273 * 274 * Recursively walk the page table tree of the process represented by @mm 275 * within the virtual address range [@start, @end). During walking, we can do 276 * some caller-specific works for each entry, by setting up pmd_entry(), 277 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 278 * callbacks, the associated entries/pages are just ignored. 279 * The return values of these callbacks are commonly defined like below: 280 * 281 * - 0 : succeeded to handle the current entry, and if you don't reach the 282 * end address yet, continue to walk. 283 * - >0 : succeeded to handle the current entry, and return to the caller 284 * with caller specific value. 285 * - <0 : failed to handle the current entry, and return to the caller 286 * with error code. 287 * 288 * Before starting to walk page table, some callers want to check whether 289 * they really want to walk over the current vma, typically by checking 290 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 291 * purpose. 292 * 293 * struct mm_walk keeps current values of some common data like vma and pmd, 294 * which are useful for the access from callbacks. If you want to pass some 295 * caller-specific data to callbacks, @private should be helpful. 296 * 297 * Locking: 298 * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem, 299 * because these function traverse vma list and/or access to vma's data. 300 */ 301 int walk_page_range(struct mm_struct *mm, unsigned long start, 302 unsigned long end, const struct mm_walk_ops *ops, 303 void *private) 304 { 305 int err = 0; 306 unsigned long next; 307 struct vm_area_struct *vma; 308 struct mm_walk walk = { 309 .ops = ops, 310 .mm = mm, 311 .private = private, 312 }; 313 314 if (start >= end) 315 return -EINVAL; 316 317 if (!walk.mm) 318 return -EINVAL; 319 320 lockdep_assert_held(&walk.mm->mmap_sem); 321 322 vma = find_vma(walk.mm, start); 323 do { 324 if (!vma) { /* after the last vma */ 325 walk.vma = NULL; 326 next = end; 327 } else if (start < vma->vm_start) { /* outside vma */ 328 walk.vma = NULL; 329 next = min(end, vma->vm_start); 330 } else { /* inside vma */ 331 walk.vma = vma; 332 next = min(end, vma->vm_end); 333 vma = vma->vm_next; 334 335 err = walk_page_test(start, next, &walk); 336 if (err > 0) { 337 /* 338 * positive return values are purely for 339 * controlling the pagewalk, so should never 340 * be passed to the callers. 341 */ 342 err = 0; 343 continue; 344 } 345 if (err < 0) 346 break; 347 } 348 if (walk.vma || walk.ops->pte_hole) 349 err = __walk_page_range(start, next, &walk); 350 if (err) 351 break; 352 } while (start = next, start < end); 353 return err; 354 } 355 356 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 357 void *private) 358 { 359 struct mm_walk walk = { 360 .ops = ops, 361 .mm = vma->vm_mm, 362 .vma = vma, 363 .private = private, 364 }; 365 int err; 366 367 if (!walk.mm) 368 return -EINVAL; 369 370 lockdep_assert_held(&walk.mm->mmap_sem); 371 372 err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 373 if (err > 0) 374 return 0; 375 if (err < 0) 376 return err; 377 return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 378 } 379