1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/pagewalk.h> 3 #include <linux/highmem.h> 4 #include <linux/sched.h> 5 #include <linux/hugetlb.h> 6 7 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 8 struct mm_walk *walk) 9 { 10 pte_t *pte; 11 int err = 0; 12 const struct mm_walk_ops *ops = walk->ops; 13 spinlock_t *ptl; 14 15 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 16 for (;;) { 17 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 18 if (err) 19 break; 20 addr += PAGE_SIZE; 21 if (addr == end) 22 break; 23 pte++; 24 } 25 26 pte_unmap_unlock(pte, ptl); 27 return err; 28 } 29 30 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 31 struct mm_walk *walk) 32 { 33 pmd_t *pmd; 34 unsigned long next; 35 const struct mm_walk_ops *ops = walk->ops; 36 int err = 0; 37 38 pmd = pmd_offset(pud, addr); 39 do { 40 again: 41 next = pmd_addr_end(addr, end); 42 if (pmd_none(*pmd) || !walk->vma) { 43 if (ops->pte_hole) 44 err = ops->pte_hole(addr, next, walk); 45 if (err) 46 break; 47 continue; 48 } 49 /* 50 * This implies that each ->pmd_entry() handler 51 * needs to know about pmd_trans_huge() pmds 52 */ 53 if (ops->pmd_entry) 54 err = ops->pmd_entry(pmd, addr, next, walk); 55 if (err) 56 break; 57 58 /* 59 * Check this here so we only break down trans_huge 60 * pages when we _need_ to 61 */ 62 if (!ops->pte_entry) 63 continue; 64 65 split_huge_pmd(walk->vma, pmd, addr); 66 if (pmd_trans_unstable(pmd)) 67 goto again; 68 err = walk_pte_range(pmd, addr, next, walk); 69 if (err) 70 break; 71 } while (pmd++, addr = next, addr != end); 72 73 return err; 74 } 75 76 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 77 struct mm_walk *walk) 78 { 79 pud_t *pud; 80 unsigned long next; 81 const struct mm_walk_ops *ops = walk->ops; 82 int err = 0; 83 84 pud = pud_offset(p4d, addr); 85 do { 86 again: 87 next = pud_addr_end(addr, end); 88 if (pud_none(*pud) || !walk->vma) { 89 if (ops->pte_hole) 90 err = ops->pte_hole(addr, next, walk); 91 if (err) 92 break; 93 continue; 94 } 95 96 if (ops->pud_entry) { 97 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 98 99 if (ptl) { 100 err = ops->pud_entry(pud, addr, next, walk); 101 spin_unlock(ptl); 102 if (err) 103 break; 104 continue; 105 } 106 } 107 108 split_huge_pud(walk->vma, pud, addr); 109 if (pud_none(*pud)) 110 goto again; 111 112 if (ops->pmd_entry || ops->pte_entry) 113 err = walk_pmd_range(pud, addr, next, walk); 114 if (err) 115 break; 116 } while (pud++, addr = next, addr != end); 117 118 return err; 119 } 120 121 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 122 struct mm_walk *walk) 123 { 124 p4d_t *p4d; 125 unsigned long next; 126 const struct mm_walk_ops *ops = walk->ops; 127 int err = 0; 128 129 p4d = p4d_offset(pgd, addr); 130 do { 131 next = p4d_addr_end(addr, end); 132 if (p4d_none_or_clear_bad(p4d)) { 133 if (ops->pte_hole) 134 err = ops->pte_hole(addr, next, walk); 135 if (err) 136 break; 137 continue; 138 } 139 if (ops->pmd_entry || ops->pte_entry) 140 err = walk_pud_range(p4d, addr, next, walk); 141 if (err) 142 break; 143 } while (p4d++, addr = next, addr != end); 144 145 return err; 146 } 147 148 static int walk_pgd_range(unsigned long addr, unsigned long end, 149 struct mm_walk *walk) 150 { 151 pgd_t *pgd; 152 unsigned long next; 153 const struct mm_walk_ops *ops = walk->ops; 154 int err = 0; 155 156 pgd = pgd_offset(walk->mm, addr); 157 do { 158 next = pgd_addr_end(addr, end); 159 if (pgd_none_or_clear_bad(pgd)) { 160 if (ops->pte_hole) 161 err = ops->pte_hole(addr, next, walk); 162 if (err) 163 break; 164 continue; 165 } 166 if (ops->pmd_entry || ops->pte_entry) 167 err = walk_p4d_range(pgd, addr, next, walk); 168 if (err) 169 break; 170 } while (pgd++, addr = next, addr != end); 171 172 return err; 173 } 174 175 #ifdef CONFIG_HUGETLB_PAGE 176 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 177 unsigned long end) 178 { 179 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 180 return boundary < end ? boundary : end; 181 } 182 183 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 184 struct mm_walk *walk) 185 { 186 struct vm_area_struct *vma = walk->vma; 187 struct hstate *h = hstate_vma(vma); 188 unsigned long next; 189 unsigned long hmask = huge_page_mask(h); 190 unsigned long sz = huge_page_size(h); 191 pte_t *pte; 192 const struct mm_walk_ops *ops = walk->ops; 193 int err = 0; 194 195 do { 196 next = hugetlb_entry_end(h, addr, end); 197 pte = huge_pte_offset(walk->mm, addr & hmask, sz); 198 199 if (pte) 200 err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 201 else if (ops->pte_hole) 202 err = ops->pte_hole(addr, next, walk); 203 204 if (err) 205 break; 206 } while (addr = next, addr != end); 207 208 return err; 209 } 210 211 #else /* CONFIG_HUGETLB_PAGE */ 212 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 213 struct mm_walk *walk) 214 { 215 return 0; 216 } 217 218 #endif /* CONFIG_HUGETLB_PAGE */ 219 220 /* 221 * Decide whether we really walk over the current vma on [@start, @end) 222 * or skip it via the returned value. Return 0 if we do walk over the 223 * current vma, and return 1 if we skip the vma. Negative values means 224 * error, where we abort the current walk. 225 */ 226 static int walk_page_test(unsigned long start, unsigned long end, 227 struct mm_walk *walk) 228 { 229 struct vm_area_struct *vma = walk->vma; 230 const struct mm_walk_ops *ops = walk->ops; 231 232 if (ops->test_walk) 233 return ops->test_walk(start, end, walk); 234 235 /* 236 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 237 * range, so we don't walk over it as we do for normal vmas. However, 238 * Some callers are interested in handling hole range and they don't 239 * want to just ignore any single address range. Such users certainly 240 * define their ->pte_hole() callbacks, so let's delegate them to handle 241 * vma(VM_PFNMAP). 242 */ 243 if (vma->vm_flags & VM_PFNMAP) { 244 int err = 1; 245 if (ops->pte_hole) 246 err = ops->pte_hole(start, end, walk); 247 return err ? err : 1; 248 } 249 return 0; 250 } 251 252 static int __walk_page_range(unsigned long start, unsigned long end, 253 struct mm_walk *walk) 254 { 255 int err = 0; 256 struct vm_area_struct *vma = walk->vma; 257 const struct mm_walk_ops *ops = walk->ops; 258 259 if (vma && ops->pre_vma) { 260 err = ops->pre_vma(start, end, walk); 261 if (err) 262 return err; 263 } 264 265 if (vma && is_vm_hugetlb_page(vma)) { 266 if (ops->hugetlb_entry) 267 err = walk_hugetlb_range(start, end, walk); 268 } else 269 err = walk_pgd_range(start, end, walk); 270 271 if (vma && ops->post_vma) 272 ops->post_vma(walk); 273 274 return err; 275 } 276 277 /** 278 * walk_page_range - walk page table with caller specific callbacks 279 * @mm: mm_struct representing the target process of page table walk 280 * @start: start address of the virtual address range 281 * @end: end address of the virtual address range 282 * @ops: operation to call during the walk 283 * @private: private data for callbacks' usage 284 * 285 * Recursively walk the page table tree of the process represented by @mm 286 * within the virtual address range [@start, @end). During walking, we can do 287 * some caller-specific works for each entry, by setting up pmd_entry(), 288 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 289 * callbacks, the associated entries/pages are just ignored. 290 * The return values of these callbacks are commonly defined like below: 291 * 292 * - 0 : succeeded to handle the current entry, and if you don't reach the 293 * end address yet, continue to walk. 294 * - >0 : succeeded to handle the current entry, and return to the caller 295 * with caller specific value. 296 * - <0 : failed to handle the current entry, and return to the caller 297 * with error code. 298 * 299 * Before starting to walk page table, some callers want to check whether 300 * they really want to walk over the current vma, typically by checking 301 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 302 * purpose. 303 * 304 * If operations need to be staged before and committed after a vma is walked, 305 * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), 306 * since it is intended to handle commit-type operations, can't return any 307 * errors. 308 * 309 * struct mm_walk keeps current values of some common data like vma and pmd, 310 * which are useful for the access from callbacks. If you want to pass some 311 * caller-specific data to callbacks, @private should be helpful. 312 * 313 * Locking: 314 * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem, 315 * because these function traverse vma list and/or access to vma's data. 316 */ 317 int walk_page_range(struct mm_struct *mm, unsigned long start, 318 unsigned long end, const struct mm_walk_ops *ops, 319 void *private) 320 { 321 int err = 0; 322 unsigned long next; 323 struct vm_area_struct *vma; 324 struct mm_walk walk = { 325 .ops = ops, 326 .mm = mm, 327 .private = private, 328 }; 329 330 if (start >= end) 331 return -EINVAL; 332 333 if (!walk.mm) 334 return -EINVAL; 335 336 lockdep_assert_held(&walk.mm->mmap_sem); 337 338 vma = find_vma(walk.mm, start); 339 do { 340 if (!vma) { /* after the last vma */ 341 walk.vma = NULL; 342 next = end; 343 } else if (start < vma->vm_start) { /* outside vma */ 344 walk.vma = NULL; 345 next = min(end, vma->vm_start); 346 } else { /* inside vma */ 347 walk.vma = vma; 348 next = min(end, vma->vm_end); 349 vma = vma->vm_next; 350 351 err = walk_page_test(start, next, &walk); 352 if (err > 0) { 353 /* 354 * positive return values are purely for 355 * controlling the pagewalk, so should never 356 * be passed to the callers. 357 */ 358 err = 0; 359 continue; 360 } 361 if (err < 0) 362 break; 363 } 364 if (walk.vma || walk.ops->pte_hole) 365 err = __walk_page_range(start, next, &walk); 366 if (err) 367 break; 368 } while (start = next, start < end); 369 return err; 370 } 371 372 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 373 void *private) 374 { 375 struct mm_walk walk = { 376 .ops = ops, 377 .mm = vma->vm_mm, 378 .vma = vma, 379 .private = private, 380 }; 381 int err; 382 383 if (!walk.mm) 384 return -EINVAL; 385 386 lockdep_assert_held(&walk.mm->mmap_sem); 387 388 err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 389 if (err > 0) 390 return 0; 391 if (err < 0) 392 return err; 393 return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 394 } 395 396 /** 397 * walk_page_mapping - walk all memory areas mapped into a struct address_space. 398 * @mapping: Pointer to the struct address_space 399 * @first_index: First page offset in the address_space 400 * @nr: Number of incremental page offsets to cover 401 * @ops: operation to call during the walk 402 * @private: private data for callbacks' usage 403 * 404 * This function walks all memory areas mapped into a struct address_space. 405 * The walk is limited to only the given page-size index range, but if 406 * the index boundaries cross a huge page-table entry, that entry will be 407 * included. 408 * 409 * Also see walk_page_range() for additional information. 410 * 411 * Locking: 412 * This function can't require that the struct mm_struct::mmap_sem is held, 413 * since @mapping may be mapped by multiple processes. Instead 414 * @mapping->i_mmap_rwsem must be held. This might have implications in the 415 * callbacks, and it's up tho the caller to ensure that the 416 * struct mm_struct::mmap_sem is not needed. 417 * 418 * Also this means that a caller can't rely on the struct 419 * vm_area_struct::vm_flags to be constant across a call, 420 * except for immutable flags. Callers requiring this shouldn't use 421 * this function. 422 * 423 * Return: 0 on success, negative error code on failure, positive number on 424 * caller defined premature termination. 425 */ 426 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, 427 pgoff_t nr, const struct mm_walk_ops *ops, 428 void *private) 429 { 430 struct mm_walk walk = { 431 .ops = ops, 432 .private = private, 433 }; 434 struct vm_area_struct *vma; 435 pgoff_t vba, vea, cba, cea; 436 unsigned long start_addr, end_addr; 437 int err = 0; 438 439 lockdep_assert_held(&mapping->i_mmap_rwsem); 440 vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, 441 first_index + nr - 1) { 442 /* Clip to the vma */ 443 vba = vma->vm_pgoff; 444 vea = vba + vma_pages(vma); 445 cba = first_index; 446 cba = max(cba, vba); 447 cea = first_index + nr; 448 cea = min(cea, vea); 449 450 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; 451 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; 452 if (start_addr >= end_addr) 453 continue; 454 455 walk.vma = vma; 456 walk.mm = vma->vm_mm; 457 458 err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 459 if (err > 0) { 460 err = 0; 461 break; 462 } else if (err < 0) 463 break; 464 465 err = __walk_page_range(start_addr, end_addr, &walk); 466 if (err) 467 break; 468 } 469 470 return err; 471 } 472