1 #include <linux/mm.h> 2 #include <linux/highmem.h> 3 #include <linux/sched.h> 4 #include <linux/hugetlb.h> 5 6 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 7 struct mm_walk *walk) 8 { 9 pte_t *pte; 10 int err = 0; 11 12 pte = pte_offset_map(pmd, addr); 13 for (;;) { 14 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 15 if (err) 16 break; 17 addr += PAGE_SIZE; 18 if (addr == end) 19 break; 20 pte++; 21 } 22 23 pte_unmap(pte); 24 return err; 25 } 26 27 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 28 struct mm_walk *walk) 29 { 30 pmd_t *pmd; 31 unsigned long next; 32 int err = 0; 33 34 pmd = pmd_offset(pud, addr); 35 do { 36 again: 37 next = pmd_addr_end(addr, end); 38 if (pmd_none(*pmd) || !walk->vma) { 39 if (walk->pte_hole) 40 err = walk->pte_hole(addr, next, walk); 41 if (err) 42 break; 43 continue; 44 } 45 /* 46 * This implies that each ->pmd_entry() handler 47 * needs to know about pmd_trans_huge() pmds 48 */ 49 if (walk->pmd_entry) 50 err = walk->pmd_entry(pmd, addr, next, walk); 51 if (err) 52 break; 53 54 /* 55 * Check this here so we only break down trans_huge 56 * pages when we _need_ to 57 */ 58 if (!walk->pte_entry) 59 continue; 60 61 split_huge_pmd(walk->vma, pmd, addr); 62 if (pmd_trans_unstable(pmd)) 63 goto again; 64 err = walk_pte_range(pmd, addr, next, walk); 65 if (err) 66 break; 67 } while (pmd++, addr = next, addr != end); 68 69 return err; 70 } 71 72 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 73 struct mm_walk *walk) 74 { 75 pud_t *pud; 76 unsigned long next; 77 int err = 0; 78 79 pud = pud_offset(p4d, addr); 80 do { 81 again: 82 next = pud_addr_end(addr, end); 83 if (pud_none(*pud) || !walk->vma) { 84 if (walk->pte_hole) 85 err = walk->pte_hole(addr, next, walk); 86 if (err) 87 break; 88 continue; 89 } 90 91 if (walk->pud_entry) { 92 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 93 94 if (ptl) { 95 err = walk->pud_entry(pud, addr, next, walk); 96 spin_unlock(ptl); 97 if (err) 98 break; 99 continue; 100 } 101 } 102 103 split_huge_pud(walk->vma, pud, addr); 104 if (pud_none(*pud)) 105 goto again; 106 107 if (walk->pmd_entry || walk->pte_entry) 108 err = walk_pmd_range(pud, addr, next, walk); 109 if (err) 110 break; 111 } while (pud++, addr = next, addr != end); 112 113 return err; 114 } 115 116 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 117 struct mm_walk *walk) 118 { 119 p4d_t *p4d; 120 unsigned long next; 121 int err = 0; 122 123 p4d = p4d_offset(pgd, addr); 124 do { 125 next = p4d_addr_end(addr, end); 126 if (p4d_none_or_clear_bad(p4d)) { 127 if (walk->pte_hole) 128 err = walk->pte_hole(addr, next, walk); 129 if (err) 130 break; 131 continue; 132 } 133 if (walk->pmd_entry || walk->pte_entry) 134 err = walk_pud_range(p4d, addr, next, walk); 135 if (err) 136 break; 137 } while (p4d++, addr = next, addr != end); 138 139 return err; 140 } 141 142 static int walk_pgd_range(unsigned long addr, unsigned long end, 143 struct mm_walk *walk) 144 { 145 pgd_t *pgd; 146 unsigned long next; 147 int err = 0; 148 149 pgd = pgd_offset(walk->mm, addr); 150 do { 151 next = pgd_addr_end(addr, end); 152 if (pgd_none_or_clear_bad(pgd)) { 153 if (walk->pte_hole) 154 err = walk->pte_hole(addr, next, walk); 155 if (err) 156 break; 157 continue; 158 } 159 if (walk->pmd_entry || walk->pte_entry) 160 err = walk_p4d_range(pgd, addr, next, walk); 161 if (err) 162 break; 163 } while (pgd++, addr = next, addr != end); 164 165 return err; 166 } 167 168 #ifdef CONFIG_HUGETLB_PAGE 169 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 170 unsigned long end) 171 { 172 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 173 return boundary < end ? boundary : end; 174 } 175 176 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 177 struct mm_walk *walk) 178 { 179 struct vm_area_struct *vma = walk->vma; 180 struct hstate *h = hstate_vma(vma); 181 unsigned long next; 182 unsigned long hmask = huge_page_mask(h); 183 unsigned long sz = huge_page_size(h); 184 pte_t *pte; 185 int err = 0; 186 187 do { 188 next = hugetlb_entry_end(h, addr, end); 189 pte = huge_pte_offset(walk->mm, addr & hmask, sz); 190 if (pte && walk->hugetlb_entry) 191 err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 192 if (err) 193 break; 194 } while (addr = next, addr != end); 195 196 return err; 197 } 198 199 #else /* CONFIG_HUGETLB_PAGE */ 200 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 201 struct mm_walk *walk) 202 { 203 return 0; 204 } 205 206 #endif /* CONFIG_HUGETLB_PAGE */ 207 208 /* 209 * Decide whether we really walk over the current vma on [@start, @end) 210 * or skip it via the returned value. Return 0 if we do walk over the 211 * current vma, and return 1 if we skip the vma. Negative values means 212 * error, where we abort the current walk. 213 */ 214 static int walk_page_test(unsigned long start, unsigned long end, 215 struct mm_walk *walk) 216 { 217 struct vm_area_struct *vma = walk->vma; 218 219 if (walk->test_walk) 220 return walk->test_walk(start, end, walk); 221 222 /* 223 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 224 * range, so we don't walk over it as we do for normal vmas. However, 225 * Some callers are interested in handling hole range and they don't 226 * want to just ignore any single address range. Such users certainly 227 * define their ->pte_hole() callbacks, so let's delegate them to handle 228 * vma(VM_PFNMAP). 229 */ 230 if (vma->vm_flags & VM_PFNMAP) { 231 int err = 1; 232 if (walk->pte_hole) 233 err = walk->pte_hole(start, end, walk); 234 return err ? err : 1; 235 } 236 return 0; 237 } 238 239 static int __walk_page_range(unsigned long start, unsigned long end, 240 struct mm_walk *walk) 241 { 242 int err = 0; 243 struct vm_area_struct *vma = walk->vma; 244 245 if (vma && is_vm_hugetlb_page(vma)) { 246 if (walk->hugetlb_entry) 247 err = walk_hugetlb_range(start, end, walk); 248 } else 249 err = walk_pgd_range(start, end, walk); 250 251 return err; 252 } 253 254 /** 255 * walk_page_range - walk page table with caller specific callbacks 256 * 257 * Recursively walk the page table tree of the process represented by @walk->mm 258 * within the virtual address range [@start, @end). During walking, we can do 259 * some caller-specific works for each entry, by setting up pmd_entry(), 260 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 261 * callbacks, the associated entries/pages are just ignored. 262 * The return values of these callbacks are commonly defined like below: 263 * - 0 : succeeded to handle the current entry, and if you don't reach the 264 * end address yet, continue to walk. 265 * - >0 : succeeded to handle the current entry, and return to the caller 266 * with caller specific value. 267 * - <0 : failed to handle the current entry, and return to the caller 268 * with error code. 269 * 270 * Before starting to walk page table, some callers want to check whether 271 * they really want to walk over the current vma, typically by checking 272 * its vm_flags. walk_page_test() and @walk->test_walk() are used for this 273 * purpose. 274 * 275 * struct mm_walk keeps current values of some common data like vma and pmd, 276 * which are useful for the access from callbacks. If you want to pass some 277 * caller-specific data to callbacks, @walk->private should be helpful. 278 * 279 * Locking: 280 * Callers of walk_page_range() and walk_page_vma() should hold 281 * @walk->mm->mmap_sem, because these function traverse vma list and/or 282 * access to vma's data. 283 */ 284 int walk_page_range(unsigned long start, unsigned long end, 285 struct mm_walk *walk) 286 { 287 int err = 0; 288 unsigned long next; 289 struct vm_area_struct *vma; 290 291 if (start >= end) 292 return -EINVAL; 293 294 if (!walk->mm) 295 return -EINVAL; 296 297 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 298 299 vma = find_vma(walk->mm, start); 300 do { 301 if (!vma) { /* after the last vma */ 302 walk->vma = NULL; 303 next = end; 304 } else if (start < vma->vm_start) { /* outside vma */ 305 walk->vma = NULL; 306 next = min(end, vma->vm_start); 307 } else { /* inside vma */ 308 walk->vma = vma; 309 next = min(end, vma->vm_end); 310 vma = vma->vm_next; 311 312 err = walk_page_test(start, next, walk); 313 if (err > 0) { 314 /* 315 * positive return values are purely for 316 * controlling the pagewalk, so should never 317 * be passed to the callers. 318 */ 319 err = 0; 320 continue; 321 } 322 if (err < 0) 323 break; 324 } 325 if (walk->vma || walk->pte_hole) 326 err = __walk_page_range(start, next, walk); 327 if (err) 328 break; 329 } while (start = next, start < end); 330 return err; 331 } 332 333 int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) 334 { 335 int err; 336 337 if (!walk->mm) 338 return -EINVAL; 339 340 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 341 VM_BUG_ON(!vma); 342 walk->vma = vma; 343 err = walk_page_test(vma->vm_start, vma->vm_end, walk); 344 if (err > 0) 345 return 0; 346 if (err < 0) 347 return err; 348 return __walk_page_range(vma->vm_start, vma->vm_end, walk); 349 } 350