1 #include <linux/mm.h> 2 #include <linux/highmem.h> 3 #include <linux/sched.h> 4 #include <linux/hugetlb.h> 5 6 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 7 struct mm_walk *walk) 8 { 9 pte_t *pte; 10 int err = 0; 11 12 pte = pte_offset_map(pmd, addr); 13 for (;;) { 14 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 15 if (err) 16 break; 17 addr += PAGE_SIZE; 18 if (addr == end) 19 break; 20 pte++; 21 } 22 23 pte_unmap(pte); 24 return err; 25 } 26 27 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 28 struct mm_walk *walk) 29 { 30 pmd_t *pmd; 31 unsigned long next; 32 int err = 0; 33 34 pmd = pmd_offset(pud, addr); 35 do { 36 again: 37 next = pmd_addr_end(addr, end); 38 if (pmd_none(*pmd) || !walk->vma) { 39 if (walk->pte_hole) 40 err = walk->pte_hole(addr, next, walk); 41 if (err) 42 break; 43 continue; 44 } 45 /* 46 * This implies that each ->pmd_entry() handler 47 * needs to know about pmd_trans_huge() pmds 48 */ 49 if (walk->pmd_entry) 50 err = walk->pmd_entry(pmd, addr, next, walk); 51 if (err) 52 break; 53 54 /* 55 * Check this here so we only break down trans_huge 56 * pages when we _need_ to 57 */ 58 if (!walk->pte_entry) 59 continue; 60 61 split_huge_pmd(walk->vma, pmd, addr); 62 if (pmd_trans_unstable(pmd)) 63 goto again; 64 err = walk_pte_range(pmd, addr, next, walk); 65 if (err) 66 break; 67 } while (pmd++, addr = next, addr != end); 68 69 return err; 70 } 71 72 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 73 struct mm_walk *walk) 74 { 75 pud_t *pud; 76 unsigned long next; 77 int err = 0; 78 79 pud = pud_offset(p4d, addr); 80 do { 81 again: 82 next = pud_addr_end(addr, end); 83 if (pud_none(*pud) || !walk->vma) { 84 if (walk->pte_hole) 85 err = walk->pte_hole(addr, next, walk); 86 if (err) 87 break; 88 continue; 89 } 90 91 if (walk->pud_entry) { 92 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 93 94 if (ptl) { 95 err = walk->pud_entry(pud, addr, next, walk); 96 spin_unlock(ptl); 97 if (err) 98 break; 99 continue; 100 } 101 } 102 103 split_huge_pud(walk->vma, pud, addr); 104 if (pud_none(*pud)) 105 goto again; 106 107 if (walk->pmd_entry || walk->pte_entry) 108 err = walk_pmd_range(pud, addr, next, walk); 109 if (err) 110 break; 111 } while (pud++, addr = next, addr != end); 112 113 return err; 114 } 115 116 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 117 struct mm_walk *walk) 118 { 119 p4d_t *p4d; 120 unsigned long next; 121 int err = 0; 122 123 p4d = p4d_offset(pgd, addr); 124 do { 125 next = p4d_addr_end(addr, end); 126 if (p4d_none_or_clear_bad(p4d)) { 127 if (walk->pte_hole) 128 err = walk->pte_hole(addr, next, walk); 129 if (err) 130 break; 131 continue; 132 } 133 if (walk->pmd_entry || walk->pte_entry) 134 err = walk_pud_range(p4d, addr, next, walk); 135 if (err) 136 break; 137 } while (p4d++, addr = next, addr != end); 138 139 return err; 140 } 141 142 static int walk_pgd_range(unsigned long addr, unsigned long end, 143 struct mm_walk *walk) 144 { 145 pgd_t *pgd; 146 unsigned long next; 147 int err = 0; 148 149 pgd = pgd_offset(walk->mm, addr); 150 do { 151 next = pgd_addr_end(addr, end); 152 if (pgd_none_or_clear_bad(pgd)) { 153 if (walk->pte_hole) 154 err = walk->pte_hole(addr, next, walk); 155 if (err) 156 break; 157 continue; 158 } 159 if (walk->pmd_entry || walk->pte_entry) 160 err = walk_p4d_range(pgd, addr, next, walk); 161 if (err) 162 break; 163 } while (pgd++, addr = next, addr != end); 164 165 return err; 166 } 167 168 #ifdef CONFIG_HUGETLB_PAGE 169 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 170 unsigned long end) 171 { 172 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 173 return boundary < end ? boundary : end; 174 } 175 176 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 177 struct mm_walk *walk) 178 { 179 struct vm_area_struct *vma = walk->vma; 180 struct hstate *h = hstate_vma(vma); 181 unsigned long next; 182 unsigned long hmask = huge_page_mask(h); 183 pte_t *pte; 184 int err = 0; 185 186 do { 187 next = hugetlb_entry_end(h, addr, end); 188 pte = huge_pte_offset(walk->mm, addr & hmask); 189 if (pte && walk->hugetlb_entry) 190 err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 191 if (err) 192 break; 193 } while (addr = next, addr != end); 194 195 return err; 196 } 197 198 #else /* CONFIG_HUGETLB_PAGE */ 199 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 200 struct mm_walk *walk) 201 { 202 return 0; 203 } 204 205 #endif /* CONFIG_HUGETLB_PAGE */ 206 207 /* 208 * Decide whether we really walk over the current vma on [@start, @end) 209 * or skip it via the returned value. Return 0 if we do walk over the 210 * current vma, and return 1 if we skip the vma. Negative values means 211 * error, where we abort the current walk. 212 */ 213 static int walk_page_test(unsigned long start, unsigned long end, 214 struct mm_walk *walk) 215 { 216 struct vm_area_struct *vma = walk->vma; 217 218 if (walk->test_walk) 219 return walk->test_walk(start, end, walk); 220 221 /* 222 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 223 * range, so we don't walk over it as we do for normal vmas. However, 224 * Some callers are interested in handling hole range and they don't 225 * want to just ignore any single address range. Such users certainly 226 * define their ->pte_hole() callbacks, so let's delegate them to handle 227 * vma(VM_PFNMAP). 228 */ 229 if (vma->vm_flags & VM_PFNMAP) { 230 int err = 1; 231 if (walk->pte_hole) 232 err = walk->pte_hole(start, end, walk); 233 return err ? err : 1; 234 } 235 return 0; 236 } 237 238 static int __walk_page_range(unsigned long start, unsigned long end, 239 struct mm_walk *walk) 240 { 241 int err = 0; 242 struct vm_area_struct *vma = walk->vma; 243 244 if (vma && is_vm_hugetlb_page(vma)) { 245 if (walk->hugetlb_entry) 246 err = walk_hugetlb_range(start, end, walk); 247 } else 248 err = walk_pgd_range(start, end, walk); 249 250 return err; 251 } 252 253 /** 254 * walk_page_range - walk page table with caller specific callbacks 255 * 256 * Recursively walk the page table tree of the process represented by @walk->mm 257 * within the virtual address range [@start, @end). During walking, we can do 258 * some caller-specific works for each entry, by setting up pmd_entry(), 259 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 260 * callbacks, the associated entries/pages are just ignored. 261 * The return values of these callbacks are commonly defined like below: 262 * - 0 : succeeded to handle the current entry, and if you don't reach the 263 * end address yet, continue to walk. 264 * - >0 : succeeded to handle the current entry, and return to the caller 265 * with caller specific value. 266 * - <0 : failed to handle the current entry, and return to the caller 267 * with error code. 268 * 269 * Before starting to walk page table, some callers want to check whether 270 * they really want to walk over the current vma, typically by checking 271 * its vm_flags. walk_page_test() and @walk->test_walk() are used for this 272 * purpose. 273 * 274 * struct mm_walk keeps current values of some common data like vma and pmd, 275 * which are useful for the access from callbacks. If you want to pass some 276 * caller-specific data to callbacks, @walk->private should be helpful. 277 * 278 * Locking: 279 * Callers of walk_page_range() and walk_page_vma() should hold 280 * @walk->mm->mmap_sem, because these function traverse vma list and/or 281 * access to vma's data. 282 */ 283 int walk_page_range(unsigned long start, unsigned long end, 284 struct mm_walk *walk) 285 { 286 int err = 0; 287 unsigned long next; 288 struct vm_area_struct *vma; 289 290 if (start >= end) 291 return -EINVAL; 292 293 if (!walk->mm) 294 return -EINVAL; 295 296 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 297 298 vma = find_vma(walk->mm, start); 299 do { 300 if (!vma) { /* after the last vma */ 301 walk->vma = NULL; 302 next = end; 303 } else if (start < vma->vm_start) { /* outside vma */ 304 walk->vma = NULL; 305 next = min(end, vma->vm_start); 306 } else { /* inside vma */ 307 walk->vma = vma; 308 next = min(end, vma->vm_end); 309 vma = vma->vm_next; 310 311 err = walk_page_test(start, next, walk); 312 if (err > 0) { 313 /* 314 * positive return values are purely for 315 * controlling the pagewalk, so should never 316 * be passed to the callers. 317 */ 318 err = 0; 319 continue; 320 } 321 if (err < 0) 322 break; 323 } 324 if (walk->vma || walk->pte_hole) 325 err = __walk_page_range(start, next, walk); 326 if (err) 327 break; 328 } while (start = next, start < end); 329 return err; 330 } 331 332 int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) 333 { 334 int err; 335 336 if (!walk->mm) 337 return -EINVAL; 338 339 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 340 VM_BUG_ON(!vma); 341 walk->vma = vma; 342 err = walk_page_test(vma->vm_start, vma->vm_end, walk); 343 if (err > 0) 344 return 0; 345 if (err < 0) 346 return err; 347 return __walk_page_range(vma->vm_start, vma->vm_end, walk); 348 } 349