1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2a520110eSChristoph Hellwig #include <linux/pagewalk.h>
3e6473092SMatt Mackall #include <linux/highmem.h>
4e6473092SMatt Mackall #include <linux/sched.h>
5d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h>
6e6473092SMatt Mackall
7b7a16c7aSSteven Price /*
8b7a16c7aSSteven Price * We want to know the real level where a entry is located ignoring any
9b7a16c7aSSteven Price * folding of levels which may be happening. For example if p4d is folded then
10b7a16c7aSSteven Price * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
11b7a16c7aSSteven Price */
real_depth(int depth)12b7a16c7aSSteven Price static int real_depth(int depth)
13b7a16c7aSSteven Price {
14b7a16c7aSSteven Price if (depth == 3 && PTRS_PER_PMD == 1)
15b7a16c7aSSteven Price depth = 2;
16b7a16c7aSSteven Price if (depth == 2 && PTRS_PER_PUD == 1)
17b7a16c7aSSteven Price depth = 1;
18b7a16c7aSSteven Price if (depth == 1 && PTRS_PER_P4D == 1)
19b7a16c7aSSteven Price depth = 0;
20b7a16c7aSSteven Price return depth;
21b7a16c7aSSteven Price }
22b7a16c7aSSteven Price
walk_pte_range_inner(pte_t * pte,unsigned long addr,unsigned long end,struct mm_walk * walk)23fbf56346SSteven Price static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
24fbf56346SSteven Price unsigned long end, struct mm_walk *walk)
25e6473092SMatt Mackall {
267b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops;
27fbf56346SSteven Price int err = 0;
28e6473092SMatt Mackall
29556637cdSJohannes Weiner for (;;) {
307b86ac33SChristoph Hellwig err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
31e6473092SMatt Mackall if (err)
32e6473092SMatt Mackall break;
33c02a9875SSteven Price if (addr >= end - PAGE_SIZE)
34556637cdSJohannes Weiner break;
35c02a9875SSteven Price addr += PAGE_SIZE;
36556637cdSJohannes Weiner pte++;
37556637cdSJohannes Weiner }
38fbf56346SSteven Price return err;
39fbf56346SSteven Price }
40e6473092SMatt Mackall
walk_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)41fbf56346SSteven Price static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
42fbf56346SSteven Price struct mm_walk *walk)
43fbf56346SSteven Price {
44fbf56346SSteven Price pte_t *pte;
45fbf56346SSteven Price int err = 0;
46fbf56346SSteven Price spinlock_t *ptl;
47fbf56346SSteven Price
48fbf56346SSteven Price if (walk->no_vma) {
49be872f83SHugh Dickins /*
50be872f83SHugh Dickins * pte_offset_map() might apply user-specific validation.
518b1cb4a2SHugh Dickins * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
528b1cb4a2SHugh Dickins * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
538b1cb4a2SHugh Dickins * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
54be872f83SHugh Dickins */
558b1cb4a2SHugh Dickins if (walk->mm == &init_mm || addr >= TASK_SIZE)
56be872f83SHugh Dickins pte = pte_offset_kernel(pmd, addr);
57be872f83SHugh Dickins else
58fbf56346SSteven Price pte = pte_offset_map(pmd, addr);
59be872f83SHugh Dickins if (pte) {
60fbf56346SSteven Price err = walk_pte_range_inner(pte, addr, end, walk);
61*ee40d543SHugh Dickins if (walk->mm != &init_mm && addr < TASK_SIZE)
62fbf56346SSteven Price pte_unmap(pte);
63be872f83SHugh Dickins }
64fbf56346SSteven Price } else {
65fbf56346SSteven Price pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
66be872f83SHugh Dickins if (pte) {
67fbf56346SSteven Price err = walk_pte_range_inner(pte, addr, end, walk);
68ace88f10SThomas Hellstrom pte_unmap_unlock(pte, ptl);
69fbf56346SSteven Price }
70be872f83SHugh Dickins }
71be872f83SHugh Dickins if (!pte)
72be872f83SHugh Dickins walk->action = ACTION_AGAIN;
73e6473092SMatt Mackall return err;
74e6473092SMatt Mackall }
75e6473092SMatt Mackall
76e17eae2bSChristophe Leroy #ifdef CONFIG_ARCH_HAS_HUGEPD
walk_hugepd_range(hugepd_t * phpd,unsigned long addr,unsigned long end,struct mm_walk * walk,int pdshift)77e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
78e17eae2bSChristophe Leroy unsigned long end, struct mm_walk *walk, int pdshift)
79e17eae2bSChristophe Leroy {
80e17eae2bSChristophe Leroy int err = 0;
81e17eae2bSChristophe Leroy const struct mm_walk_ops *ops = walk->ops;
82e17eae2bSChristophe Leroy int shift = hugepd_shift(*phpd);
83e17eae2bSChristophe Leroy int page_size = 1 << shift;
84e17eae2bSChristophe Leroy
85e17eae2bSChristophe Leroy if (!ops->pte_entry)
86e17eae2bSChristophe Leroy return 0;
87e17eae2bSChristophe Leroy
88e17eae2bSChristophe Leroy if (addr & (page_size - 1))
89e17eae2bSChristophe Leroy return 0;
90e17eae2bSChristophe Leroy
91e17eae2bSChristophe Leroy for (;;) {
92e17eae2bSChristophe Leroy pte_t *pte;
93e17eae2bSChristophe Leroy
94e17eae2bSChristophe Leroy spin_lock(&walk->mm->page_table_lock);
95e17eae2bSChristophe Leroy pte = hugepte_offset(*phpd, addr, pdshift);
96e17eae2bSChristophe Leroy err = ops->pte_entry(pte, addr, addr + page_size, walk);
97e17eae2bSChristophe Leroy spin_unlock(&walk->mm->page_table_lock);
98e17eae2bSChristophe Leroy
99e17eae2bSChristophe Leroy if (err)
100e17eae2bSChristophe Leroy break;
101e17eae2bSChristophe Leroy if (addr >= end - page_size)
102e17eae2bSChristophe Leroy break;
103e17eae2bSChristophe Leroy addr += page_size;
104e17eae2bSChristophe Leroy }
105e17eae2bSChristophe Leroy return err;
106e17eae2bSChristophe Leroy }
107e17eae2bSChristophe Leroy #else
walk_hugepd_range(hugepd_t * phpd,unsigned long addr,unsigned long end,struct mm_walk * walk,int pdshift)108e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
109e17eae2bSChristophe Leroy unsigned long end, struct mm_walk *walk, int pdshift)
110e17eae2bSChristophe Leroy {
111e17eae2bSChristophe Leroy return 0;
112e17eae2bSChristophe Leroy }
113e17eae2bSChristophe Leroy #endif
114e17eae2bSChristophe Leroy
walk_pmd_range(pud_t * pud,unsigned long addr,unsigned long end,struct mm_walk * walk)115e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
1162165009bSDave Hansen struct mm_walk *walk)
117e6473092SMatt Mackall {
118e6473092SMatt Mackall pmd_t *pmd;
119e6473092SMatt Mackall unsigned long next;
1207b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops;
121e6473092SMatt Mackall int err = 0;
122b7a16c7aSSteven Price int depth = real_depth(3);
123e6473092SMatt Mackall
124e6473092SMatt Mackall pmd = pmd_offset(pud, addr);
125e6473092SMatt Mackall do {
12603319327SDave Hansen again:
127e6473092SMatt Mackall next = pmd_addr_end(addr, end);
1288782fb61SSteven Price if (pmd_none(*pmd)) {
1297b86ac33SChristoph Hellwig if (ops->pte_hole)
130b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk);
131e6473092SMatt Mackall if (err)
132e6473092SMatt Mackall break;
133e6473092SMatt Mackall continue;
134e6473092SMatt Mackall }
1353afc4236SSteven Price
1363afc4236SSteven Price walk->action = ACTION_SUBTREE;
1373afc4236SSteven Price
13803319327SDave Hansen /*
13903319327SDave Hansen * This implies that each ->pmd_entry() handler
14003319327SDave Hansen * needs to know about pmd_trans_huge() pmds
14103319327SDave Hansen */
1427b86ac33SChristoph Hellwig if (ops->pmd_entry)
1437b86ac33SChristoph Hellwig err = ops->pmd_entry(pmd, addr, next, walk);
14403319327SDave Hansen if (err)
14503319327SDave Hansen break;
14603319327SDave Hansen
1473afc4236SSteven Price if (walk->action == ACTION_AGAIN)
1483afc4236SSteven Price goto again;
1493afc4236SSteven Price
15003319327SDave Hansen /*
15103319327SDave Hansen * Check this here so we only break down trans_huge
15203319327SDave Hansen * pages when we _need_ to
15303319327SDave Hansen */
154488ae6a2SSteven Price if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
155488ae6a2SSteven Price walk->action == ACTION_CONTINUE ||
1563afc4236SSteven Price !(ops->pte_entry))
15703319327SDave Hansen continue;
15803319327SDave Hansen
159be872f83SHugh Dickins if (walk->vma)
16078ddc534SKirill A. Shutemov split_huge_pmd(walk->vma, pmd, addr);
1613afc4236SSteven Price
162e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pmd_val(*pmd))))
163e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
164e17eae2bSChristophe Leroy else
1652165009bSDave Hansen err = walk_pte_range(pmd, addr, next, walk);
166e6473092SMatt Mackall if (err)
167e6473092SMatt Mackall break;
168be872f83SHugh Dickins
169be872f83SHugh Dickins if (walk->action == ACTION_AGAIN)
170be872f83SHugh Dickins goto again;
171be872f83SHugh Dickins
172e6473092SMatt Mackall } while (pmd++, addr = next, addr != end);
173e6473092SMatt Mackall
174e6473092SMatt Mackall return err;
175e6473092SMatt Mackall }
176e6473092SMatt Mackall
walk_pud_range(p4d_t * p4d,unsigned long addr,unsigned long end,struct mm_walk * walk)177c2febafcSKirill A. Shutemov static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
1782165009bSDave Hansen struct mm_walk *walk)
179e6473092SMatt Mackall {
180e6473092SMatt Mackall pud_t *pud;
181e6473092SMatt Mackall unsigned long next;
1827b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops;
183e6473092SMatt Mackall int err = 0;
184b7a16c7aSSteven Price int depth = real_depth(2);
185e6473092SMatt Mackall
186c2febafcSKirill A. Shutemov pud = pud_offset(p4d, addr);
187e6473092SMatt Mackall do {
188a00cc7d9SMatthew Wilcox again:
189e6473092SMatt Mackall next = pud_addr_end(addr, end);
1908782fb61SSteven Price if (pud_none(*pud)) {
1917b86ac33SChristoph Hellwig if (ops->pte_hole)
192b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk);
193e6473092SMatt Mackall if (err)
194e6473092SMatt Mackall break;
195e6473092SMatt Mackall continue;
196e6473092SMatt Mackall }
197a00cc7d9SMatthew Wilcox
1983afc4236SSteven Price walk->action = ACTION_SUBTREE;
199a00cc7d9SMatthew Wilcox
2003afc4236SSteven Price if (ops->pud_entry)
2017b86ac33SChristoph Hellwig err = ops->pud_entry(pud, addr, next, walk);
202a00cc7d9SMatthew Wilcox if (err)
203a00cc7d9SMatthew Wilcox break;
2043afc4236SSteven Price
2053afc4236SSteven Price if (walk->action == ACTION_AGAIN)
2063afc4236SSteven Price goto again;
2073afc4236SSteven Price
208488ae6a2SSteven Price if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
209488ae6a2SSteven Price walk->action == ACTION_CONTINUE ||
2103afc4236SSteven Price !(ops->pmd_entry || ops->pte_entry))
211a00cc7d9SMatthew Wilcox continue;
212a00cc7d9SMatthew Wilcox
213488ae6a2SSteven Price if (walk->vma)
214a00cc7d9SMatthew Wilcox split_huge_pud(walk->vma, pud, addr);
215a00cc7d9SMatthew Wilcox if (pud_none(*pud))
216a00cc7d9SMatthew Wilcox goto again;
217a00cc7d9SMatthew Wilcox
218e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pud_val(*pud))))
219e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
220e17eae2bSChristophe Leroy else
2212165009bSDave Hansen err = walk_pmd_range(pud, addr, next, walk);
222e6473092SMatt Mackall if (err)
223e6473092SMatt Mackall break;
224e6473092SMatt Mackall } while (pud++, addr = next, addr != end);
225e6473092SMatt Mackall
226e6473092SMatt Mackall return err;
227e6473092SMatt Mackall }
228e6473092SMatt Mackall
walk_p4d_range(pgd_t * pgd,unsigned long addr,unsigned long end,struct mm_walk * walk)229c2febafcSKirill A. Shutemov static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
230c2febafcSKirill A. Shutemov struct mm_walk *walk)
231c2febafcSKirill A. Shutemov {
232c2febafcSKirill A. Shutemov p4d_t *p4d;
233c2febafcSKirill A. Shutemov unsigned long next;
2347b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops;
235c2febafcSKirill A. Shutemov int err = 0;
236b7a16c7aSSteven Price int depth = real_depth(1);
237c2febafcSKirill A. Shutemov
238c2febafcSKirill A. Shutemov p4d = p4d_offset(pgd, addr);
239c2febafcSKirill A. Shutemov do {
240c2febafcSKirill A. Shutemov next = p4d_addr_end(addr, end);
241c2febafcSKirill A. Shutemov if (p4d_none_or_clear_bad(p4d)) {
2427b86ac33SChristoph Hellwig if (ops->pte_hole)
243b7a16c7aSSteven Price err = ops->pte_hole(addr, next, depth, walk);
244c2febafcSKirill A. Shutemov if (err)
245c2febafcSKirill A. Shutemov break;
246c2febafcSKirill A. Shutemov continue;
247c2febafcSKirill A. Shutemov }
2483afc4236SSteven Price if (ops->p4d_entry) {
2493afc4236SSteven Price err = ops->p4d_entry(p4d, addr, next, walk);
2503afc4236SSteven Price if (err)
2513afc4236SSteven Price break;
2523afc4236SSteven Price }
253e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(p4d_val(*p4d))))
254e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
255e17eae2bSChristophe Leroy else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
256c2febafcSKirill A. Shutemov err = walk_pud_range(p4d, addr, next, walk);
257c2febafcSKirill A. Shutemov if (err)
258c2febafcSKirill A. Shutemov break;
259c2febafcSKirill A. Shutemov } while (p4d++, addr = next, addr != end);
260c2febafcSKirill A. Shutemov
261c2febafcSKirill A. Shutemov return err;
262c2febafcSKirill A. Shutemov }
263c2febafcSKirill A. Shutemov
walk_pgd_range(unsigned long addr,unsigned long end,struct mm_walk * walk)264fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end,
265fafaa426SNaoya Horiguchi struct mm_walk *walk)
266fafaa426SNaoya Horiguchi {
267fafaa426SNaoya Horiguchi pgd_t *pgd;
268fafaa426SNaoya Horiguchi unsigned long next;
2697b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops;
270fafaa426SNaoya Horiguchi int err = 0;
271fafaa426SNaoya Horiguchi
272e47690d7SSteven Price if (walk->pgd)
273e47690d7SSteven Price pgd = walk->pgd + pgd_index(addr);
274e47690d7SSteven Price else
275fafaa426SNaoya Horiguchi pgd = pgd_offset(walk->mm, addr);
276fafaa426SNaoya Horiguchi do {
277fafaa426SNaoya Horiguchi next = pgd_addr_end(addr, end);
278fafaa426SNaoya Horiguchi if (pgd_none_or_clear_bad(pgd)) {
2797b86ac33SChristoph Hellwig if (ops->pte_hole)
280b7a16c7aSSteven Price err = ops->pte_hole(addr, next, 0, walk);
281fafaa426SNaoya Horiguchi if (err)
282fafaa426SNaoya Horiguchi break;
283fafaa426SNaoya Horiguchi continue;
284fafaa426SNaoya Horiguchi }
2853afc4236SSteven Price if (ops->pgd_entry) {
2863afc4236SSteven Price err = ops->pgd_entry(pgd, addr, next, walk);
2873afc4236SSteven Price if (err)
2883afc4236SSteven Price break;
2893afc4236SSteven Price }
290e17eae2bSChristophe Leroy if (is_hugepd(__hugepd(pgd_val(*pgd))))
291e17eae2bSChristophe Leroy err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
292e17eae2bSChristophe Leroy else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
293c2febafcSKirill A. Shutemov err = walk_p4d_range(pgd, addr, next, walk);
294fafaa426SNaoya Horiguchi if (err)
295fafaa426SNaoya Horiguchi break;
296fafaa426SNaoya Horiguchi } while (pgd++, addr = next, addr != end);
297fafaa426SNaoya Horiguchi
298fafaa426SNaoya Horiguchi return err;
299fafaa426SNaoya Horiguchi }
300fafaa426SNaoya Horiguchi
301116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
hugetlb_entry_end(struct hstate * h,unsigned long addr,unsigned long end)302116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
303116354d1SNaoya Horiguchi unsigned long end)
304116354d1SNaoya Horiguchi {
305116354d1SNaoya Horiguchi unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
306116354d1SNaoya Horiguchi return boundary < end ? boundary : end;
307116354d1SNaoya Horiguchi }
308116354d1SNaoya Horiguchi
walk_hugetlb_range(unsigned long addr,unsigned long end,struct mm_walk * walk)309fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
310116354d1SNaoya Horiguchi struct mm_walk *walk)
311116354d1SNaoya Horiguchi {
312fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma;
313116354d1SNaoya Horiguchi struct hstate *h = hstate_vma(vma);
314116354d1SNaoya Horiguchi unsigned long next;
315116354d1SNaoya Horiguchi unsigned long hmask = huge_page_mask(h);
3167868a208SPunit Agrawal unsigned long sz = huge_page_size(h);
317116354d1SNaoya Horiguchi pte_t *pte;
3187b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops;
319116354d1SNaoya Horiguchi int err = 0;
320116354d1SNaoya Horiguchi
321dd361e50SPeter Xu hugetlb_vma_lock_read(vma);
322116354d1SNaoya Horiguchi do {
323116354d1SNaoya Horiguchi next = hugetlb_entry_end(h, addr, end);
3249c67a207SPeter Xu pte = hugetlb_walk(vma, addr & hmask, sz);
325373c4557SJann Horn if (pte)
3267b86ac33SChristoph Hellwig err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
3277b86ac33SChristoph Hellwig else if (ops->pte_hole)
328b7a16c7aSSteven Price err = ops->pte_hole(addr, next, -1, walk);
329116354d1SNaoya Horiguchi if (err)
330fafaa426SNaoya Horiguchi break;
331116354d1SNaoya Horiguchi } while (addr = next, addr != end);
332dd361e50SPeter Xu hugetlb_vma_unlock_read(vma);
333116354d1SNaoya Horiguchi
334fafaa426SNaoya Horiguchi return err;
335116354d1SNaoya Horiguchi }
3366c6d5280SKOSAKI Motohiro
3376c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */
walk_hugetlb_range(unsigned long addr,unsigned long end,struct mm_walk * walk)338fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
3396c6d5280SKOSAKI Motohiro struct mm_walk *walk)
3406c6d5280SKOSAKI Motohiro {
3416c6d5280SKOSAKI Motohiro return 0;
3426c6d5280SKOSAKI Motohiro }
3436c6d5280SKOSAKI Motohiro
3446c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */
3456c6d5280SKOSAKI Motohiro
346fafaa426SNaoya Horiguchi /*
347fafaa426SNaoya Horiguchi * Decide whether we really walk over the current vma on [@start, @end)
348fafaa426SNaoya Horiguchi * or skip it via the returned value. Return 0 if we do walk over the
349fafaa426SNaoya Horiguchi * current vma, and return 1 if we skip the vma. Negative values means
350fafaa426SNaoya Horiguchi * error, where we abort the current walk.
351e6473092SMatt Mackall */
walk_page_test(unsigned long start,unsigned long end,struct mm_walk * walk)352fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end,
3532165009bSDave Hansen struct mm_walk *walk)
354e6473092SMatt Mackall {
355fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma;
3567b86ac33SChristoph Hellwig const struct mm_walk_ops *ops = walk->ops;
357e6473092SMatt Mackall
3587b86ac33SChristoph Hellwig if (ops->test_walk)
3597b86ac33SChristoph Hellwig return ops->test_walk(start, end, walk);
360fafaa426SNaoya Horiguchi
361fafaa426SNaoya Horiguchi /*
36248684a65SNaoya Horiguchi * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
36348684a65SNaoya Horiguchi * range, so we don't walk over it as we do for normal vmas. However,
36448684a65SNaoya Horiguchi * Some callers are interested in handling hole range and they don't
36548684a65SNaoya Horiguchi * want to just ignore any single address range. Such users certainly
36648684a65SNaoya Horiguchi * define their ->pte_hole() callbacks, so let's delegate them to handle
36748684a65SNaoya Horiguchi * vma(VM_PFNMAP).
368fafaa426SNaoya Horiguchi */
36948684a65SNaoya Horiguchi if (vma->vm_flags & VM_PFNMAP) {
37048684a65SNaoya Horiguchi int err = 1;
3717b86ac33SChristoph Hellwig if (ops->pte_hole)
372b7a16c7aSSteven Price err = ops->pte_hole(start, end, -1, walk);
37348684a65SNaoya Horiguchi return err ? err : 1;
37448684a65SNaoya Horiguchi }
375fafaa426SNaoya Horiguchi return 0;
376fafaa426SNaoya Horiguchi }
377fafaa426SNaoya Horiguchi
__walk_page_range(unsigned long start,unsigned long end,struct mm_walk * walk)378fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end,
379fafaa426SNaoya Horiguchi struct mm_walk *walk)
380fafaa426SNaoya Horiguchi {
381fafaa426SNaoya Horiguchi int err = 0;
382fafaa426SNaoya Horiguchi struct vm_area_struct *vma = walk->vma;
383ecaad8acSThomas Hellstrom const struct mm_walk_ops *ops = walk->ops;
384ecaad8acSThomas Hellstrom
3858782fb61SSteven Price if (ops->pre_vma) {
386ecaad8acSThomas Hellstrom err = ops->pre_vma(start, end, walk);
387ecaad8acSThomas Hellstrom if (err)
388ecaad8acSThomas Hellstrom return err;
389ecaad8acSThomas Hellstrom }
390fafaa426SNaoya Horiguchi
3918782fb61SSteven Price if (is_vm_hugetlb_page(vma)) {
392ecaad8acSThomas Hellstrom if (ops->hugetlb_entry)
393fafaa426SNaoya Horiguchi err = walk_hugetlb_range(start, end, walk);
394fafaa426SNaoya Horiguchi } else
395fafaa426SNaoya Horiguchi err = walk_pgd_range(start, end, walk);
396fafaa426SNaoya Horiguchi
3978782fb61SSteven Price if (ops->post_vma)
398ecaad8acSThomas Hellstrom ops->post_vma(walk);
399ecaad8acSThomas Hellstrom
400e6473092SMatt Mackall return err;
401fafaa426SNaoya Horiguchi }
402fafaa426SNaoya Horiguchi
process_mm_walk_lock(struct mm_struct * mm,enum page_walk_lock walk_lock)40349b06385SSuren Baghdasaryan static inline void process_mm_walk_lock(struct mm_struct *mm,
40449b06385SSuren Baghdasaryan enum page_walk_lock walk_lock)
40549b06385SSuren Baghdasaryan {
40649b06385SSuren Baghdasaryan if (walk_lock == PGWALK_RDLOCK)
40749b06385SSuren Baghdasaryan mmap_assert_locked(mm);
40849b06385SSuren Baghdasaryan else
40949b06385SSuren Baghdasaryan mmap_assert_write_locked(mm);
41049b06385SSuren Baghdasaryan }
41149b06385SSuren Baghdasaryan
process_vma_walk_lock(struct vm_area_struct * vma,enum page_walk_lock walk_lock)41249b06385SSuren Baghdasaryan static inline void process_vma_walk_lock(struct vm_area_struct *vma,
41349b06385SSuren Baghdasaryan enum page_walk_lock walk_lock)
41449b06385SSuren Baghdasaryan {
41549b06385SSuren Baghdasaryan #ifdef CONFIG_PER_VMA_LOCK
41649b06385SSuren Baghdasaryan switch (walk_lock) {
41749b06385SSuren Baghdasaryan case PGWALK_WRLOCK:
41849b06385SSuren Baghdasaryan vma_start_write(vma);
41949b06385SSuren Baghdasaryan break;
42049b06385SSuren Baghdasaryan case PGWALK_WRLOCK_VERIFY:
42149b06385SSuren Baghdasaryan vma_assert_write_locked(vma);
42249b06385SSuren Baghdasaryan break;
42349b06385SSuren Baghdasaryan case PGWALK_RDLOCK:
42449b06385SSuren Baghdasaryan /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
42549b06385SSuren Baghdasaryan break;
42649b06385SSuren Baghdasaryan }
42749b06385SSuren Baghdasaryan #endif
42849b06385SSuren Baghdasaryan }
42949b06385SSuren Baghdasaryan
430fafaa426SNaoya Horiguchi /**
431fafaa426SNaoya Horiguchi * walk_page_range - walk page table with caller specific callbacks
4327b86ac33SChristoph Hellwig * @mm: mm_struct representing the target process of page table walk
433e8b098fcSMike Rapoport * @start: start address of the virtual address range
434e8b098fcSMike Rapoport * @end: end address of the virtual address range
4357b86ac33SChristoph Hellwig * @ops: operation to call during the walk
4367b86ac33SChristoph Hellwig * @private: private data for callbacks' usage
437fafaa426SNaoya Horiguchi *
4387b86ac33SChristoph Hellwig * Recursively walk the page table tree of the process represented by @mm
439fafaa426SNaoya Horiguchi * within the virtual address range [@start, @end). During walking, we can do
440fafaa426SNaoya Horiguchi * some caller-specific works for each entry, by setting up pmd_entry(),
441fafaa426SNaoya Horiguchi * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
442fafaa426SNaoya Horiguchi * callbacks, the associated entries/pages are just ignored.
443fafaa426SNaoya Horiguchi * The return values of these callbacks are commonly defined like below:
444a5d09bedSMike Rapoport *
445fafaa426SNaoya Horiguchi * - 0 : succeeded to handle the current entry, and if you don't reach the
446fafaa426SNaoya Horiguchi * end address yet, continue to walk.
447fafaa426SNaoya Horiguchi * - >0 : succeeded to handle the current entry, and return to the caller
448fafaa426SNaoya Horiguchi * with caller specific value.
449fafaa426SNaoya Horiguchi * - <0 : failed to handle the current entry, and return to the caller
450fafaa426SNaoya Horiguchi * with error code.
451fafaa426SNaoya Horiguchi *
452fafaa426SNaoya Horiguchi * Before starting to walk page table, some callers want to check whether
453fafaa426SNaoya Horiguchi * they really want to walk over the current vma, typically by checking
4547b86ac33SChristoph Hellwig * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
455fafaa426SNaoya Horiguchi * purpose.
456fafaa426SNaoya Horiguchi *
457ecaad8acSThomas Hellstrom * If operations need to be staged before and committed after a vma is walked,
458ecaad8acSThomas Hellstrom * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
459ecaad8acSThomas Hellstrom * since it is intended to handle commit-type operations, can't return any
460ecaad8acSThomas Hellstrom * errors.
461ecaad8acSThomas Hellstrom *
462fafaa426SNaoya Horiguchi * struct mm_walk keeps current values of some common data like vma and pmd,
463fafaa426SNaoya Horiguchi * which are useful for the access from callbacks. If you want to pass some
4647b86ac33SChristoph Hellwig * caller-specific data to callbacks, @private should be helpful.
465fafaa426SNaoya Horiguchi *
466fafaa426SNaoya Horiguchi * Locking:
467c1e8d7c6SMichel Lespinasse * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
4687b86ac33SChristoph Hellwig * because these function traverse vma list and/or access to vma's data.
469fafaa426SNaoya Horiguchi */
walk_page_range(struct mm_struct * mm,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)4707b86ac33SChristoph Hellwig int walk_page_range(struct mm_struct *mm, unsigned long start,
4717b86ac33SChristoph Hellwig unsigned long end, const struct mm_walk_ops *ops,
4727b86ac33SChristoph Hellwig void *private)
473fafaa426SNaoya Horiguchi {
474fafaa426SNaoya Horiguchi int err = 0;
475fafaa426SNaoya Horiguchi unsigned long next;
476fafaa426SNaoya Horiguchi struct vm_area_struct *vma;
4777b86ac33SChristoph Hellwig struct mm_walk walk = {
4787b86ac33SChristoph Hellwig .ops = ops,
4797b86ac33SChristoph Hellwig .mm = mm,
4807b86ac33SChristoph Hellwig .private = private,
4817b86ac33SChristoph Hellwig };
482fafaa426SNaoya Horiguchi
483fafaa426SNaoya Horiguchi if (start >= end)
484fafaa426SNaoya Horiguchi return -EINVAL;
485e6473092SMatt Mackall
4867b86ac33SChristoph Hellwig if (!walk.mm)
4872165009bSDave Hansen return -EINVAL;
4882165009bSDave Hansen
48949b06385SSuren Baghdasaryan process_mm_walk_lock(walk.mm, ops->walk_lock);
490a9ff785eSCliff Wickman
4917b86ac33SChristoph Hellwig vma = find_vma(walk.mm, start);
492e6473092SMatt Mackall do {
493fafaa426SNaoya Horiguchi if (!vma) { /* after the last vma */
4947b86ac33SChristoph Hellwig walk.vma = NULL;
495fafaa426SNaoya Horiguchi next = end;
4968782fb61SSteven Price if (ops->pte_hole)
4978782fb61SSteven Price err = ops->pte_hole(start, next, -1, &walk);
498fafaa426SNaoya Horiguchi } else if (start < vma->vm_start) { /* outside vma */
4997b86ac33SChristoph Hellwig walk.vma = NULL;
500fafaa426SNaoya Horiguchi next = min(end, vma->vm_start);
5018782fb61SSteven Price if (ops->pte_hole)
5028782fb61SSteven Price err = ops->pte_hole(start, next, -1, &walk);
503fafaa426SNaoya Horiguchi } else { /* inside vma */
50449b06385SSuren Baghdasaryan process_vma_walk_lock(vma, ops->walk_lock);
5057b86ac33SChristoph Hellwig walk.vma = vma;
506fafaa426SNaoya Horiguchi next = min(end, vma->vm_end);
5079ec08f30SMatthew Wilcox (Oracle) vma = find_vma(mm, vma->vm_end);
5085f0af70aSDavid Sterba
5097b86ac33SChristoph Hellwig err = walk_page_test(start, next, &walk);
510f6837395SNaoya Horiguchi if (err > 0) {
511f6837395SNaoya Horiguchi /*
512f6837395SNaoya Horiguchi * positive return values are purely for
513f6837395SNaoya Horiguchi * controlling the pagewalk, so should never
514f6837395SNaoya Horiguchi * be passed to the callers.
515f6837395SNaoya Horiguchi */
516f6837395SNaoya Horiguchi err = 0;
517a9ff785eSCliff Wickman continue;
518f6837395SNaoya Horiguchi }
519fafaa426SNaoya Horiguchi if (err < 0)
520fafaa426SNaoya Horiguchi break;
5217b86ac33SChristoph Hellwig err = __walk_page_range(start, next, &walk);
5228782fb61SSteven Price }
5235dc37642SNaoya Horiguchi if (err)
5245dc37642SNaoya Horiguchi break;
525fafaa426SNaoya Horiguchi } while (start = next, start < end);
526e6473092SMatt Mackall return err;
527e6473092SMatt Mackall }
528900fc5f1SNaoya Horiguchi
5298bd3873dSRolf Eike Beer /**
5308bd3873dSRolf Eike Beer * walk_page_range_novma - walk a range of pagetables not backed by a vma
5318bd3873dSRolf Eike Beer * @mm: mm_struct representing the target process of page table walk
5328bd3873dSRolf Eike Beer * @start: start address of the virtual address range
5338bd3873dSRolf Eike Beer * @end: end address of the virtual address range
5348bd3873dSRolf Eike Beer * @ops: operation to call during the walk
5358bd3873dSRolf Eike Beer * @pgd: pgd to walk if different from mm->pgd
5368bd3873dSRolf Eike Beer * @private: private data for callbacks' usage
5378bd3873dSRolf Eike Beer *
538fbf56346SSteven Price * Similar to walk_page_range() but can walk any page tables even if they are
539fbf56346SSteven Price * not backed by VMAs. Because 'unusual' entries may be walked this function
540fbf56346SSteven Price * will also not lock the PTEs for the pte_entry() callback. This is useful for
541fbf56346SSteven Price * walking the kernel pages tables or page tables for firmware.
542fbf56346SSteven Price */
walk_page_range_novma(struct mm_struct * mm,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,pgd_t * pgd,void * private)543488ae6a2SSteven Price int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
544488ae6a2SSteven Price unsigned long end, const struct mm_walk_ops *ops,
545e47690d7SSteven Price pgd_t *pgd,
546488ae6a2SSteven Price void *private)
547488ae6a2SSteven Price {
548488ae6a2SSteven Price struct mm_walk walk = {
549488ae6a2SSteven Price .ops = ops,
550488ae6a2SSteven Price .mm = mm,
551e47690d7SSteven Price .pgd = pgd,
552488ae6a2SSteven Price .private = private,
553488ae6a2SSteven Price .no_vma = true
554488ae6a2SSteven Price };
555488ae6a2SSteven Price
556488ae6a2SSteven Price if (start >= end || !walk.mm)
557488ae6a2SSteven Price return -EINVAL;
558488ae6a2SSteven Price
5598782fb61SSteven Price mmap_assert_write_locked(walk.mm);
560488ae6a2SSteven Price
5618782fb61SSteven Price return walk_pgd_range(start, end, &walk);
562488ae6a2SSteven Price }
563488ae6a2SSteven Price
walk_page_range_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)564e07cda5fSDavid Hildenbrand int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
565e07cda5fSDavid Hildenbrand unsigned long end, const struct mm_walk_ops *ops,
566e07cda5fSDavid Hildenbrand void *private)
567e07cda5fSDavid Hildenbrand {
568e07cda5fSDavid Hildenbrand struct mm_walk walk = {
569e07cda5fSDavid Hildenbrand .ops = ops,
570e07cda5fSDavid Hildenbrand .mm = vma->vm_mm,
571e07cda5fSDavid Hildenbrand .vma = vma,
572e07cda5fSDavid Hildenbrand .private = private,
573e07cda5fSDavid Hildenbrand };
574e07cda5fSDavid Hildenbrand
575e07cda5fSDavid Hildenbrand if (start >= end || !walk.mm)
576e07cda5fSDavid Hildenbrand return -EINVAL;
577e07cda5fSDavid Hildenbrand if (start < vma->vm_start || end > vma->vm_end)
578e07cda5fSDavid Hildenbrand return -EINVAL;
579e07cda5fSDavid Hildenbrand
58049b06385SSuren Baghdasaryan process_mm_walk_lock(walk.mm, ops->walk_lock);
58149b06385SSuren Baghdasaryan process_vma_walk_lock(vma, ops->walk_lock);
582e07cda5fSDavid Hildenbrand return __walk_page_range(start, end, &walk);
583e07cda5fSDavid Hildenbrand }
584e07cda5fSDavid Hildenbrand
walk_page_vma(struct vm_area_struct * vma,const struct mm_walk_ops * ops,void * private)5857b86ac33SChristoph Hellwig int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
5867b86ac33SChristoph Hellwig void *private)
587900fc5f1SNaoya Horiguchi {
5887b86ac33SChristoph Hellwig struct mm_walk walk = {
5897b86ac33SChristoph Hellwig .ops = ops,
5907b86ac33SChristoph Hellwig .mm = vma->vm_mm,
5917b86ac33SChristoph Hellwig .vma = vma,
5927b86ac33SChristoph Hellwig .private = private,
5937b86ac33SChristoph Hellwig };
594900fc5f1SNaoya Horiguchi
5957b86ac33SChristoph Hellwig if (!walk.mm)
596900fc5f1SNaoya Horiguchi return -EINVAL;
597900fc5f1SNaoya Horiguchi
59849b06385SSuren Baghdasaryan process_mm_walk_lock(walk.mm, ops->walk_lock);
59949b06385SSuren Baghdasaryan process_vma_walk_lock(vma, ops->walk_lock);
6007b86ac33SChristoph Hellwig return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
601900fc5f1SNaoya Horiguchi }
602ecaad8acSThomas Hellstrom
603ecaad8acSThomas Hellstrom /**
604ecaad8acSThomas Hellstrom * walk_page_mapping - walk all memory areas mapped into a struct address_space.
605ecaad8acSThomas Hellstrom * @mapping: Pointer to the struct address_space
606ecaad8acSThomas Hellstrom * @first_index: First page offset in the address_space
607ecaad8acSThomas Hellstrom * @nr: Number of incremental page offsets to cover
608ecaad8acSThomas Hellstrom * @ops: operation to call during the walk
609ecaad8acSThomas Hellstrom * @private: private data for callbacks' usage
610ecaad8acSThomas Hellstrom *
611ecaad8acSThomas Hellstrom * This function walks all memory areas mapped into a struct address_space.
612ecaad8acSThomas Hellstrom * The walk is limited to only the given page-size index range, but if
613ecaad8acSThomas Hellstrom * the index boundaries cross a huge page-table entry, that entry will be
614ecaad8acSThomas Hellstrom * included.
615ecaad8acSThomas Hellstrom *
616ecaad8acSThomas Hellstrom * Also see walk_page_range() for additional information.
617ecaad8acSThomas Hellstrom *
618ecaad8acSThomas Hellstrom * Locking:
619c1e8d7c6SMichel Lespinasse * This function can't require that the struct mm_struct::mmap_lock is held,
620ecaad8acSThomas Hellstrom * since @mapping may be mapped by multiple processes. Instead
621ecaad8acSThomas Hellstrom * @mapping->i_mmap_rwsem must be held. This might have implications in the
622ecaad8acSThomas Hellstrom * callbacks, and it's up tho the caller to ensure that the
623c1e8d7c6SMichel Lespinasse * struct mm_struct::mmap_lock is not needed.
624ecaad8acSThomas Hellstrom *
625ecaad8acSThomas Hellstrom * Also this means that a caller can't rely on the struct
626ecaad8acSThomas Hellstrom * vm_area_struct::vm_flags to be constant across a call,
627ecaad8acSThomas Hellstrom * except for immutable flags. Callers requiring this shouldn't use
628ecaad8acSThomas Hellstrom * this function.
629ecaad8acSThomas Hellstrom *
630ecaad8acSThomas Hellstrom * Return: 0 on success, negative error code on failure, positive number on
631ecaad8acSThomas Hellstrom * caller defined premature termination.
632ecaad8acSThomas Hellstrom */
walk_page_mapping(struct address_space * mapping,pgoff_t first_index,pgoff_t nr,const struct mm_walk_ops * ops,void * private)633ecaad8acSThomas Hellstrom int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
634ecaad8acSThomas Hellstrom pgoff_t nr, const struct mm_walk_ops *ops,
635ecaad8acSThomas Hellstrom void *private)
636ecaad8acSThomas Hellstrom {
637ecaad8acSThomas Hellstrom struct mm_walk walk = {
638ecaad8acSThomas Hellstrom .ops = ops,
639ecaad8acSThomas Hellstrom .private = private,
640ecaad8acSThomas Hellstrom };
641ecaad8acSThomas Hellstrom struct vm_area_struct *vma;
642ecaad8acSThomas Hellstrom pgoff_t vba, vea, cba, cea;
643ecaad8acSThomas Hellstrom unsigned long start_addr, end_addr;
644ecaad8acSThomas Hellstrom int err = 0;
645ecaad8acSThomas Hellstrom
646ecaad8acSThomas Hellstrom lockdep_assert_held(&mapping->i_mmap_rwsem);
647ecaad8acSThomas Hellstrom vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
648ecaad8acSThomas Hellstrom first_index + nr - 1) {
649ecaad8acSThomas Hellstrom /* Clip to the vma */
650ecaad8acSThomas Hellstrom vba = vma->vm_pgoff;
651ecaad8acSThomas Hellstrom vea = vba + vma_pages(vma);
652ecaad8acSThomas Hellstrom cba = first_index;
653ecaad8acSThomas Hellstrom cba = max(cba, vba);
654ecaad8acSThomas Hellstrom cea = first_index + nr;
655ecaad8acSThomas Hellstrom cea = min(cea, vea);
656ecaad8acSThomas Hellstrom
657ecaad8acSThomas Hellstrom start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
658ecaad8acSThomas Hellstrom end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
659ecaad8acSThomas Hellstrom if (start_addr >= end_addr)
660ecaad8acSThomas Hellstrom continue;
661ecaad8acSThomas Hellstrom
662ecaad8acSThomas Hellstrom walk.vma = vma;
663ecaad8acSThomas Hellstrom walk.mm = vma->vm_mm;
664ecaad8acSThomas Hellstrom
665ecaad8acSThomas Hellstrom err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
666ecaad8acSThomas Hellstrom if (err > 0) {
667ecaad8acSThomas Hellstrom err = 0;
668ecaad8acSThomas Hellstrom break;
669ecaad8acSThomas Hellstrom } else if (err < 0)
670ecaad8acSThomas Hellstrom break;
671ecaad8acSThomas Hellstrom
672ecaad8acSThomas Hellstrom err = __walk_page_range(start_addr, end_addr, &walk);
673ecaad8acSThomas Hellstrom if (err)
674ecaad8acSThomas Hellstrom break;
675ecaad8acSThomas Hellstrom }
676ecaad8acSThomas Hellstrom
677ecaad8acSThomas Hellstrom return err;
678ecaad8acSThomas Hellstrom }
679