xref: /openbmc/linux/arch/arm64/mm/hugetlbpage.c (revision ed1666f6)
1 /*
2  * arch/arm64/mm/hugetlbpage.c
3  *
4  * Copyright (C) 2013 Linaro Ltd.
5  *
6  * Based on arch/x86/mm/hugetlbpage.c.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  */
17 
18 #include <linux/init.h>
19 #include <linux/fs.h>
20 #include <linux/mm.h>
21 #include <linux/hugetlb.h>
22 #include <linux/pagemap.h>
23 #include <linux/err.h>
24 #include <linux/sysctl.h>
25 #include <asm/mman.h>
26 #include <asm/tlb.h>
27 #include <asm/tlbflush.h>
28 #include <asm/pgalloc.h>
29 
30 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
31 bool arch_hugetlb_migration_supported(struct hstate *h)
32 {
33 	size_t pagesize = huge_page_size(h);
34 
35 	switch (pagesize) {
36 #ifdef CONFIG_ARM64_4K_PAGES
37 	case PUD_SIZE:
38 #endif
39 	case PMD_SIZE:
40 	case CONT_PMD_SIZE:
41 	case CONT_PTE_SIZE:
42 		return true;
43 	}
44 	pr_warn("%s: unrecognized huge page size 0x%lx\n",
45 			__func__, pagesize);
46 	return false;
47 }
48 #endif
49 
50 int pmd_huge(pmd_t pmd)
51 {
52 	return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
53 }
54 
55 int pud_huge(pud_t pud)
56 {
57 #ifndef __PAGETABLE_PMD_FOLDED
58 	return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT);
59 #else
60 	return 0;
61 #endif
62 }
63 
64 /*
65  * Select all bits except the pfn
66  */
67 static inline pgprot_t pte_pgprot(pte_t pte)
68 {
69 	unsigned long pfn = pte_pfn(pte);
70 
71 	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
72 }
73 
74 static int find_num_contig(struct mm_struct *mm, unsigned long addr,
75 			   pte_t *ptep, size_t *pgsize)
76 {
77 	pgd_t *pgdp = pgd_offset(mm, addr);
78 	pud_t *pudp;
79 	pmd_t *pmdp;
80 
81 	*pgsize = PAGE_SIZE;
82 	pudp = pud_offset(pgdp, addr);
83 	pmdp = pmd_offset(pudp, addr);
84 	if ((pte_t *)pmdp == ptep) {
85 		*pgsize = PMD_SIZE;
86 		return CONT_PMDS;
87 	}
88 	return CONT_PTES;
89 }
90 
91 static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
92 {
93 	int contig_ptes = 0;
94 
95 	*pgsize = size;
96 
97 	switch (size) {
98 #ifdef CONFIG_ARM64_4K_PAGES
99 	case PUD_SIZE:
100 #endif
101 	case PMD_SIZE:
102 		contig_ptes = 1;
103 		break;
104 	case CONT_PMD_SIZE:
105 		*pgsize = PMD_SIZE;
106 		contig_ptes = CONT_PMDS;
107 		break;
108 	case CONT_PTE_SIZE:
109 		*pgsize = PAGE_SIZE;
110 		contig_ptes = CONT_PTES;
111 		break;
112 	}
113 
114 	return contig_ptes;
115 }
116 
117 /*
118  * Changing some bits of contiguous entries requires us to follow a
119  * Break-Before-Make approach, breaking the whole contiguous set
120  * before we can change any entries. See ARM DDI 0487A.k_iss10775,
121  * "Misprogramming of the Contiguous bit", page D4-1762.
122  *
123  * This helper performs the break step.
124  */
125 static pte_t get_clear_flush(struct mm_struct *mm,
126 			     unsigned long addr,
127 			     pte_t *ptep,
128 			     unsigned long pgsize,
129 			     unsigned long ncontig)
130 {
131 	pte_t orig_pte = huge_ptep_get(ptep);
132 	bool valid = pte_valid(orig_pte);
133 	unsigned long i, saddr = addr;
134 
135 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
136 		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
137 
138 		/*
139 		 * If HW_AFDBM is enabled, then the HW could turn on
140 		 * the dirty or accessed bit for any page in the set,
141 		 * so check them all.
142 		 */
143 		if (pte_dirty(pte))
144 			orig_pte = pte_mkdirty(orig_pte);
145 
146 		if (pte_young(pte))
147 			orig_pte = pte_mkyoung(orig_pte);
148 	}
149 
150 	if (valid) {
151 		struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
152 		flush_tlb_range(&vma, saddr, addr);
153 	}
154 	return orig_pte;
155 }
156 
157 /*
158  * Changing some bits of contiguous entries requires us to follow a
159  * Break-Before-Make approach, breaking the whole contiguous set
160  * before we can change any entries. See ARM DDI 0487A.k_iss10775,
161  * "Misprogramming of the Contiguous bit", page D4-1762.
162  *
163  * This helper performs the break step for use cases where the
164  * original pte is not needed.
165  */
166 static void clear_flush(struct mm_struct *mm,
167 			     unsigned long addr,
168 			     pte_t *ptep,
169 			     unsigned long pgsize,
170 			     unsigned long ncontig)
171 {
172 	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
173 	unsigned long i, saddr = addr;
174 
175 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
176 		pte_clear(mm, addr, ptep);
177 
178 	flush_tlb_range(&vma, saddr, addr);
179 }
180 
181 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
182 			    pte_t *ptep, pte_t pte)
183 {
184 	size_t pgsize;
185 	int i;
186 	int ncontig;
187 	unsigned long pfn, dpfn;
188 	pgprot_t hugeprot;
189 
190 	/*
191 	 * Code needs to be expanded to handle huge swap and migration
192 	 * entries. Needed for HUGETLB and MEMORY_FAILURE.
193 	 */
194 	WARN_ON(!pte_present(pte));
195 
196 	if (!pte_cont(pte)) {
197 		set_pte_at(mm, addr, ptep, pte);
198 		return;
199 	}
200 
201 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
202 	pfn = pte_pfn(pte);
203 	dpfn = pgsize >> PAGE_SHIFT;
204 	hugeprot = pte_pgprot(pte);
205 
206 	clear_flush(mm, addr, ptep, pgsize, ncontig);
207 
208 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
209 		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
210 }
211 
212 void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
213 			  pte_t *ptep, pte_t pte, unsigned long sz)
214 {
215 	int i, ncontig;
216 	size_t pgsize;
217 
218 	ncontig = num_contig_ptes(sz, &pgsize);
219 
220 	for (i = 0; i < ncontig; i++, ptep++)
221 		set_pte(ptep, pte);
222 }
223 
224 pte_t *huge_pte_alloc(struct mm_struct *mm,
225 		      unsigned long addr, unsigned long sz)
226 {
227 	pgd_t *pgdp;
228 	pud_t *pudp;
229 	pmd_t *pmdp;
230 	pte_t *ptep = NULL;
231 
232 	pgdp = pgd_offset(mm, addr);
233 	pudp = pud_alloc(mm, pgdp, addr);
234 	if (!pudp)
235 		return NULL;
236 
237 	if (sz == PUD_SIZE) {
238 		ptep = (pte_t *)pudp;
239 	} else if (sz == (PAGE_SIZE * CONT_PTES)) {
240 		pmdp = pmd_alloc(mm, pudp, addr);
241 
242 		WARN_ON(addr & (sz - 1));
243 		/*
244 		 * Note that if this code were ever ported to the
245 		 * 32-bit arm platform then it will cause trouble in
246 		 * the case where CONFIG_HIGHPTE is set, since there
247 		 * will be no pte_unmap() to correspond with this
248 		 * pte_alloc_map().
249 		 */
250 		ptep = pte_alloc_map(mm, pmdp, addr);
251 	} else if (sz == PMD_SIZE) {
252 		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
253 		    pud_none(READ_ONCE(*pudp)))
254 			ptep = huge_pmd_share(mm, addr, pudp);
255 		else
256 			ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
257 	} else if (sz == (PMD_SIZE * CONT_PMDS)) {
258 		pmdp = pmd_alloc(mm, pudp, addr);
259 		WARN_ON(addr & (sz - 1));
260 		return (pte_t *)pmdp;
261 	}
262 
263 	return ptep;
264 }
265 
266 pte_t *huge_pte_offset(struct mm_struct *mm,
267 		       unsigned long addr, unsigned long sz)
268 {
269 	pgd_t *pgdp;
270 	pud_t *pudp, pud;
271 	pmd_t *pmdp, pmd;
272 
273 	pgdp = pgd_offset(mm, addr);
274 	if (!pgd_present(READ_ONCE(*pgdp)))
275 		return NULL;
276 
277 	pudp = pud_offset(pgdp, addr);
278 	pud = READ_ONCE(*pudp);
279 	if (sz != PUD_SIZE && pud_none(pud))
280 		return NULL;
281 	/* hugepage or swap? */
282 	if (pud_huge(pud) || !pud_present(pud))
283 		return (pte_t *)pudp;
284 	/* table; check the next level */
285 
286 	if (sz == CONT_PMD_SIZE)
287 		addr &= CONT_PMD_MASK;
288 
289 	pmdp = pmd_offset(pudp, addr);
290 	pmd = READ_ONCE(*pmdp);
291 	if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) &&
292 	    pmd_none(pmd))
293 		return NULL;
294 	if (pmd_huge(pmd) || !pmd_present(pmd))
295 		return (pte_t *)pmdp;
296 
297 	if (sz == CONT_PTE_SIZE)
298 		return pte_offset_kernel(pmdp, (addr & CONT_PTE_MASK));
299 
300 	return NULL;
301 }
302 
303 pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
304 			 struct page *page, int writable)
305 {
306 	size_t pagesize = huge_page_size(hstate_vma(vma));
307 
308 	if (pagesize == CONT_PTE_SIZE) {
309 		entry = pte_mkcont(entry);
310 	} else if (pagesize == CONT_PMD_SIZE) {
311 		entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
312 	} else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) {
313 		pr_warn("%s: unrecognized huge page size 0x%lx\n",
314 			__func__, pagesize);
315 	}
316 	return entry;
317 }
318 
319 void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
320 		    pte_t *ptep, unsigned long sz)
321 {
322 	int i, ncontig;
323 	size_t pgsize;
324 
325 	ncontig = num_contig_ptes(sz, &pgsize);
326 
327 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
328 		pte_clear(mm, addr, ptep);
329 }
330 
331 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
332 			      unsigned long addr, pte_t *ptep)
333 {
334 	int ncontig;
335 	size_t pgsize;
336 	pte_t orig_pte = huge_ptep_get(ptep);
337 
338 	if (!pte_cont(orig_pte))
339 		return ptep_get_and_clear(mm, addr, ptep);
340 
341 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
342 
343 	return get_clear_flush(mm, addr, ptep, pgsize, ncontig);
344 }
345 
346 /*
347  * huge_ptep_set_access_flags will update access flags (dirty, accesssed)
348  * and write permission.
349  *
350  * For a contiguous huge pte range we need to check whether or not write
351  * permission has to change only on the first pte in the set. Then for
352  * all the contiguous ptes we need to check whether or not there is a
353  * discrepancy between dirty or young.
354  */
355 static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
356 {
357 	int i;
358 
359 	if (pte_write(pte) != pte_write(huge_ptep_get(ptep)))
360 		return 1;
361 
362 	for (i = 0; i < ncontig; i++) {
363 		pte_t orig_pte = huge_ptep_get(ptep + i);
364 
365 		if (pte_dirty(pte) != pte_dirty(orig_pte))
366 			return 1;
367 
368 		if (pte_young(pte) != pte_young(orig_pte))
369 			return 1;
370 	}
371 
372 	return 0;
373 }
374 
375 int huge_ptep_set_access_flags(struct vm_area_struct *vma,
376 			       unsigned long addr, pte_t *ptep,
377 			       pte_t pte, int dirty)
378 {
379 	int ncontig, i;
380 	size_t pgsize = 0;
381 	unsigned long pfn = pte_pfn(pte), dpfn;
382 	pgprot_t hugeprot;
383 	pte_t orig_pte;
384 
385 	if (!pte_cont(pte))
386 		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
387 
388 	ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize);
389 	dpfn = pgsize >> PAGE_SHIFT;
390 
391 	if (!__cont_access_flags_changed(ptep, pte, ncontig))
392 		return 0;
393 
394 	orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
395 
396 	/* Make sure we don't lose the dirty or young state */
397 	if (pte_dirty(orig_pte))
398 		pte = pte_mkdirty(pte);
399 
400 	if (pte_young(orig_pte))
401 		pte = pte_mkyoung(pte);
402 
403 	hugeprot = pte_pgprot(pte);
404 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
405 		set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot));
406 
407 	return 1;
408 }
409 
410 void huge_ptep_set_wrprotect(struct mm_struct *mm,
411 			     unsigned long addr, pte_t *ptep)
412 {
413 	unsigned long pfn, dpfn;
414 	pgprot_t hugeprot;
415 	int ncontig, i;
416 	size_t pgsize;
417 	pte_t pte;
418 
419 	if (!pte_cont(READ_ONCE(*ptep))) {
420 		ptep_set_wrprotect(mm, addr, ptep);
421 		return;
422 	}
423 
424 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
425 	dpfn = pgsize >> PAGE_SHIFT;
426 
427 	pte = get_clear_flush(mm, addr, ptep, pgsize, ncontig);
428 	pte = pte_wrprotect(pte);
429 
430 	hugeprot = pte_pgprot(pte);
431 	pfn = pte_pfn(pte);
432 
433 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
434 		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
435 }
436 
437 void huge_ptep_clear_flush(struct vm_area_struct *vma,
438 			   unsigned long addr, pte_t *ptep)
439 {
440 	size_t pgsize;
441 	int ncontig;
442 
443 	if (!pte_cont(READ_ONCE(*ptep))) {
444 		ptep_clear_flush(vma, addr, ptep);
445 		return;
446 	}
447 
448 	ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize);
449 	clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
450 }
451 
452 static void __init add_huge_page_size(unsigned long size)
453 {
454 	if (size_to_hstate(size))
455 		return;
456 
457 	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
458 }
459 
460 static int __init hugetlbpage_init(void)
461 {
462 #ifdef CONFIG_ARM64_4K_PAGES
463 	add_huge_page_size(PUD_SIZE);
464 #endif
465 	add_huge_page_size(PMD_SIZE * CONT_PMDS);
466 	add_huge_page_size(PMD_SIZE);
467 	add_huge_page_size(PAGE_SIZE * CONT_PTES);
468 
469 	return 0;
470 }
471 arch_initcall(hugetlbpage_init);
472 
473 static __init int setup_hugepagesz(char *opt)
474 {
475 	unsigned long ps = memparse(opt, &opt);
476 
477 	switch (ps) {
478 #ifdef CONFIG_ARM64_4K_PAGES
479 	case PUD_SIZE:
480 #endif
481 	case PMD_SIZE * CONT_PMDS:
482 	case PMD_SIZE:
483 	case PAGE_SIZE * CONT_PTES:
484 		add_huge_page_size(ps);
485 		return 1;
486 	}
487 
488 	hugetlb_bad_size();
489 	pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
490 	return 0;
491 }
492 __setup("hugepagesz=", setup_hugepagesz);
493