xref: /openbmc/linux/arch/s390/mm/pgalloc.c (revision 794fa52b)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21e133ab2SMartin Schwidefsky /*
31e133ab2SMartin Schwidefsky  *  Page table allocation functions
41e133ab2SMartin Schwidefsky  *
51e133ab2SMartin Schwidefsky  *    Copyright IBM Corp. 2016
61e133ab2SMartin Schwidefsky  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
71e133ab2SMartin Schwidefsky  */
81e133ab2SMartin Schwidefsky 
91e133ab2SMartin Schwidefsky #include <linux/sysctl.h>
101caf170dSHeiko Carstens #include <linux/slab.h>
111caf170dSHeiko Carstens #include <linux/mm.h>
121e133ab2SMartin Schwidefsky #include <asm/mmu_context.h>
131e133ab2SMartin Schwidefsky #include <asm/pgalloc.h>
141e133ab2SMartin Schwidefsky #include <asm/gmap.h>
151e133ab2SMartin Schwidefsky #include <asm/tlb.h>
161e133ab2SMartin Schwidefsky #include <asm/tlbflush.h>
171e133ab2SMartin Schwidefsky 
181e133ab2SMartin Schwidefsky #ifdef CONFIG_PGSTE
191e133ab2SMartin Schwidefsky 
201e133ab2SMartin Schwidefsky int page_table_allocate_pgste = 0;
211e133ab2SMartin Schwidefsky EXPORT_SYMBOL(page_table_allocate_pgste);
221e133ab2SMartin Schwidefsky 
231e133ab2SMartin Schwidefsky static struct ctl_table page_table_sysctl[] = {
241e133ab2SMartin Schwidefsky 	{
251e133ab2SMartin Schwidefsky 		.procname	= "allocate_pgste",
261e133ab2SMartin Schwidefsky 		.data		= &page_table_allocate_pgste,
271e133ab2SMartin Schwidefsky 		.maxlen		= sizeof(int),
281e133ab2SMartin Schwidefsky 		.mode		= S_IRUGO | S_IWUSR,
295bedf8aaSVasily Gorbik 		.proc_handler	= proc_dointvec_minmax,
30ac7a0fceSVasily Gorbik 		.extra1		= SYSCTL_ZERO,
31ac7a0fceSVasily Gorbik 		.extra2		= SYSCTL_ONE,
321e133ab2SMartin Schwidefsky 	},
331e133ab2SMartin Schwidefsky 	{ }
341e133ab2SMartin Schwidefsky };
351e133ab2SMartin Schwidefsky 
page_table_register_sysctl(void)361e133ab2SMartin Schwidefsky static int __init page_table_register_sysctl(void)
371e133ab2SMartin Schwidefsky {
387ddc873dSLuis Chamberlain 	return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM;
391e133ab2SMartin Schwidefsky }
401e133ab2SMartin Schwidefsky __initcall(page_table_register_sysctl);
411e133ab2SMartin Schwidefsky 
421e133ab2SMartin Schwidefsky #endif /* CONFIG_PGSTE */
431e133ab2SMartin Schwidefsky 
crst_table_alloc(struct mm_struct * mm)441e133ab2SMartin Schwidefsky unsigned long *crst_table_alloc(struct mm_struct *mm)
451e133ab2SMartin Schwidefsky {
466326c26cSVishal Moola (Oracle) 	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
471e133ab2SMartin Schwidefsky 
486326c26cSVishal Moola (Oracle) 	if (!ptdesc)
491e133ab2SMartin Schwidefsky 		return NULL;
506326c26cSVishal Moola (Oracle) 	arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER);
516326c26cSVishal Moola (Oracle) 	return (unsigned long *) ptdesc_to_virt(ptdesc);
521e133ab2SMartin Schwidefsky }
531e133ab2SMartin Schwidefsky 
crst_table_free(struct mm_struct * mm,unsigned long * table)541e133ab2SMartin Schwidefsky void crst_table_free(struct mm_struct *mm, unsigned long *table)
551e133ab2SMartin Schwidefsky {
56*794fa52bSHeiko Carstens 	if (!table)
57*794fa52bSHeiko Carstens 		return;
586326c26cSVishal Moola (Oracle) 	pagetable_free(virt_to_ptdesc(table));
591e133ab2SMartin Schwidefsky }
601e133ab2SMartin Schwidefsky 
__crst_table_upgrade(void * arg)611e133ab2SMartin Schwidefsky static void __crst_table_upgrade(void *arg)
621e133ab2SMartin Schwidefsky {
631e133ab2SMartin Schwidefsky 	struct mm_struct *mm = arg;
641e133ab2SMartin Schwidefsky 
6587d59863SHeiko Carstens 	/* change all active ASCEs to avoid the creation of new TLBs */
66316ec154SChristian Borntraeger 	if (current->active_mm == mm) {
67316ec154SChristian Borntraeger 		S390_lowcore.user_asce = mm->context.asce;
68316ec154SChristian Borntraeger 		__ctl_load(S390_lowcore.user_asce, 7, 7);
69316ec154SChristian Borntraeger 	}
701e133ab2SMartin Schwidefsky 	__tlb_flush_local();
711e133ab2SMartin Schwidefsky }
721e133ab2SMartin Schwidefsky 
crst_table_upgrade(struct mm_struct * mm,unsigned long end)731aea9b3fSMartin Schwidefsky int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
741e133ab2SMartin Schwidefsky {
7531932757SAlexander Gordeev 	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
7631932757SAlexander Gordeev 	unsigned long asce_limit = mm->context.asce_limit;
771e133ab2SMartin Schwidefsky 
781aea9b3fSMartin Schwidefsky 	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
7931932757SAlexander Gordeev 	VM_BUG_ON(asce_limit < _REGION2_SIZE);
8031932757SAlexander Gordeev 
8131932757SAlexander Gordeev 	if (end <= asce_limit)
8231932757SAlexander Gordeev 		return 0;
8331932757SAlexander Gordeev 
8431932757SAlexander Gordeev 	if (asce_limit == _REGION2_SIZE) {
8531932757SAlexander Gordeev 		p4d = crst_table_alloc(mm);
8631932757SAlexander Gordeev 		if (unlikely(!p4d))
8731932757SAlexander Gordeev 			goto err_p4d;
8831932757SAlexander Gordeev 		crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
891aea9b3fSMartin Schwidefsky 	}
9031932757SAlexander Gordeev 	if (end > _REGION1_SIZE) {
9131932757SAlexander Gordeev 		pgd = crst_table_alloc(mm);
9231932757SAlexander Gordeev 		if (unlikely(!pgd))
9331932757SAlexander Gordeev 			goto err_pgd;
9431932757SAlexander Gordeev 		crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
9531932757SAlexander Gordeev 	}
9631932757SAlexander Gordeev 
971e133ab2SMartin Schwidefsky 	spin_lock_bh(&mm->page_table_lock);
9831932757SAlexander Gordeev 
9931932757SAlexander Gordeev 	/*
100c1e8d7c6SMichel Lespinasse 	 * This routine gets called with mmap_lock lock held and there is
10131932757SAlexander Gordeev 	 * no reason to optimize for the case of otherwise. However, if
10231932757SAlexander Gordeev 	 * that would ever change, the below check will let us know.
10331932757SAlexander Gordeev 	 */
10431932757SAlexander Gordeev 	VM_BUG_ON(asce_limit != mm->context.asce_limit);
10531932757SAlexander Gordeev 
10631932757SAlexander Gordeev 	if (p4d) {
10731932757SAlexander Gordeev 		__pgd = (unsigned long *) mm->pgd;
10831932757SAlexander Gordeev 		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
10931932757SAlexander Gordeev 		mm->pgd = (pgd_t *) p4d;
110f1c1174fSHeiko Carstens 		mm->context.asce_limit = _REGION1_SIZE;
111723cacbdSGerald Schaefer 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
112723cacbdSGerald Schaefer 			_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
113e12e4044SMartin Schwidefsky 		mm_inc_nr_puds(mm);
11431932757SAlexander Gordeev 	}
11531932757SAlexander Gordeev 	if (pgd) {
11631932757SAlexander Gordeev 		__pgd = (unsigned long *) mm->pgd;
11731932757SAlexander Gordeev 		pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
11831932757SAlexander Gordeev 		mm->pgd = (pgd_t *) pgd;
119f7555608SAlexander Gordeev 		mm->context.asce_limit = TASK_SIZE_MAX;
1201aea9b3fSMartin Schwidefsky 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
1211aea9b3fSMartin Schwidefsky 			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
1221aea9b3fSMartin Schwidefsky 	}
12331932757SAlexander Gordeev 
1241e133ab2SMartin Schwidefsky 	spin_unlock_bh(&mm->page_table_lock);
12531932757SAlexander Gordeev 
1261e133ab2SMartin Schwidefsky 	on_each_cpu(__crst_table_upgrade, mm, 0);
12731932757SAlexander Gordeev 
12831932757SAlexander Gordeev 	return 0;
12931932757SAlexander Gordeev 
13031932757SAlexander Gordeev err_pgd:
13131932757SAlexander Gordeev 	crst_table_free(mm, p4d);
13231932757SAlexander Gordeev err_p4d:
13331932757SAlexander Gordeev 	return -ENOMEM;
1341e133ab2SMartin Schwidefsky }
1351e133ab2SMartin Schwidefsky 
atomic_xor_bits(atomic_t * v,unsigned int bits)1361e133ab2SMartin Schwidefsky static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
1371e133ab2SMartin Schwidefsky {
138d28d86a0SHeiko Carstens 	return atomic_fetch_xor(bits, v) ^ bits;
1391e133ab2SMartin Schwidefsky }
1401e133ab2SMartin Schwidefsky 
1414be130a0SMartin Schwidefsky #ifdef CONFIG_PGSTE
1424be130a0SMartin Schwidefsky 
page_table_alloc_pgste(struct mm_struct * mm)1434be130a0SMartin Schwidefsky struct page *page_table_alloc_pgste(struct mm_struct *mm)
1444be130a0SMartin Schwidefsky {
1456326c26cSVishal Moola (Oracle) 	struct ptdesc *ptdesc;
14641879ff6SHeiko Carstens 	u64 *table;
1474be130a0SMartin Schwidefsky 
1486326c26cSVishal Moola (Oracle) 	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
1496326c26cSVishal Moola (Oracle) 	if (ptdesc) {
1506326c26cSVishal Moola (Oracle) 		table = (u64 *)ptdesc_to_virt(ptdesc);
151bfabe8d0SHeiko Carstens 		arch_set_page_dat(virt_to_page(table), 0);
15241879ff6SHeiko Carstens 		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
15341879ff6SHeiko Carstens 		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
1544be130a0SMartin Schwidefsky 	}
1556326c26cSVishal Moola (Oracle) 	return ptdesc_page(ptdesc);
1564be130a0SMartin Schwidefsky }
1574be130a0SMartin Schwidefsky 
page_table_free_pgste(struct page * page)1584be130a0SMartin Schwidefsky void page_table_free_pgste(struct page *page)
1594be130a0SMartin Schwidefsky {
1606326c26cSVishal Moola (Oracle) 	pagetable_free(page_ptdesc(page));
1614be130a0SMartin Schwidefsky }
1624be130a0SMartin Schwidefsky 
1634be130a0SMartin Schwidefsky #endif /* CONFIG_PGSTE */
1644be130a0SMartin Schwidefsky 
1651e133ab2SMartin Schwidefsky /*
1661194372dSAlexander Gordeev  * A 2KB-pgtable is either upper or lower half of a normal page.
1671194372dSAlexander Gordeev  * The second half of the page may be unused or used as another
1681194372dSAlexander Gordeev  * 2KB-pgtable.
1691194372dSAlexander Gordeev  *
1701194372dSAlexander Gordeev  * Whenever possible the parent page for a new 2KB-pgtable is picked
1711194372dSAlexander Gordeev  * from the list of partially allocated pages mm_context_t::pgtable_list.
1721194372dSAlexander Gordeev  * In case the list is empty a new parent page is allocated and added to
1731194372dSAlexander Gordeev  * the list.
1741194372dSAlexander Gordeev  *
1751194372dSAlexander Gordeev  * When a parent page gets fully allocated it contains 2KB-pgtables in both
1761194372dSAlexander Gordeev  * upper and lower halves and is removed from mm_context_t::pgtable_list.
1771194372dSAlexander Gordeev  *
1781194372dSAlexander Gordeev  * When 2KB-pgtable is freed from to fully allocated parent page that
1791194372dSAlexander Gordeev  * page turns partially allocated and added to mm_context_t::pgtable_list.
1801194372dSAlexander Gordeev  *
1811194372dSAlexander Gordeev  * If 2KB-pgtable is freed from the partially allocated parent page that
1821194372dSAlexander Gordeev  * page turns unused and gets removed from mm_context_t::pgtable_list.
1831194372dSAlexander Gordeev  * Furthermore, the unused parent page is released.
1841194372dSAlexander Gordeev  *
1851194372dSAlexander Gordeev  * As follows from the above, no unallocated or fully allocated parent
1861194372dSAlexander Gordeev  * pages are contained in mm_context_t::pgtable_list.
1871194372dSAlexander Gordeev  *
1881194372dSAlexander Gordeev  * The upper byte (bits 24-31) of the parent page _refcount is used
1891194372dSAlexander Gordeev  * for tracking contained 2KB-pgtables and has the following format:
1901194372dSAlexander Gordeev  *
1911194372dSAlexander Gordeev  *   PP  AA
1921194372dSAlexander Gordeev  * 01234567    upper byte (bits 24-31) of struct page::_refcount
1931194372dSAlexander Gordeev  *   ||  ||
1941194372dSAlexander Gordeev  *   ||  |+--- upper 2KB-pgtable is allocated
1951194372dSAlexander Gordeev  *   ||  +---- lower 2KB-pgtable is allocated
1961194372dSAlexander Gordeev  *   |+------- upper 2KB-pgtable is pending for removal
1971194372dSAlexander Gordeev  *   +-------- lower 2KB-pgtable is pending for removal
1981194372dSAlexander Gordeev  *
1991194372dSAlexander Gordeev  * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
2001194372dSAlexander Gordeev  * using _refcount is possible).
2011194372dSAlexander Gordeev  *
2021194372dSAlexander Gordeev  * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
2031194372dSAlexander Gordeev  * The parent page is either:
2041194372dSAlexander Gordeev  *   - added to mm_context_t::pgtable_list in case the second half of the
2051194372dSAlexander Gordeev  *     parent page is still unallocated;
2061194372dSAlexander Gordeev  *   - removed from mm_context_t::pgtable_list in case both hales of the
2071194372dSAlexander Gordeev  *     parent page are allocated;
2081194372dSAlexander Gordeev  * These operations are protected with mm_context_t::lock.
2091194372dSAlexander Gordeev  *
2101194372dSAlexander Gordeev  * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
2111194372dSAlexander Gordeev  * and the corresponding PP bit is set to 1 in a single atomic operation.
2121194372dSAlexander Gordeev  * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
2131194372dSAlexander Gordeev  * exclusive and may never be both set to 1!
2141194372dSAlexander Gordeev  * The parent page is either:
2151194372dSAlexander Gordeev  *   - added to mm_context_t::pgtable_list in case the second half of the
2161194372dSAlexander Gordeev  *     parent page is still allocated;
2171194372dSAlexander Gordeev  *   - removed from mm_context_t::pgtable_list in case the second half of
2181194372dSAlexander Gordeev  *     the parent page is unallocated;
2191194372dSAlexander Gordeev  * These operations are protected with mm_context_t::lock.
2201194372dSAlexander Gordeev  *
2211194372dSAlexander Gordeev  * It is important to understand that mm_context_t::lock only protects
2221194372dSAlexander Gordeev  * mm_context_t::pgtable_list and AA bits, but not the parent page itself
2231194372dSAlexander Gordeev  * and PP bits.
2241194372dSAlexander Gordeev  *
2251194372dSAlexander Gordeev  * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
2261194372dSAlexander Gordeev  * while both AA bits and the second PP bit are already unset. Then the
2271194372dSAlexander Gordeev  * parent page does not contain any 2KB-pgtable fragment anymore, and it has
2281194372dSAlexander Gordeev  * also been removed from mm_context_t::pgtable_list. It is safe to release
2291194372dSAlexander Gordeev  * the page therefore.
2301194372dSAlexander Gordeev  *
2311194372dSAlexander Gordeev  * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
2321194372dSAlexander Gordeev  * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
2331194372dSAlexander Gordeev  * while the PP bits are never used, nor such a page is added to or removed
2341194372dSAlexander Gordeev  * from mm_context_t::pgtable_list.
2358211dad6SHugh Dickins  *
2368211dad6SHugh Dickins  * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
2378211dad6SHugh Dickins  * and prevents both 2K fragments from being reused. pte_free_defer() has to
2388211dad6SHugh Dickins  * guarantee that its pgtable cannot be reused before the RCU grace period
2398211dad6SHugh Dickins  * has elapsed (which page_table_free_rcu() does not actually guarantee).
2408211dad6SHugh Dickins  * But for simplicity, because page->rcu_head overlays page->lru, and because
2418211dad6SHugh Dickins  * the RCU callback might not be called before the mm_context_t has been freed,
2428211dad6SHugh Dickins  * pte_free_defer() in this implementation prevents both fragments from being
2438211dad6SHugh Dickins  * reused, and delays making the call to RCU until both fragments are freed.
2441e133ab2SMartin Schwidefsky  */
page_table_alloc(struct mm_struct * mm)2451e133ab2SMartin Schwidefsky unsigned long *page_table_alloc(struct mm_struct *mm)
2461e133ab2SMartin Schwidefsky {
2471e133ab2SMartin Schwidefsky 	unsigned long *table;
2486326c26cSVishal Moola (Oracle) 	struct ptdesc *ptdesc;
2491e133ab2SMartin Schwidefsky 	unsigned int mask, bit;
2501e133ab2SMartin Schwidefsky 
2511e133ab2SMartin Schwidefsky 	/* Try to get a fragment of a 4K page as a 2K page table */
2521e133ab2SMartin Schwidefsky 	if (!mm_alloc_pgste(mm)) {
2531e133ab2SMartin Schwidefsky 		table = NULL;
254f28a4b4dSMartin Schwidefsky 		spin_lock_bh(&mm->context.lock);
2551e133ab2SMartin Schwidefsky 		if (!list_empty(&mm->context.pgtable_list)) {
2566326c26cSVishal Moola (Oracle) 			ptdesc = list_first_entry(&mm->context.pgtable_list,
2576326c26cSVishal Moola (Oracle) 						struct ptdesc, pt_list);
2586326c26cSVishal Moola (Oracle) 			mask = atomic_read(&ptdesc->_refcount) >> 24;
2591194372dSAlexander Gordeev 			/*
2601194372dSAlexander Gordeev 			 * The pending removal bits must also be checked.
2611194372dSAlexander Gordeev 			 * Failure to do so might lead to an impossible
2621194372dSAlexander Gordeev 			 * value of (i.e 0x13 or 0x23) written to _refcount.
2631194372dSAlexander Gordeev 			 * Such values violate the assumption that pending and
2641194372dSAlexander Gordeev 			 * allocation bits are mutually exclusive, and the rest
2651194372dSAlexander Gordeev 			 * of the code unrails as result. That could lead to
2661194372dSAlexander Gordeev 			 * a whole bunch of races and corruptions.
2671194372dSAlexander Gordeev 			 */
2681194372dSAlexander Gordeev 			mask = (mask | (mask >> 4)) & 0x03U;
2691194372dSAlexander Gordeev 			if (mask != 0x03U) {
2706326c26cSVishal Moola (Oracle) 				table = (unsigned long *) ptdesc_to_virt(ptdesc);
2711e133ab2SMartin Schwidefsky 				bit = mask & 1;		/* =1 -> second 2K */
2721e133ab2SMartin Schwidefsky 				if (bit)
2731e133ab2SMartin Schwidefsky 					table += PTRS_PER_PTE;
2746326c26cSVishal Moola (Oracle) 				atomic_xor_bits(&ptdesc->_refcount,
2751194372dSAlexander Gordeev 							0x01U << (bit + 24));
2766326c26cSVishal Moola (Oracle) 				list_del_init(&ptdesc->pt_list);
2771e133ab2SMartin Schwidefsky 			}
2781e133ab2SMartin Schwidefsky 		}
279f28a4b4dSMartin Schwidefsky 		spin_unlock_bh(&mm->context.lock);
2801e133ab2SMartin Schwidefsky 		if (table)
2811e133ab2SMartin Schwidefsky 			return table;
2821e133ab2SMartin Schwidefsky 	}
2831e133ab2SMartin Schwidefsky 	/* Allocate a fresh page */
2846326c26cSVishal Moola (Oracle) 	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
2856326c26cSVishal Moola (Oracle) 	if (!ptdesc)
2861e133ab2SMartin Schwidefsky 		return NULL;
2876326c26cSVishal Moola (Oracle) 	if (!pagetable_pte_ctor(ptdesc)) {
2886326c26cSVishal Moola (Oracle) 		pagetable_free(ptdesc);
2891e133ab2SMartin Schwidefsky 		return NULL;
2901e133ab2SMartin Schwidefsky 	}
2916326c26cSVishal Moola (Oracle) 	arch_set_page_dat(ptdesc_page(ptdesc), 0);
2921e133ab2SMartin Schwidefsky 	/* Initialize page table */
2936326c26cSVishal Moola (Oracle) 	table = (unsigned long *) ptdesc_to_virt(ptdesc);
2941e133ab2SMartin Schwidefsky 	if (mm_alloc_pgste(mm)) {
2951e133ab2SMartin Schwidefsky 		/* Return 4K page table with PGSTEs */
2966326c26cSVishal Moola (Oracle) 		INIT_LIST_HEAD(&ptdesc->pt_list);
2976326c26cSVishal Moola (Oracle) 		atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
29841879ff6SHeiko Carstens 		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
29941879ff6SHeiko Carstens 		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
3001e133ab2SMartin Schwidefsky 	} else {
3011e133ab2SMartin Schwidefsky 		/* Return the first 2K fragment of the page */
3026326c26cSVishal Moola (Oracle) 		atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24);
30341879ff6SHeiko Carstens 		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
304f28a4b4dSMartin Schwidefsky 		spin_lock_bh(&mm->context.lock);
3056326c26cSVishal Moola (Oracle) 		list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
306f28a4b4dSMartin Schwidefsky 		spin_unlock_bh(&mm->context.lock);
3071e133ab2SMartin Schwidefsky 	}
3081e133ab2SMartin Schwidefsky 	return table;
3091e133ab2SMartin Schwidefsky }
3101e133ab2SMartin Schwidefsky 
page_table_release_check(struct page * page,void * table,unsigned int half,unsigned int mask)3114c88bb96SAlexander Gordeev static void page_table_release_check(struct page *page, void *table,
3124c88bb96SAlexander Gordeev 				     unsigned int half, unsigned int mask)
3134c88bb96SAlexander Gordeev {
3144c88bb96SAlexander Gordeev 	char msg[128];
3154c88bb96SAlexander Gordeev 
3168211dad6SHugh Dickins 	if (!IS_ENABLED(CONFIG_DEBUG_VM))
3178211dad6SHugh Dickins 		return;
3188211dad6SHugh Dickins 	if (!mask && list_empty(&page->lru))
3194c88bb96SAlexander Gordeev 		return;
3204c88bb96SAlexander Gordeev 	snprintf(msg, sizeof(msg),
3214c88bb96SAlexander Gordeev 		 "Invalid pgtable %p release half 0x%02x mask 0x%02x",
3224c88bb96SAlexander Gordeev 		 table, half, mask);
3234c88bb96SAlexander Gordeev 	dump_page(page, msg);
3244c88bb96SAlexander Gordeev }
3254c88bb96SAlexander Gordeev 
pte_free_now(struct rcu_head * head)3268211dad6SHugh Dickins static void pte_free_now(struct rcu_head *head)
3278211dad6SHugh Dickins {
3286326c26cSVishal Moola (Oracle) 	struct ptdesc *ptdesc;
3298211dad6SHugh Dickins 
3306326c26cSVishal Moola (Oracle) 	ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
3316326c26cSVishal Moola (Oracle) 	pagetable_pte_dtor(ptdesc);
3326326c26cSVishal Moola (Oracle) 	pagetable_free(ptdesc);
3338211dad6SHugh Dickins }
3348211dad6SHugh Dickins 
page_table_free(struct mm_struct * mm,unsigned long * table)3351e133ab2SMartin Schwidefsky void page_table_free(struct mm_struct *mm, unsigned long *table)
3361e133ab2SMartin Schwidefsky {
3374c88bb96SAlexander Gordeev 	unsigned int mask, bit, half;
3386326c26cSVishal Moola (Oracle) 	struct ptdesc *ptdesc = virt_to_ptdesc(table);
3391e133ab2SMartin Schwidefsky 
3401e133ab2SMartin Schwidefsky 	if (!mm_alloc_pgste(mm)) {
3411e133ab2SMartin Schwidefsky 		/* Free 2K page table fragment of a 4K page */
3422a444fdcSAlexander Gordeev 		bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
343f28a4b4dSMartin Schwidefsky 		spin_lock_bh(&mm->context.lock);
3441194372dSAlexander Gordeev 		/*
3451194372dSAlexander Gordeev 		 * Mark the page for delayed release. The actual release
3461194372dSAlexander Gordeev 		 * will happen outside of the critical section from this
3471194372dSAlexander Gordeev 		 * function or from __tlb_remove_table()
3481194372dSAlexander Gordeev 		 */
3496326c26cSVishal Moola (Oracle) 		mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
350620b4e90SMatthew Wilcox 		mask >>= 24;
3516326c26cSVishal Moola (Oracle) 		if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
3528211dad6SHugh Dickins 			/*
3538211dad6SHugh Dickins 			 * Other half is allocated, and neither half has had
3548211dad6SHugh Dickins 			 * its free deferred: add page to head of list, to make
3558211dad6SHugh Dickins 			 * this freed half available for immediate reuse.
3568211dad6SHugh Dickins 			 */
3576326c26cSVishal Moola (Oracle) 			list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
3588211dad6SHugh Dickins 		} else {
3598211dad6SHugh Dickins 			/* If page is on list, now remove it. */
3606326c26cSVishal Moola (Oracle) 			list_del_init(&ptdesc->pt_list);
3618211dad6SHugh Dickins 		}
362f28a4b4dSMartin Schwidefsky 		spin_unlock_bh(&mm->context.lock);
3636326c26cSVishal Moola (Oracle) 		mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24));
364c2c22493SAlexander Gordeev 		mask >>= 24;
3651194372dSAlexander Gordeev 		if (mask != 0x00U)
3661e133ab2SMartin Schwidefsky 			return;
3674c88bb96SAlexander Gordeev 		half = 0x01U << bit;
368dfa75863SEric Farman 	} else {
3694c88bb96SAlexander Gordeev 		half = 0x03U;
3706326c26cSVishal Moola (Oracle) 		mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
3714c88bb96SAlexander Gordeev 		mask >>= 24;
3721e133ab2SMartin Schwidefsky 	}
3731e133ab2SMartin Schwidefsky 
3746326c26cSVishal Moola (Oracle) 	page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
3756326c26cSVishal Moola (Oracle) 	if (folio_test_clear_active(ptdesc_folio(ptdesc)))
3766326c26cSVishal Moola (Oracle) 		call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
3778211dad6SHugh Dickins 	else
3786326c26cSVishal Moola (Oracle) 		pte_free_now(&ptdesc->pt_rcu_head);
3791e133ab2SMartin Schwidefsky }
3801e133ab2SMartin Schwidefsky 
page_table_free_rcu(struct mmu_gather * tlb,unsigned long * table,unsigned long vmaddr)3811e133ab2SMartin Schwidefsky void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
3821e133ab2SMartin Schwidefsky 			 unsigned long vmaddr)
3831e133ab2SMartin Schwidefsky {
3841e133ab2SMartin Schwidefsky 	struct mm_struct *mm;
3851e133ab2SMartin Schwidefsky 	unsigned int bit, mask;
3866326c26cSVishal Moola (Oracle) 	struct ptdesc *ptdesc = virt_to_ptdesc(table);
3871e133ab2SMartin Schwidefsky 
3881e133ab2SMartin Schwidefsky 	mm = tlb->mm;
3891e133ab2SMartin Schwidefsky 	if (mm_alloc_pgste(mm)) {
3901e133ab2SMartin Schwidefsky 		gmap_unlink(mm, table, vmaddr);
3911194372dSAlexander Gordeev 		table = (unsigned long *) ((unsigned long)table | 0x03U);
3926326c26cSVishal Moola (Oracle) 		tlb_remove_ptdesc(tlb, table);
3931e133ab2SMartin Schwidefsky 		return;
3941e133ab2SMartin Schwidefsky 	}
3952a444fdcSAlexander Gordeev 	bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
396f28a4b4dSMartin Schwidefsky 	spin_lock_bh(&mm->context.lock);
3971194372dSAlexander Gordeev 	/*
3981194372dSAlexander Gordeev 	 * Mark the page for delayed release. The actual release will happen
3991194372dSAlexander Gordeev 	 * outside of the critical section from __tlb_remove_table() or from
4001194372dSAlexander Gordeev 	 * page_table_free()
4011194372dSAlexander Gordeev 	 */
4026326c26cSVishal Moola (Oracle) 	mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
403620b4e90SMatthew Wilcox 	mask >>= 24;
4046326c26cSVishal Moola (Oracle) 	if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
4058211dad6SHugh Dickins 		/*
4068211dad6SHugh Dickins 		 * Other half is allocated, and neither half has had
4078211dad6SHugh Dickins 		 * its free deferred: add page to end of list, to make
4088211dad6SHugh Dickins 		 * this freed half available for reuse once its pending
4098211dad6SHugh Dickins 		 * bit has been cleared by __tlb_remove_table().
4108211dad6SHugh Dickins 		 */
4116326c26cSVishal Moola (Oracle) 		list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list);
4128211dad6SHugh Dickins 	} else {
4138211dad6SHugh Dickins 		/* If page is on list, now remove it. */
4146326c26cSVishal Moola (Oracle) 		list_del_init(&ptdesc->pt_list);
4158211dad6SHugh Dickins 	}
416f28a4b4dSMartin Schwidefsky 	spin_unlock_bh(&mm->context.lock);
4171194372dSAlexander Gordeev 	table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
4181e133ab2SMartin Schwidefsky 	tlb_remove_table(tlb, table);
4191e133ab2SMartin Schwidefsky }
4201e133ab2SMartin Schwidefsky 
__tlb_remove_table(void * _table)4219de7d833SMartin Schwidefsky void __tlb_remove_table(void *_table)
4221e133ab2SMartin Schwidefsky {
4234c88bb96SAlexander Gordeev 	unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
4241e133ab2SMartin Schwidefsky 	void *table = (void *)((unsigned long) _table ^ mask);
4256326c26cSVishal Moola (Oracle) 	struct ptdesc *ptdesc = virt_to_ptdesc(table);
4261e133ab2SMartin Schwidefsky 
4274c88bb96SAlexander Gordeev 	switch (half) {
4281194372dSAlexander Gordeev 	case 0x00U:	/* pmd, pud, or p4d */
4296326c26cSVishal Moola (Oracle) 		pagetable_free(ptdesc);
4304c88bb96SAlexander Gordeev 		return;
4311194372dSAlexander Gordeev 	case 0x01U:	/* lower 2K of a 4K page table */
4321194372dSAlexander Gordeev 	case 0x02U:	/* higher 2K of a 4K page table */
4336326c26cSVishal Moola (Oracle) 		mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24));
434620b4e90SMatthew Wilcox 		mask >>= 24;
4351194372dSAlexander Gordeev 		if (mask != 0x00U)
4364c88bb96SAlexander Gordeev 			return;
4371e133ab2SMartin Schwidefsky 		break;
4381194372dSAlexander Gordeev 	case 0x03U:	/* 4K page table with pgstes */
4396326c26cSVishal Moola (Oracle) 		mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
4404c88bb96SAlexander Gordeev 		mask >>= 24;
4411e133ab2SMartin Schwidefsky 		break;
4421e133ab2SMartin Schwidefsky 	}
4434c88bb96SAlexander Gordeev 
4446326c26cSVishal Moola (Oracle) 	page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
4456326c26cSVishal Moola (Oracle) 	if (folio_test_clear_active(ptdesc_folio(ptdesc)))
4466326c26cSVishal Moola (Oracle) 		call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
4478211dad6SHugh Dickins 	else
4486326c26cSVishal Moola (Oracle) 		pte_free_now(&ptdesc->pt_rcu_head);
4491e133ab2SMartin Schwidefsky }
4501e133ab2SMartin Schwidefsky 
4518211dad6SHugh Dickins #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pte_free_defer(struct mm_struct * mm,pgtable_t pgtable)4528211dad6SHugh Dickins void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
4538211dad6SHugh Dickins {
4548211dad6SHugh Dickins 	struct page *page;
4558211dad6SHugh Dickins 
4568211dad6SHugh Dickins 	page = virt_to_page(pgtable);
4578211dad6SHugh Dickins 	SetPageActive(page);
4588211dad6SHugh Dickins 	page_table_free(mm, (unsigned long *)pgtable);
4598211dad6SHugh Dickins 	/*
4608211dad6SHugh Dickins 	 * page_table_free() does not do the pgste gmap_unlink() which
4618211dad6SHugh Dickins 	 * page_table_free_rcu() does: warn us if pgste ever reaches here.
4628211dad6SHugh Dickins 	 */
4638211dad6SHugh Dickins 	WARN_ON_ONCE(mm_has_pgste(mm));
4648211dad6SHugh Dickins }
4658211dad6SHugh Dickins #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4668211dad6SHugh Dickins 
4671caf170dSHeiko Carstens /*
4681caf170dSHeiko Carstens  * Base infrastructure required to generate basic asces, region, segment,
4691caf170dSHeiko Carstens  * and page tables that do not make use of enhanced features like EDAT1.
4701caf170dSHeiko Carstens  */
4711caf170dSHeiko Carstens 
4721caf170dSHeiko Carstens static struct kmem_cache *base_pgt_cache;
4731caf170dSHeiko Carstens 
base_pgt_alloc(void)474da001fceSHeiko Carstens static unsigned long *base_pgt_alloc(void)
4751caf170dSHeiko Carstens {
476da001fceSHeiko Carstens 	unsigned long *table;
4771caf170dSHeiko Carstens 
4781caf170dSHeiko Carstens 	table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
4791caf170dSHeiko Carstens 	if (table)
480da001fceSHeiko Carstens 		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
4811caf170dSHeiko Carstens 	return table;
4821caf170dSHeiko Carstens }
4831caf170dSHeiko Carstens 
base_pgt_free(unsigned long * table)484da001fceSHeiko Carstens static void base_pgt_free(unsigned long *table)
4851caf170dSHeiko Carstens {
486da001fceSHeiko Carstens 	kmem_cache_free(base_pgt_cache, table);
487da001fceSHeiko Carstens }
488da001fceSHeiko Carstens 
base_crst_alloc(unsigned long val)489da001fceSHeiko Carstens static unsigned long *base_crst_alloc(unsigned long val)
490da001fceSHeiko Carstens {
491da001fceSHeiko Carstens 	unsigned long *table;
4926326c26cSVishal Moola (Oracle) 	struct ptdesc *ptdesc;
493da001fceSHeiko Carstens 
4946326c26cSVishal Moola (Oracle) 	ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER);
4956326c26cSVishal Moola (Oracle) 	if (!ptdesc)
4966326c26cSVishal Moola (Oracle) 		return NULL;
4976326c26cSVishal Moola (Oracle) 	table = ptdesc_address(ptdesc);
4986326c26cSVishal Moola (Oracle) 
499da001fceSHeiko Carstens 	crst_table_init(table, val);
500da001fceSHeiko Carstens 	return table;
501da001fceSHeiko Carstens }
502da001fceSHeiko Carstens 
base_crst_free(unsigned long * table)503da001fceSHeiko Carstens static void base_crst_free(unsigned long *table)
504da001fceSHeiko Carstens {
505*794fa52bSHeiko Carstens 	if (!table)
506*794fa52bSHeiko Carstens 		return;
5076326c26cSVishal Moola (Oracle) 	pagetable_free(virt_to_ptdesc(table));
5081caf170dSHeiko Carstens }
5091caf170dSHeiko Carstens 
5101caf170dSHeiko Carstens #define BASE_ADDR_END_FUNC(NAME, SIZE)					\
5111caf170dSHeiko Carstens static inline unsigned long base_##NAME##_addr_end(unsigned long addr,	\
5121caf170dSHeiko Carstens 						   unsigned long end)	\
5131caf170dSHeiko Carstens {									\
5141caf170dSHeiko Carstens 	unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);		\
5151caf170dSHeiko Carstens 									\
5161caf170dSHeiko Carstens 	return (next - 1) < (end - 1) ? next : end;			\
5171caf170dSHeiko Carstens }
5181caf170dSHeiko Carstens 
BASE_ADDR_END_FUNC(page,_PAGE_SIZE)5191caf170dSHeiko Carstens BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
5201caf170dSHeiko Carstens BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
5211caf170dSHeiko Carstens BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
5221caf170dSHeiko Carstens BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
5231caf170dSHeiko Carstens BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
5241caf170dSHeiko Carstens 
5251caf170dSHeiko Carstens static inline unsigned long base_lra(unsigned long address)
5261caf170dSHeiko Carstens {
5271caf170dSHeiko Carstens 	unsigned long real;
5281caf170dSHeiko Carstens 
5291caf170dSHeiko Carstens 	asm volatile(
5301caf170dSHeiko Carstens 		"	lra	%0,0(%1)\n"
5311caf170dSHeiko Carstens 		: "=d" (real) : "a" (address) : "cc");
5321caf170dSHeiko Carstens 	return real;
5331caf170dSHeiko Carstens }
5341caf170dSHeiko Carstens 
base_page_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)535da001fceSHeiko Carstens static int base_page_walk(unsigned long *origin, unsigned long addr,
5361caf170dSHeiko Carstens 			  unsigned long end, int alloc)
5371caf170dSHeiko Carstens {
5381caf170dSHeiko Carstens 	unsigned long *pte, next;
5391caf170dSHeiko Carstens 
5401caf170dSHeiko Carstens 	if (!alloc)
5411caf170dSHeiko Carstens 		return 0;
542da001fceSHeiko Carstens 	pte = origin;
5431caf170dSHeiko Carstens 	pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
5441caf170dSHeiko Carstens 	do {
5451caf170dSHeiko Carstens 		next = base_page_addr_end(addr, end);
5461caf170dSHeiko Carstens 		*pte = base_lra(addr);
5471caf170dSHeiko Carstens 	} while (pte++, addr = next, addr < end);
5481caf170dSHeiko Carstens 	return 0;
5491caf170dSHeiko Carstens }
5501caf170dSHeiko Carstens 
base_segment_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)551da001fceSHeiko Carstens static int base_segment_walk(unsigned long *origin, unsigned long addr,
5521caf170dSHeiko Carstens 			     unsigned long end, int alloc)
5531caf170dSHeiko Carstens {
554da001fceSHeiko Carstens 	unsigned long *ste, next, *table;
5551caf170dSHeiko Carstens 	int rc;
5561caf170dSHeiko Carstens 
557da001fceSHeiko Carstens 	ste = origin;
5581caf170dSHeiko Carstens 	ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
5591caf170dSHeiko Carstens 	do {
5601caf170dSHeiko Carstens 		next = base_segment_addr_end(addr, end);
5611caf170dSHeiko Carstens 		if (*ste & _SEGMENT_ENTRY_INVALID) {
5621caf170dSHeiko Carstens 			if (!alloc)
5631caf170dSHeiko Carstens 				continue;
5641caf170dSHeiko Carstens 			table = base_pgt_alloc();
5651caf170dSHeiko Carstens 			if (!table)
5661caf170dSHeiko Carstens 				return -ENOMEM;
5672f882800SHeiko Carstens 			*ste = __pa(table) | _SEGMENT_ENTRY;
5681caf170dSHeiko Carstens 		}
569da001fceSHeiko Carstens 		table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
5701caf170dSHeiko Carstens 		rc = base_page_walk(table, addr, next, alloc);
5711caf170dSHeiko Carstens 		if (rc)
5721caf170dSHeiko Carstens 			return rc;
5731caf170dSHeiko Carstens 		if (!alloc)
5741caf170dSHeiko Carstens 			base_pgt_free(table);
5751caf170dSHeiko Carstens 		cond_resched();
5761caf170dSHeiko Carstens 	} while (ste++, addr = next, addr < end);
5771caf170dSHeiko Carstens 	return 0;
5781caf170dSHeiko Carstens }
5791caf170dSHeiko Carstens 
base_region3_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)580da001fceSHeiko Carstens static int base_region3_walk(unsigned long *origin, unsigned long addr,
5811caf170dSHeiko Carstens 			     unsigned long end, int alloc)
5821caf170dSHeiko Carstens {
583da001fceSHeiko Carstens 	unsigned long *rtte, next, *table;
5841caf170dSHeiko Carstens 	int rc;
5851caf170dSHeiko Carstens 
586da001fceSHeiko Carstens 	rtte = origin;
5871caf170dSHeiko Carstens 	rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
5881caf170dSHeiko Carstens 	do {
5891caf170dSHeiko Carstens 		next = base_region3_addr_end(addr, end);
5901caf170dSHeiko Carstens 		if (*rtte & _REGION_ENTRY_INVALID) {
5911caf170dSHeiko Carstens 			if (!alloc)
5921caf170dSHeiko Carstens 				continue;
5931caf170dSHeiko Carstens 			table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
5941caf170dSHeiko Carstens 			if (!table)
5951caf170dSHeiko Carstens 				return -ENOMEM;
5962f882800SHeiko Carstens 			*rtte = __pa(table) | _REGION3_ENTRY;
5971caf170dSHeiko Carstens 		}
598da001fceSHeiko Carstens 		table = __va(*rtte & _REGION_ENTRY_ORIGIN);
5991caf170dSHeiko Carstens 		rc = base_segment_walk(table, addr, next, alloc);
6001caf170dSHeiko Carstens 		if (rc)
6011caf170dSHeiko Carstens 			return rc;
6021caf170dSHeiko Carstens 		if (!alloc)
6031caf170dSHeiko Carstens 			base_crst_free(table);
6041caf170dSHeiko Carstens 	} while (rtte++, addr = next, addr < end);
6051caf170dSHeiko Carstens 	return 0;
6061caf170dSHeiko Carstens }
6071caf170dSHeiko Carstens 
base_region2_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)608da001fceSHeiko Carstens static int base_region2_walk(unsigned long *origin, unsigned long addr,
6091caf170dSHeiko Carstens 			     unsigned long end, int alloc)
6101caf170dSHeiko Carstens {
611da001fceSHeiko Carstens 	unsigned long *rste, next, *table;
6121caf170dSHeiko Carstens 	int rc;
6131caf170dSHeiko Carstens 
614da001fceSHeiko Carstens 	rste = origin;
6151caf170dSHeiko Carstens 	rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
6161caf170dSHeiko Carstens 	do {
6171caf170dSHeiko Carstens 		next = base_region2_addr_end(addr, end);
6181caf170dSHeiko Carstens 		if (*rste & _REGION_ENTRY_INVALID) {
6191caf170dSHeiko Carstens 			if (!alloc)
6201caf170dSHeiko Carstens 				continue;
6211caf170dSHeiko Carstens 			table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
6221caf170dSHeiko Carstens 			if (!table)
6231caf170dSHeiko Carstens 				return -ENOMEM;
6242f882800SHeiko Carstens 			*rste = __pa(table) | _REGION2_ENTRY;
6251caf170dSHeiko Carstens 		}
626da001fceSHeiko Carstens 		table = __va(*rste & _REGION_ENTRY_ORIGIN);
6271caf170dSHeiko Carstens 		rc = base_region3_walk(table, addr, next, alloc);
6281caf170dSHeiko Carstens 		if (rc)
6291caf170dSHeiko Carstens 			return rc;
6301caf170dSHeiko Carstens 		if (!alloc)
6311caf170dSHeiko Carstens 			base_crst_free(table);
6321caf170dSHeiko Carstens 	} while (rste++, addr = next, addr < end);
6331caf170dSHeiko Carstens 	return 0;
6341caf170dSHeiko Carstens }
6351caf170dSHeiko Carstens 
base_region1_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)636da001fceSHeiko Carstens static int base_region1_walk(unsigned long *origin, unsigned long addr,
6371caf170dSHeiko Carstens 			     unsigned long end, int alloc)
6381caf170dSHeiko Carstens {
639da001fceSHeiko Carstens 	unsigned long *rfte, next, *table;
6401caf170dSHeiko Carstens 	int rc;
6411caf170dSHeiko Carstens 
642da001fceSHeiko Carstens 	rfte = origin;
6431caf170dSHeiko Carstens 	rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
6441caf170dSHeiko Carstens 	do {
6451caf170dSHeiko Carstens 		next = base_region1_addr_end(addr, end);
6461caf170dSHeiko Carstens 		if (*rfte & _REGION_ENTRY_INVALID) {
6471caf170dSHeiko Carstens 			if (!alloc)
6481caf170dSHeiko Carstens 				continue;
6491caf170dSHeiko Carstens 			table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
6501caf170dSHeiko Carstens 			if (!table)
6511caf170dSHeiko Carstens 				return -ENOMEM;
6522f882800SHeiko Carstens 			*rfte = __pa(table) | _REGION1_ENTRY;
6531caf170dSHeiko Carstens 		}
654da001fceSHeiko Carstens 		table = __va(*rfte & _REGION_ENTRY_ORIGIN);
6551caf170dSHeiko Carstens 		rc = base_region2_walk(table, addr, next, alloc);
6561caf170dSHeiko Carstens 		if (rc)
6571caf170dSHeiko Carstens 			return rc;
6581caf170dSHeiko Carstens 		if (!alloc)
6591caf170dSHeiko Carstens 			base_crst_free(table);
6601caf170dSHeiko Carstens 	} while (rfte++, addr = next, addr < end);
6611caf170dSHeiko Carstens 	return 0;
6621caf170dSHeiko Carstens }
6631caf170dSHeiko Carstens 
6641caf170dSHeiko Carstens /**
6651caf170dSHeiko Carstens  * base_asce_free - free asce and tables returned from base_asce_alloc()
6661caf170dSHeiko Carstens  * @asce: asce to be freed
6671caf170dSHeiko Carstens  *
6681caf170dSHeiko Carstens  * Frees all region, segment, and page tables that were allocated with a
6691caf170dSHeiko Carstens  * corresponding base_asce_alloc() call.
6701caf170dSHeiko Carstens  */
base_asce_free(unsigned long asce)6711caf170dSHeiko Carstens void base_asce_free(unsigned long asce)
6721caf170dSHeiko Carstens {
673da001fceSHeiko Carstens 	unsigned long *table = __va(asce & _ASCE_ORIGIN);
6741caf170dSHeiko Carstens 
6751caf170dSHeiko Carstens 	if (!asce)
6761caf170dSHeiko Carstens 		return;
6771caf170dSHeiko Carstens 	switch (asce & _ASCE_TYPE_MASK) {
6781caf170dSHeiko Carstens 	case _ASCE_TYPE_SEGMENT:
6791caf170dSHeiko Carstens 		base_segment_walk(table, 0, _REGION3_SIZE, 0);
6801caf170dSHeiko Carstens 		break;
6811caf170dSHeiko Carstens 	case _ASCE_TYPE_REGION3:
6821caf170dSHeiko Carstens 		base_region3_walk(table, 0, _REGION2_SIZE, 0);
6831caf170dSHeiko Carstens 		break;
6841caf170dSHeiko Carstens 	case _ASCE_TYPE_REGION2:
6851caf170dSHeiko Carstens 		base_region2_walk(table, 0, _REGION1_SIZE, 0);
6861caf170dSHeiko Carstens 		break;
6871caf170dSHeiko Carstens 	case _ASCE_TYPE_REGION1:
688f7555608SAlexander Gordeev 		base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
6891caf170dSHeiko Carstens 		break;
6901caf170dSHeiko Carstens 	}
6911caf170dSHeiko Carstens 	base_crst_free(table);
6921caf170dSHeiko Carstens }
6931caf170dSHeiko Carstens 
base_pgt_cache_init(void)6941caf170dSHeiko Carstens static int base_pgt_cache_init(void)
6951caf170dSHeiko Carstens {
6961caf170dSHeiko Carstens 	static DEFINE_MUTEX(base_pgt_cache_mutex);
6971caf170dSHeiko Carstens 	unsigned long sz = _PAGE_TABLE_SIZE;
6981caf170dSHeiko Carstens 
6991caf170dSHeiko Carstens 	if (base_pgt_cache)
7001caf170dSHeiko Carstens 		return 0;
7011caf170dSHeiko Carstens 	mutex_lock(&base_pgt_cache_mutex);
7021caf170dSHeiko Carstens 	if (!base_pgt_cache)
7031caf170dSHeiko Carstens 		base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
7041caf170dSHeiko Carstens 	mutex_unlock(&base_pgt_cache_mutex);
7051caf170dSHeiko Carstens 	return base_pgt_cache ? 0 : -ENOMEM;
7061caf170dSHeiko Carstens }
7071caf170dSHeiko Carstens 
7081caf170dSHeiko Carstens /**
7091caf170dSHeiko Carstens  * base_asce_alloc - create kernel mapping without enhanced DAT features
7101caf170dSHeiko Carstens  * @addr: virtual start address of kernel mapping
7111caf170dSHeiko Carstens  * @num_pages: number of consecutive pages
7121caf170dSHeiko Carstens  *
7131caf170dSHeiko Carstens  * Generate an asce, including all required region, segment and page tables,
7141caf170dSHeiko Carstens  * that can be used to access the virtual kernel mapping. The difference is
7151caf170dSHeiko Carstens  * that the returned asce does not make use of any enhanced DAT features like
7161caf170dSHeiko Carstens  * e.g. large pages. This is required for some I/O functions that pass an
7171caf170dSHeiko Carstens  * asce, like e.g. some service call requests.
7181caf170dSHeiko Carstens  *
7191caf170dSHeiko Carstens  * Note: the returned asce may NEVER be attached to any cpu. It may only be
7201caf170dSHeiko Carstens  *	 used for I/O requests. tlb entries that might result because the
7211caf170dSHeiko Carstens  *	 asce was attached to a cpu won't be cleared.
7221caf170dSHeiko Carstens  */
base_asce_alloc(unsigned long addr,unsigned long num_pages)7231caf170dSHeiko Carstens unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
7241caf170dSHeiko Carstens {
725da001fceSHeiko Carstens 	unsigned long asce, *table, end;
7261caf170dSHeiko Carstens 	int rc;
7271caf170dSHeiko Carstens 
7281caf170dSHeiko Carstens 	if (base_pgt_cache_init())
7291caf170dSHeiko Carstens 		return 0;
7301caf170dSHeiko Carstens 	end = addr + num_pages * PAGE_SIZE;
7311caf170dSHeiko Carstens 	if (end <= _REGION3_SIZE) {
7321caf170dSHeiko Carstens 		table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
7331caf170dSHeiko Carstens 		if (!table)
7341caf170dSHeiko Carstens 			return 0;
7351caf170dSHeiko Carstens 		rc = base_segment_walk(table, addr, end, 1);
7362f882800SHeiko Carstens 		asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
7371caf170dSHeiko Carstens 	} else if (end <= _REGION2_SIZE) {
7381caf170dSHeiko Carstens 		table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
7391caf170dSHeiko Carstens 		if (!table)
7401caf170dSHeiko Carstens 			return 0;
7411caf170dSHeiko Carstens 		rc = base_region3_walk(table, addr, end, 1);
7422f882800SHeiko Carstens 		asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
7431caf170dSHeiko Carstens 	} else if (end <= _REGION1_SIZE) {
7441caf170dSHeiko Carstens 		table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
7451caf170dSHeiko Carstens 		if (!table)
7461caf170dSHeiko Carstens 			return 0;
7471caf170dSHeiko Carstens 		rc = base_region2_walk(table, addr, end, 1);
7482f882800SHeiko Carstens 		asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
7491caf170dSHeiko Carstens 	} else {
7501caf170dSHeiko Carstens 		table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
7511caf170dSHeiko Carstens 		if (!table)
7521caf170dSHeiko Carstens 			return 0;
7531caf170dSHeiko Carstens 		rc = base_region1_walk(table, addr, end, 1);
7542f882800SHeiko Carstens 		asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
7551caf170dSHeiko Carstens 	}
7561caf170dSHeiko Carstens 	if (rc) {
7571caf170dSHeiko Carstens 		base_asce_free(asce);
7581caf170dSHeiko Carstens 		asce = 0;
7591caf170dSHeiko Carstens 	}
7601caf170dSHeiko Carstens 	return asce;
7611caf170dSHeiko Carstens }
762