1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21e133ab2SMartin Schwidefsky /*
31e133ab2SMartin Schwidefsky * Page table allocation functions
41e133ab2SMartin Schwidefsky *
51e133ab2SMartin Schwidefsky * Copyright IBM Corp. 2016
61e133ab2SMartin Schwidefsky * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
71e133ab2SMartin Schwidefsky */
81e133ab2SMartin Schwidefsky
91e133ab2SMartin Schwidefsky #include <linux/sysctl.h>
101caf170dSHeiko Carstens #include <linux/slab.h>
111caf170dSHeiko Carstens #include <linux/mm.h>
121e133ab2SMartin Schwidefsky #include <asm/mmu_context.h>
131e133ab2SMartin Schwidefsky #include <asm/pgalloc.h>
141e133ab2SMartin Schwidefsky #include <asm/gmap.h>
151e133ab2SMartin Schwidefsky #include <asm/tlb.h>
161e133ab2SMartin Schwidefsky #include <asm/tlbflush.h>
171e133ab2SMartin Schwidefsky
181e133ab2SMartin Schwidefsky #ifdef CONFIG_PGSTE
191e133ab2SMartin Schwidefsky
201e133ab2SMartin Schwidefsky int page_table_allocate_pgste = 0;
211e133ab2SMartin Schwidefsky EXPORT_SYMBOL(page_table_allocate_pgste);
221e133ab2SMartin Schwidefsky
231e133ab2SMartin Schwidefsky static struct ctl_table page_table_sysctl[] = {
241e133ab2SMartin Schwidefsky {
251e133ab2SMartin Schwidefsky .procname = "allocate_pgste",
261e133ab2SMartin Schwidefsky .data = &page_table_allocate_pgste,
271e133ab2SMartin Schwidefsky .maxlen = sizeof(int),
281e133ab2SMartin Schwidefsky .mode = S_IRUGO | S_IWUSR,
295bedf8aaSVasily Gorbik .proc_handler = proc_dointvec_minmax,
30ac7a0fceSVasily Gorbik .extra1 = SYSCTL_ZERO,
31ac7a0fceSVasily Gorbik .extra2 = SYSCTL_ONE,
321e133ab2SMartin Schwidefsky },
331e133ab2SMartin Schwidefsky { }
341e133ab2SMartin Schwidefsky };
351e133ab2SMartin Schwidefsky
page_table_register_sysctl(void)361e133ab2SMartin Schwidefsky static int __init page_table_register_sysctl(void)
371e133ab2SMartin Schwidefsky {
387ddc873dSLuis Chamberlain return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM;
391e133ab2SMartin Schwidefsky }
401e133ab2SMartin Schwidefsky __initcall(page_table_register_sysctl);
411e133ab2SMartin Schwidefsky
421e133ab2SMartin Schwidefsky #endif /* CONFIG_PGSTE */
431e133ab2SMartin Schwidefsky
crst_table_alloc(struct mm_struct * mm)441e133ab2SMartin Schwidefsky unsigned long *crst_table_alloc(struct mm_struct *mm)
451e133ab2SMartin Schwidefsky {
466326c26cSVishal Moola (Oracle) struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
471e133ab2SMartin Schwidefsky
486326c26cSVishal Moola (Oracle) if (!ptdesc)
491e133ab2SMartin Schwidefsky return NULL;
506326c26cSVishal Moola (Oracle) arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER);
516326c26cSVishal Moola (Oracle) return (unsigned long *) ptdesc_to_virt(ptdesc);
521e133ab2SMartin Schwidefsky }
531e133ab2SMartin Schwidefsky
crst_table_free(struct mm_struct * mm,unsigned long * table)541e133ab2SMartin Schwidefsky void crst_table_free(struct mm_struct *mm, unsigned long *table)
551e133ab2SMartin Schwidefsky {
56*794fa52bSHeiko Carstens if (!table)
57*794fa52bSHeiko Carstens return;
586326c26cSVishal Moola (Oracle) pagetable_free(virt_to_ptdesc(table));
591e133ab2SMartin Schwidefsky }
601e133ab2SMartin Schwidefsky
__crst_table_upgrade(void * arg)611e133ab2SMartin Schwidefsky static void __crst_table_upgrade(void *arg)
621e133ab2SMartin Schwidefsky {
631e133ab2SMartin Schwidefsky struct mm_struct *mm = arg;
641e133ab2SMartin Schwidefsky
6587d59863SHeiko Carstens /* change all active ASCEs to avoid the creation of new TLBs */
66316ec154SChristian Borntraeger if (current->active_mm == mm) {
67316ec154SChristian Borntraeger S390_lowcore.user_asce = mm->context.asce;
68316ec154SChristian Borntraeger __ctl_load(S390_lowcore.user_asce, 7, 7);
69316ec154SChristian Borntraeger }
701e133ab2SMartin Schwidefsky __tlb_flush_local();
711e133ab2SMartin Schwidefsky }
721e133ab2SMartin Schwidefsky
crst_table_upgrade(struct mm_struct * mm,unsigned long end)731aea9b3fSMartin Schwidefsky int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
741e133ab2SMartin Schwidefsky {
7531932757SAlexander Gordeev unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
7631932757SAlexander Gordeev unsigned long asce_limit = mm->context.asce_limit;
771e133ab2SMartin Schwidefsky
781aea9b3fSMartin Schwidefsky /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
7931932757SAlexander Gordeev VM_BUG_ON(asce_limit < _REGION2_SIZE);
8031932757SAlexander Gordeev
8131932757SAlexander Gordeev if (end <= asce_limit)
8231932757SAlexander Gordeev return 0;
8331932757SAlexander Gordeev
8431932757SAlexander Gordeev if (asce_limit == _REGION2_SIZE) {
8531932757SAlexander Gordeev p4d = crst_table_alloc(mm);
8631932757SAlexander Gordeev if (unlikely(!p4d))
8731932757SAlexander Gordeev goto err_p4d;
8831932757SAlexander Gordeev crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
891aea9b3fSMartin Schwidefsky }
9031932757SAlexander Gordeev if (end > _REGION1_SIZE) {
9131932757SAlexander Gordeev pgd = crst_table_alloc(mm);
9231932757SAlexander Gordeev if (unlikely(!pgd))
9331932757SAlexander Gordeev goto err_pgd;
9431932757SAlexander Gordeev crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
9531932757SAlexander Gordeev }
9631932757SAlexander Gordeev
971e133ab2SMartin Schwidefsky spin_lock_bh(&mm->page_table_lock);
9831932757SAlexander Gordeev
9931932757SAlexander Gordeev /*
100c1e8d7c6SMichel Lespinasse * This routine gets called with mmap_lock lock held and there is
10131932757SAlexander Gordeev * no reason to optimize for the case of otherwise. However, if
10231932757SAlexander Gordeev * that would ever change, the below check will let us know.
10331932757SAlexander Gordeev */
10431932757SAlexander Gordeev VM_BUG_ON(asce_limit != mm->context.asce_limit);
10531932757SAlexander Gordeev
10631932757SAlexander Gordeev if (p4d) {
10731932757SAlexander Gordeev __pgd = (unsigned long *) mm->pgd;
10831932757SAlexander Gordeev p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
10931932757SAlexander Gordeev mm->pgd = (pgd_t *) p4d;
110f1c1174fSHeiko Carstens mm->context.asce_limit = _REGION1_SIZE;
111723cacbdSGerald Schaefer mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
112723cacbdSGerald Schaefer _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
113e12e4044SMartin Schwidefsky mm_inc_nr_puds(mm);
11431932757SAlexander Gordeev }
11531932757SAlexander Gordeev if (pgd) {
11631932757SAlexander Gordeev __pgd = (unsigned long *) mm->pgd;
11731932757SAlexander Gordeev pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
11831932757SAlexander Gordeev mm->pgd = (pgd_t *) pgd;
119f7555608SAlexander Gordeev mm->context.asce_limit = TASK_SIZE_MAX;
1201aea9b3fSMartin Schwidefsky mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
1211aea9b3fSMartin Schwidefsky _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
1221aea9b3fSMartin Schwidefsky }
12331932757SAlexander Gordeev
1241e133ab2SMartin Schwidefsky spin_unlock_bh(&mm->page_table_lock);
12531932757SAlexander Gordeev
1261e133ab2SMartin Schwidefsky on_each_cpu(__crst_table_upgrade, mm, 0);
12731932757SAlexander Gordeev
12831932757SAlexander Gordeev return 0;
12931932757SAlexander Gordeev
13031932757SAlexander Gordeev err_pgd:
13131932757SAlexander Gordeev crst_table_free(mm, p4d);
13231932757SAlexander Gordeev err_p4d:
13331932757SAlexander Gordeev return -ENOMEM;
1341e133ab2SMartin Schwidefsky }
1351e133ab2SMartin Schwidefsky
atomic_xor_bits(atomic_t * v,unsigned int bits)1361e133ab2SMartin Schwidefsky static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
1371e133ab2SMartin Schwidefsky {
138d28d86a0SHeiko Carstens return atomic_fetch_xor(bits, v) ^ bits;
1391e133ab2SMartin Schwidefsky }
1401e133ab2SMartin Schwidefsky
1414be130a0SMartin Schwidefsky #ifdef CONFIG_PGSTE
1424be130a0SMartin Schwidefsky
page_table_alloc_pgste(struct mm_struct * mm)1434be130a0SMartin Schwidefsky struct page *page_table_alloc_pgste(struct mm_struct *mm)
1444be130a0SMartin Schwidefsky {
1456326c26cSVishal Moola (Oracle) struct ptdesc *ptdesc;
14641879ff6SHeiko Carstens u64 *table;
1474be130a0SMartin Schwidefsky
1486326c26cSVishal Moola (Oracle) ptdesc = pagetable_alloc(GFP_KERNEL, 0);
1496326c26cSVishal Moola (Oracle) if (ptdesc) {
1506326c26cSVishal Moola (Oracle) table = (u64 *)ptdesc_to_virt(ptdesc);
151bfabe8d0SHeiko Carstens arch_set_page_dat(virt_to_page(table), 0);
15241879ff6SHeiko Carstens memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
15341879ff6SHeiko Carstens memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
1544be130a0SMartin Schwidefsky }
1556326c26cSVishal Moola (Oracle) return ptdesc_page(ptdesc);
1564be130a0SMartin Schwidefsky }
1574be130a0SMartin Schwidefsky
page_table_free_pgste(struct page * page)1584be130a0SMartin Schwidefsky void page_table_free_pgste(struct page *page)
1594be130a0SMartin Schwidefsky {
1606326c26cSVishal Moola (Oracle) pagetable_free(page_ptdesc(page));
1614be130a0SMartin Schwidefsky }
1624be130a0SMartin Schwidefsky
1634be130a0SMartin Schwidefsky #endif /* CONFIG_PGSTE */
1644be130a0SMartin Schwidefsky
1651e133ab2SMartin Schwidefsky /*
1661194372dSAlexander Gordeev * A 2KB-pgtable is either upper or lower half of a normal page.
1671194372dSAlexander Gordeev * The second half of the page may be unused or used as another
1681194372dSAlexander Gordeev * 2KB-pgtable.
1691194372dSAlexander Gordeev *
1701194372dSAlexander Gordeev * Whenever possible the parent page for a new 2KB-pgtable is picked
1711194372dSAlexander Gordeev * from the list of partially allocated pages mm_context_t::pgtable_list.
1721194372dSAlexander Gordeev * In case the list is empty a new parent page is allocated and added to
1731194372dSAlexander Gordeev * the list.
1741194372dSAlexander Gordeev *
1751194372dSAlexander Gordeev * When a parent page gets fully allocated it contains 2KB-pgtables in both
1761194372dSAlexander Gordeev * upper and lower halves and is removed from mm_context_t::pgtable_list.
1771194372dSAlexander Gordeev *
1781194372dSAlexander Gordeev * When 2KB-pgtable is freed from to fully allocated parent page that
1791194372dSAlexander Gordeev * page turns partially allocated and added to mm_context_t::pgtable_list.
1801194372dSAlexander Gordeev *
1811194372dSAlexander Gordeev * If 2KB-pgtable is freed from the partially allocated parent page that
1821194372dSAlexander Gordeev * page turns unused and gets removed from mm_context_t::pgtable_list.
1831194372dSAlexander Gordeev * Furthermore, the unused parent page is released.
1841194372dSAlexander Gordeev *
1851194372dSAlexander Gordeev * As follows from the above, no unallocated or fully allocated parent
1861194372dSAlexander Gordeev * pages are contained in mm_context_t::pgtable_list.
1871194372dSAlexander Gordeev *
1881194372dSAlexander Gordeev * The upper byte (bits 24-31) of the parent page _refcount is used
1891194372dSAlexander Gordeev * for tracking contained 2KB-pgtables and has the following format:
1901194372dSAlexander Gordeev *
1911194372dSAlexander Gordeev * PP AA
1921194372dSAlexander Gordeev * 01234567 upper byte (bits 24-31) of struct page::_refcount
1931194372dSAlexander Gordeev * || ||
1941194372dSAlexander Gordeev * || |+--- upper 2KB-pgtable is allocated
1951194372dSAlexander Gordeev * || +---- lower 2KB-pgtable is allocated
1961194372dSAlexander Gordeev * |+------- upper 2KB-pgtable is pending for removal
1971194372dSAlexander Gordeev * +-------- lower 2KB-pgtable is pending for removal
1981194372dSAlexander Gordeev *
1991194372dSAlexander Gordeev * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
2001194372dSAlexander Gordeev * using _refcount is possible).
2011194372dSAlexander Gordeev *
2021194372dSAlexander Gordeev * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
2031194372dSAlexander Gordeev * The parent page is either:
2041194372dSAlexander Gordeev * - added to mm_context_t::pgtable_list in case the second half of the
2051194372dSAlexander Gordeev * parent page is still unallocated;
2061194372dSAlexander Gordeev * - removed from mm_context_t::pgtable_list in case both hales of the
2071194372dSAlexander Gordeev * parent page are allocated;
2081194372dSAlexander Gordeev * These operations are protected with mm_context_t::lock.
2091194372dSAlexander Gordeev *
2101194372dSAlexander Gordeev * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
2111194372dSAlexander Gordeev * and the corresponding PP bit is set to 1 in a single atomic operation.
2121194372dSAlexander Gordeev * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
2131194372dSAlexander Gordeev * exclusive and may never be both set to 1!
2141194372dSAlexander Gordeev * The parent page is either:
2151194372dSAlexander Gordeev * - added to mm_context_t::pgtable_list in case the second half of the
2161194372dSAlexander Gordeev * parent page is still allocated;
2171194372dSAlexander Gordeev * - removed from mm_context_t::pgtable_list in case the second half of
2181194372dSAlexander Gordeev * the parent page is unallocated;
2191194372dSAlexander Gordeev * These operations are protected with mm_context_t::lock.
2201194372dSAlexander Gordeev *
2211194372dSAlexander Gordeev * It is important to understand that mm_context_t::lock only protects
2221194372dSAlexander Gordeev * mm_context_t::pgtable_list and AA bits, but not the parent page itself
2231194372dSAlexander Gordeev * and PP bits.
2241194372dSAlexander Gordeev *
2251194372dSAlexander Gordeev * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
2261194372dSAlexander Gordeev * while both AA bits and the second PP bit are already unset. Then the
2271194372dSAlexander Gordeev * parent page does not contain any 2KB-pgtable fragment anymore, and it has
2281194372dSAlexander Gordeev * also been removed from mm_context_t::pgtable_list. It is safe to release
2291194372dSAlexander Gordeev * the page therefore.
2301194372dSAlexander Gordeev *
2311194372dSAlexander Gordeev * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
2321194372dSAlexander Gordeev * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
2331194372dSAlexander Gordeev * while the PP bits are never used, nor such a page is added to or removed
2341194372dSAlexander Gordeev * from mm_context_t::pgtable_list.
2358211dad6SHugh Dickins *
2368211dad6SHugh Dickins * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
2378211dad6SHugh Dickins * and prevents both 2K fragments from being reused. pte_free_defer() has to
2388211dad6SHugh Dickins * guarantee that its pgtable cannot be reused before the RCU grace period
2398211dad6SHugh Dickins * has elapsed (which page_table_free_rcu() does not actually guarantee).
2408211dad6SHugh Dickins * But for simplicity, because page->rcu_head overlays page->lru, and because
2418211dad6SHugh Dickins * the RCU callback might not be called before the mm_context_t has been freed,
2428211dad6SHugh Dickins * pte_free_defer() in this implementation prevents both fragments from being
2438211dad6SHugh Dickins * reused, and delays making the call to RCU until both fragments are freed.
2441e133ab2SMartin Schwidefsky */
page_table_alloc(struct mm_struct * mm)2451e133ab2SMartin Schwidefsky unsigned long *page_table_alloc(struct mm_struct *mm)
2461e133ab2SMartin Schwidefsky {
2471e133ab2SMartin Schwidefsky unsigned long *table;
2486326c26cSVishal Moola (Oracle) struct ptdesc *ptdesc;
2491e133ab2SMartin Schwidefsky unsigned int mask, bit;
2501e133ab2SMartin Schwidefsky
2511e133ab2SMartin Schwidefsky /* Try to get a fragment of a 4K page as a 2K page table */
2521e133ab2SMartin Schwidefsky if (!mm_alloc_pgste(mm)) {
2531e133ab2SMartin Schwidefsky table = NULL;
254f28a4b4dSMartin Schwidefsky spin_lock_bh(&mm->context.lock);
2551e133ab2SMartin Schwidefsky if (!list_empty(&mm->context.pgtable_list)) {
2566326c26cSVishal Moola (Oracle) ptdesc = list_first_entry(&mm->context.pgtable_list,
2576326c26cSVishal Moola (Oracle) struct ptdesc, pt_list);
2586326c26cSVishal Moola (Oracle) mask = atomic_read(&ptdesc->_refcount) >> 24;
2591194372dSAlexander Gordeev /*
2601194372dSAlexander Gordeev * The pending removal bits must also be checked.
2611194372dSAlexander Gordeev * Failure to do so might lead to an impossible
2621194372dSAlexander Gordeev * value of (i.e 0x13 or 0x23) written to _refcount.
2631194372dSAlexander Gordeev * Such values violate the assumption that pending and
2641194372dSAlexander Gordeev * allocation bits are mutually exclusive, and the rest
2651194372dSAlexander Gordeev * of the code unrails as result. That could lead to
2661194372dSAlexander Gordeev * a whole bunch of races and corruptions.
2671194372dSAlexander Gordeev */
2681194372dSAlexander Gordeev mask = (mask | (mask >> 4)) & 0x03U;
2691194372dSAlexander Gordeev if (mask != 0x03U) {
2706326c26cSVishal Moola (Oracle) table = (unsigned long *) ptdesc_to_virt(ptdesc);
2711e133ab2SMartin Schwidefsky bit = mask & 1; /* =1 -> second 2K */
2721e133ab2SMartin Schwidefsky if (bit)
2731e133ab2SMartin Schwidefsky table += PTRS_PER_PTE;
2746326c26cSVishal Moola (Oracle) atomic_xor_bits(&ptdesc->_refcount,
2751194372dSAlexander Gordeev 0x01U << (bit + 24));
2766326c26cSVishal Moola (Oracle) list_del_init(&ptdesc->pt_list);
2771e133ab2SMartin Schwidefsky }
2781e133ab2SMartin Schwidefsky }
279f28a4b4dSMartin Schwidefsky spin_unlock_bh(&mm->context.lock);
2801e133ab2SMartin Schwidefsky if (table)
2811e133ab2SMartin Schwidefsky return table;
2821e133ab2SMartin Schwidefsky }
2831e133ab2SMartin Schwidefsky /* Allocate a fresh page */
2846326c26cSVishal Moola (Oracle) ptdesc = pagetable_alloc(GFP_KERNEL, 0);
2856326c26cSVishal Moola (Oracle) if (!ptdesc)
2861e133ab2SMartin Schwidefsky return NULL;
2876326c26cSVishal Moola (Oracle) if (!pagetable_pte_ctor(ptdesc)) {
2886326c26cSVishal Moola (Oracle) pagetable_free(ptdesc);
2891e133ab2SMartin Schwidefsky return NULL;
2901e133ab2SMartin Schwidefsky }
2916326c26cSVishal Moola (Oracle) arch_set_page_dat(ptdesc_page(ptdesc), 0);
2921e133ab2SMartin Schwidefsky /* Initialize page table */
2936326c26cSVishal Moola (Oracle) table = (unsigned long *) ptdesc_to_virt(ptdesc);
2941e133ab2SMartin Schwidefsky if (mm_alloc_pgste(mm)) {
2951e133ab2SMartin Schwidefsky /* Return 4K page table with PGSTEs */
2966326c26cSVishal Moola (Oracle) INIT_LIST_HEAD(&ptdesc->pt_list);
2976326c26cSVishal Moola (Oracle) atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
29841879ff6SHeiko Carstens memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
29941879ff6SHeiko Carstens memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
3001e133ab2SMartin Schwidefsky } else {
3011e133ab2SMartin Schwidefsky /* Return the first 2K fragment of the page */
3026326c26cSVishal Moola (Oracle) atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24);
30341879ff6SHeiko Carstens memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
304f28a4b4dSMartin Schwidefsky spin_lock_bh(&mm->context.lock);
3056326c26cSVishal Moola (Oracle) list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
306f28a4b4dSMartin Schwidefsky spin_unlock_bh(&mm->context.lock);
3071e133ab2SMartin Schwidefsky }
3081e133ab2SMartin Schwidefsky return table;
3091e133ab2SMartin Schwidefsky }
3101e133ab2SMartin Schwidefsky
page_table_release_check(struct page * page,void * table,unsigned int half,unsigned int mask)3114c88bb96SAlexander Gordeev static void page_table_release_check(struct page *page, void *table,
3124c88bb96SAlexander Gordeev unsigned int half, unsigned int mask)
3134c88bb96SAlexander Gordeev {
3144c88bb96SAlexander Gordeev char msg[128];
3154c88bb96SAlexander Gordeev
3168211dad6SHugh Dickins if (!IS_ENABLED(CONFIG_DEBUG_VM))
3178211dad6SHugh Dickins return;
3188211dad6SHugh Dickins if (!mask && list_empty(&page->lru))
3194c88bb96SAlexander Gordeev return;
3204c88bb96SAlexander Gordeev snprintf(msg, sizeof(msg),
3214c88bb96SAlexander Gordeev "Invalid pgtable %p release half 0x%02x mask 0x%02x",
3224c88bb96SAlexander Gordeev table, half, mask);
3234c88bb96SAlexander Gordeev dump_page(page, msg);
3244c88bb96SAlexander Gordeev }
3254c88bb96SAlexander Gordeev
pte_free_now(struct rcu_head * head)3268211dad6SHugh Dickins static void pte_free_now(struct rcu_head *head)
3278211dad6SHugh Dickins {
3286326c26cSVishal Moola (Oracle) struct ptdesc *ptdesc;
3298211dad6SHugh Dickins
3306326c26cSVishal Moola (Oracle) ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
3316326c26cSVishal Moola (Oracle) pagetable_pte_dtor(ptdesc);
3326326c26cSVishal Moola (Oracle) pagetable_free(ptdesc);
3338211dad6SHugh Dickins }
3348211dad6SHugh Dickins
page_table_free(struct mm_struct * mm,unsigned long * table)3351e133ab2SMartin Schwidefsky void page_table_free(struct mm_struct *mm, unsigned long *table)
3361e133ab2SMartin Schwidefsky {
3374c88bb96SAlexander Gordeev unsigned int mask, bit, half;
3386326c26cSVishal Moola (Oracle) struct ptdesc *ptdesc = virt_to_ptdesc(table);
3391e133ab2SMartin Schwidefsky
3401e133ab2SMartin Schwidefsky if (!mm_alloc_pgste(mm)) {
3411e133ab2SMartin Schwidefsky /* Free 2K page table fragment of a 4K page */
3422a444fdcSAlexander Gordeev bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
343f28a4b4dSMartin Schwidefsky spin_lock_bh(&mm->context.lock);
3441194372dSAlexander Gordeev /*
3451194372dSAlexander Gordeev * Mark the page for delayed release. The actual release
3461194372dSAlexander Gordeev * will happen outside of the critical section from this
3471194372dSAlexander Gordeev * function or from __tlb_remove_table()
3481194372dSAlexander Gordeev */
3496326c26cSVishal Moola (Oracle) mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
350620b4e90SMatthew Wilcox mask >>= 24;
3516326c26cSVishal Moola (Oracle) if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
3528211dad6SHugh Dickins /*
3538211dad6SHugh Dickins * Other half is allocated, and neither half has had
3548211dad6SHugh Dickins * its free deferred: add page to head of list, to make
3558211dad6SHugh Dickins * this freed half available for immediate reuse.
3568211dad6SHugh Dickins */
3576326c26cSVishal Moola (Oracle) list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
3588211dad6SHugh Dickins } else {
3598211dad6SHugh Dickins /* If page is on list, now remove it. */
3606326c26cSVishal Moola (Oracle) list_del_init(&ptdesc->pt_list);
3618211dad6SHugh Dickins }
362f28a4b4dSMartin Schwidefsky spin_unlock_bh(&mm->context.lock);
3636326c26cSVishal Moola (Oracle) mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24));
364c2c22493SAlexander Gordeev mask >>= 24;
3651194372dSAlexander Gordeev if (mask != 0x00U)
3661e133ab2SMartin Schwidefsky return;
3674c88bb96SAlexander Gordeev half = 0x01U << bit;
368dfa75863SEric Farman } else {
3694c88bb96SAlexander Gordeev half = 0x03U;
3706326c26cSVishal Moola (Oracle) mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
3714c88bb96SAlexander Gordeev mask >>= 24;
3721e133ab2SMartin Schwidefsky }
3731e133ab2SMartin Schwidefsky
3746326c26cSVishal Moola (Oracle) page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
3756326c26cSVishal Moola (Oracle) if (folio_test_clear_active(ptdesc_folio(ptdesc)))
3766326c26cSVishal Moola (Oracle) call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
3778211dad6SHugh Dickins else
3786326c26cSVishal Moola (Oracle) pte_free_now(&ptdesc->pt_rcu_head);
3791e133ab2SMartin Schwidefsky }
3801e133ab2SMartin Schwidefsky
page_table_free_rcu(struct mmu_gather * tlb,unsigned long * table,unsigned long vmaddr)3811e133ab2SMartin Schwidefsky void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
3821e133ab2SMartin Schwidefsky unsigned long vmaddr)
3831e133ab2SMartin Schwidefsky {
3841e133ab2SMartin Schwidefsky struct mm_struct *mm;
3851e133ab2SMartin Schwidefsky unsigned int bit, mask;
3866326c26cSVishal Moola (Oracle) struct ptdesc *ptdesc = virt_to_ptdesc(table);
3871e133ab2SMartin Schwidefsky
3881e133ab2SMartin Schwidefsky mm = tlb->mm;
3891e133ab2SMartin Schwidefsky if (mm_alloc_pgste(mm)) {
3901e133ab2SMartin Schwidefsky gmap_unlink(mm, table, vmaddr);
3911194372dSAlexander Gordeev table = (unsigned long *) ((unsigned long)table | 0x03U);
3926326c26cSVishal Moola (Oracle) tlb_remove_ptdesc(tlb, table);
3931e133ab2SMartin Schwidefsky return;
3941e133ab2SMartin Schwidefsky }
3952a444fdcSAlexander Gordeev bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
396f28a4b4dSMartin Schwidefsky spin_lock_bh(&mm->context.lock);
3971194372dSAlexander Gordeev /*
3981194372dSAlexander Gordeev * Mark the page for delayed release. The actual release will happen
3991194372dSAlexander Gordeev * outside of the critical section from __tlb_remove_table() or from
4001194372dSAlexander Gordeev * page_table_free()
4011194372dSAlexander Gordeev */
4026326c26cSVishal Moola (Oracle) mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
403620b4e90SMatthew Wilcox mask >>= 24;
4046326c26cSVishal Moola (Oracle) if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
4058211dad6SHugh Dickins /*
4068211dad6SHugh Dickins * Other half is allocated, and neither half has had
4078211dad6SHugh Dickins * its free deferred: add page to end of list, to make
4088211dad6SHugh Dickins * this freed half available for reuse once its pending
4098211dad6SHugh Dickins * bit has been cleared by __tlb_remove_table().
4108211dad6SHugh Dickins */
4116326c26cSVishal Moola (Oracle) list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list);
4128211dad6SHugh Dickins } else {
4138211dad6SHugh Dickins /* If page is on list, now remove it. */
4146326c26cSVishal Moola (Oracle) list_del_init(&ptdesc->pt_list);
4158211dad6SHugh Dickins }
416f28a4b4dSMartin Schwidefsky spin_unlock_bh(&mm->context.lock);
4171194372dSAlexander Gordeev table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
4181e133ab2SMartin Schwidefsky tlb_remove_table(tlb, table);
4191e133ab2SMartin Schwidefsky }
4201e133ab2SMartin Schwidefsky
__tlb_remove_table(void * _table)4219de7d833SMartin Schwidefsky void __tlb_remove_table(void *_table)
4221e133ab2SMartin Schwidefsky {
4234c88bb96SAlexander Gordeev unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
4241e133ab2SMartin Schwidefsky void *table = (void *)((unsigned long) _table ^ mask);
4256326c26cSVishal Moola (Oracle) struct ptdesc *ptdesc = virt_to_ptdesc(table);
4261e133ab2SMartin Schwidefsky
4274c88bb96SAlexander Gordeev switch (half) {
4281194372dSAlexander Gordeev case 0x00U: /* pmd, pud, or p4d */
4296326c26cSVishal Moola (Oracle) pagetable_free(ptdesc);
4304c88bb96SAlexander Gordeev return;
4311194372dSAlexander Gordeev case 0x01U: /* lower 2K of a 4K page table */
4321194372dSAlexander Gordeev case 0x02U: /* higher 2K of a 4K page table */
4336326c26cSVishal Moola (Oracle) mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24));
434620b4e90SMatthew Wilcox mask >>= 24;
4351194372dSAlexander Gordeev if (mask != 0x00U)
4364c88bb96SAlexander Gordeev return;
4371e133ab2SMartin Schwidefsky break;
4381194372dSAlexander Gordeev case 0x03U: /* 4K page table with pgstes */
4396326c26cSVishal Moola (Oracle) mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
4404c88bb96SAlexander Gordeev mask >>= 24;
4411e133ab2SMartin Schwidefsky break;
4421e133ab2SMartin Schwidefsky }
4434c88bb96SAlexander Gordeev
4446326c26cSVishal Moola (Oracle) page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
4456326c26cSVishal Moola (Oracle) if (folio_test_clear_active(ptdesc_folio(ptdesc)))
4466326c26cSVishal Moola (Oracle) call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
4478211dad6SHugh Dickins else
4486326c26cSVishal Moola (Oracle) pte_free_now(&ptdesc->pt_rcu_head);
4491e133ab2SMartin Schwidefsky }
4501e133ab2SMartin Schwidefsky
4518211dad6SHugh Dickins #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pte_free_defer(struct mm_struct * mm,pgtable_t pgtable)4528211dad6SHugh Dickins void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
4538211dad6SHugh Dickins {
4548211dad6SHugh Dickins struct page *page;
4558211dad6SHugh Dickins
4568211dad6SHugh Dickins page = virt_to_page(pgtable);
4578211dad6SHugh Dickins SetPageActive(page);
4588211dad6SHugh Dickins page_table_free(mm, (unsigned long *)pgtable);
4598211dad6SHugh Dickins /*
4608211dad6SHugh Dickins * page_table_free() does not do the pgste gmap_unlink() which
4618211dad6SHugh Dickins * page_table_free_rcu() does: warn us if pgste ever reaches here.
4628211dad6SHugh Dickins */
4638211dad6SHugh Dickins WARN_ON_ONCE(mm_has_pgste(mm));
4648211dad6SHugh Dickins }
4658211dad6SHugh Dickins #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4668211dad6SHugh Dickins
4671caf170dSHeiko Carstens /*
4681caf170dSHeiko Carstens * Base infrastructure required to generate basic asces, region, segment,
4691caf170dSHeiko Carstens * and page tables that do not make use of enhanced features like EDAT1.
4701caf170dSHeiko Carstens */
4711caf170dSHeiko Carstens
4721caf170dSHeiko Carstens static struct kmem_cache *base_pgt_cache;
4731caf170dSHeiko Carstens
base_pgt_alloc(void)474da001fceSHeiko Carstens static unsigned long *base_pgt_alloc(void)
4751caf170dSHeiko Carstens {
476da001fceSHeiko Carstens unsigned long *table;
4771caf170dSHeiko Carstens
4781caf170dSHeiko Carstens table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
4791caf170dSHeiko Carstens if (table)
480da001fceSHeiko Carstens memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
4811caf170dSHeiko Carstens return table;
4821caf170dSHeiko Carstens }
4831caf170dSHeiko Carstens
base_pgt_free(unsigned long * table)484da001fceSHeiko Carstens static void base_pgt_free(unsigned long *table)
4851caf170dSHeiko Carstens {
486da001fceSHeiko Carstens kmem_cache_free(base_pgt_cache, table);
487da001fceSHeiko Carstens }
488da001fceSHeiko Carstens
base_crst_alloc(unsigned long val)489da001fceSHeiko Carstens static unsigned long *base_crst_alloc(unsigned long val)
490da001fceSHeiko Carstens {
491da001fceSHeiko Carstens unsigned long *table;
4926326c26cSVishal Moola (Oracle) struct ptdesc *ptdesc;
493da001fceSHeiko Carstens
4946326c26cSVishal Moola (Oracle) ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER);
4956326c26cSVishal Moola (Oracle) if (!ptdesc)
4966326c26cSVishal Moola (Oracle) return NULL;
4976326c26cSVishal Moola (Oracle) table = ptdesc_address(ptdesc);
4986326c26cSVishal Moola (Oracle)
499da001fceSHeiko Carstens crst_table_init(table, val);
500da001fceSHeiko Carstens return table;
501da001fceSHeiko Carstens }
502da001fceSHeiko Carstens
base_crst_free(unsigned long * table)503da001fceSHeiko Carstens static void base_crst_free(unsigned long *table)
504da001fceSHeiko Carstens {
505*794fa52bSHeiko Carstens if (!table)
506*794fa52bSHeiko Carstens return;
5076326c26cSVishal Moola (Oracle) pagetable_free(virt_to_ptdesc(table));
5081caf170dSHeiko Carstens }
5091caf170dSHeiko Carstens
5101caf170dSHeiko Carstens #define BASE_ADDR_END_FUNC(NAME, SIZE) \
5111caf170dSHeiko Carstens static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
5121caf170dSHeiko Carstens unsigned long end) \
5131caf170dSHeiko Carstens { \
5141caf170dSHeiko Carstens unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \
5151caf170dSHeiko Carstens \
5161caf170dSHeiko Carstens return (next - 1) < (end - 1) ? next : end; \
5171caf170dSHeiko Carstens }
5181caf170dSHeiko Carstens
BASE_ADDR_END_FUNC(page,_PAGE_SIZE)5191caf170dSHeiko Carstens BASE_ADDR_END_FUNC(page, _PAGE_SIZE)
5201caf170dSHeiko Carstens BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
5211caf170dSHeiko Carstens BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
5221caf170dSHeiko Carstens BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
5231caf170dSHeiko Carstens BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
5241caf170dSHeiko Carstens
5251caf170dSHeiko Carstens static inline unsigned long base_lra(unsigned long address)
5261caf170dSHeiko Carstens {
5271caf170dSHeiko Carstens unsigned long real;
5281caf170dSHeiko Carstens
5291caf170dSHeiko Carstens asm volatile(
5301caf170dSHeiko Carstens " lra %0,0(%1)\n"
5311caf170dSHeiko Carstens : "=d" (real) : "a" (address) : "cc");
5321caf170dSHeiko Carstens return real;
5331caf170dSHeiko Carstens }
5341caf170dSHeiko Carstens
base_page_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)535da001fceSHeiko Carstens static int base_page_walk(unsigned long *origin, unsigned long addr,
5361caf170dSHeiko Carstens unsigned long end, int alloc)
5371caf170dSHeiko Carstens {
5381caf170dSHeiko Carstens unsigned long *pte, next;
5391caf170dSHeiko Carstens
5401caf170dSHeiko Carstens if (!alloc)
5411caf170dSHeiko Carstens return 0;
542da001fceSHeiko Carstens pte = origin;
5431caf170dSHeiko Carstens pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
5441caf170dSHeiko Carstens do {
5451caf170dSHeiko Carstens next = base_page_addr_end(addr, end);
5461caf170dSHeiko Carstens *pte = base_lra(addr);
5471caf170dSHeiko Carstens } while (pte++, addr = next, addr < end);
5481caf170dSHeiko Carstens return 0;
5491caf170dSHeiko Carstens }
5501caf170dSHeiko Carstens
base_segment_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)551da001fceSHeiko Carstens static int base_segment_walk(unsigned long *origin, unsigned long addr,
5521caf170dSHeiko Carstens unsigned long end, int alloc)
5531caf170dSHeiko Carstens {
554da001fceSHeiko Carstens unsigned long *ste, next, *table;
5551caf170dSHeiko Carstens int rc;
5561caf170dSHeiko Carstens
557da001fceSHeiko Carstens ste = origin;
5581caf170dSHeiko Carstens ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
5591caf170dSHeiko Carstens do {
5601caf170dSHeiko Carstens next = base_segment_addr_end(addr, end);
5611caf170dSHeiko Carstens if (*ste & _SEGMENT_ENTRY_INVALID) {
5621caf170dSHeiko Carstens if (!alloc)
5631caf170dSHeiko Carstens continue;
5641caf170dSHeiko Carstens table = base_pgt_alloc();
5651caf170dSHeiko Carstens if (!table)
5661caf170dSHeiko Carstens return -ENOMEM;
5672f882800SHeiko Carstens *ste = __pa(table) | _SEGMENT_ENTRY;
5681caf170dSHeiko Carstens }
569da001fceSHeiko Carstens table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
5701caf170dSHeiko Carstens rc = base_page_walk(table, addr, next, alloc);
5711caf170dSHeiko Carstens if (rc)
5721caf170dSHeiko Carstens return rc;
5731caf170dSHeiko Carstens if (!alloc)
5741caf170dSHeiko Carstens base_pgt_free(table);
5751caf170dSHeiko Carstens cond_resched();
5761caf170dSHeiko Carstens } while (ste++, addr = next, addr < end);
5771caf170dSHeiko Carstens return 0;
5781caf170dSHeiko Carstens }
5791caf170dSHeiko Carstens
base_region3_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)580da001fceSHeiko Carstens static int base_region3_walk(unsigned long *origin, unsigned long addr,
5811caf170dSHeiko Carstens unsigned long end, int alloc)
5821caf170dSHeiko Carstens {
583da001fceSHeiko Carstens unsigned long *rtte, next, *table;
5841caf170dSHeiko Carstens int rc;
5851caf170dSHeiko Carstens
586da001fceSHeiko Carstens rtte = origin;
5871caf170dSHeiko Carstens rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
5881caf170dSHeiko Carstens do {
5891caf170dSHeiko Carstens next = base_region3_addr_end(addr, end);
5901caf170dSHeiko Carstens if (*rtte & _REGION_ENTRY_INVALID) {
5911caf170dSHeiko Carstens if (!alloc)
5921caf170dSHeiko Carstens continue;
5931caf170dSHeiko Carstens table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
5941caf170dSHeiko Carstens if (!table)
5951caf170dSHeiko Carstens return -ENOMEM;
5962f882800SHeiko Carstens *rtte = __pa(table) | _REGION3_ENTRY;
5971caf170dSHeiko Carstens }
598da001fceSHeiko Carstens table = __va(*rtte & _REGION_ENTRY_ORIGIN);
5991caf170dSHeiko Carstens rc = base_segment_walk(table, addr, next, alloc);
6001caf170dSHeiko Carstens if (rc)
6011caf170dSHeiko Carstens return rc;
6021caf170dSHeiko Carstens if (!alloc)
6031caf170dSHeiko Carstens base_crst_free(table);
6041caf170dSHeiko Carstens } while (rtte++, addr = next, addr < end);
6051caf170dSHeiko Carstens return 0;
6061caf170dSHeiko Carstens }
6071caf170dSHeiko Carstens
base_region2_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)608da001fceSHeiko Carstens static int base_region2_walk(unsigned long *origin, unsigned long addr,
6091caf170dSHeiko Carstens unsigned long end, int alloc)
6101caf170dSHeiko Carstens {
611da001fceSHeiko Carstens unsigned long *rste, next, *table;
6121caf170dSHeiko Carstens int rc;
6131caf170dSHeiko Carstens
614da001fceSHeiko Carstens rste = origin;
6151caf170dSHeiko Carstens rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
6161caf170dSHeiko Carstens do {
6171caf170dSHeiko Carstens next = base_region2_addr_end(addr, end);
6181caf170dSHeiko Carstens if (*rste & _REGION_ENTRY_INVALID) {
6191caf170dSHeiko Carstens if (!alloc)
6201caf170dSHeiko Carstens continue;
6211caf170dSHeiko Carstens table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
6221caf170dSHeiko Carstens if (!table)
6231caf170dSHeiko Carstens return -ENOMEM;
6242f882800SHeiko Carstens *rste = __pa(table) | _REGION2_ENTRY;
6251caf170dSHeiko Carstens }
626da001fceSHeiko Carstens table = __va(*rste & _REGION_ENTRY_ORIGIN);
6271caf170dSHeiko Carstens rc = base_region3_walk(table, addr, next, alloc);
6281caf170dSHeiko Carstens if (rc)
6291caf170dSHeiko Carstens return rc;
6301caf170dSHeiko Carstens if (!alloc)
6311caf170dSHeiko Carstens base_crst_free(table);
6321caf170dSHeiko Carstens } while (rste++, addr = next, addr < end);
6331caf170dSHeiko Carstens return 0;
6341caf170dSHeiko Carstens }
6351caf170dSHeiko Carstens
base_region1_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)636da001fceSHeiko Carstens static int base_region1_walk(unsigned long *origin, unsigned long addr,
6371caf170dSHeiko Carstens unsigned long end, int alloc)
6381caf170dSHeiko Carstens {
639da001fceSHeiko Carstens unsigned long *rfte, next, *table;
6401caf170dSHeiko Carstens int rc;
6411caf170dSHeiko Carstens
642da001fceSHeiko Carstens rfte = origin;
6431caf170dSHeiko Carstens rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
6441caf170dSHeiko Carstens do {
6451caf170dSHeiko Carstens next = base_region1_addr_end(addr, end);
6461caf170dSHeiko Carstens if (*rfte & _REGION_ENTRY_INVALID) {
6471caf170dSHeiko Carstens if (!alloc)
6481caf170dSHeiko Carstens continue;
6491caf170dSHeiko Carstens table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
6501caf170dSHeiko Carstens if (!table)
6511caf170dSHeiko Carstens return -ENOMEM;
6522f882800SHeiko Carstens *rfte = __pa(table) | _REGION1_ENTRY;
6531caf170dSHeiko Carstens }
654da001fceSHeiko Carstens table = __va(*rfte & _REGION_ENTRY_ORIGIN);
6551caf170dSHeiko Carstens rc = base_region2_walk(table, addr, next, alloc);
6561caf170dSHeiko Carstens if (rc)
6571caf170dSHeiko Carstens return rc;
6581caf170dSHeiko Carstens if (!alloc)
6591caf170dSHeiko Carstens base_crst_free(table);
6601caf170dSHeiko Carstens } while (rfte++, addr = next, addr < end);
6611caf170dSHeiko Carstens return 0;
6621caf170dSHeiko Carstens }
6631caf170dSHeiko Carstens
6641caf170dSHeiko Carstens /**
6651caf170dSHeiko Carstens * base_asce_free - free asce and tables returned from base_asce_alloc()
6661caf170dSHeiko Carstens * @asce: asce to be freed
6671caf170dSHeiko Carstens *
6681caf170dSHeiko Carstens * Frees all region, segment, and page tables that were allocated with a
6691caf170dSHeiko Carstens * corresponding base_asce_alloc() call.
6701caf170dSHeiko Carstens */
base_asce_free(unsigned long asce)6711caf170dSHeiko Carstens void base_asce_free(unsigned long asce)
6721caf170dSHeiko Carstens {
673da001fceSHeiko Carstens unsigned long *table = __va(asce & _ASCE_ORIGIN);
6741caf170dSHeiko Carstens
6751caf170dSHeiko Carstens if (!asce)
6761caf170dSHeiko Carstens return;
6771caf170dSHeiko Carstens switch (asce & _ASCE_TYPE_MASK) {
6781caf170dSHeiko Carstens case _ASCE_TYPE_SEGMENT:
6791caf170dSHeiko Carstens base_segment_walk(table, 0, _REGION3_SIZE, 0);
6801caf170dSHeiko Carstens break;
6811caf170dSHeiko Carstens case _ASCE_TYPE_REGION3:
6821caf170dSHeiko Carstens base_region3_walk(table, 0, _REGION2_SIZE, 0);
6831caf170dSHeiko Carstens break;
6841caf170dSHeiko Carstens case _ASCE_TYPE_REGION2:
6851caf170dSHeiko Carstens base_region2_walk(table, 0, _REGION1_SIZE, 0);
6861caf170dSHeiko Carstens break;
6871caf170dSHeiko Carstens case _ASCE_TYPE_REGION1:
688f7555608SAlexander Gordeev base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
6891caf170dSHeiko Carstens break;
6901caf170dSHeiko Carstens }
6911caf170dSHeiko Carstens base_crst_free(table);
6921caf170dSHeiko Carstens }
6931caf170dSHeiko Carstens
base_pgt_cache_init(void)6941caf170dSHeiko Carstens static int base_pgt_cache_init(void)
6951caf170dSHeiko Carstens {
6961caf170dSHeiko Carstens static DEFINE_MUTEX(base_pgt_cache_mutex);
6971caf170dSHeiko Carstens unsigned long sz = _PAGE_TABLE_SIZE;
6981caf170dSHeiko Carstens
6991caf170dSHeiko Carstens if (base_pgt_cache)
7001caf170dSHeiko Carstens return 0;
7011caf170dSHeiko Carstens mutex_lock(&base_pgt_cache_mutex);
7021caf170dSHeiko Carstens if (!base_pgt_cache)
7031caf170dSHeiko Carstens base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
7041caf170dSHeiko Carstens mutex_unlock(&base_pgt_cache_mutex);
7051caf170dSHeiko Carstens return base_pgt_cache ? 0 : -ENOMEM;
7061caf170dSHeiko Carstens }
7071caf170dSHeiko Carstens
7081caf170dSHeiko Carstens /**
7091caf170dSHeiko Carstens * base_asce_alloc - create kernel mapping without enhanced DAT features
7101caf170dSHeiko Carstens * @addr: virtual start address of kernel mapping
7111caf170dSHeiko Carstens * @num_pages: number of consecutive pages
7121caf170dSHeiko Carstens *
7131caf170dSHeiko Carstens * Generate an asce, including all required region, segment and page tables,
7141caf170dSHeiko Carstens * that can be used to access the virtual kernel mapping. The difference is
7151caf170dSHeiko Carstens * that the returned asce does not make use of any enhanced DAT features like
7161caf170dSHeiko Carstens * e.g. large pages. This is required for some I/O functions that pass an
7171caf170dSHeiko Carstens * asce, like e.g. some service call requests.
7181caf170dSHeiko Carstens *
7191caf170dSHeiko Carstens * Note: the returned asce may NEVER be attached to any cpu. It may only be
7201caf170dSHeiko Carstens * used for I/O requests. tlb entries that might result because the
7211caf170dSHeiko Carstens * asce was attached to a cpu won't be cleared.
7221caf170dSHeiko Carstens */
base_asce_alloc(unsigned long addr,unsigned long num_pages)7231caf170dSHeiko Carstens unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
7241caf170dSHeiko Carstens {
725da001fceSHeiko Carstens unsigned long asce, *table, end;
7261caf170dSHeiko Carstens int rc;
7271caf170dSHeiko Carstens
7281caf170dSHeiko Carstens if (base_pgt_cache_init())
7291caf170dSHeiko Carstens return 0;
7301caf170dSHeiko Carstens end = addr + num_pages * PAGE_SIZE;
7311caf170dSHeiko Carstens if (end <= _REGION3_SIZE) {
7321caf170dSHeiko Carstens table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
7331caf170dSHeiko Carstens if (!table)
7341caf170dSHeiko Carstens return 0;
7351caf170dSHeiko Carstens rc = base_segment_walk(table, addr, end, 1);
7362f882800SHeiko Carstens asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
7371caf170dSHeiko Carstens } else if (end <= _REGION2_SIZE) {
7381caf170dSHeiko Carstens table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
7391caf170dSHeiko Carstens if (!table)
7401caf170dSHeiko Carstens return 0;
7411caf170dSHeiko Carstens rc = base_region3_walk(table, addr, end, 1);
7422f882800SHeiko Carstens asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
7431caf170dSHeiko Carstens } else if (end <= _REGION1_SIZE) {
7441caf170dSHeiko Carstens table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
7451caf170dSHeiko Carstens if (!table)
7461caf170dSHeiko Carstens return 0;
7471caf170dSHeiko Carstens rc = base_region2_walk(table, addr, end, 1);
7482f882800SHeiko Carstens asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
7491caf170dSHeiko Carstens } else {
7501caf170dSHeiko Carstens table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
7511caf170dSHeiko Carstens if (!table)
7521caf170dSHeiko Carstens return 0;
7531caf170dSHeiko Carstens rc = base_region1_walk(table, addr, end, 1);
7542f882800SHeiko Carstens asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
7551caf170dSHeiko Carstens }
7561caf170dSHeiko Carstens if (rc) {
7571caf170dSHeiko Carstens base_asce_free(asce);
7581caf170dSHeiko Carstens asce = 0;
7591caf170dSHeiko Carstens }
7601caf170dSHeiko Carstens return asce;
7611caf170dSHeiko Carstens }
762