xref: /openbmc/linux/arch/arm64/kvm/mmu.c (revision 8a511e7efc5a72173f64d191f01cda236d54e27a)
19ed24f4bSMarc Zyngier // SPDX-License-Identifier: GPL-2.0-only
29ed24f4bSMarc Zyngier /*
39ed24f4bSMarc Zyngier  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
49ed24f4bSMarc Zyngier  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
59ed24f4bSMarc Zyngier  */
69ed24f4bSMarc Zyngier 
79ed24f4bSMarc Zyngier #include <linux/mman.h>
89ed24f4bSMarc Zyngier #include <linux/kvm_host.h>
99ed24f4bSMarc Zyngier #include <linux/io.h>
109ed24f4bSMarc Zyngier #include <linux/hugetlb.h>
119ed24f4bSMarc Zyngier #include <linux/sched/signal.h>
129ed24f4bSMarc Zyngier #include <trace/events/kvm.h>
139ed24f4bSMarc Zyngier #include <asm/pgalloc.h>
149ed24f4bSMarc Zyngier #include <asm/cacheflush.h>
159ed24f4bSMarc Zyngier #include <asm/kvm_arm.h>
169ed24f4bSMarc Zyngier #include <asm/kvm_mmu.h>
170f9d09b8SWill Deacon #include <asm/kvm_pgtable.h>
189ed24f4bSMarc Zyngier #include <asm/kvm_ras.h>
199ed24f4bSMarc Zyngier #include <asm/kvm_asm.h>
209ed24f4bSMarc Zyngier #include <asm/kvm_emulate.h>
219ed24f4bSMarc Zyngier #include <asm/virt.h>
229ed24f4bSMarc Zyngier 
239ed24f4bSMarc Zyngier #include "trace.h"
249ed24f4bSMarc Zyngier 
250f9d09b8SWill Deacon static struct kvm_pgtable *hyp_pgtable;
269ed24f4bSMarc Zyngier static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
279ed24f4bSMarc Zyngier 
288d20bd63SSean Christopherson static unsigned long __ro_after_init hyp_idmap_start;
298d20bd63SSean Christopherson static unsigned long __ro_after_init hyp_idmap_end;
308d20bd63SSean Christopherson static phys_addr_t __ro_after_init hyp_idmap_vector;
319ed24f4bSMarc Zyngier 
328d20bd63SSean Christopherson static unsigned long __ro_after_init io_map_base;
339ed24f4bSMarc Zyngier 
__stage2_range_addr_end(phys_addr_t addr,phys_addr_t end,phys_addr_t size)34e7bf7a49SRicardo Koller static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
35e7bf7a49SRicardo Koller 					   phys_addr_t size)
365994bc9eSOliver Upton {
375994bc9eSOliver Upton 	phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
385994bc9eSOliver Upton 
395994bc9eSOliver Upton 	return (boundary - 1 < end - 1) ? boundary : end;
405994bc9eSOliver Upton }
419ed24f4bSMarc Zyngier 
stage2_range_addr_end(phys_addr_t addr,phys_addr_t end)42e7bf7a49SRicardo Koller static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
43e7bf7a49SRicardo Koller {
44e7bf7a49SRicardo Koller 	phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
45e7bf7a49SRicardo Koller 
46e7bf7a49SRicardo Koller 	return __stage2_range_addr_end(addr, end, size);
47e7bf7a49SRicardo Koller }
48e7bf7a49SRicardo Koller 
4952bae936SWill Deacon /*
5052bae936SWill Deacon  * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
5152bae936SWill Deacon  * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
5252bae936SWill Deacon  * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
5352bae936SWill Deacon  * long will also starve other vCPUs. We have to also make sure that the page
5452bae936SWill Deacon  * tables are not freed while we released the lock.
5552bae936SWill Deacon  */
stage2_apply_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,phys_addr_t end,int (* fn)(struct kvm_pgtable *,u64,u64),bool resched)568531bd63SMarc Zyngier static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
5752bae936SWill Deacon 			      phys_addr_t end,
5852bae936SWill Deacon 			      int (*fn)(struct kvm_pgtable *, u64, u64),
5952bae936SWill Deacon 			      bool resched)
6052bae936SWill Deacon {
618531bd63SMarc Zyngier 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
6252bae936SWill Deacon 	int ret;
6352bae936SWill Deacon 	u64 next;
6452bae936SWill Deacon 
6552bae936SWill Deacon 	do {
668531bd63SMarc Zyngier 		struct kvm_pgtable *pgt = mmu->pgt;
6752bae936SWill Deacon 		if (!pgt)
6852bae936SWill Deacon 			return -EINVAL;
6952bae936SWill Deacon 
705994bc9eSOliver Upton 		next = stage2_range_addr_end(addr, end);
7152bae936SWill Deacon 		ret = fn(pgt, addr, next - addr);
7252bae936SWill Deacon 		if (ret)
7352bae936SWill Deacon 			break;
7452bae936SWill Deacon 
7552bae936SWill Deacon 		if (resched && next != end)
76fcc5bf89SJing Zhang 			cond_resched_rwlock_write(&kvm->mmu_lock);
7752bae936SWill Deacon 	} while (addr = next, addr != end);
7852bae936SWill Deacon 
7952bae936SWill Deacon 	return ret;
8052bae936SWill Deacon }
8152bae936SWill Deacon 
828531bd63SMarc Zyngier #define stage2_apply_range_resched(mmu, addr, end, fn)			\
838531bd63SMarc Zyngier 	stage2_apply_range(mmu, addr, end, fn, true)
84cc38d61cSQuentin Perret 
85e7bf7a49SRicardo Koller /*
86e7bf7a49SRicardo Koller  * Get the maximum number of page-tables pages needed to split a range
87e7bf7a49SRicardo Koller  * of blocks into PAGE_SIZE PTEs. It assumes the range is already
88e7bf7a49SRicardo Koller  * mapped at level 2, or at level 1 if allowed.
89e7bf7a49SRicardo Koller  */
kvm_mmu_split_nr_page_tables(u64 range)90e7bf7a49SRicardo Koller static int kvm_mmu_split_nr_page_tables(u64 range)
91e7bf7a49SRicardo Koller {
92e7bf7a49SRicardo Koller 	int n = 0;
93e7bf7a49SRicardo Koller 
94e7bf7a49SRicardo Koller 	if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
9514c3555fSArnd Bergmann 		n += DIV_ROUND_UP(range, PUD_SIZE);
9614c3555fSArnd Bergmann 	n += DIV_ROUND_UP(range, PMD_SIZE);
97e7bf7a49SRicardo Koller 	return n;
98e7bf7a49SRicardo Koller }
99e7bf7a49SRicardo Koller 
need_split_memcache_topup_or_resched(struct kvm * kvm)100e7bf7a49SRicardo Koller static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
101e7bf7a49SRicardo Koller {
102e7bf7a49SRicardo Koller 	struct kvm_mmu_memory_cache *cache;
103e7bf7a49SRicardo Koller 	u64 chunk_size, min;
104e7bf7a49SRicardo Koller 
105e7bf7a49SRicardo Koller 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
106e7bf7a49SRicardo Koller 		return true;
107e7bf7a49SRicardo Koller 
108e7bf7a49SRicardo Koller 	chunk_size = kvm->arch.mmu.split_page_chunk_size;
109e7bf7a49SRicardo Koller 	min = kvm_mmu_split_nr_page_tables(chunk_size);
110e7bf7a49SRicardo Koller 	cache = &kvm->arch.mmu.split_page_cache;
111e7bf7a49SRicardo Koller 	return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
112e7bf7a49SRicardo Koller }
113e7bf7a49SRicardo Koller 
kvm_mmu_split_huge_pages(struct kvm * kvm,phys_addr_t addr,phys_addr_t end)114e7bf7a49SRicardo Koller static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
115e7bf7a49SRicardo Koller 				    phys_addr_t end)
116e7bf7a49SRicardo Koller {
117e7bf7a49SRicardo Koller 	struct kvm_mmu_memory_cache *cache;
118e7bf7a49SRicardo Koller 	struct kvm_pgtable *pgt;
119e7bf7a49SRicardo Koller 	int ret, cache_capacity;
120e7bf7a49SRicardo Koller 	u64 next, chunk_size;
121e7bf7a49SRicardo Koller 
122e7bf7a49SRicardo Koller 	lockdep_assert_held_write(&kvm->mmu_lock);
123e7bf7a49SRicardo Koller 
124e7bf7a49SRicardo Koller 	chunk_size = kvm->arch.mmu.split_page_chunk_size;
125e7bf7a49SRicardo Koller 	cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
126e7bf7a49SRicardo Koller 
127e7bf7a49SRicardo Koller 	if (chunk_size == 0)
128e7bf7a49SRicardo Koller 		return 0;
129e7bf7a49SRicardo Koller 
130e7bf7a49SRicardo Koller 	cache = &kvm->arch.mmu.split_page_cache;
131e7bf7a49SRicardo Koller 
132e7bf7a49SRicardo Koller 	do {
133e7bf7a49SRicardo Koller 		if (need_split_memcache_topup_or_resched(kvm)) {
134e7bf7a49SRicardo Koller 			write_unlock(&kvm->mmu_lock);
135e7bf7a49SRicardo Koller 			cond_resched();
136e7bf7a49SRicardo Koller 			/* Eager page splitting is best-effort. */
137e7bf7a49SRicardo Koller 			ret = __kvm_mmu_topup_memory_cache(cache,
138e7bf7a49SRicardo Koller 							   cache_capacity,
139e7bf7a49SRicardo Koller 							   cache_capacity);
140e7bf7a49SRicardo Koller 			write_lock(&kvm->mmu_lock);
141e7bf7a49SRicardo Koller 			if (ret)
142e7bf7a49SRicardo Koller 				break;
143e7bf7a49SRicardo Koller 		}
144e7bf7a49SRicardo Koller 
145e7bf7a49SRicardo Koller 		pgt = kvm->arch.mmu.pgt;
146e7bf7a49SRicardo Koller 		if (!pgt)
147e7bf7a49SRicardo Koller 			return -EINVAL;
148e7bf7a49SRicardo Koller 
149e7bf7a49SRicardo Koller 		next = __stage2_range_addr_end(addr, end, chunk_size);
150e7bf7a49SRicardo Koller 		ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
151e7bf7a49SRicardo Koller 		if (ret)
152e7bf7a49SRicardo Koller 			break;
153e7bf7a49SRicardo Koller 	} while (addr = next, addr != end);
154e7bf7a49SRicardo Koller 
155e7bf7a49SRicardo Koller 	return ret;
156e7bf7a49SRicardo Koller }
157e7bf7a49SRicardo Koller 
memslot_is_logging(struct kvm_memory_slot * memslot)1589ed24f4bSMarc Zyngier static bool memslot_is_logging(struct kvm_memory_slot *memslot)
1599ed24f4bSMarc Zyngier {
1609ed24f4bSMarc Zyngier 	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
1619ed24f4bSMarc Zyngier }
1629ed24f4bSMarc Zyngier 
1639ed24f4bSMarc Zyngier /**
16432121c81SRaghavendra Rao Ananta  * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
1659ed24f4bSMarc Zyngier  * @kvm:	pointer to kvm structure.
1669ed24f4bSMarc Zyngier  *
1679ed24f4bSMarc Zyngier  * Interface to HYP function to flush all VM TLB entries
1689ed24f4bSMarc Zyngier  */
kvm_arch_flush_remote_tlbs(struct kvm * kvm)16932121c81SRaghavendra Rao Ananta int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
1709ed24f4bSMarc Zyngier {
171a0e50aa3SChristoffer Dall 	kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
17232121c81SRaghavendra Rao Ananta 	return 0;
1739ed24f4bSMarc Zyngier }
1749ed24f4bSMarc Zyngier 
kvm_arch_flush_remote_tlbs_range(struct kvm * kvm,gfn_t gfn,u64 nr_pages)175c42b6f0bSRaghavendra Rao Ananta int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
176c42b6f0bSRaghavendra Rao Ananta 				      gfn_t gfn, u64 nr_pages)
177c42b6f0bSRaghavendra Rao Ananta {
178c42b6f0bSRaghavendra Rao Ananta 	kvm_tlb_flush_vmid_range(&kvm->arch.mmu,
179c42b6f0bSRaghavendra Rao Ananta 				gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
180c42b6f0bSRaghavendra Rao Ananta 	return 0;
181c42b6f0bSRaghavendra Rao Ananta }
182c42b6f0bSRaghavendra Rao Ananta 
kvm_is_device_pfn(unsigned long pfn)1839ed24f4bSMarc Zyngier static bool kvm_is_device_pfn(unsigned long pfn)
1849ed24f4bSMarc Zyngier {
185873ba463SMike Rapoport 	return !pfn_is_map_memory(pfn);
1869ed24f4bSMarc Zyngier }
1879ed24f4bSMarc Zyngier 
stage2_memcache_zalloc_page(void * arg)1887aef0cbcSQuentin Perret static void *stage2_memcache_zalloc_page(void *arg)
1897aef0cbcSQuentin Perret {
1907aef0cbcSQuentin Perret 	struct kvm_mmu_memory_cache *mc = arg;
191d38ba8ccSYosry Ahmed 	void *virt;
1927aef0cbcSQuentin Perret 
1937aef0cbcSQuentin Perret 	/* Allocated with __GFP_ZERO, so no need to zero */
194d38ba8ccSYosry Ahmed 	virt = kvm_mmu_memory_cache_alloc(mc);
195d38ba8ccSYosry Ahmed 	if (virt)
196d38ba8ccSYosry Ahmed 		kvm_account_pgtable_pages(virt, 1);
197d38ba8ccSYosry Ahmed 	return virt;
1987aef0cbcSQuentin Perret }
1997aef0cbcSQuentin Perret 
kvm_host_zalloc_pages_exact(size_t size)2007aef0cbcSQuentin Perret static void *kvm_host_zalloc_pages_exact(size_t size)
2017aef0cbcSQuentin Perret {
2027aef0cbcSQuentin Perret 	return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
2037aef0cbcSQuentin Perret }
2047aef0cbcSQuentin Perret 
kvm_s2_zalloc_pages_exact(size_t size)205d38ba8ccSYosry Ahmed static void *kvm_s2_zalloc_pages_exact(size_t size)
206d38ba8ccSYosry Ahmed {
207d38ba8ccSYosry Ahmed 	void *virt = kvm_host_zalloc_pages_exact(size);
208d38ba8ccSYosry Ahmed 
209d38ba8ccSYosry Ahmed 	if (virt)
210d38ba8ccSYosry Ahmed 		kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT));
211d38ba8ccSYosry Ahmed 	return virt;
212d38ba8ccSYosry Ahmed }
213d38ba8ccSYosry Ahmed 
kvm_s2_free_pages_exact(void * virt,size_t size)214d38ba8ccSYosry Ahmed static void kvm_s2_free_pages_exact(void *virt, size_t size)
215d38ba8ccSYosry Ahmed {
216d38ba8ccSYosry Ahmed 	kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT));
217d38ba8ccSYosry Ahmed 	free_pages_exact(virt, size);
218d38ba8ccSYosry Ahmed }
219d38ba8ccSYosry Ahmed 
2205c359ccaSOliver Upton static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
2215c359ccaSOliver Upton 
stage2_free_unlinked_table_rcu_cb(struct rcu_head * head)222c14d08c5SRicardo Koller static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
223c3119ae4SOliver Upton {
224c3119ae4SOliver Upton 	struct page *page = container_of(head, struct page, rcu_head);
225c3119ae4SOliver Upton 	void *pgtable = page_to_virt(page);
226c3119ae4SOliver Upton 	u32 level = page_private(page);
227c3119ae4SOliver Upton 
228c14d08c5SRicardo Koller 	kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
229c3119ae4SOliver Upton }
230c3119ae4SOliver Upton 
stage2_free_unlinked_table(void * addr,u32 level)231c14d08c5SRicardo Koller static void stage2_free_unlinked_table(void *addr, u32 level)
2325c359ccaSOliver Upton {
233c3119ae4SOliver Upton 	struct page *page = virt_to_page(addr);
234c3119ae4SOliver Upton 
235c3119ae4SOliver Upton 	set_page_private(page, (unsigned long)level);
236c14d08c5SRicardo Koller 	call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
2375c359ccaSOliver Upton }
2385c359ccaSOliver Upton 
kvm_host_get_page(void * addr)2397aef0cbcSQuentin Perret static void kvm_host_get_page(void *addr)
2407aef0cbcSQuentin Perret {
2417aef0cbcSQuentin Perret 	get_page(virt_to_page(addr));
2427aef0cbcSQuentin Perret }
2437aef0cbcSQuentin Perret 
kvm_host_put_page(void * addr)2447aef0cbcSQuentin Perret static void kvm_host_put_page(void *addr)
2457aef0cbcSQuentin Perret {
2467aef0cbcSQuentin Perret 	put_page(virt_to_page(addr));
2477aef0cbcSQuentin Perret }
2487aef0cbcSQuentin Perret 
kvm_s2_put_page(void * addr)249d38ba8ccSYosry Ahmed static void kvm_s2_put_page(void *addr)
250d38ba8ccSYosry Ahmed {
251d38ba8ccSYosry Ahmed 	struct page *p = virt_to_page(addr);
252d38ba8ccSYosry Ahmed 	/* Dropping last refcount, the page will be freed */
253d38ba8ccSYosry Ahmed 	if (page_count(p) == 1)
254d38ba8ccSYosry Ahmed 		kvm_account_pgtable_pages(addr, -1);
255d38ba8ccSYosry Ahmed 	put_page(p);
256d38ba8ccSYosry Ahmed }
257d38ba8ccSYosry Ahmed 
kvm_host_page_count(void * addr)2587aef0cbcSQuentin Perret static int kvm_host_page_count(void *addr)
2597aef0cbcSQuentin Perret {
2607aef0cbcSQuentin Perret 	return page_count(virt_to_page(addr));
2617aef0cbcSQuentin Perret }
2627aef0cbcSQuentin Perret 
kvm_host_pa(void * addr)2637aef0cbcSQuentin Perret static phys_addr_t kvm_host_pa(void *addr)
2647aef0cbcSQuentin Perret {
2657aef0cbcSQuentin Perret 	return __pa(addr);
2667aef0cbcSQuentin Perret }
2677aef0cbcSQuentin Perret 
kvm_host_va(phys_addr_t phys)2687aef0cbcSQuentin Perret static void *kvm_host_va(phys_addr_t phys)
2697aef0cbcSQuentin Perret {
2707aef0cbcSQuentin Perret 	return __va(phys);
2717aef0cbcSQuentin Perret }
2727aef0cbcSQuentin Perret 
clean_dcache_guest_page(void * va,size_t size)273378e6a9cSYanan Wang static void clean_dcache_guest_page(void *va, size_t size)
274378e6a9cSYanan Wang {
275378e6a9cSYanan Wang 	__clean_dcache_guest_page(va, size);
276378e6a9cSYanan Wang }
277378e6a9cSYanan Wang 
invalidate_icache_guest_page(void * va,size_t size)278378e6a9cSYanan Wang static void invalidate_icache_guest_page(void *va, size_t size)
279378e6a9cSYanan Wang {
280378e6a9cSYanan Wang 	__invalidate_icache_guest_page(va, size);
281378e6a9cSYanan Wang }
282378e6a9cSYanan Wang 
2839ed24f4bSMarc Zyngier /*
2849ed24f4bSMarc Zyngier  * Unmapping vs dcache management:
2859ed24f4bSMarc Zyngier  *
2869ed24f4bSMarc Zyngier  * If a guest maps certain memory pages as uncached, all writes will
2879ed24f4bSMarc Zyngier  * bypass the data cache and go directly to RAM.  However, the CPUs
2889ed24f4bSMarc Zyngier  * can still speculate reads (not writes) and fill cache lines with
2899ed24f4bSMarc Zyngier  * data.
2909ed24f4bSMarc Zyngier  *
2919ed24f4bSMarc Zyngier  * Those cache lines will be *clean* cache lines though, so a
2929ed24f4bSMarc Zyngier  * clean+invalidate operation is equivalent to an invalidate
2939ed24f4bSMarc Zyngier  * operation, because no cache lines are marked dirty.
2949ed24f4bSMarc Zyngier  *
2959ed24f4bSMarc Zyngier  * Those clean cache lines could be filled prior to an uncached write
2969ed24f4bSMarc Zyngier  * by the guest, and the cache coherent IO subsystem would therefore
2979ed24f4bSMarc Zyngier  * end up writing old data to disk.
2989ed24f4bSMarc Zyngier  *
2999ed24f4bSMarc Zyngier  * This is why right after unmapping a page/section and invalidating
30052bae936SWill Deacon  * the corresponding TLBs, we flush to make sure the IO subsystem will
30152bae936SWill Deacon  * never hit in the cache.
3029ed24f4bSMarc Zyngier  *
3039ed24f4bSMarc Zyngier  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
3049ed24f4bSMarc Zyngier  * we then fully enforce cacheability of RAM, no matter what the guest
3059ed24f4bSMarc Zyngier  * does.
3069ed24f4bSMarc Zyngier  */
3079ed24f4bSMarc Zyngier /**
3089ed24f4bSMarc Zyngier  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
309c9c0279cSXiaofei Tan  * @mmu:   The KVM stage-2 MMU pointer
3109ed24f4bSMarc Zyngier  * @start: The intermediate physical base address of the range to unmap
3119ed24f4bSMarc Zyngier  * @size:  The size of the area to unmap
312c9c0279cSXiaofei Tan  * @may_block: Whether or not we are permitted to block
3139ed24f4bSMarc Zyngier  *
3149ed24f4bSMarc Zyngier  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
3159ed24f4bSMarc Zyngier  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
3169ed24f4bSMarc Zyngier  * destroying the VM), otherwise another faulting VCPU may come in and mess
3179ed24f4bSMarc Zyngier  * with things behind our backs.
3189ed24f4bSMarc Zyngier  */
__unmap_stage2_range(struct kvm_s2_mmu * mmu,phys_addr_t start,u64 size,bool may_block)319b5331379SWill Deacon static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
320b5331379SWill Deacon 				 bool may_block)
3219ed24f4bSMarc Zyngier {
322cfb1a98dSQuentin Perret 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
32352bae936SWill Deacon 	phys_addr_t end = start + size;
3249ed24f4bSMarc Zyngier 
325fcc5bf89SJing Zhang 	lockdep_assert_held_write(&kvm->mmu_lock);
3269ed24f4bSMarc Zyngier 	WARN_ON(size & ~PAGE_MASK);
3278531bd63SMarc Zyngier 	WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
32852bae936SWill Deacon 				   may_block));
3299ed24f4bSMarc Zyngier }
3309ed24f4bSMarc Zyngier 
unmap_stage2_range(struct kvm_s2_mmu * mmu,phys_addr_t start,u64 size)331b5331379SWill Deacon static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
332b5331379SWill Deacon {
333b5331379SWill Deacon 	__unmap_stage2_range(mmu, start, size, true);
334b5331379SWill Deacon }
335b5331379SWill Deacon 
stage2_flush_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)3369ed24f4bSMarc Zyngier static void stage2_flush_memslot(struct kvm *kvm,
3379ed24f4bSMarc Zyngier 				 struct kvm_memory_slot *memslot)
3389ed24f4bSMarc Zyngier {
3399ed24f4bSMarc Zyngier 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
3409ed24f4bSMarc Zyngier 	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
3419ed24f4bSMarc Zyngier 
3428531bd63SMarc Zyngier 	stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush);
3439ed24f4bSMarc Zyngier }
3449ed24f4bSMarc Zyngier 
3459ed24f4bSMarc Zyngier /**
3469ed24f4bSMarc Zyngier  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
3479ed24f4bSMarc Zyngier  * @kvm: The struct kvm pointer
3489ed24f4bSMarc Zyngier  *
3499ed24f4bSMarc Zyngier  * Go through the stage 2 page tables and invalidate any cache lines
3509ed24f4bSMarc Zyngier  * backing memory already mapped to the VM.
3519ed24f4bSMarc Zyngier  */
stage2_flush_vm(struct kvm * kvm)3529ed24f4bSMarc Zyngier static void stage2_flush_vm(struct kvm *kvm)
3539ed24f4bSMarc Zyngier {
3549ed24f4bSMarc Zyngier 	struct kvm_memslots *slots;
3559ed24f4bSMarc Zyngier 	struct kvm_memory_slot *memslot;
356a54d8066SMaciej S. Szmigiero 	int idx, bkt;
3579ed24f4bSMarc Zyngier 
3589ed24f4bSMarc Zyngier 	idx = srcu_read_lock(&kvm->srcu);
359fcc5bf89SJing Zhang 	write_lock(&kvm->mmu_lock);
3609ed24f4bSMarc Zyngier 
3619ed24f4bSMarc Zyngier 	slots = kvm_memslots(kvm);
362a54d8066SMaciej S. Szmigiero 	kvm_for_each_memslot(memslot, bkt, slots)
3639ed24f4bSMarc Zyngier 		stage2_flush_memslot(kvm, memslot);
3649ed24f4bSMarc Zyngier 
365fcc5bf89SJing Zhang 	write_unlock(&kvm->mmu_lock);
3669ed24f4bSMarc Zyngier 	srcu_read_unlock(&kvm->srcu, idx);
3679ed24f4bSMarc Zyngier }
3689ed24f4bSMarc Zyngier 
3699ed24f4bSMarc Zyngier /**
3709ed24f4bSMarc Zyngier  * free_hyp_pgds - free Hyp-mode page tables
3719ed24f4bSMarc Zyngier  */
free_hyp_pgds(void)3728d20bd63SSean Christopherson void __init free_hyp_pgds(void)
3739ed24f4bSMarc Zyngier {
3749ed24f4bSMarc Zyngier 	mutex_lock(&kvm_hyp_pgd_mutex);
3750f9d09b8SWill Deacon 	if (hyp_pgtable) {
3760f9d09b8SWill Deacon 		kvm_pgtable_hyp_destroy(hyp_pgtable);
3770f9d09b8SWill Deacon 		kfree(hyp_pgtable);
378bfa79a80SQuentin Perret 		hyp_pgtable = NULL;
3799ed24f4bSMarc Zyngier 	}
3809ed24f4bSMarc Zyngier 	mutex_unlock(&kvm_hyp_pgd_mutex);
3819ed24f4bSMarc Zyngier }
3829ed24f4bSMarc Zyngier 
kvm_host_owns_hyp_mappings(void)383bfa79a80SQuentin Perret static bool kvm_host_owns_hyp_mappings(void)
384bfa79a80SQuentin Perret {
38564a1fbdaSQuentin Perret 	if (is_kernel_in_hyp_mode())
38664a1fbdaSQuentin Perret 		return false;
38764a1fbdaSQuentin Perret 
388bfa79a80SQuentin Perret 	if (static_branch_likely(&kvm_protected_mode_initialized))
389bfa79a80SQuentin Perret 		return false;
390bfa79a80SQuentin Perret 
391bfa79a80SQuentin Perret 	/*
392bfa79a80SQuentin Perret 	 * This can happen at boot time when __create_hyp_mappings() is called
393bfa79a80SQuentin Perret 	 * after the hyp protection has been enabled, but the static key has
394bfa79a80SQuentin Perret 	 * not been flipped yet.
395bfa79a80SQuentin Perret 	 */
396bfa79a80SQuentin Perret 	if (!hyp_pgtable && is_protected_kvm_enabled())
397bfa79a80SQuentin Perret 		return false;
398bfa79a80SQuentin Perret 
399bfa79a80SQuentin Perret 	WARN_ON(!hyp_pgtable);
400bfa79a80SQuentin Perret 
401bfa79a80SQuentin Perret 	return true;
402bfa79a80SQuentin Perret }
403bfa79a80SQuentin Perret 
__create_hyp_mappings(unsigned long start,unsigned long size,unsigned long phys,enum kvm_pgtable_prot prot)404ce335431SKalesh Singh int __create_hyp_mappings(unsigned long start, unsigned long size,
4050f9d09b8SWill Deacon 			  unsigned long phys, enum kvm_pgtable_prot prot)
4069ed24f4bSMarc Zyngier {
4070f9d09b8SWill Deacon 	int err;
4089ed24f4bSMarc Zyngier 
40966c57eddSQuentin Perret 	if (WARN_ON(!kvm_host_owns_hyp_mappings()))
41066c57eddSQuentin Perret 		return -EINVAL;
411bfa79a80SQuentin Perret 
4129ed24f4bSMarc Zyngier 	mutex_lock(&kvm_hyp_pgd_mutex);
4130f9d09b8SWill Deacon 	err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
4149ed24f4bSMarc Zyngier 	mutex_unlock(&kvm_hyp_pgd_mutex);
4150f9d09b8SWill Deacon 
4169ed24f4bSMarc Zyngier 	return err;
4179ed24f4bSMarc Zyngier }
4189ed24f4bSMarc Zyngier 
kvm_kaddr_to_phys(void * kaddr)4199ed24f4bSMarc Zyngier static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
4209ed24f4bSMarc Zyngier {
4219ed24f4bSMarc Zyngier 	if (!is_vmalloc_addr(kaddr)) {
4229ed24f4bSMarc Zyngier 		BUG_ON(!virt_addr_valid(kaddr));
4239ed24f4bSMarc Zyngier 		return __pa(kaddr);
4249ed24f4bSMarc Zyngier 	} else {
4259ed24f4bSMarc Zyngier 		return page_to_phys(vmalloc_to_page(kaddr)) +
4269ed24f4bSMarc Zyngier 		       offset_in_page(kaddr);
4279ed24f4bSMarc Zyngier 	}
4289ed24f4bSMarc Zyngier }
4299ed24f4bSMarc Zyngier 
430a83e2191SQuentin Perret struct hyp_shared_pfn {
431a83e2191SQuentin Perret 	u64 pfn;
432a83e2191SQuentin Perret 	int count;
433a83e2191SQuentin Perret 	struct rb_node node;
434a83e2191SQuentin Perret };
43566c57eddSQuentin Perret 
436a83e2191SQuentin Perret static DEFINE_MUTEX(hyp_shared_pfns_lock);
437a83e2191SQuentin Perret static struct rb_root hyp_shared_pfns = RB_ROOT;
438a83e2191SQuentin Perret 
find_shared_pfn(u64 pfn,struct rb_node *** node,struct rb_node ** parent)439a83e2191SQuentin Perret static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
440a83e2191SQuentin Perret 					      struct rb_node **parent)
44166c57eddSQuentin Perret {
442a83e2191SQuentin Perret 	struct hyp_shared_pfn *this;
443a83e2191SQuentin Perret 
444a83e2191SQuentin Perret 	*node = &hyp_shared_pfns.rb_node;
445a83e2191SQuentin Perret 	*parent = NULL;
446a83e2191SQuentin Perret 	while (**node) {
447a83e2191SQuentin Perret 		this = container_of(**node, struct hyp_shared_pfn, node);
448a83e2191SQuentin Perret 		*parent = **node;
449a83e2191SQuentin Perret 		if (this->pfn < pfn)
450a83e2191SQuentin Perret 			*node = &((**node)->rb_left);
451a83e2191SQuentin Perret 		else if (this->pfn > pfn)
452a83e2191SQuentin Perret 			*node = &((**node)->rb_right);
453a83e2191SQuentin Perret 		else
454a83e2191SQuentin Perret 			return this;
45566c57eddSQuentin Perret 	}
45666c57eddSQuentin Perret 
457a83e2191SQuentin Perret 	return NULL;
458a83e2191SQuentin Perret }
459a83e2191SQuentin Perret 
share_pfn_hyp(u64 pfn)460a83e2191SQuentin Perret static int share_pfn_hyp(u64 pfn)
461a83e2191SQuentin Perret {
462a83e2191SQuentin Perret 	struct rb_node **node, *parent;
463a83e2191SQuentin Perret 	struct hyp_shared_pfn *this;
464a83e2191SQuentin Perret 	int ret = 0;
465a83e2191SQuentin Perret 
466a83e2191SQuentin Perret 	mutex_lock(&hyp_shared_pfns_lock);
467a83e2191SQuentin Perret 	this = find_shared_pfn(pfn, &node, &parent);
468a83e2191SQuentin Perret 	if (this) {
469a83e2191SQuentin Perret 		this->count++;
470a83e2191SQuentin Perret 		goto unlock;
471a83e2191SQuentin Perret 	}
472a83e2191SQuentin Perret 
473a83e2191SQuentin Perret 	this = kzalloc(sizeof(*this), GFP_KERNEL);
474a83e2191SQuentin Perret 	if (!this) {
475a83e2191SQuentin Perret 		ret = -ENOMEM;
476a83e2191SQuentin Perret 		goto unlock;
477a83e2191SQuentin Perret 	}
478a83e2191SQuentin Perret 
479a83e2191SQuentin Perret 	this->pfn = pfn;
480a83e2191SQuentin Perret 	this->count = 1;
481a83e2191SQuentin Perret 	rb_link_node(&this->node, parent, node);
482a83e2191SQuentin Perret 	rb_insert_color(&this->node, &hyp_shared_pfns);
483a83e2191SQuentin Perret 	ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
484a83e2191SQuentin Perret unlock:
485a83e2191SQuentin Perret 	mutex_unlock(&hyp_shared_pfns_lock);
486a83e2191SQuentin Perret 
487a83e2191SQuentin Perret 	return ret;
48866c57eddSQuentin Perret }
48966c57eddSQuentin Perret 
unshare_pfn_hyp(u64 pfn)49052b28657SQuentin Perret static int unshare_pfn_hyp(u64 pfn)
49152b28657SQuentin Perret {
49252b28657SQuentin Perret 	struct rb_node **node, *parent;
49352b28657SQuentin Perret 	struct hyp_shared_pfn *this;
49452b28657SQuentin Perret 	int ret = 0;
49552b28657SQuentin Perret 
49652b28657SQuentin Perret 	mutex_lock(&hyp_shared_pfns_lock);
49752b28657SQuentin Perret 	this = find_shared_pfn(pfn, &node, &parent);
49852b28657SQuentin Perret 	if (WARN_ON(!this)) {
49952b28657SQuentin Perret 		ret = -ENOENT;
50052b28657SQuentin Perret 		goto unlock;
50152b28657SQuentin Perret 	}
50252b28657SQuentin Perret 
50352b28657SQuentin Perret 	this->count--;
50452b28657SQuentin Perret 	if (this->count)
50552b28657SQuentin Perret 		goto unlock;
50652b28657SQuentin Perret 
50752b28657SQuentin Perret 	rb_erase(&this->node, &hyp_shared_pfns);
50852b28657SQuentin Perret 	kfree(this);
50952b28657SQuentin Perret 	ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
51052b28657SQuentin Perret unlock:
51152b28657SQuentin Perret 	mutex_unlock(&hyp_shared_pfns_lock);
51252b28657SQuentin Perret 
51352b28657SQuentin Perret 	return ret;
51452b28657SQuentin Perret }
51552b28657SQuentin Perret 
kvm_share_hyp(void * from,void * to)5163f868e14SQuentin Perret int kvm_share_hyp(void *from, void *to)
5173f868e14SQuentin Perret {
518a83e2191SQuentin Perret 	phys_addr_t start, end, cur;
519a83e2191SQuentin Perret 	u64 pfn;
5209ed24f4bSMarc Zyngier 	int ret;
5219ed24f4bSMarc Zyngier 
5223f868e14SQuentin Perret 	if (is_kernel_in_hyp_mode())
5233f868e14SQuentin Perret 		return 0;
5243f868e14SQuentin Perret 
5253f868e14SQuentin Perret 	/*
5263f868e14SQuentin Perret 	 * The share hcall maps things in the 'fixed-offset' region of the hyp
5273f868e14SQuentin Perret 	 * VA space, so we can only share physically contiguous data-structures
5283f868e14SQuentin Perret 	 * for now.
5293f868e14SQuentin Perret 	 */
5303f868e14SQuentin Perret 	if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
5313f868e14SQuentin Perret 		return -EINVAL;
5323f868e14SQuentin Perret 
5333f868e14SQuentin Perret 	if (kvm_host_owns_hyp_mappings())
5343f868e14SQuentin Perret 		return create_hyp_mappings(from, to, PAGE_HYP);
5353f868e14SQuentin Perret 
536a83e2191SQuentin Perret 	start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
537a83e2191SQuentin Perret 	end = PAGE_ALIGN(__pa(to));
538a83e2191SQuentin Perret 	for (cur = start; cur < end; cur += PAGE_SIZE) {
539a83e2191SQuentin Perret 		pfn = __phys_to_pfn(cur);
540a83e2191SQuentin Perret 		ret = share_pfn_hyp(pfn);
5419ed24f4bSMarc Zyngier 		if (ret)
5429ed24f4bSMarc Zyngier 			return ret;
5439ed24f4bSMarc Zyngier 	}
5449ed24f4bSMarc Zyngier 
5459ed24f4bSMarc Zyngier 	return 0;
5469ed24f4bSMarc Zyngier }
5479ed24f4bSMarc Zyngier 
kvm_unshare_hyp(void * from,void * to)54852b28657SQuentin Perret void kvm_unshare_hyp(void *from, void *to)
54952b28657SQuentin Perret {
55052b28657SQuentin Perret 	phys_addr_t start, end, cur;
55152b28657SQuentin Perret 	u64 pfn;
55252b28657SQuentin Perret 
55352b28657SQuentin Perret 	if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
55452b28657SQuentin Perret 		return;
55552b28657SQuentin Perret 
55652b28657SQuentin Perret 	start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
55752b28657SQuentin Perret 	end = PAGE_ALIGN(__pa(to));
55852b28657SQuentin Perret 	for (cur = start; cur < end; cur += PAGE_SIZE) {
55952b28657SQuentin Perret 		pfn = __phys_to_pfn(cur);
56052b28657SQuentin Perret 		WARN_ON(unshare_pfn_hyp(pfn));
56152b28657SQuentin Perret 	}
56252b28657SQuentin Perret }
56352b28657SQuentin Perret 
5649ed24f4bSMarc Zyngier /**
5659ed24f4bSMarc Zyngier  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
5669ed24f4bSMarc Zyngier  * @from:	The virtual kernel start address of the range
5679ed24f4bSMarc Zyngier  * @to:		The virtual kernel end address of the range (exclusive)
5689ed24f4bSMarc Zyngier  * @prot:	The protection to be applied to this range
5699ed24f4bSMarc Zyngier  *
5709ed24f4bSMarc Zyngier  * The same virtual address as the kernel virtual address is also used
5719ed24f4bSMarc Zyngier  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
5729ed24f4bSMarc Zyngier  * physical pages.
5739ed24f4bSMarc Zyngier  */
create_hyp_mappings(void * from,void * to,enum kvm_pgtable_prot prot)5740f9d09b8SWill Deacon int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
5759ed24f4bSMarc Zyngier {
5769ed24f4bSMarc Zyngier 	phys_addr_t phys_addr;
5779ed24f4bSMarc Zyngier 	unsigned long virt_addr;
5789ed24f4bSMarc Zyngier 	unsigned long start = kern_hyp_va((unsigned long)from);
5799ed24f4bSMarc Zyngier 	unsigned long end = kern_hyp_va((unsigned long)to);
5809ed24f4bSMarc Zyngier 
5819ed24f4bSMarc Zyngier 	if (is_kernel_in_hyp_mode())
5829ed24f4bSMarc Zyngier 		return 0;
5839ed24f4bSMarc Zyngier 
5843f868e14SQuentin Perret 	if (!kvm_host_owns_hyp_mappings())
58566c57eddSQuentin Perret 		return -EPERM;
58666c57eddSQuentin Perret 
5879ed24f4bSMarc Zyngier 	start = start & PAGE_MASK;
5889ed24f4bSMarc Zyngier 	end = PAGE_ALIGN(end);
5899ed24f4bSMarc Zyngier 
5909ed24f4bSMarc Zyngier 	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
5919ed24f4bSMarc Zyngier 		int err;
5929ed24f4bSMarc Zyngier 
5939ed24f4bSMarc Zyngier 		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
5940f9d09b8SWill Deacon 		err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
5959ed24f4bSMarc Zyngier 					    prot);
5969ed24f4bSMarc Zyngier 		if (err)
5979ed24f4bSMarc Zyngier 			return err;
5989ed24f4bSMarc Zyngier 	}
5999ed24f4bSMarc Zyngier 
6009ed24f4bSMarc Zyngier 	return 0;
6019ed24f4bSMarc Zyngier }
6029ed24f4bSMarc Zyngier 
__hyp_alloc_private_va_range(unsigned long base)603f156a7d1SVincent Donnefort static int __hyp_alloc_private_va_range(unsigned long base)
604f156a7d1SVincent Donnefort {
605f156a7d1SVincent Donnefort 	lockdep_assert_held(&kvm_hyp_pgd_mutex);
606f156a7d1SVincent Donnefort 
607f156a7d1SVincent Donnefort 	if (!PAGE_ALIGNED(base))
608f156a7d1SVincent Donnefort 		return -EINVAL;
609f156a7d1SVincent Donnefort 
610f156a7d1SVincent Donnefort 	/*
611f156a7d1SVincent Donnefort 	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
612f156a7d1SVincent Donnefort 	 * allocating the new area, as it would indicate we've
613f156a7d1SVincent Donnefort 	 * overflowed the idmap/IO address range.
614f156a7d1SVincent Donnefort 	 */
615f156a7d1SVincent Donnefort 	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
616f156a7d1SVincent Donnefort 		return -ENOMEM;
617f156a7d1SVincent Donnefort 
618f156a7d1SVincent Donnefort 	io_map_base = base;
619f156a7d1SVincent Donnefort 
620f156a7d1SVincent Donnefort 	return 0;
621f156a7d1SVincent Donnefort }
62292abe0f8SKalesh Singh 
62392abe0f8SKalesh Singh /**
62492abe0f8SKalesh Singh  * hyp_alloc_private_va_range - Allocates a private VA range.
62592abe0f8SKalesh Singh  * @size:	The size of the VA range to reserve.
62692abe0f8SKalesh Singh  * @haddr:	The hypervisor virtual start address of the allocation.
62792abe0f8SKalesh Singh  *
62892abe0f8SKalesh Singh  * The private virtual address (VA) range is allocated below io_map_base
62992abe0f8SKalesh Singh  * and aligned based on the order of @size.
63092abe0f8SKalesh Singh  *
63192abe0f8SKalesh Singh  * Return: 0 on success or negative error code on failure.
63292abe0f8SKalesh Singh  */
hyp_alloc_private_va_range(size_t size,unsigned long * haddr)63392abe0f8SKalesh Singh int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
6349ed24f4bSMarc Zyngier {
6359ed24f4bSMarc Zyngier 	unsigned long base;
6369ed24f4bSMarc Zyngier 	int ret = 0;
6379ed24f4bSMarc Zyngier 
6389ed24f4bSMarc Zyngier 	mutex_lock(&kvm_hyp_pgd_mutex);
6399ed24f4bSMarc Zyngier 
6409ed24f4bSMarc Zyngier 	/*
641656012c7SFuad Tabba 	 * This assumes that we have enough space below the idmap
642f156a7d1SVincent Donnefort 	 * page to allocate our VAs. If not, the check in
643f156a7d1SVincent Donnefort 	 * __hyp_alloc_private_va_range() will kick. A potential
644f156a7d1SVincent Donnefort 	 * alternative would be to detect that overflow and switch
645f156a7d1SVincent Donnefort 	 * to an allocation above the idmap.
6469ed24f4bSMarc Zyngier 	 *
6479ed24f4bSMarc Zyngier 	 * The allocated size is always a multiple of PAGE_SIZE.
6489ed24f4bSMarc Zyngier 	 */
649f156a7d1SVincent Donnefort 	size = PAGE_ALIGN(size);
650f156a7d1SVincent Donnefort 	base = io_map_base - size;
651f156a7d1SVincent Donnefort 	ret = __hyp_alloc_private_va_range(base);
6529ed24f4bSMarc Zyngier 
6539ed24f4bSMarc Zyngier 	mutex_unlock(&kvm_hyp_pgd_mutex);
6549ed24f4bSMarc Zyngier 
655*3579dc74SMarc Zyngier 	if (!ret)
656*3579dc74SMarc Zyngier 		*haddr = base;
657*3579dc74SMarc Zyngier 
65892abe0f8SKalesh Singh 	return ret;
65992abe0f8SKalesh Singh }
6609ed24f4bSMarc Zyngier 
__create_hyp_private_mapping(phys_addr_t phys_addr,size_t size,unsigned long * haddr,enum kvm_pgtable_prot prot)66192abe0f8SKalesh Singh static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
66292abe0f8SKalesh Singh 					unsigned long *haddr,
66392abe0f8SKalesh Singh 					enum kvm_pgtable_prot prot)
66492abe0f8SKalesh Singh {
66592abe0f8SKalesh Singh 	unsigned long addr;
66692abe0f8SKalesh Singh 	int ret = 0;
6679ed24f4bSMarc Zyngier 
66892abe0f8SKalesh Singh 	if (!kvm_host_owns_hyp_mappings()) {
66992abe0f8SKalesh Singh 		addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
67092abe0f8SKalesh Singh 					 phys_addr, size, prot);
67192abe0f8SKalesh Singh 		if (IS_ERR_VALUE(addr))
67292abe0f8SKalesh Singh 			return addr;
67392abe0f8SKalesh Singh 		*haddr = addr;
67492abe0f8SKalesh Singh 
67592abe0f8SKalesh Singh 		return 0;
67692abe0f8SKalesh Singh 	}
67792abe0f8SKalesh Singh 
67892abe0f8SKalesh Singh 	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
67992abe0f8SKalesh Singh 	ret = hyp_alloc_private_va_range(size, &addr);
68092abe0f8SKalesh Singh 	if (ret)
68192abe0f8SKalesh Singh 		return ret;
68292abe0f8SKalesh Singh 
68392abe0f8SKalesh Singh 	ret = __create_hyp_mappings(addr, size, phys_addr, prot);
68492abe0f8SKalesh Singh 	if (ret)
68592abe0f8SKalesh Singh 		return ret;
68692abe0f8SKalesh Singh 
68792abe0f8SKalesh Singh 	*haddr = addr + offset_in_page(phys_addr);
6889ed24f4bSMarc Zyngier 	return ret;
6899ed24f4bSMarc Zyngier }
6909ed24f4bSMarc Zyngier 
create_hyp_stack(phys_addr_t phys_addr,unsigned long * haddr)691f156a7d1SVincent Donnefort int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
692f156a7d1SVincent Donnefort {
693f156a7d1SVincent Donnefort 	unsigned long base;
694f156a7d1SVincent Donnefort 	size_t size;
695f156a7d1SVincent Donnefort 	int ret;
696f156a7d1SVincent Donnefort 
697f156a7d1SVincent Donnefort 	mutex_lock(&kvm_hyp_pgd_mutex);
698f156a7d1SVincent Donnefort 	/*
699f156a7d1SVincent Donnefort 	 * Efficient stack verification using the PAGE_SHIFT bit implies
700f156a7d1SVincent Donnefort 	 * an alignment of our allocation on the order of the size.
701f156a7d1SVincent Donnefort 	 */
702f156a7d1SVincent Donnefort 	size = PAGE_SIZE * 2;
703f156a7d1SVincent Donnefort 	base = ALIGN_DOWN(io_map_base - size, size);
704f156a7d1SVincent Donnefort 
705f156a7d1SVincent Donnefort 	ret = __hyp_alloc_private_va_range(base);
706f156a7d1SVincent Donnefort 
707f156a7d1SVincent Donnefort 	mutex_unlock(&kvm_hyp_pgd_mutex);
708f156a7d1SVincent Donnefort 
709f156a7d1SVincent Donnefort 	if (ret) {
710f156a7d1SVincent Donnefort 		kvm_err("Cannot allocate hyp stack guard page\n");
711f156a7d1SVincent Donnefort 		return ret;
712f156a7d1SVincent Donnefort 	}
713f156a7d1SVincent Donnefort 
714f156a7d1SVincent Donnefort 	/*
715f156a7d1SVincent Donnefort 	 * Since the stack grows downwards, map the stack to the page
716f156a7d1SVincent Donnefort 	 * at the higher address and leave the lower guard page
717f156a7d1SVincent Donnefort 	 * unbacked.
718f156a7d1SVincent Donnefort 	 *
719f156a7d1SVincent Donnefort 	 * Any valid stack address now has the PAGE_SHIFT bit as 1
720f156a7d1SVincent Donnefort 	 * and addresses corresponding to the guard page have the
721f156a7d1SVincent Donnefort 	 * PAGE_SHIFT bit as 0 - this is used for overflow detection.
722f156a7d1SVincent Donnefort 	 */
723f156a7d1SVincent Donnefort 	ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr,
724f156a7d1SVincent Donnefort 				    PAGE_HYP);
725f156a7d1SVincent Donnefort 	if (ret)
726f156a7d1SVincent Donnefort 		kvm_err("Cannot map hyp stack\n");
727f156a7d1SVincent Donnefort 
728f156a7d1SVincent Donnefort 	*haddr = base + size;
729f156a7d1SVincent Donnefort 
730f156a7d1SVincent Donnefort 	return ret;
731f156a7d1SVincent Donnefort }
732f156a7d1SVincent Donnefort 
7339ed24f4bSMarc Zyngier /**
7349ed24f4bSMarc Zyngier  * create_hyp_io_mappings - Map IO into both kernel and HYP
7359ed24f4bSMarc Zyngier  * @phys_addr:	The physical start address which gets mapped
7369ed24f4bSMarc Zyngier  * @size:	Size of the region being mapped
7379ed24f4bSMarc Zyngier  * @kaddr:	Kernel VA for this mapping
7389ed24f4bSMarc Zyngier  * @haddr:	HYP VA for this mapping
7399ed24f4bSMarc Zyngier  */
create_hyp_io_mappings(phys_addr_t phys_addr,size_t size,void __iomem ** kaddr,void __iomem ** haddr)7409ed24f4bSMarc Zyngier int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
7419ed24f4bSMarc Zyngier 			   void __iomem **kaddr,
7429ed24f4bSMarc Zyngier 			   void __iomem **haddr)
7439ed24f4bSMarc Zyngier {
7449ed24f4bSMarc Zyngier 	unsigned long addr;
7459ed24f4bSMarc Zyngier 	int ret;
7469ed24f4bSMarc Zyngier 
747bff01cb6SQuentin Perret 	if (is_protected_kvm_enabled())
748bff01cb6SQuentin Perret 		return -EPERM;
749bff01cb6SQuentin Perret 
7509ed24f4bSMarc Zyngier 	*kaddr = ioremap(phys_addr, size);
7519ed24f4bSMarc Zyngier 	if (!*kaddr)
7529ed24f4bSMarc Zyngier 		return -ENOMEM;
7539ed24f4bSMarc Zyngier 
7549ed24f4bSMarc Zyngier 	if (is_kernel_in_hyp_mode()) {
7559ed24f4bSMarc Zyngier 		*haddr = *kaddr;
7569ed24f4bSMarc Zyngier 		return 0;
7579ed24f4bSMarc Zyngier 	}
7589ed24f4bSMarc Zyngier 
7599ed24f4bSMarc Zyngier 	ret = __create_hyp_private_mapping(phys_addr, size,
7609ed24f4bSMarc Zyngier 					   &addr, PAGE_HYP_DEVICE);
7619ed24f4bSMarc Zyngier 	if (ret) {
7629ed24f4bSMarc Zyngier 		iounmap(*kaddr);
7639ed24f4bSMarc Zyngier 		*kaddr = NULL;
7649ed24f4bSMarc Zyngier 		*haddr = NULL;
7659ed24f4bSMarc Zyngier 		return ret;
7669ed24f4bSMarc Zyngier 	}
7679ed24f4bSMarc Zyngier 
7689ed24f4bSMarc Zyngier 	*haddr = (void __iomem *)addr;
7699ed24f4bSMarc Zyngier 	return 0;
7709ed24f4bSMarc Zyngier }
7719ed24f4bSMarc Zyngier 
7729ed24f4bSMarc Zyngier /**
7739ed24f4bSMarc Zyngier  * create_hyp_exec_mappings - Map an executable range into HYP
7749ed24f4bSMarc Zyngier  * @phys_addr:	The physical start address which gets mapped
7759ed24f4bSMarc Zyngier  * @size:	Size of the region being mapped
7769ed24f4bSMarc Zyngier  * @haddr:	HYP VA for this mapping
7779ed24f4bSMarc Zyngier  */
create_hyp_exec_mappings(phys_addr_t phys_addr,size_t size,void ** haddr)7789ed24f4bSMarc Zyngier int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
7799ed24f4bSMarc Zyngier 			     void **haddr)
7809ed24f4bSMarc Zyngier {
7819ed24f4bSMarc Zyngier 	unsigned long addr;
7829ed24f4bSMarc Zyngier 	int ret;
7839ed24f4bSMarc Zyngier 
7849ed24f4bSMarc Zyngier 	BUG_ON(is_kernel_in_hyp_mode());
7859ed24f4bSMarc Zyngier 
7869ed24f4bSMarc Zyngier 	ret = __create_hyp_private_mapping(phys_addr, size,
7879ed24f4bSMarc Zyngier 					   &addr, PAGE_HYP_EXEC);
7889ed24f4bSMarc Zyngier 	if (ret) {
7899ed24f4bSMarc Zyngier 		*haddr = NULL;
7909ed24f4bSMarc Zyngier 		return ret;
7919ed24f4bSMarc Zyngier 	}
7929ed24f4bSMarc Zyngier 
7939ed24f4bSMarc Zyngier 	*haddr = (void *)addr;
7949ed24f4bSMarc Zyngier 	return 0;
7959ed24f4bSMarc Zyngier }
7969ed24f4bSMarc Zyngier 
7976011cf68SMarc Zyngier static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
7986011cf68SMarc Zyngier 	/* We shouldn't need any other callback to walk the PT */
7996011cf68SMarc Zyngier 	.phys_to_virt		= kvm_host_va,
8006011cf68SMarc Zyngier };
8016011cf68SMarc Zyngier 
get_user_mapping_size(struct kvm * kvm,u64 addr)8026011cf68SMarc Zyngier static int get_user_mapping_size(struct kvm *kvm, u64 addr)
8036011cf68SMarc Zyngier {
8046011cf68SMarc Zyngier 	struct kvm_pgtable pgt = {
8056b91b8f9SOliver Upton 		.pgd		= (kvm_pteref_t)kvm->mm->pgd,
806219072c0SRyan Roberts 		.ia_bits	= vabits_actual,
8076011cf68SMarc Zyngier 		.start_level	= (KVM_PGTABLE_MAX_LEVELS -
8086011cf68SMarc Zyngier 				   CONFIG_PGTABLE_LEVELS),
8096011cf68SMarc Zyngier 		.mm_ops		= &kvm_user_mm_ops,
8106011cf68SMarc Zyngier 	};
811e86fc1a3SMarc Zyngier 	unsigned long flags;
8126011cf68SMarc Zyngier 	kvm_pte_t pte = 0;	/* Keep GCC quiet... */
8136011cf68SMarc Zyngier 	u32 level = ~0;
8146011cf68SMarc Zyngier 	int ret;
8156011cf68SMarc Zyngier 
816e86fc1a3SMarc Zyngier 	/*
817e86fc1a3SMarc Zyngier 	 * Disable IRQs so that we hazard against a concurrent
818e86fc1a3SMarc Zyngier 	 * teardown of the userspace page tables (which relies on
819e86fc1a3SMarc Zyngier 	 * IPI-ing threads).
820e86fc1a3SMarc Zyngier 	 */
821e86fc1a3SMarc Zyngier 	local_irq_save(flags);
8226011cf68SMarc Zyngier 	ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
823e86fc1a3SMarc Zyngier 	local_irq_restore(flags);
824e86fc1a3SMarc Zyngier 
825e86fc1a3SMarc Zyngier 	if (ret)
826e86fc1a3SMarc Zyngier 		return ret;
827e86fc1a3SMarc Zyngier 
828e86fc1a3SMarc Zyngier 	/*
829e86fc1a3SMarc Zyngier 	 * Not seeing an error, but not updating level? Something went
830e86fc1a3SMarc Zyngier 	 * deeply wrong...
831e86fc1a3SMarc Zyngier 	 */
832e86fc1a3SMarc Zyngier 	if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS))
833e86fc1a3SMarc Zyngier 		return -EFAULT;
834e86fc1a3SMarc Zyngier 
835e86fc1a3SMarc Zyngier 	/* Oops, the userspace PTs are gone... Replay the fault */
836e86fc1a3SMarc Zyngier 	if (!kvm_pte_valid(pte))
837e86fc1a3SMarc Zyngier 		return -EAGAIN;
8386011cf68SMarc Zyngier 
8396011cf68SMarc Zyngier 	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
8406011cf68SMarc Zyngier }
8416011cf68SMarc Zyngier 
8427aef0cbcSQuentin Perret static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
8437aef0cbcSQuentin Perret 	.zalloc_page		= stage2_memcache_zalloc_page,
844d38ba8ccSYosry Ahmed 	.zalloc_pages_exact	= kvm_s2_zalloc_pages_exact,
845d38ba8ccSYosry Ahmed 	.free_pages_exact	= kvm_s2_free_pages_exact,
846c14d08c5SRicardo Koller 	.free_unlinked_table	= stage2_free_unlinked_table,
8477aef0cbcSQuentin Perret 	.get_page		= kvm_host_get_page,
848d38ba8ccSYosry Ahmed 	.put_page		= kvm_s2_put_page,
8497aef0cbcSQuentin Perret 	.page_count		= kvm_host_page_count,
8507aef0cbcSQuentin Perret 	.phys_to_virt		= kvm_host_va,
8517aef0cbcSQuentin Perret 	.virt_to_phys		= kvm_host_pa,
85225aa2869SYanan Wang 	.dcache_clean_inval_poc	= clean_dcache_guest_page,
85325aa2869SYanan Wang 	.icache_inval_pou	= invalidate_icache_guest_page,
8547aef0cbcSQuentin Perret };
8557aef0cbcSQuentin Perret 
8569ed24f4bSMarc Zyngier /**
85721ea4578SJulia Lawall  * kvm_init_stage2_mmu - Initialise a S2 MMU structure
858a0e50aa3SChristoffer Dall  * @kvm:	The pointer to the KVM structure
859a0e50aa3SChristoffer Dall  * @mmu:	The pointer to the s2 MMU structure
860315775ffSQuentin Perret  * @type:	The machine type of the virtual machine
8619ed24f4bSMarc Zyngier  *
86271233d05SWill Deacon  * Allocates only the stage-2 HW PGD level table(s).
8639ed24f4bSMarc Zyngier  * Note we don't need locking here as this is only called when the VM is
8649ed24f4bSMarc Zyngier  * created, which can only be done once.
8659ed24f4bSMarc Zyngier  */
kvm_init_stage2_mmu(struct kvm * kvm,struct kvm_s2_mmu * mmu,unsigned long type)866315775ffSQuentin Perret int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
8679ed24f4bSMarc Zyngier {
868315775ffSQuentin Perret 	u32 kvm_ipa_limit = get_kvm_ipa_limit();
86971233d05SWill Deacon 	int cpu, err;
87071233d05SWill Deacon 	struct kvm_pgtable *pgt;
871315775ffSQuentin Perret 	u64 mmfr0, mmfr1;
872315775ffSQuentin Perret 	u32 phys_shift;
873315775ffSQuentin Perret 
874315775ffSQuentin Perret 	if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
875315775ffSQuentin Perret 		return -EINVAL;
876315775ffSQuentin Perret 
877315775ffSQuentin Perret 	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
87860dfe093SQuentin Perret 	if (is_protected_kvm_enabled()) {
87960dfe093SQuentin Perret 		phys_shift = kvm_ipa_limit;
88060dfe093SQuentin Perret 	} else if (phys_shift) {
881315775ffSQuentin Perret 		if (phys_shift > kvm_ipa_limit ||
882315775ffSQuentin Perret 		    phys_shift < ARM64_MIN_PARANGE_BITS)
883315775ffSQuentin Perret 			return -EINVAL;
884315775ffSQuentin Perret 	} else {
885315775ffSQuentin Perret 		phys_shift = KVM_PHYS_SHIFT;
886315775ffSQuentin Perret 		if (phys_shift > kvm_ipa_limit) {
887315775ffSQuentin Perret 			pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
888315775ffSQuentin Perret 				     current->comm);
889315775ffSQuentin Perret 			return -EINVAL;
890315775ffSQuentin Perret 		}
891315775ffSQuentin Perret 	}
892315775ffSQuentin Perret 
893315775ffSQuentin Perret 	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
894315775ffSQuentin Perret 	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
895315775ffSQuentin Perret 	kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
8969ed24f4bSMarc Zyngier 
89771233d05SWill Deacon 	if (mmu->pgt != NULL) {
8989ed24f4bSMarc Zyngier 		kvm_err("kvm_arch already initialized?\n");
8999ed24f4bSMarc Zyngier 		return -EINVAL;
9009ed24f4bSMarc Zyngier 	}
9019ed24f4bSMarc Zyngier 
902115bae92SJia He 	pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
90371233d05SWill Deacon 	if (!pgt)
9049ed24f4bSMarc Zyngier 		return -ENOMEM;
9059ed24f4bSMarc Zyngier 
9069d8604b2SMarc Zyngier 	mmu->arch = &kvm->arch;
9079d8604b2SMarc Zyngier 	err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
90871233d05SWill Deacon 	if (err)
90971233d05SWill Deacon 		goto out_free_pgtable;
9109ed24f4bSMarc Zyngier 
911a0e50aa3SChristoffer Dall 	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
912a0e50aa3SChristoffer Dall 	if (!mmu->last_vcpu_ran) {
91371233d05SWill Deacon 		err = -ENOMEM;
91471233d05SWill Deacon 		goto out_destroy_pgtable;
915a0e50aa3SChristoffer Dall 	}
916a0e50aa3SChristoffer Dall 
917a0e50aa3SChristoffer Dall 	for_each_possible_cpu(cpu)
918a0e50aa3SChristoffer Dall 		*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
919a0e50aa3SChristoffer Dall 
9202f440b72SRicardo Koller 	 /* The eager page splitting is disabled by default */
9212f440b72SRicardo Koller 	mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
9222f440b72SRicardo Koller 	mmu->split_page_cache.gfp_zero = __GFP_ZERO;
9232f440b72SRicardo Koller 
92471233d05SWill Deacon 	mmu->pgt = pgt;
92571233d05SWill Deacon 	mmu->pgd_phys = __pa(pgt->pgd);
9269ed24f4bSMarc Zyngier 	return 0;
92771233d05SWill Deacon 
92871233d05SWill Deacon out_destroy_pgtable:
92971233d05SWill Deacon 	kvm_pgtable_stage2_destroy(pgt);
93071233d05SWill Deacon out_free_pgtable:
93171233d05SWill Deacon 	kfree(pgt);
93271233d05SWill Deacon 	return err;
9339ed24f4bSMarc Zyngier }
9349ed24f4bSMarc Zyngier 
kvm_uninit_stage2_mmu(struct kvm * kvm)935ce2b6022SRicardo Koller void kvm_uninit_stage2_mmu(struct kvm *kvm)
936ce2b6022SRicardo Koller {
937ce2b6022SRicardo Koller 	kvm_free_stage2_pgd(&kvm->arch.mmu);
938e7bf7a49SRicardo Koller 	kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
939ce2b6022SRicardo Koller }
940ce2b6022SRicardo Koller 
stage2_unmap_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)9419ed24f4bSMarc Zyngier static void stage2_unmap_memslot(struct kvm *kvm,
9429ed24f4bSMarc Zyngier 				 struct kvm_memory_slot *memslot)
9439ed24f4bSMarc Zyngier {
9449ed24f4bSMarc Zyngier 	hva_t hva = memslot->userspace_addr;
9459ed24f4bSMarc Zyngier 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
9469ed24f4bSMarc Zyngier 	phys_addr_t size = PAGE_SIZE * memslot->npages;
9479ed24f4bSMarc Zyngier 	hva_t reg_end = hva + size;
9489ed24f4bSMarc Zyngier 
9499ed24f4bSMarc Zyngier 	/*
9509ed24f4bSMarc Zyngier 	 * A memory region could potentially cover multiple VMAs, and any holes
9519ed24f4bSMarc Zyngier 	 * between them, so iterate over all of them to find out if we should
9529ed24f4bSMarc Zyngier 	 * unmap any of them.
9539ed24f4bSMarc Zyngier 	 *
9549ed24f4bSMarc Zyngier 	 *     +--------------------------------------------+
9559ed24f4bSMarc Zyngier 	 * +---------------+----------------+   +----------------+
9569ed24f4bSMarc Zyngier 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
9579ed24f4bSMarc Zyngier 	 * +---------------+----------------+   +----------------+
9589ed24f4bSMarc Zyngier 	 *     |               memory region                |
9599ed24f4bSMarc Zyngier 	 *     +--------------------------------------------+
9609ed24f4bSMarc Zyngier 	 */
9619ed24f4bSMarc Zyngier 	do {
962c728fd4cSGavin Shan 		struct vm_area_struct *vma;
9639ed24f4bSMarc Zyngier 		hva_t vm_start, vm_end;
9649ed24f4bSMarc Zyngier 
965c728fd4cSGavin Shan 		vma = find_vma_intersection(current->mm, hva, reg_end);
966c728fd4cSGavin Shan 		if (!vma)
9679ed24f4bSMarc Zyngier 			break;
9689ed24f4bSMarc Zyngier 
9699ed24f4bSMarc Zyngier 		/*
9709ed24f4bSMarc Zyngier 		 * Take the intersection of this VMA with the memory region
9719ed24f4bSMarc Zyngier 		 */
9729ed24f4bSMarc Zyngier 		vm_start = max(hva, vma->vm_start);
9739ed24f4bSMarc Zyngier 		vm_end = min(reg_end, vma->vm_end);
9749ed24f4bSMarc Zyngier 
9759ed24f4bSMarc Zyngier 		if (!(vma->vm_flags & VM_PFNMAP)) {
9769ed24f4bSMarc Zyngier 			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
977a0e50aa3SChristoffer Dall 			unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
9789ed24f4bSMarc Zyngier 		}
9799ed24f4bSMarc Zyngier 		hva = vm_end;
9809ed24f4bSMarc Zyngier 	} while (hva < reg_end);
9819ed24f4bSMarc Zyngier }
9829ed24f4bSMarc Zyngier 
9839ed24f4bSMarc Zyngier /**
9849ed24f4bSMarc Zyngier  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
9859ed24f4bSMarc Zyngier  * @kvm: The struct kvm pointer
9869ed24f4bSMarc Zyngier  *
987656012c7SFuad Tabba  * Go through the memregions and unmap any regular RAM
9889ed24f4bSMarc Zyngier  * backing memory already mapped to the VM.
9899ed24f4bSMarc Zyngier  */
stage2_unmap_vm(struct kvm * kvm)9909ed24f4bSMarc Zyngier void stage2_unmap_vm(struct kvm *kvm)
9919ed24f4bSMarc Zyngier {
9929ed24f4bSMarc Zyngier 	struct kvm_memslots *slots;
9939ed24f4bSMarc Zyngier 	struct kvm_memory_slot *memslot;
994a54d8066SMaciej S. Szmigiero 	int idx, bkt;
9959ed24f4bSMarc Zyngier 
9969ed24f4bSMarc Zyngier 	idx = srcu_read_lock(&kvm->srcu);
99789154dd5SMichel Lespinasse 	mmap_read_lock(current->mm);
998fcc5bf89SJing Zhang 	write_lock(&kvm->mmu_lock);
9999ed24f4bSMarc Zyngier 
10009ed24f4bSMarc Zyngier 	slots = kvm_memslots(kvm);
1001a54d8066SMaciej S. Szmigiero 	kvm_for_each_memslot(memslot, bkt, slots)
10029ed24f4bSMarc Zyngier 		stage2_unmap_memslot(kvm, memslot);
10039ed24f4bSMarc Zyngier 
1004fcc5bf89SJing Zhang 	write_unlock(&kvm->mmu_lock);
100589154dd5SMichel Lespinasse 	mmap_read_unlock(current->mm);
10069ed24f4bSMarc Zyngier 	srcu_read_unlock(&kvm->srcu, idx);
10079ed24f4bSMarc Zyngier }
10089ed24f4bSMarc Zyngier 
kvm_free_stage2_pgd(struct kvm_s2_mmu * mmu)1009a0e50aa3SChristoffer Dall void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
10109ed24f4bSMarc Zyngier {
1011cfb1a98dSQuentin Perret 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
101271233d05SWill Deacon 	struct kvm_pgtable *pgt = NULL;
10139ed24f4bSMarc Zyngier 
1014fcc5bf89SJing Zhang 	write_lock(&kvm->mmu_lock);
101571233d05SWill Deacon 	pgt = mmu->pgt;
101671233d05SWill Deacon 	if (pgt) {
101771233d05SWill Deacon 		mmu->pgd_phys = 0;
101871233d05SWill Deacon 		mmu->pgt = NULL;
101971233d05SWill Deacon 		free_percpu(mmu->last_vcpu_ran);
10209ed24f4bSMarc Zyngier 	}
1021fcc5bf89SJing Zhang 	write_unlock(&kvm->mmu_lock);
10229ed24f4bSMarc Zyngier 
102371233d05SWill Deacon 	if (pgt) {
102471233d05SWill Deacon 		kvm_pgtable_stage2_destroy(pgt);
102571233d05SWill Deacon 		kfree(pgt);
1026a0e50aa3SChristoffer Dall 	}
10279ed24f4bSMarc Zyngier }
10289ed24f4bSMarc Zyngier 
hyp_mc_free_fn(void * addr,void * unused)1029717a7eebSQuentin Perret static void hyp_mc_free_fn(void *addr, void *unused)
1030717a7eebSQuentin Perret {
1031717a7eebSQuentin Perret 	free_page((unsigned long)addr);
1032717a7eebSQuentin Perret }
1033717a7eebSQuentin Perret 
hyp_mc_alloc_fn(void * unused)1034717a7eebSQuentin Perret static void *hyp_mc_alloc_fn(void *unused)
1035717a7eebSQuentin Perret {
1036717a7eebSQuentin Perret 	return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1037717a7eebSQuentin Perret }
1038717a7eebSQuentin Perret 
free_hyp_memcache(struct kvm_hyp_memcache * mc)1039717a7eebSQuentin Perret void free_hyp_memcache(struct kvm_hyp_memcache *mc)
1040717a7eebSQuentin Perret {
1041717a7eebSQuentin Perret 	if (is_protected_kvm_enabled())
1042717a7eebSQuentin Perret 		__free_hyp_memcache(mc, hyp_mc_free_fn,
1043717a7eebSQuentin Perret 				    kvm_host_va, NULL);
1044717a7eebSQuentin Perret }
1045717a7eebSQuentin Perret 
topup_hyp_memcache(struct kvm_hyp_memcache * mc,unsigned long min_pages)1046717a7eebSQuentin Perret int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
1047717a7eebSQuentin Perret {
1048717a7eebSQuentin Perret 	if (!is_protected_kvm_enabled())
1049717a7eebSQuentin Perret 		return 0;
1050717a7eebSQuentin Perret 
1051717a7eebSQuentin Perret 	return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
1052717a7eebSQuentin Perret 				    kvm_host_pa, NULL);
1053717a7eebSQuentin Perret }
1054717a7eebSQuentin Perret 
10559ed24f4bSMarc Zyngier /**
10569ed24f4bSMarc Zyngier  * kvm_phys_addr_ioremap - map a device range to guest IPA
10579ed24f4bSMarc Zyngier  *
10589ed24f4bSMarc Zyngier  * @kvm:	The KVM pointer
10599ed24f4bSMarc Zyngier  * @guest_ipa:	The IPA at which to insert the mapping
10609ed24f4bSMarc Zyngier  * @pa:		The physical address of the device
10619ed24f4bSMarc Zyngier  * @size:	The size of the mapping
1062c9c0279cSXiaofei Tan  * @writable:   Whether or not to create a writable mapping
10639ed24f4bSMarc Zyngier  */
kvm_phys_addr_ioremap(struct kvm * kvm,phys_addr_t guest_ipa,phys_addr_t pa,unsigned long size,bool writable)10649ed24f4bSMarc Zyngier int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
10659ed24f4bSMarc Zyngier 			  phys_addr_t pa, unsigned long size, bool writable)
10669ed24f4bSMarc Zyngier {
106702bbd374SWill Deacon 	phys_addr_t addr;
10689ed24f4bSMarc Zyngier 	int ret = 0;
1069837f66c7SDavid Matlack 	struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
107002bbd374SWill Deacon 	struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
107102bbd374SWill Deacon 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
107202bbd374SWill Deacon 				     KVM_PGTABLE_PROT_R |
107302bbd374SWill Deacon 				     (writable ? KVM_PGTABLE_PROT_W : 0);
10749ed24f4bSMarc Zyngier 
1075bff01cb6SQuentin Perret 	if (is_protected_kvm_enabled())
1076bff01cb6SQuentin Perret 		return -EPERM;
1077bff01cb6SQuentin Perret 
107802bbd374SWill Deacon 	size += offset_in_page(guest_ipa);
107902bbd374SWill Deacon 	guest_ipa &= PAGE_MASK;
10809ed24f4bSMarc Zyngier 
108102bbd374SWill Deacon 	for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1082c1a33aebSSean Christopherson 		ret = kvm_mmu_topup_memory_cache(&cache,
108361ffb3a5SSean Christopherson 						 kvm_mmu_cache_min_pages(kvm));
10849ed24f4bSMarc Zyngier 		if (ret)
108502bbd374SWill Deacon 			break;
108602bbd374SWill Deacon 
1087fcc5bf89SJing Zhang 		write_lock(&kvm->mmu_lock);
108802bbd374SWill Deacon 		ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
10891577cb58SOliver Upton 					     &cache, 0);
1090fcc5bf89SJing Zhang 		write_unlock(&kvm->mmu_lock);
10919ed24f4bSMarc Zyngier 		if (ret)
109202bbd374SWill Deacon 			break;
10939ed24f4bSMarc Zyngier 
109402bbd374SWill Deacon 		pa += PAGE_SIZE;
10959ed24f4bSMarc Zyngier 	}
10969ed24f4bSMarc Zyngier 
1097c1a33aebSSean Christopherson 	kvm_mmu_free_memory_cache(&cache);
10989ed24f4bSMarc Zyngier 	return ret;
10999ed24f4bSMarc Zyngier }
11009ed24f4bSMarc Zyngier 
11019ed24f4bSMarc Zyngier /**
11029ed24f4bSMarc Zyngier  * stage2_wp_range() - write protect stage2 memory region range
1103c9c0279cSXiaofei Tan  * @mmu:        The KVM stage-2 MMU pointer
11049ed24f4bSMarc Zyngier  * @addr:	Start address of range
11059ed24f4bSMarc Zyngier  * @end:	End address of range
11069ed24f4bSMarc Zyngier  */
stage2_wp_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,phys_addr_t end)1107a0e50aa3SChristoffer Dall static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
11089ed24f4bSMarc Zyngier {
11098531bd63SMarc Zyngier 	stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect);
11109ed24f4bSMarc Zyngier }
11119ed24f4bSMarc Zyngier 
11129ed24f4bSMarc Zyngier /**
11139ed24f4bSMarc Zyngier  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
11149ed24f4bSMarc Zyngier  * @kvm:	The KVM pointer
11159ed24f4bSMarc Zyngier  * @slot:	The memory slot to write protect
11169ed24f4bSMarc Zyngier  *
11179ed24f4bSMarc Zyngier  * Called to start logging dirty pages after memory region
11189ed24f4bSMarc Zyngier  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
11199ed24f4bSMarc Zyngier  * all present PUD, PMD and PTEs are write protected in the memory region.
11209ed24f4bSMarc Zyngier  * Afterwards read of dirty page log can be called.
11219ed24f4bSMarc Zyngier  *
11229ed24f4bSMarc Zyngier  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
11239ed24f4bSMarc Zyngier  * serializing operations for VM memory regions.
11249ed24f4bSMarc Zyngier  */
kvm_mmu_wp_memory_region(struct kvm * kvm,int slot)1125eab62148SGavin Shan static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
11269ed24f4bSMarc Zyngier {
11279ed24f4bSMarc Zyngier 	struct kvm_memslots *slots = kvm_memslots(kvm);
11289ed24f4bSMarc Zyngier 	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
11299ed24f4bSMarc Zyngier 	phys_addr_t start, end;
11309ed24f4bSMarc Zyngier 
11319ed24f4bSMarc Zyngier 	if (WARN_ON_ONCE(!memslot))
11329ed24f4bSMarc Zyngier 		return;
11339ed24f4bSMarc Zyngier 
11349ed24f4bSMarc Zyngier 	start = memslot->base_gfn << PAGE_SHIFT;
11359ed24f4bSMarc Zyngier 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
11369ed24f4bSMarc Zyngier 
1137fcc5bf89SJing Zhang 	write_lock(&kvm->mmu_lock);
1138a0e50aa3SChristoffer Dall 	stage2_wp_range(&kvm->arch.mmu, start, end);
1139fcc5bf89SJing Zhang 	write_unlock(&kvm->mmu_lock);
11403756b6f2SRaghavendra Rao Ananta 	kvm_flush_remote_tlbs_memslot(kvm, memslot);
11419ed24f4bSMarc Zyngier }
11429ed24f4bSMarc Zyngier 
11439ed24f4bSMarc Zyngier /**
1144e7bf7a49SRicardo Koller  * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
1145e7bf7a49SRicardo Koller  *				   pages for memory slot
1146e7bf7a49SRicardo Koller  * @kvm:	The KVM pointer
1147e7bf7a49SRicardo Koller  * @slot:	The memory slot to split
1148e7bf7a49SRicardo Koller  *
1149e7bf7a49SRicardo Koller  * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
1150e7bf7a49SRicardo Koller  * serializing operations for VM memory regions.
1151e7bf7a49SRicardo Koller  */
kvm_mmu_split_memory_region(struct kvm * kvm,int slot)1152e7bf7a49SRicardo Koller static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
1153e7bf7a49SRicardo Koller {
1154e7bf7a49SRicardo Koller 	struct kvm_memslots *slots;
1155e7bf7a49SRicardo Koller 	struct kvm_memory_slot *memslot;
1156e7bf7a49SRicardo Koller 	phys_addr_t start, end;
1157e7bf7a49SRicardo Koller 
1158e7bf7a49SRicardo Koller 	lockdep_assert_held(&kvm->slots_lock);
1159e7bf7a49SRicardo Koller 
1160e7bf7a49SRicardo Koller 	slots = kvm_memslots(kvm);
1161e7bf7a49SRicardo Koller 	memslot = id_to_memslot(slots, slot);
1162e7bf7a49SRicardo Koller 
1163e7bf7a49SRicardo Koller 	start = memslot->base_gfn << PAGE_SHIFT;
1164e7bf7a49SRicardo Koller 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1165e7bf7a49SRicardo Koller 
1166e7bf7a49SRicardo Koller 	write_lock(&kvm->mmu_lock);
1167e7bf7a49SRicardo Koller 	kvm_mmu_split_huge_pages(kvm, start, end);
1168e7bf7a49SRicardo Koller 	write_unlock(&kvm->mmu_lock);
1169e7bf7a49SRicardo Koller }
1170e7bf7a49SRicardo Koller 
11719ed24f4bSMarc Zyngier /*
11723005f6f2SRicardo Koller  * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
11733005f6f2SRicardo Koller  * @kvm:	The KVM pointer
11743005f6f2SRicardo Koller  * @slot:	The memory slot associated with mask
11753005f6f2SRicardo Koller  * @gfn_offset:	The gfn offset in memory slot
11763005f6f2SRicardo Koller  * @mask:	The mask of pages at offset 'gfn_offset' in this memory
11773005f6f2SRicardo Koller  *		slot to enable dirty logging on
11789ed24f4bSMarc Zyngier  *
11796acf5166SRicardo Koller  * Writes protect selected pages to enable dirty logging, and then
11806acf5166SRicardo Koller  * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
11819ed24f4bSMarc Zyngier  */
kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)11829ed24f4bSMarc Zyngier void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
11839ed24f4bSMarc Zyngier 		struct kvm_memory_slot *slot,
11849ed24f4bSMarc Zyngier 		gfn_t gfn_offset, unsigned long mask)
11859ed24f4bSMarc Zyngier {
11863005f6f2SRicardo Koller 	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
11873005f6f2SRicardo Koller 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
11883005f6f2SRicardo Koller 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
11893005f6f2SRicardo Koller 
11903005f6f2SRicardo Koller 	lockdep_assert_held_write(&kvm->mmu_lock);
11913005f6f2SRicardo Koller 
11923005f6f2SRicardo Koller 	stage2_wp_range(&kvm->arch.mmu, start, end);
11936acf5166SRicardo Koller 
11946acf5166SRicardo Koller 	/*
11956acf5166SRicardo Koller 	 * Eager-splitting is done when manual-protect is set.  We
11966acf5166SRicardo Koller 	 * also check for initially-all-set because we can avoid
11976acf5166SRicardo Koller 	 * eager-splitting if initially-all-set is false.
11986acf5166SRicardo Koller 	 * Initially-all-set equal false implies that huge-pages were
11996acf5166SRicardo Koller 	 * already split when enabling dirty logging: no need to do it
12006acf5166SRicardo Koller 	 * again.
12016acf5166SRicardo Koller 	 */
12026acf5166SRicardo Koller 	if (kvm_dirty_log_manual_protect_and_init_set(kvm))
12036acf5166SRicardo Koller 		kvm_mmu_split_huge_pages(kvm, start, end);
12049ed24f4bSMarc Zyngier }
12059ed24f4bSMarc Zyngier 
kvm_send_hwpoison_signal(unsigned long address,short lsb)12069ed24f4bSMarc Zyngier static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
12079ed24f4bSMarc Zyngier {
12089ed24f4bSMarc Zyngier 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
12099ed24f4bSMarc Zyngier }
12109ed24f4bSMarc Zyngier 
fault_supports_stage2_huge_mapping(struct kvm_memory_slot * memslot,unsigned long hva,unsigned long map_size)12119ed24f4bSMarc Zyngier static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
12129ed24f4bSMarc Zyngier 					       unsigned long hva,
12139ed24f4bSMarc Zyngier 					       unsigned long map_size)
12149ed24f4bSMarc Zyngier {
12159ed24f4bSMarc Zyngier 	gpa_t gpa_start;
12169ed24f4bSMarc Zyngier 	hva_t uaddr_start, uaddr_end;
12179ed24f4bSMarc Zyngier 	size_t size;
12189ed24f4bSMarc Zyngier 
12199f283614SSuzuki K Poulose 	/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
12209f283614SSuzuki K Poulose 	if (map_size == PAGE_SIZE)
12219f283614SSuzuki K Poulose 		return true;
12229f283614SSuzuki K Poulose 
12239ed24f4bSMarc Zyngier 	size = memslot->npages * PAGE_SIZE;
12249ed24f4bSMarc Zyngier 
12259ed24f4bSMarc Zyngier 	gpa_start = memslot->base_gfn << PAGE_SHIFT;
12269ed24f4bSMarc Zyngier 
12279ed24f4bSMarc Zyngier 	uaddr_start = memslot->userspace_addr;
12289ed24f4bSMarc Zyngier 	uaddr_end = uaddr_start + size;
12299ed24f4bSMarc Zyngier 
12309ed24f4bSMarc Zyngier 	/*
12319ed24f4bSMarc Zyngier 	 * Pages belonging to memslots that don't have the same alignment
12329ed24f4bSMarc Zyngier 	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
12339ed24f4bSMarc Zyngier 	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
12349ed24f4bSMarc Zyngier 	 *
12359ed24f4bSMarc Zyngier 	 * Consider a layout like the following:
12369ed24f4bSMarc Zyngier 	 *
12379ed24f4bSMarc Zyngier 	 *    memslot->userspace_addr:
12389ed24f4bSMarc Zyngier 	 *    +-----+--------------------+--------------------+---+
12399ed24f4bSMarc Zyngier 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
12409ed24f4bSMarc Zyngier 	 *    +-----+--------------------+--------------------+---+
12419ed24f4bSMarc Zyngier 	 *
12429f283614SSuzuki K Poulose 	 *    memslot->base_gfn << PAGE_SHIFT:
12439ed24f4bSMarc Zyngier 	 *      +---+--------------------+--------------------+-----+
12449ed24f4bSMarc Zyngier 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
12459ed24f4bSMarc Zyngier 	 *      +---+--------------------+--------------------+-----+
12469ed24f4bSMarc Zyngier 	 *
12479ed24f4bSMarc Zyngier 	 * If we create those stage-2 blocks, we'll end up with this incorrect
12489ed24f4bSMarc Zyngier 	 * mapping:
12499ed24f4bSMarc Zyngier 	 *   d -> f
12509ed24f4bSMarc Zyngier 	 *   e -> g
12519ed24f4bSMarc Zyngier 	 *   f -> h
12529ed24f4bSMarc Zyngier 	 */
12539ed24f4bSMarc Zyngier 	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
12549ed24f4bSMarc Zyngier 		return false;
12559ed24f4bSMarc Zyngier 
12569ed24f4bSMarc Zyngier 	/*
12579ed24f4bSMarc Zyngier 	 * Next, let's make sure we're not trying to map anything not covered
12589ed24f4bSMarc Zyngier 	 * by the memslot. This means we have to prohibit block size mappings
12599ed24f4bSMarc Zyngier 	 * for the beginning and end of a non-block aligned and non-block sized
12609ed24f4bSMarc Zyngier 	 * memory slot (illustrated by the head and tail parts of the
12619ed24f4bSMarc Zyngier 	 * userspace view above containing pages 'abcde' and 'xyz',
12629ed24f4bSMarc Zyngier 	 * respectively).
12639ed24f4bSMarc Zyngier 	 *
12649ed24f4bSMarc Zyngier 	 * Note that it doesn't matter if we do the check using the
12659ed24f4bSMarc Zyngier 	 * userspace_addr or the base_gfn, as both are equally aligned (per
12669ed24f4bSMarc Zyngier 	 * the check above) and equally sized.
12679ed24f4bSMarc Zyngier 	 */
12689ed24f4bSMarc Zyngier 	return (hva & ~(map_size - 1)) >= uaddr_start &&
12699ed24f4bSMarc Zyngier 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
12709ed24f4bSMarc Zyngier }
12719ed24f4bSMarc Zyngier 
12720529c902SSuzuki K Poulose /*
12730529c902SSuzuki K Poulose  * Check if the given hva is backed by a transparent huge page (THP) and
12740529c902SSuzuki K Poulose  * whether it can be mapped using block mapping in stage2. If so, adjust
12750529c902SSuzuki K Poulose  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
12760529c902SSuzuki K Poulose  * supported. This will need to be updated to support other THP sizes.
12770529c902SSuzuki K Poulose  *
12780529c902SSuzuki K Poulose  * Returns the size of the mapping.
12790529c902SSuzuki K Poulose  */
1280e86fc1a3SMarc Zyngier static long
transparent_hugepage_adjust(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long hva,kvm_pfn_t * pfnp,phys_addr_t * ipap)12816011cf68SMarc Zyngier transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
12820529c902SSuzuki K Poulose 			    unsigned long hva, kvm_pfn_t *pfnp,
12830529c902SSuzuki K Poulose 			    phys_addr_t *ipap)
12840529c902SSuzuki K Poulose {
12850529c902SSuzuki K Poulose 	kvm_pfn_t pfn = *pfnp;
12860529c902SSuzuki K Poulose 
12870529c902SSuzuki K Poulose 	/*
12880529c902SSuzuki K Poulose 	 * Make sure the adjustment is done only for THP pages. Also make
12890529c902SSuzuki K Poulose 	 * sure that the HVA and IPA are sufficiently aligned and that the
12900529c902SSuzuki K Poulose 	 * block map is contained within the memslot.
12910529c902SSuzuki K Poulose 	 */
1292e86fc1a3SMarc Zyngier 	if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1293e86fc1a3SMarc Zyngier 		int sz = get_user_mapping_size(kvm, hva);
1294e86fc1a3SMarc Zyngier 
1295e86fc1a3SMarc Zyngier 		if (sz < 0)
1296e86fc1a3SMarc Zyngier 			return sz;
1297e86fc1a3SMarc Zyngier 
1298e86fc1a3SMarc Zyngier 		if (sz < PMD_SIZE)
1299e86fc1a3SMarc Zyngier 			return PAGE_SIZE;
1300e86fc1a3SMarc Zyngier 
13010529c902SSuzuki K Poulose 		/*
13020529c902SSuzuki K Poulose 		 * The address we faulted on is backed by a transparent huge
13030529c902SSuzuki K Poulose 		 * page.  However, because we map the compound huge page and
13040529c902SSuzuki K Poulose 		 * not the individual tail page, we need to transfer the
13050529c902SSuzuki K Poulose 		 * refcount to the head page.  We have to be careful that the
13060529c902SSuzuki K Poulose 		 * THP doesn't start to split while we are adjusting the
13070529c902SSuzuki K Poulose 		 * refcounts.
13080529c902SSuzuki K Poulose 		 *
130920ec3ebdSChao Peng 		 * We are sure this doesn't happen, because mmu_invalidate_retry
13100529c902SSuzuki K Poulose 		 * was successful and we are holding the mmu_lock, so if this
13110529c902SSuzuki K Poulose 		 * THP is trying to split, it will be blocked in the mmu
13120529c902SSuzuki K Poulose 		 * notifier before touching any of the pages, specifically
13130529c902SSuzuki K Poulose 		 * before being able to call __split_huge_page_refcount().
13140529c902SSuzuki K Poulose 		 *
13150529c902SSuzuki K Poulose 		 * We can therefore safely transfer the refcount from PG_tail
13160529c902SSuzuki K Poulose 		 * to PG_head and switch the pfn from a tail page to the head
13170529c902SSuzuki K Poulose 		 * page accordingly.
13180529c902SSuzuki K Poulose 		 */
13190529c902SSuzuki K Poulose 		*ipap &= PMD_MASK;
13200529c902SSuzuki K Poulose 		kvm_release_pfn_clean(pfn);
13210529c902SSuzuki K Poulose 		pfn &= ~(PTRS_PER_PMD - 1);
13220fe49630SMarc Zyngier 		get_page(pfn_to_page(pfn));
13230529c902SSuzuki K Poulose 		*pfnp = pfn;
13240529c902SSuzuki K Poulose 
13250529c902SSuzuki K Poulose 		return PMD_SIZE;
13260529c902SSuzuki K Poulose 	}
13270529c902SSuzuki K Poulose 
13280529c902SSuzuki K Poulose 	/* Use page mapping if we cannot use block mapping. */
13290529c902SSuzuki K Poulose 	return PAGE_SIZE;
13300529c902SSuzuki K Poulose }
13310529c902SSuzuki K Poulose 
get_vma_page_shift(struct vm_area_struct * vma,unsigned long hva)13322aa53d68SKeqian Zhu static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
13332aa53d68SKeqian Zhu {
13342aa53d68SKeqian Zhu 	unsigned long pa;
13352aa53d68SKeqian Zhu 
13362aa53d68SKeqian Zhu 	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
13372aa53d68SKeqian Zhu 		return huge_page_shift(hstate_vma(vma));
13382aa53d68SKeqian Zhu 
13392aa53d68SKeqian Zhu 	if (!(vma->vm_flags & VM_PFNMAP))
13402aa53d68SKeqian Zhu 		return PAGE_SHIFT;
13412aa53d68SKeqian Zhu 
13422aa53d68SKeqian Zhu 	VM_BUG_ON(is_vm_hugetlb_page(vma));
13432aa53d68SKeqian Zhu 
13442aa53d68SKeqian Zhu 	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
13452aa53d68SKeqian Zhu 
13462aa53d68SKeqian Zhu #ifndef __PAGETABLE_PMD_FOLDED
13472aa53d68SKeqian Zhu 	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
13482aa53d68SKeqian Zhu 	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
13492aa53d68SKeqian Zhu 	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
13502aa53d68SKeqian Zhu 		return PUD_SHIFT;
13512aa53d68SKeqian Zhu #endif
13522aa53d68SKeqian Zhu 
13532aa53d68SKeqian Zhu 	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
13542aa53d68SKeqian Zhu 	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
13552aa53d68SKeqian Zhu 	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
13562aa53d68SKeqian Zhu 		return PMD_SHIFT;
13572aa53d68SKeqian Zhu 
13582aa53d68SKeqian Zhu 	return PAGE_SHIFT;
13592aa53d68SKeqian Zhu }
13602aa53d68SKeqian Zhu 
1361ea7fc1bbSSteven Price /*
1362ea7fc1bbSSteven Price  * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
1363ea7fc1bbSSteven Price  * able to see the page's tags and therefore they must be initialised first. If
1364ea7fc1bbSSteven Price  * PG_mte_tagged is set, tags have already been initialised.
1365ea7fc1bbSSteven Price  *
1366ea7fc1bbSSteven Price  * The race in the test/set of the PG_mte_tagged flag is handled by:
1367ea7fc1bbSSteven Price  * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
1368ea7fc1bbSSteven Price  *   racing to santise the same page
1369ea7fc1bbSSteven Price  * - mmap_lock protects between a VM faulting a page in and the VMM performing
1370ea7fc1bbSSteven Price  *   an mprotect() to add VM_MTE
1371ea7fc1bbSSteven Price  */
sanitise_mte_tags(struct kvm * kvm,kvm_pfn_t pfn,unsigned long size)13722dbf12aeSCatalin Marinas static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
1373ea7fc1bbSSteven Price 			      unsigned long size)
1374ea7fc1bbSSteven Price {
1375ea7fc1bbSSteven Price 	unsigned long i, nr_pages = size >> PAGE_SHIFT;
13762dbf12aeSCatalin Marinas 	struct page *page = pfn_to_page(pfn);
1377ea7fc1bbSSteven Price 
1378ea7fc1bbSSteven Price 	if (!kvm_has_mte(kvm))
13792dbf12aeSCatalin Marinas 		return;
1380ea7fc1bbSSteven Price 
1381ea7fc1bbSSteven Price 	for (i = 0; i < nr_pages; i++, page++) {
1382d77e59a8SCatalin Marinas 		if (try_page_mte_tagging(page)) {
1383ea7fc1bbSSteven Price 			mte_clear_page_tags(page_address(page));
1384e059853dSCatalin Marinas 			set_page_mte_tagged(page);
1385ea7fc1bbSSteven Price 		}
1386ea7fc1bbSSteven Price 	}
1387ea7fc1bbSSteven Price }
1388ea7fc1bbSSteven Price 
kvm_vma_mte_allowed(struct vm_area_struct * vma)1389d89585fbSPeter Collingbourne static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
1390d89585fbSPeter Collingbourne {
1391d89585fbSPeter Collingbourne 	return vma->vm_flags & VM_MTE_ALLOWED;
1392ea7fc1bbSSteven Price }
1393ea7fc1bbSSteven Price 
user_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_memory_slot * memslot,unsigned long hva,unsigned long fault_status)13949ed24f4bSMarc Zyngier static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
13959ed24f4bSMarc Zyngier 			  struct kvm_memory_slot *memslot, unsigned long hva,
13969ed24f4bSMarc Zyngier 			  unsigned long fault_status)
13979ed24f4bSMarc Zyngier {
1398ffd1b63aSWill Deacon 	int ret = 0;
13999ed24f4bSMarc Zyngier 	bool write_fault, writable, force_pte = false;
14008c2e8ac8SMarc Zyngier 	bool exec_fault, mte_allowed;
14016f745f1bSWill Deacon 	bool device = false;
14029ed24f4bSMarc Zyngier 	unsigned long mmu_seq;
14039ed24f4bSMarc Zyngier 	struct kvm *kvm = vcpu->kvm;
14049ed24f4bSMarc Zyngier 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
14059ed24f4bSMarc Zyngier 	struct vm_area_struct *vma;
14069ed24f4bSMarc Zyngier 	short vma_shift;
14076f745f1bSWill Deacon 	gfn_t gfn;
14089ed24f4bSMarc Zyngier 	kvm_pfn_t pfn;
14099ed24f4bSMarc Zyngier 	bool logging_active = memslot_is_logging(memslot);
14107d894834SYanan Wang 	unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
1411e86fc1a3SMarc Zyngier 	long vma_pagesize, fault_granule;
14126f745f1bSWill Deacon 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
14136f745f1bSWill Deacon 	struct kvm_pgtable *pgt;
14149ed24f4bSMarc Zyngier 
14157d894834SYanan Wang 	fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
14169ed24f4bSMarc Zyngier 	write_fault = kvm_is_write_fault(vcpu);
1417c4ad98e4SMarc Zyngier 	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
14189ed24f4bSMarc Zyngier 	VM_BUG_ON(write_fault && exec_fault);
14199ed24f4bSMarc Zyngier 
1420b0803ba7SMarc Zyngier 	if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) {
14219ed24f4bSMarc Zyngier 		kvm_err("Unexpected L2 read permission error\n");
14229ed24f4bSMarc Zyngier 		return -EFAULT;
14239ed24f4bSMarc Zyngier 	}
14249ed24f4bSMarc Zyngier 
14252aa53d68SKeqian Zhu 	/*
142613ec9308SDavid Matlack 	 * Permission faults just need to update the existing leaf entry,
142713ec9308SDavid Matlack 	 * and so normally don't require allocations from the memcache. The
142813ec9308SDavid Matlack 	 * only exception to this is when dirty logging is enabled at runtime
142913ec9308SDavid Matlack 	 * and a write fault needs to collapse a block entry into a table.
143013ec9308SDavid Matlack 	 */
143113ec9308SDavid Matlack 	if (fault_status != ESR_ELx_FSC_PERM ||
143213ec9308SDavid Matlack 	    (logging_active && write_fault)) {
143313ec9308SDavid Matlack 		ret = kvm_mmu_topup_memory_cache(memcache,
143413ec9308SDavid Matlack 						 kvm_mmu_cache_min_pages(kvm));
143513ec9308SDavid Matlack 		if (ret)
143613ec9308SDavid Matlack 			return ret;
143713ec9308SDavid Matlack 	}
143813ec9308SDavid Matlack 
143913ec9308SDavid Matlack 	/*
14402aa53d68SKeqian Zhu 	 * Let's check if we will get back a huge page backed by hugetlbfs, or
14412aa53d68SKeqian Zhu 	 * get block mapping for device MMIO region.
14422aa53d68SKeqian Zhu 	 */
144389154dd5SMichel Lespinasse 	mmap_read_lock(current->mm);
144409eef83aSLiam Howlett 	vma = vma_lookup(current->mm, hva);
14459ed24f4bSMarc Zyngier 	if (unlikely(!vma)) {
14469ed24f4bSMarc Zyngier 		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
144789154dd5SMichel Lespinasse 		mmap_read_unlock(current->mm);
14489ed24f4bSMarc Zyngier 		return -EFAULT;
14499ed24f4bSMarc Zyngier 	}
14509ed24f4bSMarc Zyngier 
14512aa53d68SKeqian Zhu 	/*
14522aa53d68SKeqian Zhu 	 * logging_active is guaranteed to never be true for VM_PFNMAP
14532aa53d68SKeqian Zhu 	 * memslots.
14542aa53d68SKeqian Zhu 	 */
14552aa53d68SKeqian Zhu 	if (logging_active) {
14569ed24f4bSMarc Zyngier 		force_pte = true;
1457523b3999SAlexandru Elisei 		vma_shift = PAGE_SHIFT;
14582aa53d68SKeqian Zhu 	} else {
14592aa53d68SKeqian Zhu 		vma_shift = get_vma_page_shift(vma, hva);
14609ed24f4bSMarc Zyngier 	}
14619ed24f4bSMarc Zyngier 
14622f40c460SGavin Shan 	switch (vma_shift) {
1463faf00039SGavin Shan #ifndef __PAGETABLE_PMD_FOLDED
14642f40c460SGavin Shan 	case PUD_SHIFT:
14652f40c460SGavin Shan 		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
14662f40c460SGavin Shan 			break;
14672f40c460SGavin Shan 		fallthrough;
1468faf00039SGavin Shan #endif
14692f40c460SGavin Shan 	case CONT_PMD_SHIFT:
1470523b3999SAlexandru Elisei 		vma_shift = PMD_SHIFT;
14712f40c460SGavin Shan 		fallthrough;
14722f40c460SGavin Shan 	case PMD_SHIFT:
14732f40c460SGavin Shan 		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
14742f40c460SGavin Shan 			break;
14752f40c460SGavin Shan 		fallthrough;
14762f40c460SGavin Shan 	case CONT_PTE_SHIFT:
1477523b3999SAlexandru Elisei 		vma_shift = PAGE_SHIFT;
14782f40c460SGavin Shan 		force_pte = true;
14792f40c460SGavin Shan 		fallthrough;
14802f40c460SGavin Shan 	case PAGE_SHIFT:
14812f40c460SGavin Shan 		break;
14822f40c460SGavin Shan 	default:
14832f40c460SGavin Shan 		WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
1484523b3999SAlexandru Elisei 	}
1485523b3999SAlexandru Elisei 
1486523b3999SAlexandru Elisei 	vma_pagesize = 1UL << vma_shift;
14876f745f1bSWill Deacon 	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
1488523b3999SAlexandru Elisei 		fault_ipa &= ~(vma_pagesize - 1);
14896f745f1bSWill Deacon 
14906f745f1bSWill Deacon 	gfn = fault_ipa >> PAGE_SHIFT;
14918c2e8ac8SMarc Zyngier 	mte_allowed = kvm_vma_mte_allowed(vma);
14928c2e8ac8SMarc Zyngier 
14938c2e8ac8SMarc Zyngier 	/* Don't use the VMA after the unlock -- it may have vanished */
14948c2e8ac8SMarc Zyngier 	vma = NULL;
14959ed24f4bSMarc Zyngier 
14966f745f1bSWill Deacon 	/*
149713ec9308SDavid Matlack 	 * Read mmu_invalidate_seq so that KVM can detect if the results of
149813ec9308SDavid Matlack 	 * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
149913ec9308SDavid Matlack 	 * acquiring kvm->mmu_lock.
150010ba2d17SGavin Shan 	 *
150113ec9308SDavid Matlack 	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
150213ec9308SDavid Matlack 	 * with the smp_wmb() in kvm_mmu_invalidate_end().
15039ed24f4bSMarc Zyngier 	 */
150413ec9308SDavid Matlack 	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
150513ec9308SDavid Matlack 	mmap_read_unlock(current->mm);
15069ed24f4bSMarc Zyngier 
1507c8b88b33SPeter Xu 	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
150810ba2d17SGavin Shan 				   write_fault, &writable, NULL);
15099ed24f4bSMarc Zyngier 	if (pfn == KVM_PFN_ERR_HWPOISON) {
15109ed24f4bSMarc Zyngier 		kvm_send_hwpoison_signal(hva, vma_shift);
15119ed24f4bSMarc Zyngier 		return 0;
15129ed24f4bSMarc Zyngier 	}
15139ed24f4bSMarc Zyngier 	if (is_error_noslot_pfn(pfn))
15149ed24f4bSMarc Zyngier 		return -EFAULT;
15159ed24f4bSMarc Zyngier 
15169ed24f4bSMarc Zyngier 	if (kvm_is_device_pfn(pfn)) {
15172aa53d68SKeqian Zhu 		/*
15182aa53d68SKeqian Zhu 		 * If the page was identified as device early by looking at
15192aa53d68SKeqian Zhu 		 * the VMA flags, vma_pagesize is already representing the
15202aa53d68SKeqian Zhu 		 * largest quantity we can map.  If instead it was mapped
15212aa53d68SKeqian Zhu 		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
15222aa53d68SKeqian Zhu 		 * and must not be upgraded.
15232aa53d68SKeqian Zhu 		 *
15242aa53d68SKeqian Zhu 		 * In both cases, we don't let transparent_hugepage_adjust()
15252aa53d68SKeqian Zhu 		 * change things at the last minute.
15262aa53d68SKeqian Zhu 		 */
15276f745f1bSWill Deacon 		device = true;
15286f745f1bSWill Deacon 	} else if (logging_active && !write_fault) {
15299ed24f4bSMarc Zyngier 		/*
15309ed24f4bSMarc Zyngier 		 * Only actually map the page as writable if this was a write
15319ed24f4bSMarc Zyngier 		 * fault.
15329ed24f4bSMarc Zyngier 		 */
15339ed24f4bSMarc Zyngier 		writable = false;
15349ed24f4bSMarc Zyngier 	}
15359ed24f4bSMarc Zyngier 
15366f745f1bSWill Deacon 	if (exec_fault && device)
15379ed24f4bSMarc Zyngier 		return -ENOEXEC;
15389ed24f4bSMarc Zyngier 
1539f783ef1cSJing Zhang 	read_lock(&kvm->mmu_lock);
15406f745f1bSWill Deacon 	pgt = vcpu->arch.hw_mmu->pgt;
154120ec3ebdSChao Peng 	if (mmu_invalidate_retry(kvm, mmu_seq))
15429ed24f4bSMarc Zyngier 		goto out_unlock;
15439ed24f4bSMarc Zyngier 
15449ed24f4bSMarc Zyngier 	/*
15450529c902SSuzuki K Poulose 	 * If we are not forced to use page mapping, check if we are
15460529c902SSuzuki K Poulose 	 * backed by a THP and thus use block mapping if possible.
15479ed24f4bSMarc Zyngier 	 */
1548f2cc3273SMarc Zyngier 	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
1549b0803ba7SMarc Zyngier 		if (fault_status ==  ESR_ELx_FSC_PERM &&
1550b0803ba7SMarc Zyngier 		    fault_granule > PAGE_SIZE)
1551f2cc3273SMarc Zyngier 			vma_pagesize = fault_granule;
1552f2cc3273SMarc Zyngier 		else
1553f2cc3273SMarc Zyngier 			vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
1554f2cc3273SMarc Zyngier 								   hva, &pfn,
1555f2cc3273SMarc Zyngier 								   &fault_ipa);
1556e86fc1a3SMarc Zyngier 
1557e86fc1a3SMarc Zyngier 		if (vma_pagesize < 0) {
1558e86fc1a3SMarc Zyngier 			ret = vma_pagesize;
1559e86fc1a3SMarc Zyngier 			goto out_unlock;
1560e86fc1a3SMarc Zyngier 		}
1561f2cc3273SMarc Zyngier 	}
15629f03db66SMarc Zyngier 
1563b0803ba7SMarc Zyngier 	if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) {
1564d89585fbSPeter Collingbourne 		/* Check the VMM hasn't introduced a new disallowed VMA */
15658c2e8ac8SMarc Zyngier 		if (mte_allowed) {
15662dbf12aeSCatalin Marinas 			sanitise_mte_tags(kvm, pfn, vma_pagesize);
15672dbf12aeSCatalin Marinas 		} else {
15689f03db66SMarc Zyngier 			ret = -EFAULT;
15699f03db66SMarc Zyngier 			goto out_unlock;
15709f03db66SMarc Zyngier 		}
15712dbf12aeSCatalin Marinas 	}
15729f03db66SMarc Zyngier 
1573509552e6SYanan Wang 	if (writable)
15746f745f1bSWill Deacon 		prot |= KVM_PGTABLE_PROT_W;
15759ed24f4bSMarc Zyngier 
157625aa2869SYanan Wang 	if (exec_fault)
15776f745f1bSWill Deacon 		prot |= KVM_PGTABLE_PROT_X;
15786f745f1bSWill Deacon 
15796f745f1bSWill Deacon 	if (device)
15806f745f1bSWill Deacon 		prot |= KVM_PGTABLE_PROT_DEVICE;
15816f745f1bSWill Deacon 	else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
15826f745f1bSWill Deacon 		prot |= KVM_PGTABLE_PROT_X;
15836f745f1bSWill Deacon 
15847d894834SYanan Wang 	/*
15857d894834SYanan Wang 	 * Under the premise of getting a FSC_PERM fault, we just need to relax
15867d894834SYanan Wang 	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
15877d894834SYanan Wang 	 * kvm_pgtable_stage2_map() should be called to change block size.
15887d894834SYanan Wang 	 */
1589b0803ba7SMarc Zyngier 	if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule)
15906f745f1bSWill Deacon 		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
15911577cb58SOliver Upton 	else
15926f745f1bSWill Deacon 		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
15936f745f1bSWill Deacon 					     __pfn_to_phys(pfn), prot,
1594ddcadb29SOliver Upton 					     memcache,
1595ddcadb29SOliver Upton 					     KVM_PGTABLE_WALK_HANDLE_FAULT |
1596ddcadb29SOliver Upton 					     KVM_PGTABLE_WALK_SHARED);
15979ed24f4bSMarc Zyngier 
1598509552e6SYanan Wang 	/* Mark the page dirty only if the fault is handled successfully */
1599509552e6SYanan Wang 	if (writable && !ret) {
1600509552e6SYanan Wang 		kvm_set_pfn_dirty(pfn);
160110ba2d17SGavin Shan 		mark_page_dirty_in_slot(kvm, memslot, gfn);
1602509552e6SYanan Wang 	}
1603509552e6SYanan Wang 
16049ed24f4bSMarc Zyngier out_unlock:
1605f783ef1cSJing Zhang 	read_unlock(&kvm->mmu_lock);
16069ed24f4bSMarc Zyngier 	kvm_release_pfn_clean(pfn);
1607509552e6SYanan Wang 	return ret != -EAGAIN ? ret : 0;
16089ed24f4bSMarc Zyngier }
16099ed24f4bSMarc Zyngier 
1610ee8efad7SWill Deacon /* Resolve the access fault by making the page young again. */
handle_access_fault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa)16119ed24f4bSMarc Zyngier static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
16129ed24f4bSMarc Zyngier {
16139a7ad19aSOliver Upton 	kvm_pte_t pte;
1614ee8efad7SWill Deacon 	struct kvm_s2_mmu *mmu;
16159ed24f4bSMarc Zyngier 
16169ed24f4bSMarc Zyngier 	trace_kvm_access_fault(fault_ipa);
16179ed24f4bSMarc Zyngier 
1618fc61f554SOliver Upton 	read_lock(&vcpu->kvm->mmu_lock);
1619ee8efad7SWill Deacon 	mmu = vcpu->arch.hw_mmu;
16209a7ad19aSOliver Upton 	pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
1621fc61f554SOliver Upton 	read_unlock(&vcpu->kvm->mmu_lock);
1622ee8efad7SWill Deacon 
16239a7ad19aSOliver Upton 	if (kvm_pte_valid(pte))
16249a7ad19aSOliver Upton 		kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
16259ed24f4bSMarc Zyngier }
16269ed24f4bSMarc Zyngier 
16279ed24f4bSMarc Zyngier /**
16289ed24f4bSMarc Zyngier  * kvm_handle_guest_abort - handles all 2nd stage aborts
16299ed24f4bSMarc Zyngier  * @vcpu:	the VCPU pointer
16309ed24f4bSMarc Zyngier  *
16319ed24f4bSMarc Zyngier  * Any abort that gets to the host is almost guaranteed to be caused by a
16329ed24f4bSMarc Zyngier  * missing second stage translation table entry, which can mean that either the
16339ed24f4bSMarc Zyngier  * guest simply needs more memory and we must allocate an appropriate page or it
16349ed24f4bSMarc Zyngier  * can mean that the guest tried to access I/O memory, which is emulated by user
16359ed24f4bSMarc Zyngier  * space. The distinction is based on the IPA causing the fault and whether this
16369ed24f4bSMarc Zyngier  * memory region has been registered as standard RAM by user space.
16379ed24f4bSMarc Zyngier  */
kvm_handle_guest_abort(struct kvm_vcpu * vcpu)163874cc7e0cSTianjia Zhang int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
16399ed24f4bSMarc Zyngier {
16409ed24f4bSMarc Zyngier 	unsigned long fault_status;
16419ed24f4bSMarc Zyngier 	phys_addr_t fault_ipa;
16429ed24f4bSMarc Zyngier 	struct kvm_memory_slot *memslot;
16439ed24f4bSMarc Zyngier 	unsigned long hva;
16449ed24f4bSMarc Zyngier 	bool is_iabt, write_fault, writable;
16459ed24f4bSMarc Zyngier 	gfn_t gfn;
16469ed24f4bSMarc Zyngier 	int ret, idx;
16479ed24f4bSMarc Zyngier 
16489ed24f4bSMarc Zyngier 	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
16499ed24f4bSMarc Zyngier 
16509ed24f4bSMarc Zyngier 	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
16519ed24f4bSMarc Zyngier 	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
16529ed24f4bSMarc Zyngier 
1653b0803ba7SMarc Zyngier 	if (fault_status == ESR_ELx_FSC_FAULT) {
165485ea6b1eSMarc Zyngier 		/* Beyond sanitised PARange (which is the IPA limit) */
165585ea6b1eSMarc Zyngier 		if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
165685ea6b1eSMarc Zyngier 			kvm_inject_size_fault(vcpu);
165785ea6b1eSMarc Zyngier 			return 1;
165885ea6b1eSMarc Zyngier 		}
165985ea6b1eSMarc Zyngier 
166085ea6b1eSMarc Zyngier 		/* Falls between the IPA range and the PARange? */
166185ea6b1eSMarc Zyngier 		if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
166285ea6b1eSMarc Zyngier 			fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
166385ea6b1eSMarc Zyngier 
166485ea6b1eSMarc Zyngier 			if (is_iabt)
166585ea6b1eSMarc Zyngier 				kvm_inject_pabt(vcpu, fault_ipa);
166685ea6b1eSMarc Zyngier 			else
166785ea6b1eSMarc Zyngier 				kvm_inject_dabt(vcpu, fault_ipa);
166885ea6b1eSMarc Zyngier 			return 1;
166985ea6b1eSMarc Zyngier 		}
167085ea6b1eSMarc Zyngier 	}
167185ea6b1eSMarc Zyngier 
16729ed24f4bSMarc Zyngier 	/* Synchronous External Abort? */
1673c9a636f2SWill Deacon 	if (kvm_vcpu_abt_issea(vcpu)) {
16749ed24f4bSMarc Zyngier 		/*
16759ed24f4bSMarc Zyngier 		 * For RAS the host kernel may handle this abort.
16769ed24f4bSMarc Zyngier 		 * There is no need to pass the error into the guest.
16779ed24f4bSMarc Zyngier 		 */
167884b951a8SWill Deacon 		if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
16799ed24f4bSMarc Zyngier 			kvm_inject_vabt(vcpu);
168084b951a8SWill Deacon 
16819ed24f4bSMarc Zyngier 		return 1;
16829ed24f4bSMarc Zyngier 	}
16839ed24f4bSMarc Zyngier 
16843a949f4cSGavin Shan 	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
16859ed24f4bSMarc Zyngier 			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
16869ed24f4bSMarc Zyngier 
16879ed24f4bSMarc Zyngier 	/* Check the stage-2 fault is trans. fault or write fault */
1688b0803ba7SMarc Zyngier 	if (fault_status != ESR_ELx_FSC_FAULT &&
1689b0803ba7SMarc Zyngier 	    fault_status != ESR_ELx_FSC_PERM &&
1690b0803ba7SMarc Zyngier 	    fault_status != ESR_ELx_FSC_ACCESS) {
16919ed24f4bSMarc Zyngier 		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
16929ed24f4bSMarc Zyngier 			kvm_vcpu_trap_get_class(vcpu),
16939ed24f4bSMarc Zyngier 			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
16943a949f4cSGavin Shan 			(unsigned long)kvm_vcpu_get_esr(vcpu));
16959ed24f4bSMarc Zyngier 		return -EFAULT;
16969ed24f4bSMarc Zyngier 	}
16979ed24f4bSMarc Zyngier 
16989ed24f4bSMarc Zyngier 	idx = srcu_read_lock(&vcpu->kvm->srcu);
16999ed24f4bSMarc Zyngier 
17009ed24f4bSMarc Zyngier 	gfn = fault_ipa >> PAGE_SHIFT;
17019ed24f4bSMarc Zyngier 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
17029ed24f4bSMarc Zyngier 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
17039ed24f4bSMarc Zyngier 	write_fault = kvm_is_write_fault(vcpu);
17049ed24f4bSMarc Zyngier 	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1705022c8328SWill Deacon 		/*
1706022c8328SWill Deacon 		 * The guest has put either its instructions or its page-tables
1707022c8328SWill Deacon 		 * somewhere it shouldn't have. Userspace won't be able to do
1708022c8328SWill Deacon 		 * anything about this (there's no syndrome for a start), so
1709022c8328SWill Deacon 		 * re-inject the abort back into the guest.
1710022c8328SWill Deacon 		 */
17119ed24f4bSMarc Zyngier 		if (is_iabt) {
17129ed24f4bSMarc Zyngier 			ret = -ENOEXEC;
17139ed24f4bSMarc Zyngier 			goto out;
17149ed24f4bSMarc Zyngier 		}
17159ed24f4bSMarc Zyngier 
1716c4ad98e4SMarc Zyngier 		if (kvm_vcpu_abt_iss1tw(vcpu)) {
1717022c8328SWill Deacon 			kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1718022c8328SWill Deacon 			ret = 1;
1719022c8328SWill Deacon 			goto out_unlock;
1720022c8328SWill Deacon 		}
1721022c8328SWill Deacon 
17229ed24f4bSMarc Zyngier 		/*
17239ed24f4bSMarc Zyngier 		 * Check for a cache maintenance operation. Since we
17249ed24f4bSMarc Zyngier 		 * ended-up here, we know it is outside of any memory
17259ed24f4bSMarc Zyngier 		 * slot. But we can't find out if that is for a device,
17269ed24f4bSMarc Zyngier 		 * or if the guest is just being stupid. The only thing
17279ed24f4bSMarc Zyngier 		 * we know for sure is that this range cannot be cached.
17289ed24f4bSMarc Zyngier 		 *
17299ed24f4bSMarc Zyngier 		 * So let's assume that the guest is just being
17309ed24f4bSMarc Zyngier 		 * cautious, and skip the instruction.
17319ed24f4bSMarc Zyngier 		 */
173254dc0d24SWill Deacon 		if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1733cdb5e02eSMarc Zyngier 			kvm_incr_pc(vcpu);
17349ed24f4bSMarc Zyngier 			ret = 1;
17359ed24f4bSMarc Zyngier 			goto out_unlock;
17369ed24f4bSMarc Zyngier 		}
17379ed24f4bSMarc Zyngier 
17389ed24f4bSMarc Zyngier 		/*
17399ed24f4bSMarc Zyngier 		 * The IPA is reported as [MAX:12], so we need to
17409ed24f4bSMarc Zyngier 		 * complement it with the bottom 12 bits from the
17419ed24f4bSMarc Zyngier 		 * faulting VA. This is always 12 bits, irrespective
17429ed24f4bSMarc Zyngier 		 * of the page size.
17439ed24f4bSMarc Zyngier 		 */
17449ed24f4bSMarc Zyngier 		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
174574cc7e0cSTianjia Zhang 		ret = io_mem_abort(vcpu, fault_ipa);
17469ed24f4bSMarc Zyngier 		goto out_unlock;
17479ed24f4bSMarc Zyngier 	}
17489ed24f4bSMarc Zyngier 
17499ed24f4bSMarc Zyngier 	/* Userspace should not be able to register out-of-bounds IPAs */
17509ed24f4bSMarc Zyngier 	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
17519ed24f4bSMarc Zyngier 
1752b0803ba7SMarc Zyngier 	if (fault_status == ESR_ELx_FSC_ACCESS) {
17539ed24f4bSMarc Zyngier 		handle_access_fault(vcpu, fault_ipa);
17549ed24f4bSMarc Zyngier 		ret = 1;
17559ed24f4bSMarc Zyngier 		goto out_unlock;
17569ed24f4bSMarc Zyngier 	}
17579ed24f4bSMarc Zyngier 
17589ed24f4bSMarc Zyngier 	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
17599ed24f4bSMarc Zyngier 	if (ret == 0)
17609ed24f4bSMarc Zyngier 		ret = 1;
17619ed24f4bSMarc Zyngier out:
17629ed24f4bSMarc Zyngier 	if (ret == -ENOEXEC) {
17639ed24f4bSMarc Zyngier 		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
17649ed24f4bSMarc Zyngier 		ret = 1;
17659ed24f4bSMarc Zyngier 	}
17669ed24f4bSMarc Zyngier out_unlock:
17679ed24f4bSMarc Zyngier 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
17689ed24f4bSMarc Zyngier 	return ret;
17699ed24f4bSMarc Zyngier }
17709ed24f4bSMarc Zyngier 
kvm_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1771cd4c7183SSean Christopherson bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
17729ed24f4bSMarc Zyngier {
1773063deeb1SWill Deacon 	if (!kvm->arch.mmu.pgt)
1774fcb82839Skernel test robot 		return false;
17759ed24f4bSMarc Zyngier 
1776cd4c7183SSean Christopherson 	__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
1777cd4c7183SSean Christopherson 			     (range->end - range->start) << PAGE_SHIFT,
1778cd4c7183SSean Christopherson 			     range->may_block);
1779cd4c7183SSean Christopherson 
1780fcb82839Skernel test robot 	return false;
17819ed24f4bSMarc Zyngier }
17829ed24f4bSMarc Zyngier 
kvm_set_spte_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1783cd4c7183SSean Christopherson bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
17849ed24f4bSMarc Zyngier {
17853e1efe2bSSean Christopherson 	kvm_pfn_t pfn = pte_pfn(range->arg.pte);
17869ed24f4bSMarc Zyngier 
1787e9edb17aSWill Deacon 	if (!kvm->arch.mmu.pgt)
1788fcb82839Skernel test robot 		return false;
17899ed24f4bSMarc Zyngier 
1790cd4c7183SSean Christopherson 	WARN_ON(range->end - range->start != 1);
1791cd4c7183SSean Christopherson 
17922dbf12aeSCatalin Marinas 	/*
17932dbf12aeSCatalin Marinas 	 * If the page isn't tagged, defer to user_mem_abort() for sanitising
17942dbf12aeSCatalin Marinas 	 * the MTE tags. The S2 pte should have been unmapped by
17952dbf12aeSCatalin Marinas 	 * mmu_notifier_invalidate_range_end().
17962dbf12aeSCatalin Marinas 	 */
17972dbf12aeSCatalin Marinas 	if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
1798ea7fc1bbSSteven Price 		return false;
1799ea7fc1bbSSteven Price 
18009ed24f4bSMarc Zyngier 	/*
180125aa2869SYanan Wang 	 * We've moved a page around, probably through CoW, so let's treat
180225aa2869SYanan Wang 	 * it just like a translation fault and the map handler will clean
180325aa2869SYanan Wang 	 * the cache to the PoC.
180425aa2869SYanan Wang 	 *
1805cd4c7183SSean Christopherson 	 * The MMU notifiers will have unmapped a huge PMD before calling
1806cd4c7183SSean Christopherson 	 * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
1807cd4c7183SSean Christopherson 	 * therefore we never need to clear out a huge PMD through this
1808cd4c7183SSean Christopherson 	 * calling path and a memcache is not required.
1809cd4c7183SSean Christopherson 	 */
1810cd4c7183SSean Christopherson 	kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
1811cd4c7183SSean Christopherson 			       PAGE_SIZE, __pfn_to_phys(pfn),
18121577cb58SOliver Upton 			       KVM_PGTABLE_PROT_R, NULL, 0);
1813cd4c7183SSean Christopherson 
1814fcb82839Skernel test robot 	return false;
18159ed24f4bSMarc Zyngier }
18169ed24f4bSMarc Zyngier 
kvm_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1817cd4c7183SSean Christopherson bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
18189ed24f4bSMarc Zyngier {
1819cd4c7183SSean Christopherson 	u64 size = (range->end - range->start) << PAGE_SHIFT;
1820cd4c7183SSean Christopherson 
1821cd4c7183SSean Christopherson 	if (!kvm->arch.mmu.pgt)
1822fcb82839Skernel test robot 		return false;
18239ed24f4bSMarc Zyngier 
1824df6556adSOliver Upton 	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
1825df6556adSOliver Upton 						   range->start << PAGE_SHIFT,
1826df6556adSOliver Upton 						   size, true);
18279ed24f4bSMarc Zyngier }
18289ed24f4bSMarc Zyngier 
kvm_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1829cd4c7183SSean Christopherson bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
18309ed24f4bSMarc Zyngier {
1831df6556adSOliver Upton 	u64 size = (range->end - range->start) << PAGE_SHIFT;
1832df6556adSOliver Upton 
1833063deeb1SWill Deacon 	if (!kvm->arch.mmu.pgt)
1834fcb82839Skernel test robot 		return false;
1835501b9185SSean Christopherson 
1836df6556adSOliver Upton 	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
1837df6556adSOliver Upton 						   range->start << PAGE_SHIFT,
1838df6556adSOliver Upton 						   size, false);
18399ed24f4bSMarc Zyngier }
18409ed24f4bSMarc Zyngier 
kvm_mmu_get_httbr(void)18419ed24f4bSMarc Zyngier phys_addr_t kvm_mmu_get_httbr(void)
18429ed24f4bSMarc Zyngier {
18430f9d09b8SWill Deacon 	return __pa(hyp_pgtable->pgd);
18449ed24f4bSMarc Zyngier }
18459ed24f4bSMarc Zyngier 
kvm_get_idmap_vector(void)18469ed24f4bSMarc Zyngier phys_addr_t kvm_get_idmap_vector(void)
18479ed24f4bSMarc Zyngier {
18489ed24f4bSMarc Zyngier 	return hyp_idmap_vector;
18499ed24f4bSMarc Zyngier }
18509ed24f4bSMarc Zyngier 
kvm_map_idmap_text(void)18510f9d09b8SWill Deacon static int kvm_map_idmap_text(void)
18529ed24f4bSMarc Zyngier {
18530f9d09b8SWill Deacon 	unsigned long size = hyp_idmap_end - hyp_idmap_start;
18540f9d09b8SWill Deacon 	int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
18559ed24f4bSMarc Zyngier 					PAGE_HYP_EXEC);
18569ed24f4bSMarc Zyngier 	if (err)
18579ed24f4bSMarc Zyngier 		kvm_err("Failed to idmap %lx-%lx\n",
18589ed24f4bSMarc Zyngier 			hyp_idmap_start, hyp_idmap_end);
18599ed24f4bSMarc Zyngier 
18609ed24f4bSMarc Zyngier 	return err;
18619ed24f4bSMarc Zyngier }
18629ed24f4bSMarc Zyngier 
kvm_hyp_zalloc_page(void * arg)18637aef0cbcSQuentin Perret static void *kvm_hyp_zalloc_page(void *arg)
18647aef0cbcSQuentin Perret {
18657aef0cbcSQuentin Perret 	return (void *)get_zeroed_page(GFP_KERNEL);
18667aef0cbcSQuentin Perret }
18677aef0cbcSQuentin Perret 
18687aef0cbcSQuentin Perret static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
18697aef0cbcSQuentin Perret 	.zalloc_page		= kvm_hyp_zalloc_page,
18707aef0cbcSQuentin Perret 	.get_page		= kvm_host_get_page,
18717aef0cbcSQuentin Perret 	.put_page		= kvm_host_put_page,
18727aef0cbcSQuentin Perret 	.phys_to_virt		= kvm_host_va,
18737aef0cbcSQuentin Perret 	.virt_to_phys		= kvm_host_pa,
18747aef0cbcSQuentin Perret };
18757aef0cbcSQuentin Perret 
kvm_mmu_init(u32 * hyp_va_bits)18768d20bd63SSean Christopherson int __init kvm_mmu_init(u32 *hyp_va_bits)
18779ed24f4bSMarc Zyngier {
18789ed24f4bSMarc Zyngier 	int err;
1879579d7ebeSRyan Roberts 	u32 idmap_bits;
1880579d7ebeSRyan Roberts 	u32 kernel_bits;
18819ed24f4bSMarc Zyngier 
18820a78791cSAndrew Scull 	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
18839ed24f4bSMarc Zyngier 	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
18840a78791cSAndrew Scull 	hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
18859ed24f4bSMarc Zyngier 	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
18860a78791cSAndrew Scull 	hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
18879ed24f4bSMarc Zyngier 
18889ed24f4bSMarc Zyngier 	/*
18899ed24f4bSMarc Zyngier 	 * We rely on the linker script to ensure at build time that the HYP
18909ed24f4bSMarc Zyngier 	 * init code does not cross a page boundary.
18919ed24f4bSMarc Zyngier 	 */
18929ed24f4bSMarc Zyngier 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
18939ed24f4bSMarc Zyngier 
1894579d7ebeSRyan Roberts 	/*
1895579d7ebeSRyan Roberts 	 * The ID map may be configured to use an extended virtual address
1896579d7ebeSRyan Roberts 	 * range. This is only the case if system RAM is out of range for the
1897579d7ebeSRyan Roberts 	 * currently configured page size and VA_BITS_MIN, in which case we will
1898579d7ebeSRyan Roberts 	 * also need the extended virtual range for the HYP ID map, or we won't
1899579d7ebeSRyan Roberts 	 * be able to enable the EL2 MMU.
1900579d7ebeSRyan Roberts 	 *
1901579d7ebeSRyan Roberts 	 * However, in some cases the ID map may be configured for fewer than
1902579d7ebeSRyan Roberts 	 * the number of VA bits used by the regular kernel stage 1. This
1903579d7ebeSRyan Roberts 	 * happens when VA_BITS=52 and the kernel image is placed in PA space
1904579d7ebeSRyan Roberts 	 * below 48 bits.
1905579d7ebeSRyan Roberts 	 *
1906579d7ebeSRyan Roberts 	 * At EL2, there is only one TTBR register, and we can't switch between
1907579d7ebeSRyan Roberts 	 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
1908579d7ebeSRyan Roberts 	 * line: we need to use the extended range with *both* our translation
1909579d7ebeSRyan Roberts 	 * tables.
1910579d7ebeSRyan Roberts 	 *
1911579d7ebeSRyan Roberts 	 * So use the maximum of the idmap VA bits and the regular kernel stage
1912579d7ebeSRyan Roberts 	 * 1 VA bits to assure that the hypervisor can both ID map its code page
1913579d7ebeSRyan Roberts 	 * and map any kernel memory.
1914579d7ebeSRyan Roberts 	 */
1915579d7ebeSRyan Roberts 	idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1916579d7ebeSRyan Roberts 	kernel_bits = vabits_actual;
1917579d7ebeSRyan Roberts 	*hyp_va_bits = max(idmap_bits, kernel_bits);
1918579d7ebeSRyan Roberts 
1919bfa79a80SQuentin Perret 	kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
19209ed24f4bSMarc Zyngier 	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
19219ed24f4bSMarc Zyngier 	kvm_debug("HYP VA range: %lx:%lx\n",
19229ed24f4bSMarc Zyngier 		  kern_hyp_va(PAGE_OFFSET),
19239ed24f4bSMarc Zyngier 		  kern_hyp_va((unsigned long)high_memory - 1));
19249ed24f4bSMarc Zyngier 
19259ed24f4bSMarc Zyngier 	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
19269ed24f4bSMarc Zyngier 	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
19279ed24f4bSMarc Zyngier 	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
19289ed24f4bSMarc Zyngier 		/*
19299ed24f4bSMarc Zyngier 		 * The idmap page is intersecting with the VA space,
19309ed24f4bSMarc Zyngier 		 * it is not safe to continue further.
19319ed24f4bSMarc Zyngier 		 */
19329ed24f4bSMarc Zyngier 		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
19339ed24f4bSMarc Zyngier 		err = -EINVAL;
19349ed24f4bSMarc Zyngier 		goto out;
19359ed24f4bSMarc Zyngier 	}
19369ed24f4bSMarc Zyngier 
19370f9d09b8SWill Deacon 	hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
19380f9d09b8SWill Deacon 	if (!hyp_pgtable) {
19390f9d09b8SWill Deacon 		kvm_err("Hyp mode page-table not allocated\n");
19409ed24f4bSMarc Zyngier 		err = -ENOMEM;
19419ed24f4bSMarc Zyngier 		goto out;
19429ed24f4bSMarc Zyngier 	}
19439ed24f4bSMarc Zyngier 
1944bfa79a80SQuentin Perret 	err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
19459ed24f4bSMarc Zyngier 	if (err)
19460f9d09b8SWill Deacon 		goto out_free_pgtable;
19479ed24f4bSMarc Zyngier 
19480f9d09b8SWill Deacon 	err = kvm_map_idmap_text();
19499ed24f4bSMarc Zyngier 	if (err)
19500f9d09b8SWill Deacon 		goto out_destroy_pgtable;
19519ed24f4bSMarc Zyngier 
19529ed24f4bSMarc Zyngier 	io_map_base = hyp_idmap_start;
19539ed24f4bSMarc Zyngier 	return 0;
19540f9d09b8SWill Deacon 
19550f9d09b8SWill Deacon out_destroy_pgtable:
19560f9d09b8SWill Deacon 	kvm_pgtable_hyp_destroy(hyp_pgtable);
19570f9d09b8SWill Deacon out_free_pgtable:
19580f9d09b8SWill Deacon 	kfree(hyp_pgtable);
19590f9d09b8SWill Deacon 	hyp_pgtable = NULL;
19609ed24f4bSMarc Zyngier out:
19619ed24f4bSMarc Zyngier 	return err;
19629ed24f4bSMarc Zyngier }
19639ed24f4bSMarc Zyngier 
kvm_arch_commit_memory_region(struct kvm * kvm,struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)19649ed24f4bSMarc Zyngier void kvm_arch_commit_memory_region(struct kvm *kvm,
19659ed24f4bSMarc Zyngier 				   struct kvm_memory_slot *old,
19669ed24f4bSMarc Zyngier 				   const struct kvm_memory_slot *new,
19679ed24f4bSMarc Zyngier 				   enum kvm_mr_change change)
19689ed24f4bSMarc Zyngier {
19696bd92b9dSRicardo Koller 	bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
19706bd92b9dSRicardo Koller 
19719ed24f4bSMarc Zyngier 	/*
19729ed24f4bSMarc Zyngier 	 * At this point memslot has been committed and there is an
1973656012c7SFuad Tabba 	 * allocated dirty_bitmap[], dirty pages will be tracked while the
19749ed24f4bSMarc Zyngier 	 * memory slot is write protected.
19759ed24f4bSMarc Zyngier 	 */
19766bd92b9dSRicardo Koller 	if (log_dirty_pages) {
19776bd92b9dSRicardo Koller 
19786bd92b9dSRicardo Koller 		if (change == KVM_MR_DELETE)
19796bd92b9dSRicardo Koller 			return;
19806bd92b9dSRicardo Koller 
1981c862626eSKeqian Zhu 		/*
1982e7bf7a49SRicardo Koller 		 * Huge and normal pages are write-protected and split
1983e7bf7a49SRicardo Koller 		 * on either of these two cases:
19846bd92b9dSRicardo Koller 		 *
19856bd92b9dSRicardo Koller 		 * 1. with initial-all-set: gradually with CLEAR ioctls,
1986c862626eSKeqian Zhu 		 */
19876bd92b9dSRicardo Koller 		if (kvm_dirty_log_manual_protect_and_init_set(kvm))
19886bd92b9dSRicardo Koller 			return;
19896bd92b9dSRicardo Koller 		/*
19906bd92b9dSRicardo Koller 		 * or
19916bd92b9dSRicardo Koller 		 * 2. without initial-all-set: all in one shot when
19926bd92b9dSRicardo Koller 		 *    enabling dirty logging.
19936bd92b9dSRicardo Koller 		 */
1994509c594cSSean Christopherson 		kvm_mmu_wp_memory_region(kvm, new->id);
1995e7bf7a49SRicardo Koller 		kvm_mmu_split_memory_region(kvm, new->id);
1996e7bf7a49SRicardo Koller 	} else {
1997e7bf7a49SRicardo Koller 		/*
1998e7bf7a49SRicardo Koller 		 * Free any leftovers from the eager page splitting cache. Do
1999e7bf7a49SRicardo Koller 		 * this when deleting, moving, disabling dirty logging, or
2000e7bf7a49SRicardo Koller 		 * creating the memslot (a nop). Doing it for deletes makes
2001e7bf7a49SRicardo Koller 		 * sure we don't leak memory, and there's no need to keep the
2002e7bf7a49SRicardo Koller 		 * cache around for any of the other cases.
2003e7bf7a49SRicardo Koller 		 */
2004e7bf7a49SRicardo Koller 		kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
20059ed24f4bSMarc Zyngier 	}
2006c862626eSKeqian Zhu }
20079ed24f4bSMarc Zyngier 
kvm_arch_prepare_memory_region(struct kvm * kvm,const struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)20089ed24f4bSMarc Zyngier int kvm_arch_prepare_memory_region(struct kvm *kvm,
2009537a17b3SSean Christopherson 				   const struct kvm_memory_slot *old,
2010537a17b3SSean Christopherson 				   struct kvm_memory_slot *new,
20119ed24f4bSMarc Zyngier 				   enum kvm_mr_change change)
20129ed24f4bSMarc Zyngier {
2013509c594cSSean Christopherson 	hva_t hva, reg_end;
20149ed24f4bSMarc Zyngier 	int ret = 0;
20159ed24f4bSMarc Zyngier 
20169ed24f4bSMarc Zyngier 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
20179ed24f4bSMarc Zyngier 			change != KVM_MR_FLAGS_ONLY)
20189ed24f4bSMarc Zyngier 		return 0;
20199ed24f4bSMarc Zyngier 
20209ed24f4bSMarc Zyngier 	/*
20219ed24f4bSMarc Zyngier 	 * Prevent userspace from creating a memory region outside of the IPA
20229ed24f4bSMarc Zyngier 	 * space addressable by the KVM guest IPA space.
20239ed24f4bSMarc Zyngier 	 */
2024537a17b3SSean Christopherson 	if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
20259ed24f4bSMarc Zyngier 		return -EFAULT;
20269ed24f4bSMarc Zyngier 
2027509c594cSSean Christopherson 	hva = new->userspace_addr;
2028509c594cSSean Christopherson 	reg_end = hva + (new->npages << PAGE_SHIFT);
2029509c594cSSean Christopherson 
203089154dd5SMichel Lespinasse 	mmap_read_lock(current->mm);
20319ed24f4bSMarc Zyngier 	/*
20329ed24f4bSMarc Zyngier 	 * A memory region could potentially cover multiple VMAs, and any holes
2033fd6f17baSKeqian Zhu 	 * between them, so iterate over all of them.
20349ed24f4bSMarc Zyngier 	 *
20359ed24f4bSMarc Zyngier 	 *     +--------------------------------------------+
20369ed24f4bSMarc Zyngier 	 * +---------------+----------------+   +----------------+
20379ed24f4bSMarc Zyngier 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
20389ed24f4bSMarc Zyngier 	 * +---------------+----------------+   +----------------+
20399ed24f4bSMarc Zyngier 	 *     |               memory region                |
20409ed24f4bSMarc Zyngier 	 *     +--------------------------------------------+
20419ed24f4bSMarc Zyngier 	 */
20429ed24f4bSMarc Zyngier 	do {
2043c728fd4cSGavin Shan 		struct vm_area_struct *vma;
20449ed24f4bSMarc Zyngier 
2045c728fd4cSGavin Shan 		vma = find_vma_intersection(current->mm, hva, reg_end);
2046c728fd4cSGavin Shan 		if (!vma)
20479ed24f4bSMarc Zyngier 			break;
20489ed24f4bSMarc Zyngier 
2049d89585fbSPeter Collingbourne 		if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
20506e6a8ef0SQuentin Perret 			ret = -EINVAL;
20516e6a8ef0SQuentin Perret 			break;
20526e6a8ef0SQuentin Perret 		}
2053ea7fc1bbSSteven Price 
20549ed24f4bSMarc Zyngier 		if (vma->vm_flags & VM_PFNMAP) {
20559ed24f4bSMarc Zyngier 			/* IO region dirty page logging not allowed */
2056537a17b3SSean Christopherson 			if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
20579ed24f4bSMarc Zyngier 				ret = -EINVAL;
20589ed24f4bSMarc Zyngier 				break;
20599ed24f4bSMarc Zyngier 			}
2060fd6f17baSKeqian Zhu 		}
2061fd6f17baSKeqian Zhu 		hva = min(reg_end, vma->vm_end);
20629ed24f4bSMarc Zyngier 	} while (hva < reg_end);
20639ed24f4bSMarc Zyngier 
206489154dd5SMichel Lespinasse 	mmap_read_unlock(current->mm);
20659ed24f4bSMarc Zyngier 	return ret;
20669ed24f4bSMarc Zyngier }
20679ed24f4bSMarc Zyngier 
kvm_arch_free_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)20689ed24f4bSMarc Zyngier void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
20699ed24f4bSMarc Zyngier {
20709ed24f4bSMarc Zyngier }
20719ed24f4bSMarc Zyngier 
kvm_arch_memslots_updated(struct kvm * kvm,u64 gen)20729ed24f4bSMarc Zyngier void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
20739ed24f4bSMarc Zyngier {
20749ed24f4bSMarc Zyngier }
20759ed24f4bSMarc Zyngier 
kvm_arch_flush_shadow_all(struct kvm * kvm)20769ed24f4bSMarc Zyngier void kvm_arch_flush_shadow_all(struct kvm *kvm)
20779ed24f4bSMarc Zyngier {
2078ce2b6022SRicardo Koller 	kvm_uninit_stage2_mmu(kvm);
20799ed24f4bSMarc Zyngier }
20809ed24f4bSMarc Zyngier 
kvm_arch_flush_shadow_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)20819ed24f4bSMarc Zyngier void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
20829ed24f4bSMarc Zyngier 				   struct kvm_memory_slot *slot)
20839ed24f4bSMarc Zyngier {
20849ed24f4bSMarc Zyngier 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
20859ed24f4bSMarc Zyngier 	phys_addr_t size = slot->npages << PAGE_SHIFT;
20869ed24f4bSMarc Zyngier 
2087fcc5bf89SJing Zhang 	write_lock(&kvm->mmu_lock);
2088a0e50aa3SChristoffer Dall 	unmap_stage2_range(&kvm->arch.mmu, gpa, size);
2089fcc5bf89SJing Zhang 	write_unlock(&kvm->mmu_lock);
20909ed24f4bSMarc Zyngier }
20919ed24f4bSMarc Zyngier 
20929ed24f4bSMarc Zyngier /*
20939ed24f4bSMarc Zyngier  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
20949ed24f4bSMarc Zyngier  *
20959ed24f4bSMarc Zyngier  * Main problems:
20969ed24f4bSMarc Zyngier  * - S/W ops are local to a CPU (not broadcast)
20979ed24f4bSMarc Zyngier  * - We have line migration behind our back (speculation)
20989ed24f4bSMarc Zyngier  * - System caches don't support S/W at all (damn!)
20999ed24f4bSMarc Zyngier  *
21009ed24f4bSMarc Zyngier  * In the face of the above, the best we can do is to try and convert
21019ed24f4bSMarc Zyngier  * S/W ops to VA ops. Because the guest is not allowed to infer the
21029ed24f4bSMarc Zyngier  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
21039ed24f4bSMarc Zyngier  * which is a rather good thing for us.
21049ed24f4bSMarc Zyngier  *
21059ed24f4bSMarc Zyngier  * Also, it is only used when turning caches on/off ("The expected
21069ed24f4bSMarc Zyngier  * usage of the cache maintenance instructions that operate by set/way
21079ed24f4bSMarc Zyngier  * is associated with the cache maintenance instructions associated
21089ed24f4bSMarc Zyngier  * with the powerdown and powerup of caches, if this is required by
21099ed24f4bSMarc Zyngier  * the implementation.").
21109ed24f4bSMarc Zyngier  *
21119ed24f4bSMarc Zyngier  * We use the following policy:
21129ed24f4bSMarc Zyngier  *
21139ed24f4bSMarc Zyngier  * - If we trap a S/W operation, we enable VM trapping to detect
21149ed24f4bSMarc Zyngier  *   caches being turned on/off, and do a full clean.
21159ed24f4bSMarc Zyngier  *
21169ed24f4bSMarc Zyngier  * - We flush the caches on both caches being turned on and off.
21179ed24f4bSMarc Zyngier  *
21189ed24f4bSMarc Zyngier  * - Once the caches are enabled, we stop trapping VM ops.
21199ed24f4bSMarc Zyngier  */
kvm_set_way_flush(struct kvm_vcpu * vcpu)21209ed24f4bSMarc Zyngier void kvm_set_way_flush(struct kvm_vcpu *vcpu)
21219ed24f4bSMarc Zyngier {
21229ed24f4bSMarc Zyngier 	unsigned long hcr = *vcpu_hcr(vcpu);
21239ed24f4bSMarc Zyngier 
21249ed24f4bSMarc Zyngier 	/*
21259ed24f4bSMarc Zyngier 	 * If this is the first time we do a S/W operation
21269ed24f4bSMarc Zyngier 	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
21279ed24f4bSMarc Zyngier 	 * VM trapping.
21289ed24f4bSMarc Zyngier 	 *
21299ed24f4bSMarc Zyngier 	 * Otherwise, rely on the VM trapping to wait for the MMU +
21309ed24f4bSMarc Zyngier 	 * Caches to be turned off. At that point, we'll be able to
21319ed24f4bSMarc Zyngier 	 * clean the caches again.
21329ed24f4bSMarc Zyngier 	 */
21339ed24f4bSMarc Zyngier 	if (!(hcr & HCR_TVM)) {
21349ed24f4bSMarc Zyngier 		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
21359ed24f4bSMarc Zyngier 					vcpu_has_cache_enabled(vcpu));
21369ed24f4bSMarc Zyngier 		stage2_flush_vm(vcpu->kvm);
21379ed24f4bSMarc Zyngier 		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
21389ed24f4bSMarc Zyngier 	}
21399ed24f4bSMarc Zyngier }
21409ed24f4bSMarc Zyngier 
kvm_toggle_cache(struct kvm_vcpu * vcpu,bool was_enabled)21419ed24f4bSMarc Zyngier void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
21429ed24f4bSMarc Zyngier {
21439ed24f4bSMarc Zyngier 	bool now_enabled = vcpu_has_cache_enabled(vcpu);
21449ed24f4bSMarc Zyngier 
21459ed24f4bSMarc Zyngier 	/*
21469ed24f4bSMarc Zyngier 	 * If switching the MMU+caches on, need to invalidate the caches.
21479ed24f4bSMarc Zyngier 	 * If switching it off, need to clean the caches.
21489ed24f4bSMarc Zyngier 	 * Clean + invalidate does the trick always.
21499ed24f4bSMarc Zyngier 	 */
21509ed24f4bSMarc Zyngier 	if (now_enabled != was_enabled)
21519ed24f4bSMarc Zyngier 		stage2_flush_vm(vcpu->kvm);
21529ed24f4bSMarc Zyngier 
21539ed24f4bSMarc Zyngier 	/* Caches are now on, stop trapping VM ops (until a S/W op) */
21549ed24f4bSMarc Zyngier 	if (now_enabled)
21559ed24f4bSMarc Zyngier 		*vcpu_hcr(vcpu) &= ~HCR_TVM;
21569ed24f4bSMarc Zyngier 
21579ed24f4bSMarc Zyngier 	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
21589ed24f4bSMarc Zyngier }
2159