xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 9a77daac)
1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0
2fe5db27dSBen Gardon 
302c00b3aSBen Gardon #include "mmu.h"
402c00b3aSBen Gardon #include "mmu_internal.h"
5bb18842eSBen Gardon #include "mmutrace.h"
62f2fad08SBen Gardon #include "tdp_iter.h"
7fe5db27dSBen Gardon #include "tdp_mmu.h"
802c00b3aSBen Gardon #include "spte.h"
9fe5db27dSBen Gardon 
10*9a77daacSBen Gardon #include <asm/cmpxchg.h>
1133dd3574SBen Gardon #include <trace/events/kvm.h>
1233dd3574SBen Gardon 
1395fb5b02SBen Gardon #ifdef CONFIG_X86_64
14fe5db27dSBen Gardon static bool __read_mostly tdp_mmu_enabled = false;
1595fb5b02SBen Gardon module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
1695fb5b02SBen Gardon #endif
17fe5db27dSBen Gardon 
18fe5db27dSBen Gardon static bool is_tdp_mmu_enabled(void)
19fe5db27dSBen Gardon {
20fe5db27dSBen Gardon #ifdef CONFIG_X86_64
21fe5db27dSBen Gardon 	return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
22fe5db27dSBen Gardon #else
23fe5db27dSBen Gardon 	return false;
24fe5db27dSBen Gardon #endif /* CONFIG_X86_64 */
25fe5db27dSBen Gardon }
26fe5db27dSBen Gardon 
27fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */
28fe5db27dSBen Gardon void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
29fe5db27dSBen Gardon {
30fe5db27dSBen Gardon 	if (!is_tdp_mmu_enabled())
31fe5db27dSBen Gardon 		return;
32fe5db27dSBen Gardon 
33fe5db27dSBen Gardon 	/* This should not be changed for the lifetime of the VM. */
34fe5db27dSBen Gardon 	kvm->arch.tdp_mmu_enabled = true;
3502c00b3aSBen Gardon 
3602c00b3aSBen Gardon 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
37*9a77daacSBen Gardon 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
3889c0fd49SBen Gardon 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
39fe5db27dSBen Gardon }
40fe5db27dSBen Gardon 
41fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42fe5db27dSBen Gardon {
43fe5db27dSBen Gardon 	if (!kvm->arch.tdp_mmu_enabled)
44fe5db27dSBen Gardon 		return;
4502c00b3aSBen Gardon 
4602c00b3aSBen Gardon 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
477cca2d0bSBen Gardon 
487cca2d0bSBen Gardon 	/*
497cca2d0bSBen Gardon 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
507cca2d0bSBen Gardon 	 * can run before the VM is torn down.
517cca2d0bSBen Gardon 	 */
527cca2d0bSBen Gardon 	rcu_barrier();
5302c00b3aSBen Gardon }
5402c00b3aSBen Gardon 
55a889ea54SBen Gardon static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
56a889ea54SBen Gardon {
57a889ea54SBen Gardon 	if (kvm_mmu_put_root(kvm, root))
58a889ea54SBen Gardon 		kvm_tdp_mmu_free_root(kvm, root);
59a889ea54SBen Gardon }
60a889ea54SBen Gardon 
61a889ea54SBen Gardon static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
62a889ea54SBen Gardon 					   struct kvm_mmu_page *root)
63a889ea54SBen Gardon {
64531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
65a889ea54SBen Gardon 
66a889ea54SBen Gardon 	if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
67a889ea54SBen Gardon 		return false;
68a889ea54SBen Gardon 
69a889ea54SBen Gardon 	kvm_mmu_get_root(kvm, root);
70a889ea54SBen Gardon 	return true;
71a889ea54SBen Gardon 
72a889ea54SBen Gardon }
73a889ea54SBen Gardon 
74a889ea54SBen Gardon static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
75a889ea54SBen Gardon 						     struct kvm_mmu_page *root)
76a889ea54SBen Gardon {
77a889ea54SBen Gardon 	struct kvm_mmu_page *next_root;
78a889ea54SBen Gardon 
79a889ea54SBen Gardon 	next_root = list_next_entry(root, link);
80a889ea54SBen Gardon 	tdp_mmu_put_root(kvm, root);
81a889ea54SBen Gardon 	return next_root;
82a889ea54SBen Gardon }
83a889ea54SBen Gardon 
84a889ea54SBen Gardon /*
85a889ea54SBen Gardon  * Note: this iterator gets and puts references to the roots it iterates over.
86a889ea54SBen Gardon  * This makes it safe to release the MMU lock and yield within the loop, but
87a889ea54SBen Gardon  * if exiting the loop early, the caller must drop the reference to the most
88a889ea54SBen Gardon  * recent root. (Unless keeping a live reference is desirable.)
89a889ea54SBen Gardon  */
90a889ea54SBen Gardon #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)				\
91a889ea54SBen Gardon 	for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,	\
92a889ea54SBen Gardon 				      typeof(*_root), link);		\
93a889ea54SBen Gardon 	     tdp_mmu_next_root_valid(_kvm, _root);			\
94a889ea54SBen Gardon 	     _root = tdp_mmu_next_root(_kvm, _root))
95a889ea54SBen Gardon 
9602c00b3aSBen Gardon #define for_each_tdp_mmu_root(_kvm, _root)				\
9702c00b3aSBen Gardon 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
9802c00b3aSBen Gardon 
9902c00b3aSBen Gardon bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
10002c00b3aSBen Gardon {
10102c00b3aSBen Gardon 	struct kvm_mmu_page *sp;
10202c00b3aSBen Gardon 
103c887c9b9SPaolo Bonzini 	if (!kvm->arch.tdp_mmu_enabled)
104c887c9b9SPaolo Bonzini 		return false;
105c887c9b9SPaolo Bonzini 	if (WARN_ON(!VALID_PAGE(hpa)))
106c887c9b9SPaolo Bonzini 		return false;
107c887c9b9SPaolo Bonzini 
10802c00b3aSBen Gardon 	sp = to_shadow_page(hpa);
109c887c9b9SPaolo Bonzini 	if (WARN_ON(!sp))
110c887c9b9SPaolo Bonzini 		return false;
11102c00b3aSBen Gardon 
11202c00b3aSBen Gardon 	return sp->tdp_mmu_page && sp->root_count;
11302c00b3aSBen Gardon }
11402c00b3aSBen Gardon 
115faaf05b0SBen Gardon static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
116063afacdSBen Gardon 			  gfn_t start, gfn_t end, bool can_yield);
117faaf05b0SBen Gardon 
11802c00b3aSBen Gardon void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
11902c00b3aSBen Gardon {
120339f5a7fSRick Edgecombe 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
121faaf05b0SBen Gardon 
122531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
12302c00b3aSBen Gardon 
12402c00b3aSBen Gardon 	WARN_ON(root->root_count);
12502c00b3aSBen Gardon 	WARN_ON(!root->tdp_mmu_page);
12602c00b3aSBen Gardon 
12702c00b3aSBen Gardon 	list_del(&root->link);
12802c00b3aSBen Gardon 
129063afacdSBen Gardon 	zap_gfn_range(kvm, root, 0, max_gfn, false);
130faaf05b0SBen Gardon 
13102c00b3aSBen Gardon 	free_page((unsigned long)root->spt);
13202c00b3aSBen Gardon 	kmem_cache_free(mmu_page_header_cache, root);
13302c00b3aSBen Gardon }
13402c00b3aSBen Gardon 
13502c00b3aSBen Gardon static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
13602c00b3aSBen Gardon 						   int level)
13702c00b3aSBen Gardon {
13802c00b3aSBen Gardon 	union kvm_mmu_page_role role;
13902c00b3aSBen Gardon 
14002c00b3aSBen Gardon 	role = vcpu->arch.mmu->mmu_role.base;
14102c00b3aSBen Gardon 	role.level = level;
14202c00b3aSBen Gardon 	role.direct = true;
14302c00b3aSBen Gardon 	role.gpte_is_8_bytes = true;
14402c00b3aSBen Gardon 	role.access = ACC_ALL;
14502c00b3aSBen Gardon 
14602c00b3aSBen Gardon 	return role;
14702c00b3aSBen Gardon }
14802c00b3aSBen Gardon 
14902c00b3aSBen Gardon static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
15002c00b3aSBen Gardon 					       int level)
15102c00b3aSBen Gardon {
15202c00b3aSBen Gardon 	struct kvm_mmu_page *sp;
15302c00b3aSBen Gardon 
15402c00b3aSBen Gardon 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
15502c00b3aSBen Gardon 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
15602c00b3aSBen Gardon 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
15702c00b3aSBen Gardon 
15802c00b3aSBen Gardon 	sp->role.word = page_role_for_level(vcpu, level).word;
15902c00b3aSBen Gardon 	sp->gfn = gfn;
16002c00b3aSBen Gardon 	sp->tdp_mmu_page = true;
16102c00b3aSBen Gardon 
16233dd3574SBen Gardon 	trace_kvm_mmu_get_page(sp, true);
16333dd3574SBen Gardon 
16402c00b3aSBen Gardon 	return sp;
16502c00b3aSBen Gardon }
16602c00b3aSBen Gardon 
16702c00b3aSBen Gardon static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
16802c00b3aSBen Gardon {
16902c00b3aSBen Gardon 	union kvm_mmu_page_role role;
17002c00b3aSBen Gardon 	struct kvm *kvm = vcpu->kvm;
17102c00b3aSBen Gardon 	struct kvm_mmu_page *root;
17202c00b3aSBen Gardon 
17302c00b3aSBen Gardon 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
17402c00b3aSBen Gardon 
175531810caSBen Gardon 	write_lock(&kvm->mmu_lock);
17602c00b3aSBen Gardon 
17702c00b3aSBen Gardon 	/* Check for an existing root before allocating a new one. */
17802c00b3aSBen Gardon 	for_each_tdp_mmu_root(kvm, root) {
17902c00b3aSBen Gardon 		if (root->role.word == role.word) {
18002c00b3aSBen Gardon 			kvm_mmu_get_root(kvm, root);
181531810caSBen Gardon 			write_unlock(&kvm->mmu_lock);
18202c00b3aSBen Gardon 			return root;
18302c00b3aSBen Gardon 		}
18402c00b3aSBen Gardon 	}
18502c00b3aSBen Gardon 
18602c00b3aSBen Gardon 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
18702c00b3aSBen Gardon 	root->root_count = 1;
18802c00b3aSBen Gardon 
18902c00b3aSBen Gardon 	list_add(&root->link, &kvm->arch.tdp_mmu_roots);
19002c00b3aSBen Gardon 
191531810caSBen Gardon 	write_unlock(&kvm->mmu_lock);
19202c00b3aSBen Gardon 
19302c00b3aSBen Gardon 	return root;
19402c00b3aSBen Gardon }
19502c00b3aSBen Gardon 
19602c00b3aSBen Gardon hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
19702c00b3aSBen Gardon {
19802c00b3aSBen Gardon 	struct kvm_mmu_page *root;
19902c00b3aSBen Gardon 
20002c00b3aSBen Gardon 	root = get_tdp_mmu_vcpu_root(vcpu);
20102c00b3aSBen Gardon 	if (!root)
20202c00b3aSBen Gardon 		return INVALID_PAGE;
20302c00b3aSBen Gardon 
20402c00b3aSBen Gardon 	return __pa(root->spt);
205fe5db27dSBen Gardon }
2062f2fad08SBen Gardon 
2077cca2d0bSBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
2087cca2d0bSBen Gardon {
2097cca2d0bSBen Gardon 	free_page((unsigned long)sp->spt);
2107cca2d0bSBen Gardon 	kmem_cache_free(mmu_page_header_cache, sp);
2117cca2d0bSBen Gardon }
2127cca2d0bSBen Gardon 
2137cca2d0bSBen Gardon /*
2147cca2d0bSBen Gardon  * This is called through call_rcu in order to free TDP page table memory
2157cca2d0bSBen Gardon  * safely with respect to other kernel threads that may be operating on
2167cca2d0bSBen Gardon  * the memory.
2177cca2d0bSBen Gardon  * By only accessing TDP MMU page table memory in an RCU read critical
2187cca2d0bSBen Gardon  * section, and freeing it after a grace period, lockless access to that
2197cca2d0bSBen Gardon  * memory won't use it after it is freed.
2207cca2d0bSBen Gardon  */
2217cca2d0bSBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
2227cca2d0bSBen Gardon {
2237cca2d0bSBen Gardon 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
2247cca2d0bSBen Gardon 					       rcu_head);
2257cca2d0bSBen Gardon 
2267cca2d0bSBen Gardon 	tdp_mmu_free_sp(sp);
2277cca2d0bSBen Gardon }
2287cca2d0bSBen Gardon 
2292f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
230*9a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
231*9a77daacSBen Gardon 				bool shared);
2322f2fad08SBen Gardon 
233faaf05b0SBen Gardon static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
234faaf05b0SBen Gardon {
235faaf05b0SBen Gardon 	return sp->role.smm ? 1 : 0;
236faaf05b0SBen Gardon }
237faaf05b0SBen Gardon 
238f8e14497SBen Gardon static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
239f8e14497SBen Gardon {
240f8e14497SBen Gardon 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
241f8e14497SBen Gardon 
242f8e14497SBen Gardon 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
243f8e14497SBen Gardon 		return;
244f8e14497SBen Gardon 
245f8e14497SBen Gardon 	if (is_accessed_spte(old_spte) &&
246f8e14497SBen Gardon 	    (!is_accessed_spte(new_spte) || pfn_changed))
247f8e14497SBen Gardon 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
248f8e14497SBen Gardon }
249f8e14497SBen Gardon 
250a6a0b05dSBen Gardon static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
251a6a0b05dSBen Gardon 					  u64 old_spte, u64 new_spte, int level)
252a6a0b05dSBen Gardon {
253a6a0b05dSBen Gardon 	bool pfn_changed;
254a6a0b05dSBen Gardon 	struct kvm_memory_slot *slot;
255a6a0b05dSBen Gardon 
256a6a0b05dSBen Gardon 	if (level > PG_LEVEL_4K)
257a6a0b05dSBen Gardon 		return;
258a6a0b05dSBen Gardon 
259a6a0b05dSBen Gardon 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
260a6a0b05dSBen Gardon 
261a6a0b05dSBen Gardon 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
262a6a0b05dSBen Gardon 	    is_writable_pte(new_spte)) {
263a6a0b05dSBen Gardon 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
264fb04a1edSPeter Xu 		mark_page_dirty_in_slot(kvm, slot, gfn);
265a6a0b05dSBen Gardon 	}
266a6a0b05dSBen Gardon }
267a6a0b05dSBen Gardon 
2682f2fad08SBen Gardon /**
269a9442f59SBen Gardon  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
270a9442f59SBen Gardon  *
271a9442f59SBen Gardon  * @kvm: kvm instance
272a9442f59SBen Gardon  * @sp: the new page
273*9a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
274*9a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
275*9a77daacSBen Gardon  *	    threads that might be adding or removing pages.
276a9442f59SBen Gardon  * @account_nx: This page replaces a NX large page and should be marked for
277a9442f59SBen Gardon  *		eventual reclaim.
278a9442f59SBen Gardon  */
279a9442f59SBen Gardon static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
280*9a77daacSBen Gardon 			      bool shared, bool account_nx)
281a9442f59SBen Gardon {
282*9a77daacSBen Gardon 	if (shared)
283*9a77daacSBen Gardon 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
284*9a77daacSBen Gardon 	else
285a9442f59SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
286a9442f59SBen Gardon 
287a9442f59SBen Gardon 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
288a9442f59SBen Gardon 	if (account_nx)
289a9442f59SBen Gardon 		account_huge_nx_page(kvm, sp);
290*9a77daacSBen Gardon 
291*9a77daacSBen Gardon 	if (shared)
292*9a77daacSBen Gardon 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
293a9442f59SBen Gardon }
294a9442f59SBen Gardon 
295a9442f59SBen Gardon /**
296a9442f59SBen Gardon  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
297a9442f59SBen Gardon  *
298a9442f59SBen Gardon  * @kvm: kvm instance
299a9442f59SBen Gardon  * @sp: the page to be removed
300*9a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
301*9a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
302*9a77daacSBen Gardon  *	    threads that might be adding or removing pages.
303a9442f59SBen Gardon  */
304*9a77daacSBen Gardon static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
305*9a77daacSBen Gardon 				bool shared)
306a9442f59SBen Gardon {
307*9a77daacSBen Gardon 	if (shared)
308*9a77daacSBen Gardon 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
309*9a77daacSBen Gardon 	else
310a9442f59SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
311a9442f59SBen Gardon 
312a9442f59SBen Gardon 	list_del(&sp->link);
313a9442f59SBen Gardon 	if (sp->lpage_disallowed)
314a9442f59SBen Gardon 		unaccount_huge_nx_page(kvm, sp);
315*9a77daacSBen Gardon 
316*9a77daacSBen Gardon 	if (shared)
317*9a77daacSBen Gardon 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
318a9442f59SBen Gardon }
319a9442f59SBen Gardon 
320a9442f59SBen Gardon /**
321a066e61fSBen Gardon  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
322a066e61fSBen Gardon  *
323a066e61fSBen Gardon  * @kvm: kvm instance
324a066e61fSBen Gardon  * @pt: the page removed from the paging structure
325*9a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use
326*9a77daacSBen Gardon  *	    of the MMU lock and the operation must synchronize with other
327*9a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
328a066e61fSBen Gardon  *
329a066e61fSBen Gardon  * Given a page table that has been removed from the TDP paging structure,
330a066e61fSBen Gardon  * iterates through the page table to clear SPTEs and free child page tables.
331a066e61fSBen Gardon  */
332*9a77daacSBen Gardon static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
333*9a77daacSBen Gardon 					bool shared)
334a066e61fSBen Gardon {
335a066e61fSBen Gardon 	struct kvm_mmu_page *sp = sptep_to_sp(pt);
336a066e61fSBen Gardon 	int level = sp->role.level;
337a066e61fSBen Gardon 	gfn_t gfn = sp->gfn;
338a066e61fSBen Gardon 	u64 old_child_spte;
339*9a77daacSBen Gardon 	u64 *sptep;
340a066e61fSBen Gardon 	int i;
341a066e61fSBen Gardon 
342a066e61fSBen Gardon 	trace_kvm_mmu_prepare_zap_page(sp);
343a066e61fSBen Gardon 
344*9a77daacSBen Gardon 	tdp_mmu_unlink_page(kvm, sp, shared);
345a066e61fSBen Gardon 
346a066e61fSBen Gardon 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
347*9a77daacSBen Gardon 		sptep = pt + i;
348*9a77daacSBen Gardon 
349*9a77daacSBen Gardon 		if (shared) {
350*9a77daacSBen Gardon 			old_child_spte = xchg(sptep, 0);
351*9a77daacSBen Gardon 		} else {
352*9a77daacSBen Gardon 			old_child_spte = READ_ONCE(*sptep);
353*9a77daacSBen Gardon 			WRITE_ONCE(*sptep, 0);
354*9a77daacSBen Gardon 		}
355a066e61fSBen Gardon 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp),
356a066e61fSBen Gardon 			gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
357*9a77daacSBen Gardon 			old_child_spte, 0, level - 1, shared);
358a066e61fSBen Gardon 	}
359a066e61fSBen Gardon 
360a066e61fSBen Gardon 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
361a066e61fSBen Gardon 					   KVM_PAGES_PER_HPAGE(level));
362a066e61fSBen Gardon 
3637cca2d0bSBen Gardon 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
364a066e61fSBen Gardon }
365a066e61fSBen Gardon 
366a066e61fSBen Gardon /**
3672f2fad08SBen Gardon  * handle_changed_spte - handle bookkeeping associated with an SPTE change
3682f2fad08SBen Gardon  * @kvm: kvm instance
3692f2fad08SBen Gardon  * @as_id: the address space of the paging structure the SPTE was a part of
3702f2fad08SBen Gardon  * @gfn: the base GFN that was mapped by the SPTE
3712f2fad08SBen Gardon  * @old_spte: The value of the SPTE before the change
3722f2fad08SBen Gardon  * @new_spte: The value of the SPTE after the change
3732f2fad08SBen Gardon  * @level: the level of the PT the SPTE is part of in the paging structure
374*9a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
375*9a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
376*9a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
3772f2fad08SBen Gardon  *
3782f2fad08SBen Gardon  * Handle bookkeeping that might result from the modification of a SPTE.
3792f2fad08SBen Gardon  * This function must be called for all TDP SPTE modifications.
3802f2fad08SBen Gardon  */
3812f2fad08SBen Gardon static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
382*9a77daacSBen Gardon 				  u64 old_spte, u64 new_spte, int level,
383*9a77daacSBen Gardon 				  bool shared)
3842f2fad08SBen Gardon {
3852f2fad08SBen Gardon 	bool was_present = is_shadow_present_pte(old_spte);
3862f2fad08SBen Gardon 	bool is_present = is_shadow_present_pte(new_spte);
3872f2fad08SBen Gardon 	bool was_leaf = was_present && is_last_spte(old_spte, level);
3882f2fad08SBen Gardon 	bool is_leaf = is_present && is_last_spte(new_spte, level);
3892f2fad08SBen Gardon 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
3902f2fad08SBen Gardon 
3912f2fad08SBen Gardon 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
3922f2fad08SBen Gardon 	WARN_ON(level < PG_LEVEL_4K);
393764388ceSSean Christopherson 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
3942f2fad08SBen Gardon 
3952f2fad08SBen Gardon 	/*
3962f2fad08SBen Gardon 	 * If this warning were to trigger it would indicate that there was a
3972f2fad08SBen Gardon 	 * missing MMU notifier or a race with some notifier handler.
3982f2fad08SBen Gardon 	 * A present, leaf SPTE should never be directly replaced with another
3992f2fad08SBen Gardon 	 * present leaf SPTE pointing to a differnt PFN. A notifier handler
4002f2fad08SBen Gardon 	 * should be zapping the SPTE before the main MM's page table is
4012f2fad08SBen Gardon 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
4022f2fad08SBen Gardon 	 * thread before replacement.
4032f2fad08SBen Gardon 	 */
4042f2fad08SBen Gardon 	if (was_leaf && is_leaf && pfn_changed) {
4052f2fad08SBen Gardon 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
4062f2fad08SBen Gardon 		       "SPTE with another present leaf SPTE mapping a\n"
4072f2fad08SBen Gardon 		       "different PFN!\n"
4082f2fad08SBen Gardon 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
4092f2fad08SBen Gardon 		       as_id, gfn, old_spte, new_spte, level);
4102f2fad08SBen Gardon 
4112f2fad08SBen Gardon 		/*
4122f2fad08SBen Gardon 		 * Crash the host to prevent error propagation and guest data
4132f2fad08SBen Gardon 		 * courruption.
4142f2fad08SBen Gardon 		 */
4152f2fad08SBen Gardon 		BUG();
4162f2fad08SBen Gardon 	}
4172f2fad08SBen Gardon 
4182f2fad08SBen Gardon 	if (old_spte == new_spte)
4192f2fad08SBen Gardon 		return;
4202f2fad08SBen Gardon 
421b9a98c34SBen Gardon 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
422b9a98c34SBen Gardon 
4232f2fad08SBen Gardon 	/*
4242f2fad08SBen Gardon 	 * The only times a SPTE should be changed from a non-present to
4252f2fad08SBen Gardon 	 * non-present state is when an MMIO entry is installed/modified/
4262f2fad08SBen Gardon 	 * removed. In that case, there is nothing to do here.
4272f2fad08SBen Gardon 	 */
4282f2fad08SBen Gardon 	if (!was_present && !is_present) {
4292f2fad08SBen Gardon 		/*
4302f2fad08SBen Gardon 		 * If this change does not involve a MMIO SPTE, it is
4312f2fad08SBen Gardon 		 * unexpected. Log the change, though it should not impact the
4322f2fad08SBen Gardon 		 * guest since both the former and current SPTEs are nonpresent.
4332f2fad08SBen Gardon 		 */
4342f2fad08SBen Gardon 		if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
4352f2fad08SBen Gardon 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
4362f2fad08SBen Gardon 			       "should not be replaced with another,\n"
4372f2fad08SBen Gardon 			       "different nonpresent SPTE, unless one or both\n"
4382f2fad08SBen Gardon 			       "are MMIO SPTEs.\n"
4392f2fad08SBen Gardon 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
4402f2fad08SBen Gardon 			       as_id, gfn, old_spte, new_spte, level);
4412f2fad08SBen Gardon 		return;
4422f2fad08SBen Gardon 	}
4432f2fad08SBen Gardon 
4442f2fad08SBen Gardon 
4452f2fad08SBen Gardon 	if (was_leaf && is_dirty_spte(old_spte) &&
4462f2fad08SBen Gardon 	    (!is_dirty_spte(new_spte) || pfn_changed))
4472f2fad08SBen Gardon 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
4482f2fad08SBen Gardon 
4492f2fad08SBen Gardon 	/*
4502f2fad08SBen Gardon 	 * Recursively handle child PTs if the change removed a subtree from
4512f2fad08SBen Gardon 	 * the paging structure.
4522f2fad08SBen Gardon 	 */
453a066e61fSBen Gardon 	if (was_present && !was_leaf && (pfn_changed || !is_present))
454a066e61fSBen Gardon 		handle_removed_tdp_mmu_page(kvm,
455*9a77daacSBen Gardon 				spte_to_child_pt(old_spte, level), shared);
4562f2fad08SBen Gardon }
4572f2fad08SBen Gardon 
4582f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
459*9a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
460*9a77daacSBen Gardon 				bool shared)
4612f2fad08SBen Gardon {
462*9a77daacSBen Gardon 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
463*9a77daacSBen Gardon 			      shared);
464f8e14497SBen Gardon 	handle_changed_spte_acc_track(old_spte, new_spte, level);
465a6a0b05dSBen Gardon 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
466a6a0b05dSBen Gardon 				      new_spte, level);
4672f2fad08SBen Gardon }
468faaf05b0SBen Gardon 
469fe43fa2fSBen Gardon /*
470*9a77daacSBen Gardon  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
471*9a77daacSBen Gardon  * associated bookkeeping
472*9a77daacSBen Gardon  *
473*9a77daacSBen Gardon  * @kvm: kvm instance
474*9a77daacSBen Gardon  * @iter: a tdp_iter instance currently on the SPTE that should be set
475*9a77daacSBen Gardon  * @new_spte: The value the SPTE should be set to
476*9a77daacSBen Gardon  * Returns: true if the SPTE was set, false if it was not. If false is returned,
477*9a77daacSBen Gardon  *	    this function will have no side-effects.
478*9a77daacSBen Gardon  */
479*9a77daacSBen Gardon static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
480*9a77daacSBen Gardon 					   struct tdp_iter *iter,
481*9a77daacSBen Gardon 					   u64 new_spte)
482*9a77daacSBen Gardon {
483*9a77daacSBen Gardon 	u64 *root_pt = tdp_iter_root_pt(iter);
484*9a77daacSBen Gardon 	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
485*9a77daacSBen Gardon 	int as_id = kvm_mmu_page_as_id(root);
486*9a77daacSBen Gardon 
487*9a77daacSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
488*9a77daacSBen Gardon 
489*9a77daacSBen Gardon 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
490*9a77daacSBen Gardon 		      new_spte) != iter->old_spte)
491*9a77daacSBen Gardon 		return false;
492*9a77daacSBen Gardon 
493*9a77daacSBen Gardon 	handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
494*9a77daacSBen Gardon 			    iter->level, true);
495*9a77daacSBen Gardon 
496*9a77daacSBen Gardon 	return true;
497*9a77daacSBen Gardon }
498*9a77daacSBen Gardon 
499*9a77daacSBen Gardon 
500*9a77daacSBen Gardon /*
501fe43fa2fSBen Gardon  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
502fe43fa2fSBen Gardon  * @kvm: kvm instance
503fe43fa2fSBen Gardon  * @iter: a tdp_iter instance currently on the SPTE that should be set
504fe43fa2fSBen Gardon  * @new_spte: The value the SPTE should be set to
505fe43fa2fSBen Gardon  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
506fe43fa2fSBen Gardon  *		      of the page. Should be set unless handling an MMU
507fe43fa2fSBen Gardon  *		      notifier for access tracking. Leaving record_acc_track
508fe43fa2fSBen Gardon  *		      unset in that case prevents page accesses from being
509fe43fa2fSBen Gardon  *		      double counted.
510fe43fa2fSBen Gardon  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
511fe43fa2fSBen Gardon  *		      appropriate for the change being made. Should be set
512fe43fa2fSBen Gardon  *		      unless performing certain dirty logging operations.
513fe43fa2fSBen Gardon  *		      Leaving record_dirty_log unset in that case prevents page
514fe43fa2fSBen Gardon  *		      writes from being double counted.
515fe43fa2fSBen Gardon  */
516f8e14497SBen Gardon static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
517a6a0b05dSBen Gardon 				      u64 new_spte, bool record_acc_track,
518a6a0b05dSBen Gardon 				      bool record_dirty_log)
519faaf05b0SBen Gardon {
5207cca2d0bSBen Gardon 	tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
521faaf05b0SBen Gardon 	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
522faaf05b0SBen Gardon 	int as_id = kvm_mmu_page_as_id(root);
523faaf05b0SBen Gardon 
524531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
5253a9a4aa5SBen Gardon 
5267cca2d0bSBen Gardon 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
527faaf05b0SBen Gardon 
528f8e14497SBen Gardon 	__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
529*9a77daacSBen Gardon 			      iter->level, false);
530f8e14497SBen Gardon 	if (record_acc_track)
531f8e14497SBen Gardon 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
532f8e14497SBen Gardon 					      iter->level);
533a6a0b05dSBen Gardon 	if (record_dirty_log)
534a6a0b05dSBen Gardon 		handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
535a6a0b05dSBen Gardon 					      iter->old_spte, new_spte,
536a6a0b05dSBen Gardon 					      iter->level);
537f8e14497SBen Gardon }
538f8e14497SBen Gardon 
539f8e14497SBen Gardon static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
540f8e14497SBen Gardon 				    u64 new_spte)
541f8e14497SBen Gardon {
542a6a0b05dSBen Gardon 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
543f8e14497SBen Gardon }
544f8e14497SBen Gardon 
545f8e14497SBen Gardon static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
546f8e14497SBen Gardon 						 struct tdp_iter *iter,
547f8e14497SBen Gardon 						 u64 new_spte)
548f8e14497SBen Gardon {
549a6a0b05dSBen Gardon 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
550a6a0b05dSBen Gardon }
551a6a0b05dSBen Gardon 
552a6a0b05dSBen Gardon static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
553a6a0b05dSBen Gardon 						 struct tdp_iter *iter,
554a6a0b05dSBen Gardon 						 u64 new_spte)
555a6a0b05dSBen Gardon {
556a6a0b05dSBen Gardon 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
557faaf05b0SBen Gardon }
558faaf05b0SBen Gardon 
559faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
560faaf05b0SBen Gardon 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
561faaf05b0SBen Gardon 
562f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
563f8e14497SBen Gardon 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
564f8e14497SBen Gardon 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
565f8e14497SBen Gardon 		    !is_last_spte(_iter.old_spte, _iter.level))		\
566f8e14497SBen Gardon 			continue;					\
567f8e14497SBen Gardon 		else
568f8e14497SBen Gardon 
569bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
570bb18842eSBen Gardon 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
571bb18842eSBen Gardon 			 _mmu->shadow_root_level, _start, _end)
572bb18842eSBen Gardon 
573faaf05b0SBen Gardon /*
574e28a436cSBen Gardon  * Yield if the MMU lock is contended or this thread needs to return control
575e28a436cSBen Gardon  * to the scheduler.
576e28a436cSBen Gardon  *
577e139a34eSBen Gardon  * If this function should yield and flush is set, it will perform a remote
578e139a34eSBen Gardon  * TLB flush before yielding.
579e139a34eSBen Gardon  *
580e28a436cSBen Gardon  * If this function yields, it will also reset the tdp_iter's walk over the
581ed5e484bSBen Gardon  * paging structure and the calling function should skip to the next
582ed5e484bSBen Gardon  * iteration to allow the iterator to continue its traversal from the
583ed5e484bSBen Gardon  * paging structure root.
584e28a436cSBen Gardon  *
585e28a436cSBen Gardon  * Return true if this function yielded and the iterator's traversal was reset.
586e28a436cSBen Gardon  * Return false if a yield was not needed.
587e28a436cSBen Gardon  */
588e139a34eSBen Gardon static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
589e139a34eSBen Gardon 					     struct tdp_iter *iter, bool flush)
590a6a0b05dSBen Gardon {
591ed5e484bSBen Gardon 	/* Ensure forward progress has been made before yielding. */
592ed5e484bSBen Gardon 	if (iter->next_last_level_gfn == iter->yielded_gfn)
593ed5e484bSBen Gardon 		return false;
594ed5e484bSBen Gardon 
595531810caSBen Gardon 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5967cca2d0bSBen Gardon 		rcu_read_unlock();
5977cca2d0bSBen Gardon 
598e139a34eSBen Gardon 		if (flush)
599e139a34eSBen Gardon 			kvm_flush_remote_tlbs(kvm);
600e139a34eSBen Gardon 
601531810caSBen Gardon 		cond_resched_rwlock_write(&kvm->mmu_lock);
6027cca2d0bSBen Gardon 		rcu_read_lock();
603ed5e484bSBen Gardon 
604ed5e484bSBen Gardon 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
605ed5e484bSBen Gardon 
606ed5e484bSBen Gardon 		tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
607ed5e484bSBen Gardon 			       iter->root_level, iter->min_level,
608ed5e484bSBen Gardon 			       iter->next_last_level_gfn);
609ed5e484bSBen Gardon 
610e28a436cSBen Gardon 		return true;
611a6a0b05dSBen Gardon 	}
612e28a436cSBen Gardon 
613e28a436cSBen Gardon 	return false;
614a6a0b05dSBen Gardon }
615a6a0b05dSBen Gardon 
616faaf05b0SBen Gardon /*
617faaf05b0SBen Gardon  * Tears down the mappings for the range of gfns, [start, end), and frees the
618faaf05b0SBen Gardon  * non-root pages mapping GFNs strictly within that range. Returns true if
619faaf05b0SBen Gardon  * SPTEs have been cleared and a TLB flush is needed before releasing the
620faaf05b0SBen Gardon  * MMU lock.
621063afacdSBen Gardon  * If can_yield is true, will release the MMU lock and reschedule if the
622063afacdSBen Gardon  * scheduler needs the CPU or there is contention on the MMU lock. If this
623063afacdSBen Gardon  * function cannot yield, it will not release the MMU lock or reschedule and
624063afacdSBen Gardon  * the caller must ensure it does not supply too large a GFN range, or the
625063afacdSBen Gardon  * operation can cause a soft lockup.
626faaf05b0SBen Gardon  */
627faaf05b0SBen Gardon static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
628063afacdSBen Gardon 			  gfn_t start, gfn_t end, bool can_yield)
629faaf05b0SBen Gardon {
630faaf05b0SBen Gardon 	struct tdp_iter iter;
631faaf05b0SBen Gardon 	bool flush_needed = false;
632faaf05b0SBen Gardon 
6337cca2d0bSBen Gardon 	rcu_read_lock();
6347cca2d0bSBen Gardon 
635faaf05b0SBen Gardon 	tdp_root_for_each_pte(iter, root, start, end) {
6361af4a960SBen Gardon 		if (can_yield &&
6371af4a960SBen Gardon 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
6381af4a960SBen Gardon 			flush_needed = false;
6391af4a960SBen Gardon 			continue;
6401af4a960SBen Gardon 		}
6411af4a960SBen Gardon 
642faaf05b0SBen Gardon 		if (!is_shadow_present_pte(iter.old_spte))
643faaf05b0SBen Gardon 			continue;
644faaf05b0SBen Gardon 
645faaf05b0SBen Gardon 		/*
646faaf05b0SBen Gardon 		 * If this is a non-last-level SPTE that covers a larger range
647faaf05b0SBen Gardon 		 * than should be zapped, continue, and zap the mappings at a
648faaf05b0SBen Gardon 		 * lower level.
649faaf05b0SBen Gardon 		 */
650faaf05b0SBen Gardon 		if ((iter.gfn < start ||
651faaf05b0SBen Gardon 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
652faaf05b0SBen Gardon 		    !is_last_spte(iter.old_spte, iter.level))
653faaf05b0SBen Gardon 			continue;
654faaf05b0SBen Gardon 
655faaf05b0SBen Gardon 		tdp_mmu_set_spte(kvm, &iter, 0);
6561af4a960SBen Gardon 		flush_needed = true;
657faaf05b0SBen Gardon 	}
6587cca2d0bSBen Gardon 
6597cca2d0bSBen Gardon 	rcu_read_unlock();
660faaf05b0SBen Gardon 	return flush_needed;
661faaf05b0SBen Gardon }
662faaf05b0SBen Gardon 
663faaf05b0SBen Gardon /*
664faaf05b0SBen Gardon  * Tears down the mappings for the range of gfns, [start, end), and frees the
665faaf05b0SBen Gardon  * non-root pages mapping GFNs strictly within that range. Returns true if
666faaf05b0SBen Gardon  * SPTEs have been cleared and a TLB flush is needed before releasing the
667faaf05b0SBen Gardon  * MMU lock.
668faaf05b0SBen Gardon  */
669faaf05b0SBen Gardon bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
670faaf05b0SBen Gardon {
671faaf05b0SBen Gardon 	struct kvm_mmu_page *root;
672faaf05b0SBen Gardon 	bool flush = false;
673faaf05b0SBen Gardon 
674a889ea54SBen Gardon 	for_each_tdp_mmu_root_yield_safe(kvm, root)
675063afacdSBen Gardon 		flush |= zap_gfn_range(kvm, root, start, end, true);
676faaf05b0SBen Gardon 
677faaf05b0SBen Gardon 	return flush;
678faaf05b0SBen Gardon }
679faaf05b0SBen Gardon 
680faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm)
681faaf05b0SBen Gardon {
682339f5a7fSRick Edgecombe 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
683faaf05b0SBen Gardon 	bool flush;
684faaf05b0SBen Gardon 
685faaf05b0SBen Gardon 	flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
686faaf05b0SBen Gardon 	if (flush)
687faaf05b0SBen Gardon 		kvm_flush_remote_tlbs(kvm);
688faaf05b0SBen Gardon }
689bb18842eSBen Gardon 
690bb18842eSBen Gardon /*
691bb18842eSBen Gardon  * Installs a last-level SPTE to handle a TDP page fault.
692bb18842eSBen Gardon  * (NPT/EPT violation/misconfiguration)
693bb18842eSBen Gardon  */
694bb18842eSBen Gardon static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
695bb18842eSBen Gardon 					  int map_writable,
696bb18842eSBen Gardon 					  struct tdp_iter *iter,
697bb18842eSBen Gardon 					  kvm_pfn_t pfn, bool prefault)
698bb18842eSBen Gardon {
699bb18842eSBen Gardon 	u64 new_spte;
700bb18842eSBen Gardon 	int ret = 0;
701bb18842eSBen Gardon 	int make_spte_ret = 0;
702bb18842eSBen Gardon 
703*9a77daacSBen Gardon 	if (unlikely(is_noslot_pfn(pfn)))
704bb18842eSBen Gardon 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
705*9a77daacSBen Gardon 	else
706bb18842eSBen Gardon 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
707bb18842eSBen Gardon 					 pfn, iter->old_spte, prefault, true,
708bb18842eSBen Gardon 					 map_writable, !shadow_accessed_mask,
709bb18842eSBen Gardon 					 &new_spte);
710bb18842eSBen Gardon 
711bb18842eSBen Gardon 	if (new_spte == iter->old_spte)
712bb18842eSBen Gardon 		ret = RET_PF_SPURIOUS;
713*9a77daacSBen Gardon 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
714*9a77daacSBen Gardon 		return RET_PF_RETRY;
715bb18842eSBen Gardon 
716bb18842eSBen Gardon 	/*
717bb18842eSBen Gardon 	 * If the page fault was caused by a write but the page is write
718bb18842eSBen Gardon 	 * protected, emulation is needed. If the emulation was skipped,
719bb18842eSBen Gardon 	 * the vCPU would have the same fault again.
720bb18842eSBen Gardon 	 */
721bb18842eSBen Gardon 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
722bb18842eSBen Gardon 		if (write)
723bb18842eSBen Gardon 			ret = RET_PF_EMULATE;
724bb18842eSBen Gardon 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
725bb18842eSBen Gardon 	}
726bb18842eSBen Gardon 
727bb18842eSBen Gardon 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
728*9a77daacSBen Gardon 	if (unlikely(is_mmio_spte(new_spte))) {
729*9a77daacSBen Gardon 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
730*9a77daacSBen Gardon 				     new_spte);
731bb18842eSBen Gardon 		ret = RET_PF_EMULATE;
732*9a77daacSBen Gardon 	} else
733*9a77daacSBen Gardon 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
734*9a77daacSBen Gardon 				       rcu_dereference(iter->sptep));
735bb18842eSBen Gardon 
7367cca2d0bSBen Gardon 	trace_kvm_mmu_set_spte(iter->level, iter->gfn,
7377cca2d0bSBen Gardon 			       rcu_dereference(iter->sptep));
738bb18842eSBen Gardon 	if (!prefault)
739bb18842eSBen Gardon 		vcpu->stat.pf_fixed++;
740bb18842eSBen Gardon 
741bb18842eSBen Gardon 	return ret;
742bb18842eSBen Gardon }
743bb18842eSBen Gardon 
744bb18842eSBen Gardon /*
745bb18842eSBen Gardon  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
746bb18842eSBen Gardon  * page tables and SPTEs to translate the faulting guest physical address.
747bb18842eSBen Gardon  */
748bb18842eSBen Gardon int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
749bb18842eSBen Gardon 		    int map_writable, int max_level, kvm_pfn_t pfn,
750bb18842eSBen Gardon 		    bool prefault)
751bb18842eSBen Gardon {
752bb18842eSBen Gardon 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
753bb18842eSBen Gardon 	bool write = error_code & PFERR_WRITE_MASK;
754bb18842eSBen Gardon 	bool exec = error_code & PFERR_FETCH_MASK;
755bb18842eSBen Gardon 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
756bb18842eSBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
757bb18842eSBen Gardon 	struct tdp_iter iter;
75889c0fd49SBen Gardon 	struct kvm_mmu_page *sp;
759bb18842eSBen Gardon 	u64 *child_pt;
760bb18842eSBen Gardon 	u64 new_spte;
761bb18842eSBen Gardon 	int ret;
762bb18842eSBen Gardon 	gfn_t gfn = gpa >> PAGE_SHIFT;
763bb18842eSBen Gardon 	int level;
764bb18842eSBen Gardon 	int req_level;
765bb18842eSBen Gardon 
766bb18842eSBen Gardon 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
767bb18842eSBen Gardon 		return RET_PF_RETRY;
768bb18842eSBen Gardon 	if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
769bb18842eSBen Gardon 		return RET_PF_RETRY;
770bb18842eSBen Gardon 
771bb18842eSBen Gardon 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
772bb18842eSBen Gardon 					huge_page_disallowed, &req_level);
773bb18842eSBen Gardon 
774bb18842eSBen Gardon 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
7757cca2d0bSBen Gardon 
7767cca2d0bSBen Gardon 	rcu_read_lock();
7777cca2d0bSBen Gardon 
778bb18842eSBen Gardon 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
779bb18842eSBen Gardon 		if (nx_huge_page_workaround_enabled)
780bb18842eSBen Gardon 			disallowed_hugepage_adjust(iter.old_spte, gfn,
781bb18842eSBen Gardon 						   iter.level, &pfn, &level);
782bb18842eSBen Gardon 
783bb18842eSBen Gardon 		if (iter.level == level)
784bb18842eSBen Gardon 			break;
785bb18842eSBen Gardon 
786bb18842eSBen Gardon 		/*
787bb18842eSBen Gardon 		 * If there is an SPTE mapping a large page at a higher level
788bb18842eSBen Gardon 		 * than the target, that SPTE must be cleared and replaced
789bb18842eSBen Gardon 		 * with a non-leaf SPTE.
790bb18842eSBen Gardon 		 */
791bb18842eSBen Gardon 		if (is_shadow_present_pte(iter.old_spte) &&
792bb18842eSBen Gardon 		    is_large_pte(iter.old_spte)) {
793*9a77daacSBen Gardon 			if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0))
794*9a77daacSBen Gardon 				break;
795bb18842eSBen Gardon 
796bb18842eSBen Gardon 			kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
797bb18842eSBen Gardon 					KVM_PAGES_PER_HPAGE(iter.level));
798bb18842eSBen Gardon 
799bb18842eSBen Gardon 			/*
800bb18842eSBen Gardon 			 * The iter must explicitly re-read the spte here
801bb18842eSBen Gardon 			 * because the new value informs the !present
802bb18842eSBen Gardon 			 * path below.
803bb18842eSBen Gardon 			 */
8047cca2d0bSBen Gardon 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
805bb18842eSBen Gardon 		}
806bb18842eSBen Gardon 
807bb18842eSBen Gardon 		if (!is_shadow_present_pte(iter.old_spte)) {
80889c0fd49SBen Gardon 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
80989c0fd49SBen Gardon 			child_pt = sp->spt;
810a9442f59SBen Gardon 
811bb18842eSBen Gardon 			new_spte = make_nonleaf_spte(child_pt,
812bb18842eSBen Gardon 						     !shadow_accessed_mask);
813bb18842eSBen Gardon 
814*9a77daacSBen Gardon 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
815*9a77daacSBen Gardon 						    new_spte)) {
816*9a77daacSBen Gardon 				tdp_mmu_link_page(vcpu->kvm, sp, true,
817*9a77daacSBen Gardon 						  huge_page_disallowed &&
818*9a77daacSBen Gardon 						  req_level >= iter.level);
819*9a77daacSBen Gardon 
820bb18842eSBen Gardon 				trace_kvm_mmu_get_page(sp, true);
821*9a77daacSBen Gardon 			} else {
822*9a77daacSBen Gardon 				tdp_mmu_free_sp(sp);
823*9a77daacSBen Gardon 				break;
824*9a77daacSBen Gardon 			}
825bb18842eSBen Gardon 		}
826bb18842eSBen Gardon 	}
827bb18842eSBen Gardon 
828*9a77daacSBen Gardon 	if (iter.level != level) {
8297cca2d0bSBen Gardon 		rcu_read_unlock();
830bb18842eSBen Gardon 		return RET_PF_RETRY;
8317cca2d0bSBen Gardon 	}
832bb18842eSBen Gardon 
833bb18842eSBen Gardon 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
834bb18842eSBen Gardon 					      pfn, prefault);
8357cca2d0bSBen Gardon 	rcu_read_unlock();
836bb18842eSBen Gardon 
837bb18842eSBen Gardon 	return ret;
838bb18842eSBen Gardon }
839063afacdSBen Gardon 
840063afacdSBen Gardon static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
841063afacdSBen Gardon 		unsigned long end, unsigned long data,
842063afacdSBen Gardon 		int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
843063afacdSBen Gardon 			       struct kvm_mmu_page *root, gfn_t start,
844063afacdSBen Gardon 			       gfn_t end, unsigned long data))
845063afacdSBen Gardon {
846063afacdSBen Gardon 	struct kvm_memslots *slots;
847063afacdSBen Gardon 	struct kvm_memory_slot *memslot;
848063afacdSBen Gardon 	struct kvm_mmu_page *root;
849063afacdSBen Gardon 	int ret = 0;
850063afacdSBen Gardon 	int as_id;
851063afacdSBen Gardon 
852a889ea54SBen Gardon 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
853063afacdSBen Gardon 		as_id = kvm_mmu_page_as_id(root);
854063afacdSBen Gardon 		slots = __kvm_memslots(kvm, as_id);
855063afacdSBen Gardon 		kvm_for_each_memslot(memslot, slots) {
856063afacdSBen Gardon 			unsigned long hva_start, hva_end;
857063afacdSBen Gardon 			gfn_t gfn_start, gfn_end;
858063afacdSBen Gardon 
859063afacdSBen Gardon 			hva_start = max(start, memslot->userspace_addr);
860063afacdSBen Gardon 			hva_end = min(end, memslot->userspace_addr +
861063afacdSBen Gardon 				      (memslot->npages << PAGE_SHIFT));
862063afacdSBen Gardon 			if (hva_start >= hva_end)
863063afacdSBen Gardon 				continue;
864063afacdSBen Gardon 			/*
865063afacdSBen Gardon 			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
866063afacdSBen Gardon 			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
867063afacdSBen Gardon 			 */
868063afacdSBen Gardon 			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
869063afacdSBen Gardon 			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
870063afacdSBen Gardon 
871063afacdSBen Gardon 			ret |= handler(kvm, memslot, root, gfn_start,
872063afacdSBen Gardon 				       gfn_end, data);
873063afacdSBen Gardon 		}
874063afacdSBen Gardon 	}
875063afacdSBen Gardon 
876063afacdSBen Gardon 	return ret;
877063afacdSBen Gardon }
878063afacdSBen Gardon 
879063afacdSBen Gardon static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
880063afacdSBen Gardon 				     struct kvm_memory_slot *slot,
881063afacdSBen Gardon 				     struct kvm_mmu_page *root, gfn_t start,
882063afacdSBen Gardon 				     gfn_t end, unsigned long unused)
883063afacdSBen Gardon {
884063afacdSBen Gardon 	return zap_gfn_range(kvm, root, start, end, false);
885063afacdSBen Gardon }
886063afacdSBen Gardon 
887063afacdSBen Gardon int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
888063afacdSBen Gardon 			      unsigned long end)
889063afacdSBen Gardon {
890063afacdSBen Gardon 	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
891063afacdSBen Gardon 					    zap_gfn_range_hva_wrapper);
892063afacdSBen Gardon }
893f8e14497SBen Gardon 
894f8e14497SBen Gardon /*
895f8e14497SBen Gardon  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
896f8e14497SBen Gardon  * if any of the GFNs in the range have been accessed.
897f8e14497SBen Gardon  */
898f8e14497SBen Gardon static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
899f8e14497SBen Gardon 			 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
900f8e14497SBen Gardon 			 unsigned long unused)
901f8e14497SBen Gardon {
902f8e14497SBen Gardon 	struct tdp_iter iter;
903f8e14497SBen Gardon 	int young = 0;
904f8e14497SBen Gardon 	u64 new_spte = 0;
905f8e14497SBen Gardon 
9067cca2d0bSBen Gardon 	rcu_read_lock();
9077cca2d0bSBen Gardon 
908f8e14497SBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
909f8e14497SBen Gardon 		/*
910f8e14497SBen Gardon 		 * If we have a non-accessed entry we don't need to change the
911f8e14497SBen Gardon 		 * pte.
912f8e14497SBen Gardon 		 */
913f8e14497SBen Gardon 		if (!is_accessed_spte(iter.old_spte))
914f8e14497SBen Gardon 			continue;
915f8e14497SBen Gardon 
916f8e14497SBen Gardon 		new_spte = iter.old_spte;
917f8e14497SBen Gardon 
918f8e14497SBen Gardon 		if (spte_ad_enabled(new_spte)) {
919f8e14497SBen Gardon 			clear_bit((ffs(shadow_accessed_mask) - 1),
920f8e14497SBen Gardon 				  (unsigned long *)&new_spte);
921f8e14497SBen Gardon 		} else {
922f8e14497SBen Gardon 			/*
923f8e14497SBen Gardon 			 * Capture the dirty status of the page, so that it doesn't get
924f8e14497SBen Gardon 			 * lost when the SPTE is marked for access tracking.
925f8e14497SBen Gardon 			 */
926f8e14497SBen Gardon 			if (is_writable_pte(new_spte))
927f8e14497SBen Gardon 				kvm_set_pfn_dirty(spte_to_pfn(new_spte));
928f8e14497SBen Gardon 
929f8e14497SBen Gardon 			new_spte = mark_spte_for_access_track(new_spte);
930f8e14497SBen Gardon 		}
931a6a0b05dSBen Gardon 		new_spte &= ~shadow_dirty_mask;
932f8e14497SBen Gardon 
933f8e14497SBen Gardon 		tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
934f8e14497SBen Gardon 		young = 1;
93533dd3574SBen Gardon 
93633dd3574SBen Gardon 		trace_kvm_age_page(iter.gfn, iter.level, slot, young);
937f8e14497SBen Gardon 	}
938f8e14497SBen Gardon 
9397cca2d0bSBen Gardon 	rcu_read_unlock();
9407cca2d0bSBen Gardon 
941f8e14497SBen Gardon 	return young;
942f8e14497SBen Gardon }
943f8e14497SBen Gardon 
944f8e14497SBen Gardon int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
945f8e14497SBen Gardon 			      unsigned long end)
946f8e14497SBen Gardon {
947f8e14497SBen Gardon 	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
948f8e14497SBen Gardon 					    age_gfn_range);
949f8e14497SBen Gardon }
950f8e14497SBen Gardon 
951f8e14497SBen Gardon static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
952f8e14497SBen Gardon 			struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
953f8e14497SBen Gardon 			unsigned long unused2)
954f8e14497SBen Gardon {
955f8e14497SBen Gardon 	struct tdp_iter iter;
956f8e14497SBen Gardon 
957f8e14497SBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
958f8e14497SBen Gardon 		if (is_accessed_spte(iter.old_spte))
959f8e14497SBen Gardon 			return 1;
960f8e14497SBen Gardon 
961f8e14497SBen Gardon 	return 0;
962f8e14497SBen Gardon }
963f8e14497SBen Gardon 
964f8e14497SBen Gardon int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
965f8e14497SBen Gardon {
966f8e14497SBen Gardon 	return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
967f8e14497SBen Gardon 					    test_age_gfn);
968f8e14497SBen Gardon }
9691d8dd6b3SBen Gardon 
9701d8dd6b3SBen Gardon /*
9711d8dd6b3SBen Gardon  * Handle the changed_pte MMU notifier for the TDP MMU.
9721d8dd6b3SBen Gardon  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
9731d8dd6b3SBen Gardon  * notifier.
9741d8dd6b3SBen Gardon  * Returns non-zero if a flush is needed before releasing the MMU lock.
9751d8dd6b3SBen Gardon  */
9761d8dd6b3SBen Gardon static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
9771d8dd6b3SBen Gardon 			struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
9781d8dd6b3SBen Gardon 			unsigned long data)
9791d8dd6b3SBen Gardon {
9801d8dd6b3SBen Gardon 	struct tdp_iter iter;
9811d8dd6b3SBen Gardon 	pte_t *ptep = (pte_t *)data;
9821d8dd6b3SBen Gardon 	kvm_pfn_t new_pfn;
9831d8dd6b3SBen Gardon 	u64 new_spte;
9841d8dd6b3SBen Gardon 	int need_flush = 0;
9851d8dd6b3SBen Gardon 
9867cca2d0bSBen Gardon 	rcu_read_lock();
9877cca2d0bSBen Gardon 
9881d8dd6b3SBen Gardon 	WARN_ON(pte_huge(*ptep));
9891d8dd6b3SBen Gardon 
9901d8dd6b3SBen Gardon 	new_pfn = pte_pfn(*ptep);
9911d8dd6b3SBen Gardon 
9921d8dd6b3SBen Gardon 	tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
9931d8dd6b3SBen Gardon 		if (iter.level != PG_LEVEL_4K)
9941d8dd6b3SBen Gardon 			continue;
9951d8dd6b3SBen Gardon 
9961d8dd6b3SBen Gardon 		if (!is_shadow_present_pte(iter.old_spte))
9971d8dd6b3SBen Gardon 			break;
9981d8dd6b3SBen Gardon 
9991d8dd6b3SBen Gardon 		tdp_mmu_set_spte(kvm, &iter, 0);
10001d8dd6b3SBen Gardon 
10011d8dd6b3SBen Gardon 		kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
10021d8dd6b3SBen Gardon 
10031d8dd6b3SBen Gardon 		if (!pte_write(*ptep)) {
10041d8dd6b3SBen Gardon 			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
10051d8dd6b3SBen Gardon 					iter.old_spte, new_pfn);
10061d8dd6b3SBen Gardon 
10071d8dd6b3SBen Gardon 			tdp_mmu_set_spte(kvm, &iter, new_spte);
10081d8dd6b3SBen Gardon 		}
10091d8dd6b3SBen Gardon 
10101d8dd6b3SBen Gardon 		need_flush = 1;
10111d8dd6b3SBen Gardon 	}
10121d8dd6b3SBen Gardon 
10131d8dd6b3SBen Gardon 	if (need_flush)
10141d8dd6b3SBen Gardon 		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
10151d8dd6b3SBen Gardon 
10167cca2d0bSBen Gardon 	rcu_read_unlock();
10177cca2d0bSBen Gardon 
10181d8dd6b3SBen Gardon 	return 0;
10191d8dd6b3SBen Gardon }
10201d8dd6b3SBen Gardon 
10211d8dd6b3SBen Gardon int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
10221d8dd6b3SBen Gardon 			     pte_t *host_ptep)
10231d8dd6b3SBen Gardon {
10241d8dd6b3SBen Gardon 	return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
10251d8dd6b3SBen Gardon 					    (unsigned long)host_ptep,
10261d8dd6b3SBen Gardon 					    set_tdp_spte);
10271d8dd6b3SBen Gardon }
10281d8dd6b3SBen Gardon 
1029a6a0b05dSBen Gardon /*
1030a6a0b05dSBen Gardon  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1031a6a0b05dSBen Gardon  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1032a6a0b05dSBen Gardon  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1033a6a0b05dSBen Gardon  */
1034a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1035a6a0b05dSBen Gardon 			     gfn_t start, gfn_t end, int min_level)
1036a6a0b05dSBen Gardon {
1037a6a0b05dSBen Gardon 	struct tdp_iter iter;
1038a6a0b05dSBen Gardon 	u64 new_spte;
1039a6a0b05dSBen Gardon 	bool spte_set = false;
1040a6a0b05dSBen Gardon 
10417cca2d0bSBen Gardon 	rcu_read_lock();
10427cca2d0bSBen Gardon 
1043a6a0b05dSBen Gardon 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1044a6a0b05dSBen Gardon 
1045a6a0b05dSBen Gardon 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1046a6a0b05dSBen Gardon 				   min_level, start, end) {
10471af4a960SBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
10481af4a960SBen Gardon 			continue;
10491af4a960SBen Gardon 
1050a6a0b05dSBen Gardon 		if (!is_shadow_present_pte(iter.old_spte) ||
10510f99ee2cSBen Gardon 		    !is_last_spte(iter.old_spte, iter.level) ||
10520f99ee2cSBen Gardon 		    !(iter.old_spte & PT_WRITABLE_MASK))
1053a6a0b05dSBen Gardon 			continue;
1054a6a0b05dSBen Gardon 
1055a6a0b05dSBen Gardon 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1056a6a0b05dSBen Gardon 
1057a6a0b05dSBen Gardon 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1058a6a0b05dSBen Gardon 		spte_set = true;
1059a6a0b05dSBen Gardon 	}
10607cca2d0bSBen Gardon 
10617cca2d0bSBen Gardon 	rcu_read_unlock();
1062a6a0b05dSBen Gardon 	return spte_set;
1063a6a0b05dSBen Gardon }
1064a6a0b05dSBen Gardon 
1065a6a0b05dSBen Gardon /*
1066a6a0b05dSBen Gardon  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1067a6a0b05dSBen Gardon  * only affect leaf SPTEs down to min_level.
1068a6a0b05dSBen Gardon  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1069a6a0b05dSBen Gardon  */
1070a6a0b05dSBen Gardon bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1071a6a0b05dSBen Gardon 			     int min_level)
1072a6a0b05dSBen Gardon {
1073a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1074a6a0b05dSBen Gardon 	int root_as_id;
1075a6a0b05dSBen Gardon 	bool spte_set = false;
1076a6a0b05dSBen Gardon 
1077a889ea54SBen Gardon 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1078a6a0b05dSBen Gardon 		root_as_id = kvm_mmu_page_as_id(root);
1079a6a0b05dSBen Gardon 		if (root_as_id != slot->as_id)
1080a6a0b05dSBen Gardon 			continue;
1081a6a0b05dSBen Gardon 
1082a6a0b05dSBen Gardon 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1083a6a0b05dSBen Gardon 			     slot->base_gfn + slot->npages, min_level);
1084a6a0b05dSBen Gardon 	}
1085a6a0b05dSBen Gardon 
1086a6a0b05dSBen Gardon 	return spte_set;
1087a6a0b05dSBen Gardon }
1088a6a0b05dSBen Gardon 
1089a6a0b05dSBen Gardon /*
1090a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1091a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1092a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1093a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1094a6a0b05dSBen Gardon  * be flushed.
1095a6a0b05dSBen Gardon  */
1096a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1097a6a0b05dSBen Gardon 			   gfn_t start, gfn_t end)
1098a6a0b05dSBen Gardon {
1099a6a0b05dSBen Gardon 	struct tdp_iter iter;
1100a6a0b05dSBen Gardon 	u64 new_spte;
1101a6a0b05dSBen Gardon 	bool spte_set = false;
1102a6a0b05dSBen Gardon 
11037cca2d0bSBen Gardon 	rcu_read_lock();
11047cca2d0bSBen Gardon 
1105a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
11061af4a960SBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
11071af4a960SBen Gardon 			continue;
11081af4a960SBen Gardon 
1109a6a0b05dSBen Gardon 		if (spte_ad_need_write_protect(iter.old_spte)) {
1110a6a0b05dSBen Gardon 			if (is_writable_pte(iter.old_spte))
1111a6a0b05dSBen Gardon 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1112a6a0b05dSBen Gardon 			else
1113a6a0b05dSBen Gardon 				continue;
1114a6a0b05dSBen Gardon 		} else {
1115a6a0b05dSBen Gardon 			if (iter.old_spte & shadow_dirty_mask)
1116a6a0b05dSBen Gardon 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1117a6a0b05dSBen Gardon 			else
1118a6a0b05dSBen Gardon 				continue;
1119a6a0b05dSBen Gardon 		}
1120a6a0b05dSBen Gardon 
1121a6a0b05dSBen Gardon 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1122a6a0b05dSBen Gardon 		spte_set = true;
1123a6a0b05dSBen Gardon 	}
11247cca2d0bSBen Gardon 
11257cca2d0bSBen Gardon 	rcu_read_unlock();
1126a6a0b05dSBen Gardon 	return spte_set;
1127a6a0b05dSBen Gardon }
1128a6a0b05dSBen Gardon 
1129a6a0b05dSBen Gardon /*
1130a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1131a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1132a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1133a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1134a6a0b05dSBen Gardon  * be flushed.
1135a6a0b05dSBen Gardon  */
1136a6a0b05dSBen Gardon bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1137a6a0b05dSBen Gardon {
1138a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1139a6a0b05dSBen Gardon 	int root_as_id;
1140a6a0b05dSBen Gardon 	bool spte_set = false;
1141a6a0b05dSBen Gardon 
1142a889ea54SBen Gardon 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1143a6a0b05dSBen Gardon 		root_as_id = kvm_mmu_page_as_id(root);
1144a6a0b05dSBen Gardon 		if (root_as_id != slot->as_id)
1145a6a0b05dSBen Gardon 			continue;
1146a6a0b05dSBen Gardon 
1147a6a0b05dSBen Gardon 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1148a6a0b05dSBen Gardon 				slot->base_gfn + slot->npages);
1149a6a0b05dSBen Gardon 	}
1150a6a0b05dSBen Gardon 
1151a6a0b05dSBen Gardon 	return spte_set;
1152a6a0b05dSBen Gardon }
1153a6a0b05dSBen Gardon 
1154a6a0b05dSBen Gardon /*
1155a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1156a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1157a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1158a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1159a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1160a6a0b05dSBen Gardon  */
1161a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1162a6a0b05dSBen Gardon 				  gfn_t gfn, unsigned long mask, bool wrprot)
1163a6a0b05dSBen Gardon {
1164a6a0b05dSBen Gardon 	struct tdp_iter iter;
1165a6a0b05dSBen Gardon 	u64 new_spte;
1166a6a0b05dSBen Gardon 
11677cca2d0bSBen Gardon 	rcu_read_lock();
11687cca2d0bSBen Gardon 
1169a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1170a6a0b05dSBen Gardon 				    gfn + BITS_PER_LONG) {
1171a6a0b05dSBen Gardon 		if (!mask)
1172a6a0b05dSBen Gardon 			break;
1173a6a0b05dSBen Gardon 
1174a6a0b05dSBen Gardon 		if (iter.level > PG_LEVEL_4K ||
1175a6a0b05dSBen Gardon 		    !(mask & (1UL << (iter.gfn - gfn))))
1176a6a0b05dSBen Gardon 			continue;
1177a6a0b05dSBen Gardon 
1178f1b3b06aSBen Gardon 		mask &= ~(1UL << (iter.gfn - gfn));
1179f1b3b06aSBen Gardon 
1180a6a0b05dSBen Gardon 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1181a6a0b05dSBen Gardon 			if (is_writable_pte(iter.old_spte))
1182a6a0b05dSBen Gardon 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1183a6a0b05dSBen Gardon 			else
1184a6a0b05dSBen Gardon 				continue;
1185a6a0b05dSBen Gardon 		} else {
1186a6a0b05dSBen Gardon 			if (iter.old_spte & shadow_dirty_mask)
1187a6a0b05dSBen Gardon 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1188a6a0b05dSBen Gardon 			else
1189a6a0b05dSBen Gardon 				continue;
1190a6a0b05dSBen Gardon 		}
1191a6a0b05dSBen Gardon 
1192a6a0b05dSBen Gardon 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1193a6a0b05dSBen Gardon 	}
11947cca2d0bSBen Gardon 
11957cca2d0bSBen Gardon 	rcu_read_unlock();
1196a6a0b05dSBen Gardon }
1197a6a0b05dSBen Gardon 
1198a6a0b05dSBen Gardon /*
1199a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1200a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1201a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1202a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1203a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1204a6a0b05dSBen Gardon  */
1205a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1206a6a0b05dSBen Gardon 				       struct kvm_memory_slot *slot,
1207a6a0b05dSBen Gardon 				       gfn_t gfn, unsigned long mask,
1208a6a0b05dSBen Gardon 				       bool wrprot)
1209a6a0b05dSBen Gardon {
1210a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1211a6a0b05dSBen Gardon 	int root_as_id;
1212a6a0b05dSBen Gardon 
1213531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
1214a6a0b05dSBen Gardon 	for_each_tdp_mmu_root(kvm, root) {
1215a6a0b05dSBen Gardon 		root_as_id = kvm_mmu_page_as_id(root);
1216a6a0b05dSBen Gardon 		if (root_as_id != slot->as_id)
1217a6a0b05dSBen Gardon 			continue;
1218a6a0b05dSBen Gardon 
1219a6a0b05dSBen Gardon 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1220a6a0b05dSBen Gardon 	}
1221a6a0b05dSBen Gardon }
1222a6a0b05dSBen Gardon 
1223a6a0b05dSBen Gardon /*
1224a6a0b05dSBen Gardon  * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
1225a6a0b05dSBen Gardon  * only used for PML, and so will involve setting the dirty bit on each SPTE.
1226a6a0b05dSBen Gardon  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1227a6a0b05dSBen Gardon  */
1228a6a0b05dSBen Gardon static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1229a6a0b05dSBen Gardon 				gfn_t start, gfn_t end)
1230a6a0b05dSBen Gardon {
1231a6a0b05dSBen Gardon 	struct tdp_iter iter;
1232a6a0b05dSBen Gardon 	u64 new_spte;
1233a6a0b05dSBen Gardon 	bool spte_set = false;
1234a6a0b05dSBen Gardon 
12357cca2d0bSBen Gardon 	rcu_read_lock();
12367cca2d0bSBen Gardon 
1237a6a0b05dSBen Gardon 	tdp_root_for_each_pte(iter, root, start, end) {
12381af4a960SBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
12391af4a960SBen Gardon 			continue;
12401af4a960SBen Gardon 
12410f99ee2cSBen Gardon 		if (!is_shadow_present_pte(iter.old_spte) ||
12420f99ee2cSBen Gardon 		    iter.old_spte & shadow_dirty_mask)
1243a6a0b05dSBen Gardon 			continue;
1244a6a0b05dSBen Gardon 
1245a6a0b05dSBen Gardon 		new_spte = iter.old_spte | shadow_dirty_mask;
1246a6a0b05dSBen Gardon 
1247a6a0b05dSBen Gardon 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1248a6a0b05dSBen Gardon 		spte_set = true;
1249a6a0b05dSBen Gardon 	}
1250a6a0b05dSBen Gardon 
12517cca2d0bSBen Gardon 	rcu_read_unlock();
1252a6a0b05dSBen Gardon 	return spte_set;
1253a6a0b05dSBen Gardon }
1254a6a0b05dSBen Gardon 
1255a6a0b05dSBen Gardon /*
1256a6a0b05dSBen Gardon  * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
1257a6a0b05dSBen Gardon  * only used for PML, and so will involve setting the dirty bit on each SPTE.
1258a6a0b05dSBen Gardon  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1259a6a0b05dSBen Gardon  */
1260a6a0b05dSBen Gardon bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
1261a6a0b05dSBen Gardon {
1262a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1263a6a0b05dSBen Gardon 	int root_as_id;
1264a6a0b05dSBen Gardon 	bool spte_set = false;
1265a6a0b05dSBen Gardon 
1266a889ea54SBen Gardon 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1267a6a0b05dSBen Gardon 		root_as_id = kvm_mmu_page_as_id(root);
1268a6a0b05dSBen Gardon 		if (root_as_id != slot->as_id)
1269a6a0b05dSBen Gardon 			continue;
1270a6a0b05dSBen Gardon 
1271a6a0b05dSBen Gardon 		spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
1272a6a0b05dSBen Gardon 				slot->base_gfn + slot->npages);
1273a6a0b05dSBen Gardon 	}
1274a6a0b05dSBen Gardon 	return spte_set;
1275a6a0b05dSBen Gardon }
1276a6a0b05dSBen Gardon 
127714881998SBen Gardon /*
127887aa9ec9SBen Gardon  * Clear leaf entries which could be replaced by large mappings, for
127987aa9ec9SBen Gardon  * GFNs within the slot.
128014881998SBen Gardon  */
128114881998SBen Gardon static void zap_collapsible_spte_range(struct kvm *kvm,
128214881998SBen Gardon 				       struct kvm_mmu_page *root,
128314881998SBen Gardon 				       gfn_t start, gfn_t end)
128414881998SBen Gardon {
128514881998SBen Gardon 	struct tdp_iter iter;
128614881998SBen Gardon 	kvm_pfn_t pfn;
128714881998SBen Gardon 	bool spte_set = false;
128814881998SBen Gardon 
12897cca2d0bSBen Gardon 	rcu_read_lock();
12907cca2d0bSBen Gardon 
129114881998SBen Gardon 	tdp_root_for_each_pte(iter, root, start, end) {
12921af4a960SBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
12931af4a960SBen Gardon 			spte_set = false;
12941af4a960SBen Gardon 			continue;
12951af4a960SBen Gardon 		}
12961af4a960SBen Gardon 
129714881998SBen Gardon 		if (!is_shadow_present_pte(iter.old_spte) ||
129887aa9ec9SBen Gardon 		    !is_last_spte(iter.old_spte, iter.level))
129914881998SBen Gardon 			continue;
130014881998SBen Gardon 
130114881998SBen Gardon 		pfn = spte_to_pfn(iter.old_spte);
130214881998SBen Gardon 		if (kvm_is_reserved_pfn(pfn) ||
130314881998SBen Gardon 		    !PageTransCompoundMap(pfn_to_page(pfn)))
130414881998SBen Gardon 			continue;
130514881998SBen Gardon 
130614881998SBen Gardon 		tdp_mmu_set_spte(kvm, &iter, 0);
130714881998SBen Gardon 
13081af4a960SBen Gardon 		spte_set = true;
130914881998SBen Gardon 	}
131014881998SBen Gardon 
13117cca2d0bSBen Gardon 	rcu_read_unlock();
131214881998SBen Gardon 	if (spte_set)
131314881998SBen Gardon 		kvm_flush_remote_tlbs(kvm);
131414881998SBen Gardon }
131514881998SBen Gardon 
131614881998SBen Gardon /*
131714881998SBen Gardon  * Clear non-leaf entries (and free associated page tables) which could
131814881998SBen Gardon  * be replaced by large mappings, for GFNs within the slot.
131914881998SBen Gardon  */
132014881998SBen Gardon void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
132114881998SBen Gardon 				       const struct kvm_memory_slot *slot)
132214881998SBen Gardon {
132314881998SBen Gardon 	struct kvm_mmu_page *root;
132414881998SBen Gardon 	int root_as_id;
132514881998SBen Gardon 
1326a889ea54SBen Gardon 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
132714881998SBen Gardon 		root_as_id = kvm_mmu_page_as_id(root);
132814881998SBen Gardon 		if (root_as_id != slot->as_id)
132914881998SBen Gardon 			continue;
133014881998SBen Gardon 
133114881998SBen Gardon 		zap_collapsible_spte_range(kvm, root, slot->base_gfn,
133214881998SBen Gardon 					   slot->base_gfn + slot->npages);
133314881998SBen Gardon 	}
133414881998SBen Gardon }
133546044f72SBen Gardon 
133646044f72SBen Gardon /*
133746044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
133846044f72SBen Gardon  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
133946044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
134046044f72SBen Gardon  */
134146044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
134246044f72SBen Gardon 			      gfn_t gfn)
134346044f72SBen Gardon {
134446044f72SBen Gardon 	struct tdp_iter iter;
134546044f72SBen Gardon 	u64 new_spte;
134646044f72SBen Gardon 	bool spte_set = false;
134746044f72SBen Gardon 
13487cca2d0bSBen Gardon 	rcu_read_lock();
13497cca2d0bSBen Gardon 
135046044f72SBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
135146044f72SBen Gardon 		if (!is_writable_pte(iter.old_spte))
135246044f72SBen Gardon 			break;
135346044f72SBen Gardon 
135446044f72SBen Gardon 		new_spte = iter.old_spte &
135546044f72SBen Gardon 			~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
135646044f72SBen Gardon 
135746044f72SBen Gardon 		tdp_mmu_set_spte(kvm, &iter, new_spte);
135846044f72SBen Gardon 		spte_set = true;
135946044f72SBen Gardon 	}
136046044f72SBen Gardon 
13617cca2d0bSBen Gardon 	rcu_read_unlock();
13627cca2d0bSBen Gardon 
136346044f72SBen Gardon 	return spte_set;
136446044f72SBen Gardon }
136546044f72SBen Gardon 
136646044f72SBen Gardon /*
136746044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
136846044f72SBen Gardon  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
136946044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
137046044f72SBen Gardon  */
137146044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
137246044f72SBen Gardon 				   struct kvm_memory_slot *slot, gfn_t gfn)
137346044f72SBen Gardon {
137446044f72SBen Gardon 	struct kvm_mmu_page *root;
137546044f72SBen Gardon 	int root_as_id;
137646044f72SBen Gardon 	bool spte_set = false;
137746044f72SBen Gardon 
1378531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
137946044f72SBen Gardon 	for_each_tdp_mmu_root(kvm, root) {
138046044f72SBen Gardon 		root_as_id = kvm_mmu_page_as_id(root);
138146044f72SBen Gardon 		if (root_as_id != slot->as_id)
138246044f72SBen Gardon 			continue;
138346044f72SBen Gardon 
138446044f72SBen Gardon 		spte_set |= write_protect_gfn(kvm, root, gfn);
138546044f72SBen Gardon 	}
138646044f72SBen Gardon 	return spte_set;
138746044f72SBen Gardon }
138846044f72SBen Gardon 
138995fb5b02SBen Gardon /*
139095fb5b02SBen Gardon  * Return the level of the lowest level SPTE added to sptes.
139195fb5b02SBen Gardon  * That SPTE may be non-present.
139295fb5b02SBen Gardon  */
139339b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
139439b4d43eSSean Christopherson 			 int *root_level)
139595fb5b02SBen Gardon {
139695fb5b02SBen Gardon 	struct tdp_iter iter;
139795fb5b02SBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
139895fb5b02SBen Gardon 	gfn_t gfn = addr >> PAGE_SHIFT;
13992aa07893SSean Christopherson 	int leaf = -1;
140095fb5b02SBen Gardon 
140139b4d43eSSean Christopherson 	*root_level = vcpu->arch.mmu->shadow_root_level;
140295fb5b02SBen Gardon 
14037cca2d0bSBen Gardon 	rcu_read_lock();
14047cca2d0bSBen Gardon 
140595fb5b02SBen Gardon 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
140695fb5b02SBen Gardon 		leaf = iter.level;
1407dde81f94SSean Christopherson 		sptes[leaf] = iter.old_spte;
140895fb5b02SBen Gardon 	}
140995fb5b02SBen Gardon 
14107cca2d0bSBen Gardon 	rcu_read_unlock();
14117cca2d0bSBen Gardon 
141295fb5b02SBen Gardon 	return leaf;
141395fb5b02SBen Gardon }
1414