xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision cdf811a9)
1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0
28d20bd63SSean Christopherson #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3fe5db27dSBen Gardon 
402c00b3aSBen Gardon #include "mmu.h"
502c00b3aSBen Gardon #include "mmu_internal.h"
6bb18842eSBen Gardon #include "mmutrace.h"
72f2fad08SBen Gardon #include "tdp_iter.h"
8fe5db27dSBen Gardon #include "tdp_mmu.h"
902c00b3aSBen Gardon #include "spte.h"
10fe5db27dSBen Gardon 
119a77daacSBen Gardon #include <asm/cmpxchg.h>
1233dd3574SBen Gardon #include <trace/events/kvm.h>
1333dd3574SBen Gardon 
14fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)150df9dab8SSean Christopherson void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16fe5db27dSBen Gardon {
1702c00b3aSBen Gardon 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
189a77daacSBen Gardon 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19fe5db27dSBen Gardon }
20fe5db27dSBen Gardon 
21226b8c8fSSean Christopherson /* Arbitrarily returns true so that this may be used in if statements. */
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)22226b8c8fSSean Christopherson static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
236103bc07SBen Gardon 							     bool shared)
246103bc07SBen Gardon {
256103bc07SBen Gardon 	if (shared)
266103bc07SBen Gardon 		lockdep_assert_held_read(&kvm->mmu_lock);
276103bc07SBen Gardon 	else
286103bc07SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
29226b8c8fSSean Christopherson 
30226b8c8fSSean Christopherson 	return true;
316103bc07SBen Gardon }
326103bc07SBen Gardon 
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)33fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34fe5db27dSBen Gardon {
35edbdb43fSSean Christopherson 	/*
36edbdb43fSSean Christopherson 	 * Invalidate all roots, which besides the obvious, schedules all roots
37edbdb43fSSean Christopherson 	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38edbdb43fSSean Christopherson 	 * ultimately frees all roots.
39edbdb43fSSean Christopherson 	 */
40edbdb43fSSean Christopherson 	kvm_tdp_mmu_invalidate_all_roots(kvm);
410df9dab8SSean Christopherson 	kvm_tdp_mmu_zap_invalidated_roots(kvm);
4222b94c4bSPaolo Bonzini 
43d25ceb92SSean Christopherson 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
4402c00b3aSBen Gardon 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
457cca2d0bSBen Gardon 
467cca2d0bSBen Gardon 	/*
477cca2d0bSBen Gardon 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
480df9dab8SSean Christopherson 	 * can run before the VM is torn down.  Putting the last reference to
490df9dab8SSean Christopherson 	 * zapped roots will create new callbacks.
507cca2d0bSBen Gardon 	 */
517cca2d0bSBen Gardon 	rcu_barrier();
5202c00b3aSBen Gardon }
5302c00b3aSBen Gardon 
tdp_mmu_free_sp(struct kvm_mmu_page * sp)542bdb3d84SBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
55a889ea54SBen Gardon {
562bdb3d84SBen Gardon 	free_page((unsigned long)sp->spt);
572bdb3d84SBen Gardon 	kmem_cache_free(mmu_page_header_cache, sp);
58a889ea54SBen Gardon }
59a889ea54SBen Gardon 
60c0e64238SBen Gardon /*
61c0e64238SBen Gardon  * This is called through call_rcu in order to free TDP page table memory
62c0e64238SBen Gardon  * safely with respect to other kernel threads that may be operating on
63c0e64238SBen Gardon  * the memory.
64c0e64238SBen Gardon  * By only accessing TDP MMU page table memory in an RCU read critical
65c0e64238SBen Gardon  * section, and freeing it after a grace period, lockless access to that
66c0e64238SBen Gardon  * memory won't use it after it is freed.
67c0e64238SBen Gardon  */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)68c0e64238SBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
69a889ea54SBen Gardon {
70c0e64238SBen Gardon 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
71c0e64238SBen Gardon 					       rcu_head);
72a889ea54SBen Gardon 
73c0e64238SBen Gardon 	tdp_mmu_free_sp(sp);
74a889ea54SBen Gardon }
75a889ea54SBen Gardon 
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)766103bc07SBen Gardon void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
776103bc07SBen Gardon 			  bool shared)
782bdb3d84SBen Gardon {
796103bc07SBen Gardon 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
802bdb3d84SBen Gardon 
8111cccf5cSBen Gardon 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
822bdb3d84SBen Gardon 		return;
832bdb3d84SBen Gardon 
848351779cSPaolo Bonzini 	/*
85edbdb43fSSean Christopherson 	 * The TDP MMU itself holds a reference to each root until the root is
86edbdb43fSSean Christopherson 	 * explicitly invalidated, i.e. the final reference should be never be
87edbdb43fSSean Christopherson 	 * put for a valid root.
888351779cSPaolo Bonzini 	 */
89edbdb43fSSean Christopherson 	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
908351779cSPaolo Bonzini 
91c0e64238SBen Gardon 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92c0e64238SBen Gardon 	list_del_rcu(&root->link);
93c0e64238SBen Gardon 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94c0e64238SBen Gardon 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
95a889ea54SBen Gardon }
96a889ea54SBen Gardon 
97cfc10997SBen Gardon /*
98d62007edSSean Christopherson  * Returns the next root after @prev_root (or the first root if @prev_root is
99d62007edSSean Christopherson  * NULL).  A reference to the returned root is acquired, and the reference to
100d62007edSSean Christopherson  * @prev_root is released (the caller obviously must hold a reference to
101d62007edSSean Christopherson  * @prev_root if it's non-NULL).
102d62007edSSean Christopherson  *
103d62007edSSean Christopherson  * If @only_valid is true, invalid roots are skipped.
104d62007edSSean Christopherson  *
105d62007edSSean Christopherson  * Returns NULL if the end of tdp_mmu_roots was reached.
106cfc10997SBen Gardon  */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,bool shared,bool only_valid)107cfc10997SBen Gardon static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
1086103bc07SBen Gardon 					      struct kvm_mmu_page *prev_root,
109d62007edSSean Christopherson 					      bool shared, bool only_valid)
110a889ea54SBen Gardon {
111a889ea54SBen Gardon 	struct kvm_mmu_page *next_root;
112a889ea54SBen Gardon 
113c0e64238SBen Gardon 	rcu_read_lock();
114c0e64238SBen Gardon 
115cfc10997SBen Gardon 	if (prev_root)
116c0e64238SBen Gardon 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
117c0e64238SBen Gardon 						  &prev_root->link,
118c0e64238SBen Gardon 						  typeof(*prev_root), link);
119cfc10997SBen Gardon 	else
120c0e64238SBen Gardon 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
121cfc10997SBen Gardon 						   typeof(*next_root), link);
122cfc10997SBen Gardon 
12304dc4e6cSSean Christopherson 	while (next_root) {
124d62007edSSean Christopherson 		if ((!only_valid || !next_root->role.invalid) &&
125ad6d6b94SJinrong Liang 		    kvm_tdp_mmu_get_root(next_root))
12604dc4e6cSSean Christopherson 			break;
12704dc4e6cSSean Christopherson 
128c0e64238SBen Gardon 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
129c0e64238SBen Gardon 				&next_root->link, typeof(*next_root), link);
13004dc4e6cSSean Christopherson 	}
131fb101293SBen Gardon 
132c0e64238SBen Gardon 	rcu_read_unlock();
133cfc10997SBen Gardon 
134cfc10997SBen Gardon 	if (prev_root)
1356103bc07SBen Gardon 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
136cfc10997SBen Gardon 
137a889ea54SBen Gardon 	return next_root;
138a889ea54SBen Gardon }
139a889ea54SBen Gardon 
140a889ea54SBen Gardon /*
141a889ea54SBen Gardon  * Note: this iterator gets and puts references to the roots it iterates over.
142a889ea54SBen Gardon  * This makes it safe to release the MMU lock and yield within the loop, but
143a889ea54SBen Gardon  * if exiting the loop early, the caller must drop the reference to the most
144a889ea54SBen Gardon  * recent root. (Unless keeping a live reference is desirable.)
1456103bc07SBen Gardon  *
1466103bc07SBen Gardon  * If shared is set, this function is operating under the MMU lock in read
1476103bc07SBen Gardon  * mode. In the unlikely event that this thread must free a root, the lock
1486103bc07SBen Gardon  * will be temporarily dropped and reacquired in write mode.
149a889ea54SBen Gardon  */
150d62007edSSean Christopherson #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
151d62007edSSean Christopherson 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
152cfc10997SBen Gardon 	     _root;								\
153d62007edSSean Christopherson 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
154614f6970SPaolo Bonzini 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
155614f6970SPaolo Bonzini 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
156a3f15bdaSSean Christopherson 		} else
157a889ea54SBen Gardon 
158d62007edSSean Christopherson #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
159d62007edSSean Christopherson 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
160d62007edSSean Christopherson 
1610df9dab8SSean Christopherson #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared)			\
1620df9dab8SSean Christopherson 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false);		\
163441a5dfcSPaolo Bonzini 	     _root;								\
1640df9dab8SSean Christopherson 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, false))		\
1650df9dab8SSean Christopherson 		if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) {		\
166441a5dfcSPaolo Bonzini 		} else
167d62007edSSean Christopherson 
168226b8c8fSSean Christopherson /*
169226b8c8fSSean Christopherson  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
170226b8c8fSSean Christopherson  * the implication being that any flow that holds mmu_lock for read is
171226b8c8fSSean Christopherson  * inherently yield-friendly and should use the yield-safe variant above.
172226b8c8fSSean Christopherson  * Holding mmu_lock for write obviates the need for RCU protection as the list
173226b8c8fSSean Christopherson  * is guaranteed to be stable.
174226b8c8fSSean Christopherson  */
175a3f15bdaSSean Christopherson #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
176226b8c8fSSean Christopherson 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
177226b8c8fSSean Christopherson 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
178226b8c8fSSean Christopherson 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
179a3f15bdaSSean Christopherson 		} else
18002c00b3aSBen Gardon 
tdp_mmu_alloc_sp(struct kvm_vcpu * vcpu)181a82070b6SDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
18202c00b3aSBen Gardon {
18302c00b3aSBen Gardon 	struct kvm_mmu_page *sp;
18402c00b3aSBen Gardon 
18502c00b3aSBen Gardon 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
18602c00b3aSBen Gardon 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
187a82070b6SDavid Matlack 
188a82070b6SDavid Matlack 	return sp;
189a82070b6SDavid Matlack }
190a82070b6SDavid Matlack 
tdp_mmu_init_sp(struct kvm_mmu_page * sp,tdp_ptep_t sptep,gfn_t gfn,union kvm_mmu_page_role role)191c10743a1SSean Christopherson static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
192c10743a1SSean Christopherson 			    gfn_t gfn, union kvm_mmu_page_role role)
193a82070b6SDavid Matlack {
19455c510e2SSean Christopherson 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
195428e9216SSean Christopherson 
19602c00b3aSBen Gardon 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
19702c00b3aSBen Gardon 
198a3aca4deSDavid Matlack 	sp->role = role;
19902c00b3aSBen Gardon 	sp->gfn = gfn;
200c10743a1SSean Christopherson 	sp->ptep = sptep;
20102c00b3aSBen Gardon 	sp->tdp_mmu_page = true;
20202c00b3aSBen Gardon 
20333dd3574SBen Gardon 	trace_kvm_mmu_get_page(sp, true);
20402c00b3aSBen Gardon }
20502c00b3aSBen Gardon 
tdp_mmu_init_child_sp(struct kvm_mmu_page * child_sp,struct tdp_iter * iter)206a82070b6SDavid Matlack static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
207a3aca4deSDavid Matlack 				  struct tdp_iter *iter)
208a3aca4deSDavid Matlack {
209a3aca4deSDavid Matlack 	struct kvm_mmu_page *parent_sp;
210a3aca4deSDavid Matlack 	union kvm_mmu_page_role role;
211a3aca4deSDavid Matlack 
212a3aca4deSDavid Matlack 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
213a3aca4deSDavid Matlack 
214a3aca4deSDavid Matlack 	role = parent_sp->role;
215a3aca4deSDavid Matlack 	role.level--;
216a3aca4deSDavid Matlack 
217c10743a1SSean Christopherson 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
218a3aca4deSDavid Matlack }
219a3aca4deSDavid Matlack 
kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu * vcpu)2206e6ec584SSean Christopherson hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
22102c00b3aSBen Gardon {
2227a458f0eSPaolo Bonzini 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
22302c00b3aSBen Gardon 	struct kvm *kvm = vcpu->kvm;
22402c00b3aSBen Gardon 	struct kvm_mmu_page *root;
22502c00b3aSBen Gardon 
2266e6ec584SSean Christopherson 	lockdep_assert_held_write(&kvm->mmu_lock);
22702c00b3aSBen Gardon 
22804dc4e6cSSean Christopherson 	/*
22904dc4e6cSSean Christopherson 	 * Check for an existing root before allocating a new one.  Note, the
23004dc4e6cSSean Christopherson 	 * role check prevents consuming an invalid root.
23104dc4e6cSSean Christopherson 	 */
232a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
233fb101293SBen Gardon 		if (root->role.word == role.word &&
234ad6d6b94SJinrong Liang 		    kvm_tdp_mmu_get_root(root))
2356e6ec584SSean Christopherson 			goto out;
23602c00b3aSBen Gardon 	}
23702c00b3aSBen Gardon 
238a82070b6SDavid Matlack 	root = tdp_mmu_alloc_sp(vcpu);
239c10743a1SSean Christopherson 	tdp_mmu_init_sp(root, NULL, 0, role);
240a82070b6SDavid Matlack 
241edbdb43fSSean Christopherson 	/*
242edbdb43fSSean Christopherson 	 * TDP MMU roots are kept until they are explicitly invalidated, either
243edbdb43fSSean Christopherson 	 * by a memslot update or by the destruction of the VM.  Initialize the
244edbdb43fSSean Christopherson 	 * refcount to two; one reference for the vCPU, and one reference for
245edbdb43fSSean Christopherson 	 * the TDP MMU itself, which is held until the root is invalidated and
2460df9dab8SSean Christopherson 	 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
247edbdb43fSSean Christopherson 	 */
248edbdb43fSSean Christopherson 	refcount_set(&root->tdp_mmu_root_count, 2);
24902c00b3aSBen Gardon 
250c0e64238SBen Gardon 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
251c0e64238SBen Gardon 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
252c0e64238SBen Gardon 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
25302c00b3aSBen Gardon 
2546e6ec584SSean Christopherson out:
25502c00b3aSBen Gardon 	return __pa(root->spt);
256fe5db27dSBen Gardon }
2572f2fad08SBen Gardon 
2582f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
2599a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
2609a77daacSBen Gardon 				bool shared);
2612f2fad08SBen Gardon 
tdp_account_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)26243a063caSYosry Ahmed static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
26343a063caSYosry Ahmed {
26443a063caSYosry Ahmed 	kvm_account_pgtable_pages((void *)sp->spt, +1);
265d25ceb92SSean Christopherson 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
26643a063caSYosry Ahmed }
26743a063caSYosry Ahmed 
tdp_unaccount_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)26843a063caSYosry Ahmed static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
26943a063caSYosry Ahmed {
27043a063caSYosry Ahmed 	kvm_account_pgtable_pages((void *)sp->spt, -1);
271d25ceb92SSean Christopherson 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
27243a063caSYosry Ahmed }
27343a063caSYosry Ahmed 
2742f2fad08SBen Gardon /**
275c298a30cSDavid Matlack  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
276a9442f59SBen Gardon  *
277a9442f59SBen Gardon  * @kvm: kvm instance
278a9442f59SBen Gardon  * @sp: the page to be removed
2799a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
2809a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
2819a77daacSBen Gardon  *	    threads that might be adding or removing pages.
282a9442f59SBen Gardon  */
tdp_mmu_unlink_sp(struct kvm * kvm,struct kvm_mmu_page * sp,bool shared)283c298a30cSDavid Matlack static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
2849a77daacSBen Gardon 			      bool shared)
285a9442f59SBen Gardon {
28643a063caSYosry Ahmed 	tdp_unaccount_mmu_page(kvm, sp);
287d25ceb92SSean Christopherson 
288d25ceb92SSean Christopherson 	if (!sp->nx_huge_page_disallowed)
289d25ceb92SSean Christopherson 		return;
290d25ceb92SSean Christopherson 
2919a77daacSBen Gardon 	if (shared)
2929a77daacSBen Gardon 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
2939a77daacSBen Gardon 	else
294a9442f59SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
295a9442f59SBen Gardon 
29661f94478SSean Christopherson 	sp->nx_huge_page_disallowed = false;
29761f94478SSean Christopherson 	untrack_possible_nx_huge_page(kvm, sp);
2989a77daacSBen Gardon 
2999a77daacSBen Gardon 	if (shared)
3009a77daacSBen Gardon 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
301a9442f59SBen Gardon }
302a9442f59SBen Gardon 
303a9442f59SBen Gardon /**
3040f53dfa3SDavid Matlack  * handle_removed_pt() - handle a page table removed from the TDP structure
305a066e61fSBen Gardon  *
306a066e61fSBen Gardon  * @kvm: kvm instance
307a066e61fSBen Gardon  * @pt: the page removed from the paging structure
3089a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use
3099a77daacSBen Gardon  *	    of the MMU lock and the operation must synchronize with other
3109a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
311a066e61fSBen Gardon  *
312a066e61fSBen Gardon  * Given a page table that has been removed from the TDP paging structure,
313a066e61fSBen Gardon  * iterates through the page table to clear SPTEs and free child page tables.
31470fb3e41SBen Gardon  *
31570fb3e41SBen Gardon  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
31670fb3e41SBen Gardon  * protection. Since this thread removed it from the paging structure,
31770fb3e41SBen Gardon  * this thread will be responsible for ensuring the page is freed. Hence the
31870fb3e41SBen Gardon  * early rcu_dereferences in the function.
319a066e61fSBen Gardon  */
handle_removed_pt(struct kvm * kvm,tdp_ptep_t pt,bool shared)3200f53dfa3SDavid Matlack static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
321a066e61fSBen Gardon {
32270fb3e41SBen Gardon 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
323a066e61fSBen Gardon 	int level = sp->role.level;
324e25f0e0cSBen Gardon 	gfn_t base_gfn = sp->gfn;
325a066e61fSBen Gardon 	int i;
326a066e61fSBen Gardon 
327a066e61fSBen Gardon 	trace_kvm_mmu_prepare_zap_page(sp);
328a066e61fSBen Gardon 
329c298a30cSDavid Matlack 	tdp_mmu_unlink_sp(kvm, sp, shared);
330a066e61fSBen Gardon 
3312ca3129eSSean Christopherson 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
332ba3a6120SSean Christopherson 		tdp_ptep_t sptep = pt + i;
333574c3c55SBen Gardon 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
334ba3a6120SSean Christopherson 		u64 old_spte;
3359a77daacSBen Gardon 
3369a77daacSBen Gardon 		if (shared) {
337e25f0e0cSBen Gardon 			/*
338e25f0e0cSBen Gardon 			 * Set the SPTE to a nonpresent value that other
339e25f0e0cSBen Gardon 			 * threads will not overwrite. If the SPTE was
340e25f0e0cSBen Gardon 			 * already marked as removed then another thread
341e25f0e0cSBen Gardon 			 * handling a page fault could overwrite it, so
342e25f0e0cSBen Gardon 			 * set the SPTE until it is set from some other
343e25f0e0cSBen Gardon 			 * value to the removed SPTE value.
344e25f0e0cSBen Gardon 			 */
345e25f0e0cSBen Gardon 			for (;;) {
346ba3a6120SSean Christopherson 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
347ba3a6120SSean Christopherson 				if (!is_removed_spte(old_spte))
348e25f0e0cSBen Gardon 					break;
349e25f0e0cSBen Gardon 				cpu_relax();
350e25f0e0cSBen Gardon 			}
3519a77daacSBen Gardon 		} else {
3528df9f1afSSean Christopherson 			/*
3538df9f1afSSean Christopherson 			 * If the SPTE is not MMU-present, there is no backing
3548df9f1afSSean Christopherson 			 * page associated with the SPTE and so no side effects
3558df9f1afSSean Christopherson 			 * that need to be recorded, and exclusive ownership of
3568df9f1afSSean Christopherson 			 * mmu_lock ensures the SPTE can't be made present.
3578df9f1afSSean Christopherson 			 * Note, zapping MMIO SPTEs is also unnecessary as they
3588df9f1afSSean Christopherson 			 * are guarded by the memslots generation, not by being
3598df9f1afSSean Christopherson 			 * unreachable.
3608df9f1afSSean Christopherson 			 */
361ba3a6120SSean Christopherson 			old_spte = kvm_tdp_mmu_read_spte(sptep);
362ba3a6120SSean Christopherson 			if (!is_shadow_present_pte(old_spte))
3638df9f1afSSean Christopherson 				continue;
364e25f0e0cSBen Gardon 
365e25f0e0cSBen Gardon 			/*
366ba3a6120SSean Christopherson 			 * Use the common helper instead of a raw WRITE_ONCE as
367ba3a6120SSean Christopherson 			 * the SPTE needs to be updated atomically if it can be
368ba3a6120SSean Christopherson 			 * modified by a different vCPU outside of mmu_lock.
369ba3a6120SSean Christopherson 			 * Even though the parent SPTE is !PRESENT, the TLB
370ba3a6120SSean Christopherson 			 * hasn't yet been flushed, and both Intel and AMD
371ba3a6120SSean Christopherson 			 * document that A/D assists can use upper-level PxE
372ba3a6120SSean Christopherson 			 * entries that are cached in the TLB, i.e. the CPU can
373ba3a6120SSean Christopherson 			 * still access the page and mark it dirty.
374ba3a6120SSean Christopherson 			 *
375ba3a6120SSean Christopherson 			 * No retry is needed in the atomic update path as the
376ba3a6120SSean Christopherson 			 * sole concern is dropping a Dirty bit, i.e. no other
377ba3a6120SSean Christopherson 			 * task can zap/remove the SPTE as mmu_lock is held for
378ba3a6120SSean Christopherson 			 * write.  Marking the SPTE as a removed SPTE is not
379ba3a6120SSean Christopherson 			 * strictly necessary for the same reason, but using
380ba3a6120SSean Christopherson 			 * the remove SPTE value keeps the shared/exclusive
381ba3a6120SSean Christopherson 			 * paths consistent and allows the handle_changed_spte()
382ba3a6120SSean Christopherson 			 * call below to hardcode the new value to REMOVED_SPTE.
383ba3a6120SSean Christopherson 			 *
384ba3a6120SSean Christopherson 			 * Note, even though dropping a Dirty bit is the only
385ba3a6120SSean Christopherson 			 * scenario where a non-atomic update could result in a
386ba3a6120SSean Christopherson 			 * functional bug, simply checking the Dirty bit isn't
387ba3a6120SSean Christopherson 			 * sufficient as a fast page fault could read the upper
388ba3a6120SSean Christopherson 			 * level SPTE before it is zapped, and then make this
389ba3a6120SSean Christopherson 			 * target SPTE writable, resume the guest, and set the
390ba3a6120SSean Christopherson 			 * Dirty bit between reading the SPTE above and writing
391ba3a6120SSean Christopherson 			 * it here.
392e25f0e0cSBen Gardon 			 */
393ba3a6120SSean Christopherson 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
394ba3a6120SSean Christopherson 							  REMOVED_SPTE, level);
3959a77daacSBen Gardon 		}
396e25f0e0cSBen Gardon 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
397ba3a6120SSean Christopherson 				    old_spte, REMOVED_SPTE, level, shared);
398a066e61fSBen Gardon 	}
399a066e61fSBen Gardon 
4007cca2d0bSBen Gardon 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
401a066e61fSBen Gardon }
402a066e61fSBen Gardon 
403a066e61fSBen Gardon /**
40440fa907eSVipin Sharma  * handle_changed_spte - handle bookkeeping associated with an SPTE change
4052f2fad08SBen Gardon  * @kvm: kvm instance
4062f2fad08SBen Gardon  * @as_id: the address space of the paging structure the SPTE was a part of
4072f2fad08SBen Gardon  * @gfn: the base GFN that was mapped by the SPTE
4082f2fad08SBen Gardon  * @old_spte: The value of the SPTE before the change
4092f2fad08SBen Gardon  * @new_spte: The value of the SPTE after the change
4102f2fad08SBen Gardon  * @level: the level of the PT the SPTE is part of in the paging structure
4119a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
4129a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
4139a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
4142f2fad08SBen Gardon  *
4151f997345SVipin Sharma  * Handle bookkeeping that might result from the modification of a SPTE.  Note,
4161f997345SVipin Sharma  * dirty logging updates are handled in common code, not here (see make_spte()
4171f997345SVipin Sharma  * and fast_pf_fix_direct_spte()).
4182f2fad08SBen Gardon  */
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)41940fa907eSVipin Sharma static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
4209a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
4219a77daacSBen Gardon 				bool shared)
4222f2fad08SBen Gardon {
4232f2fad08SBen Gardon 	bool was_present = is_shadow_present_pte(old_spte);
4242f2fad08SBen Gardon 	bool is_present = is_shadow_present_pte(new_spte);
4252f2fad08SBen Gardon 	bool was_leaf = was_present && is_last_spte(old_spte, level);
4262f2fad08SBen Gardon 	bool is_leaf = is_present && is_last_spte(new_spte, level);
4272f2fad08SBen Gardon 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
4282f2fad08SBen Gardon 
42920ba462dSSean Christopherson 	WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
43020ba462dSSean Christopherson 	WARN_ON_ONCE(level < PG_LEVEL_4K);
43120ba462dSSean Christopherson 	WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
4322f2fad08SBen Gardon 
4332f2fad08SBen Gardon 	/*
4342f2fad08SBen Gardon 	 * If this warning were to trigger it would indicate that there was a
4352f2fad08SBen Gardon 	 * missing MMU notifier or a race with some notifier handler.
4362f2fad08SBen Gardon 	 * A present, leaf SPTE should never be directly replaced with another
437d9f6e12fSIngo Molnar 	 * present leaf SPTE pointing to a different PFN. A notifier handler
4382f2fad08SBen Gardon 	 * should be zapping the SPTE before the main MM's page table is
4392f2fad08SBen Gardon 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
4402f2fad08SBen Gardon 	 * thread before replacement.
4412f2fad08SBen Gardon 	 */
4422f2fad08SBen Gardon 	if (was_leaf && is_leaf && pfn_changed) {
4432f2fad08SBen Gardon 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
4442f2fad08SBen Gardon 		       "SPTE with another present leaf SPTE mapping a\n"
4452f2fad08SBen Gardon 		       "different PFN!\n"
4462f2fad08SBen Gardon 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
4472f2fad08SBen Gardon 		       as_id, gfn, old_spte, new_spte, level);
4482f2fad08SBen Gardon 
4492f2fad08SBen Gardon 		/*
4502f2fad08SBen Gardon 		 * Crash the host to prevent error propagation and guest data
451d9f6e12fSIngo Molnar 		 * corruption.
4522f2fad08SBen Gardon 		 */
4532f2fad08SBen Gardon 		BUG();
4542f2fad08SBen Gardon 	}
4552f2fad08SBen Gardon 
4562f2fad08SBen Gardon 	if (old_spte == new_spte)
4572f2fad08SBen Gardon 		return;
4582f2fad08SBen Gardon 
459b9a98c34SBen Gardon 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
460b9a98c34SBen Gardon 
461115111efSDavid Matlack 	if (is_leaf)
462115111efSDavid Matlack 		check_spte_writable_invariants(new_spte);
463115111efSDavid Matlack 
4642f2fad08SBen Gardon 	/*
4652f2fad08SBen Gardon 	 * The only times a SPTE should be changed from a non-present to
4662f2fad08SBen Gardon 	 * non-present state is when an MMIO entry is installed/modified/
4672f2fad08SBen Gardon 	 * removed. In that case, there is nothing to do here.
4682f2fad08SBen Gardon 	 */
4692f2fad08SBen Gardon 	if (!was_present && !is_present) {
4702f2fad08SBen Gardon 		/*
47108f07c80SBen Gardon 		 * If this change does not involve a MMIO SPTE or removed SPTE,
47208f07c80SBen Gardon 		 * it is unexpected. Log the change, though it should not
47308f07c80SBen Gardon 		 * impact the guest since both the former and current SPTEs
47408f07c80SBen Gardon 		 * are nonpresent.
4752f2fad08SBen Gardon 		 */
47620ba462dSSean Christopherson 		if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
47708f07c80SBen Gardon 				 !is_mmio_spte(new_spte) &&
47808f07c80SBen Gardon 				 !is_removed_spte(new_spte)))
4792f2fad08SBen Gardon 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
4802f2fad08SBen Gardon 			       "should not be replaced with another,\n"
4812f2fad08SBen Gardon 			       "different nonpresent SPTE, unless one or both\n"
48208f07c80SBen Gardon 			       "are MMIO SPTEs, or the new SPTE is\n"
48308f07c80SBen Gardon 			       "a temporary removed SPTE.\n"
4842f2fad08SBen Gardon 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
4852f2fad08SBen Gardon 			       as_id, gfn, old_spte, new_spte, level);
4862f2fad08SBen Gardon 		return;
4872f2fad08SBen Gardon 	}
4882f2fad08SBen Gardon 
48971f51d2cSMingwei Zhang 	if (is_leaf != was_leaf)
49071f51d2cSMingwei Zhang 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
4912f2fad08SBen Gardon 
4922f2fad08SBen Gardon 	if (was_leaf && is_dirty_spte(old_spte) &&
49364bb2769SSean Christopherson 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
4942f2fad08SBen Gardon 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
4952f2fad08SBen Gardon 
4962f2fad08SBen Gardon 	/*
4972f2fad08SBen Gardon 	 * Recursively handle child PTs if the change removed a subtree from
498c8e5a0d0SSean Christopherson 	 * the paging structure.  Note the WARN on the PFN changing without the
499c8e5a0d0SSean Christopherson 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
500c8e5a0d0SSean Christopherson 	 * pages are kernel allocations and should never be migrated.
5012f2fad08SBen Gardon 	 */
502c8e5a0d0SSean Christopherson 	if (was_present && !was_leaf &&
503c8e5a0d0SSean Christopherson 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
5040f53dfa3SDavid Matlack 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
5052f2fad08SBen Gardon 
50640fa907eSVipin Sharma 	if (was_leaf && is_accessed_spte(old_spte) &&
50740fa907eSVipin Sharma 	    (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
50840fa907eSVipin Sharma 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
5092f2fad08SBen Gardon }
510faaf05b0SBen Gardon 
511fe43fa2fSBen Gardon /*
5126ccf4438SPaolo Bonzini  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
5136ccf4438SPaolo Bonzini  * and handle the associated bookkeeping.  Do not mark the page dirty
51424ae4cfaSBen Gardon  * in KVM's dirty bitmaps.
5159a77daacSBen Gardon  *
5163255530aSDavid Matlack  * If setting the SPTE fails because it has changed, iter->old_spte will be
5173255530aSDavid Matlack  * refreshed to the current value of the spte.
5183255530aSDavid Matlack  *
5199a77daacSBen Gardon  * @kvm: kvm instance
5209a77daacSBen Gardon  * @iter: a tdp_iter instance currently on the SPTE that should be set
5219a77daacSBen Gardon  * @new_spte: The value the SPTE should be set to
5223e72c791SDavid Matlack  * Return:
5233e72c791SDavid Matlack  * * 0      - If the SPTE was set.
5243e72c791SDavid Matlack  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
5253e72c791SDavid Matlack  *            no side-effects other than setting iter->old_spte to the last
5263e72c791SDavid Matlack  *            known value of the spte.
5279a77daacSBen Gardon  */
tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)5283e72c791SDavid Matlack static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
5299a77daacSBen Gardon 					  struct tdp_iter *iter,
5309a77daacSBen Gardon 					  u64 new_spte)
5319a77daacSBen Gardon {
5323255530aSDavid Matlack 	u64 *sptep = rcu_dereference(iter->sptep);
5333255530aSDavid Matlack 
534396fd74dSSean Christopherson 	/*
535396fd74dSSean Christopherson 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
536396fd74dSSean Christopherson 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
537396fd74dSSean Christopherson 	 * and pre-checking before inserting a new SPTE is advantageous as it
538396fd74dSSean Christopherson 	 * avoids unnecessary work.
539396fd74dSSean Christopherson 	 */
540396fd74dSSean Christopherson 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
5413a0f64deSSean Christopherson 
5429a77daacSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
5439a77daacSBen Gardon 
54408f07c80SBen Gardon 	/*
5456e8eb206SDavid Matlack 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
54612ced095SUros Bizjak 	 * does not hold the mmu_lock.  On failure, i.e. if a different logical
54712ced095SUros Bizjak 	 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
54812ced095SUros Bizjak 	 * the current value, so the caller operates on fresh data, e.g. if it
54912ced095SUros Bizjak 	 * retries tdp_mmu_set_spte_atomic()
5506e8eb206SDavid Matlack 	 */
551aee98a68SUros Bizjak 	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
5523e72c791SDavid Matlack 		return -EBUSY;
5539a77daacSBen Gardon 
55440fa907eSVipin Sharma 	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
55508889894SSean Christopherson 			    new_spte, iter->level, true);
5569a77daacSBen Gardon 
5573e72c791SDavid Matlack 	return 0;
5589a77daacSBen Gardon }
5599a77daacSBen Gardon 
tdp_mmu_zap_spte_atomic(struct kvm * kvm,struct tdp_iter * iter)5603e72c791SDavid Matlack static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
56108f07c80SBen Gardon 					  struct tdp_iter *iter)
56208f07c80SBen Gardon {
5633e72c791SDavid Matlack 	int ret;
5643e72c791SDavid Matlack 
56508f07c80SBen Gardon 	/*
56608f07c80SBen Gardon 	 * Freeze the SPTE by setting it to a special,
56708f07c80SBen Gardon 	 * non-present value. This will stop other threads from
56808f07c80SBen Gardon 	 * immediately installing a present entry in its place
56908f07c80SBen Gardon 	 * before the TLBs are flushed.
57008f07c80SBen Gardon 	 */
5713e72c791SDavid Matlack 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
5723e72c791SDavid Matlack 	if (ret)
5733e72c791SDavid Matlack 		return ret;
57408f07c80SBen Gardon 
5754ad980aeSHou Wenlong 	kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
57608f07c80SBen Gardon 
57708f07c80SBen Gardon 	/*
578ba3a6120SSean Christopherson 	 * No other thread can overwrite the removed SPTE as they must either
579ba3a6120SSean Christopherson 	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
580ba3a6120SSean Christopherson 	 * overwrite the special removed SPTE value. No bookkeeping is needed
581ba3a6120SSean Christopherson 	 * here since the SPTE is going from non-present to non-present.  Use
582ba3a6120SSean Christopherson 	 * the raw write helper to avoid an unnecessary check on volatile bits.
58308f07c80SBen Gardon 	 */
584ba3a6120SSean Christopherson 	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
58508f07c80SBen Gardon 
5863e72c791SDavid Matlack 	return 0;
58708f07c80SBen Gardon }
58808f07c80SBen Gardon 
5899a77daacSBen Gardon 
5909a77daacSBen Gardon /*
5910b7cc254SVipin Sharma  * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
592626808d1SSean Christopherson  * @kvm:	      KVM instance
593626808d1SSean Christopherson  * @as_id:	      Address space ID, i.e. regular vs. SMM
594626808d1SSean Christopherson  * @sptep:	      Pointer to the SPTE
595626808d1SSean Christopherson  * @old_spte:	      The current value of the SPTE
596626808d1SSean Christopherson  * @new_spte:	      The new value that will be set for the SPTE
597626808d1SSean Christopherson  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
598626808d1SSean Christopherson  * @level:	      The level _containing_ the SPTE (its parent PT's level)
599ba3a6120SSean Christopherson  *
600ba3a6120SSean Christopherson  * Returns the old SPTE value, which _may_ be different than @old_spte if the
601ba3a6120SSean Christopherson  * SPTE had voldatile bits.
602fe43fa2fSBen Gardon  */
tdp_mmu_set_spte(struct kvm * kvm,int as_id,tdp_ptep_t sptep,u64 old_spte,u64 new_spte,gfn_t gfn,int level)6030b7cc254SVipin Sharma static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
6040b7cc254SVipin Sharma 			    u64 old_spte, u64 new_spte, gfn_t gfn, int level)
605faaf05b0SBen Gardon {
606531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
6073a9a4aa5SBen Gardon 
60808f07c80SBen Gardon 	/*
609966da62aSSean Christopherson 	 * No thread should be using this function to set SPTEs to or from the
61008f07c80SBen Gardon 	 * temporary removed SPTE value.
61108f07c80SBen Gardon 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
61208f07c80SBen Gardon 	 * should be used. If operating under the MMU lock in write mode, the
61308f07c80SBen Gardon 	 * use of the removed SPTE should not be necessary.
61408f07c80SBen Gardon 	 */
61520ba462dSSean Christopherson 	WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
61608f07c80SBen Gardon 
617ba3a6120SSean Christopherson 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
618faaf05b0SBen Gardon 
61940fa907eSVipin Sharma 	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
620ba3a6120SSean Christopherson 	return old_spte;
621626808d1SSean Christopherson }
622626808d1SSean Christopherson 
tdp_mmu_iter_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)6230b7cc254SVipin Sharma static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
624f8e14497SBen Gardon 					 u64 new_spte)
625f8e14497SBen Gardon {
6260b7cc254SVipin Sharma 	WARN_ON_ONCE(iter->yielded);
6270b7cc254SVipin Sharma 	iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
6280b7cc254SVipin Sharma 					  iter->old_spte, new_spte,
6290b7cc254SVipin Sharma 					  iter->gfn, iter->level);
630f8e14497SBen Gardon }
631f8e14497SBen Gardon 
632faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
63377aa6075SDavid Matlack 	for_each_tdp_pte(_iter, _root, _start, _end)
634faaf05b0SBen Gardon 
635f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
636f8e14497SBen Gardon 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
637f8e14497SBen Gardon 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
638f8e14497SBen Gardon 		    !is_last_spte(_iter.old_spte, _iter.level))		\
639f8e14497SBen Gardon 			continue;					\
640f8e14497SBen Gardon 		else
641f8e14497SBen Gardon 
642bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
643c5f2d564SSean Christopherson 	for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
644bb18842eSBen Gardon 
645faaf05b0SBen Gardon /*
646e28a436cSBen Gardon  * Yield if the MMU lock is contended or this thread needs to return control
647e28a436cSBen Gardon  * to the scheduler.
648e28a436cSBen Gardon  *
649e139a34eSBen Gardon  * If this function should yield and flush is set, it will perform a remote
650e139a34eSBen Gardon  * TLB flush before yielding.
651e139a34eSBen Gardon  *
6523a0f64deSSean Christopherson  * If this function yields, iter->yielded is set and the caller must skip to
6533a0f64deSSean Christopherson  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
6543a0f64deSSean Christopherson  * over the paging structures to allow the iterator to continue its traversal
6553a0f64deSSean Christopherson  * from the paging structure root.
656e28a436cSBen Gardon  *
6573a0f64deSSean Christopherson  * Returns true if this function yielded.
658e28a436cSBen Gardon  */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)6593a0f64deSSean Christopherson static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
6603a0f64deSSean Christopherson 							  struct tdp_iter *iter,
6613a0f64deSSean Christopherson 							  bool flush, bool shared)
662a6a0b05dSBen Gardon {
66320ba462dSSean Christopherson 	WARN_ON_ONCE(iter->yielded);
6643a0f64deSSean Christopherson 
665ed5e484bSBen Gardon 	/* Ensure forward progress has been made before yielding. */
666ed5e484bSBen Gardon 	if (iter->next_last_level_gfn == iter->yielded_gfn)
667ed5e484bSBen Gardon 		return false;
668ed5e484bSBen Gardon 
669531810caSBen Gardon 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
670e139a34eSBen Gardon 		if (flush)
671e139a34eSBen Gardon 			kvm_flush_remote_tlbs(kvm);
672e139a34eSBen Gardon 
673bd296779SSean Christopherson 		rcu_read_unlock();
674bd296779SSean Christopherson 
6756103bc07SBen Gardon 		if (shared)
6766103bc07SBen Gardon 			cond_resched_rwlock_read(&kvm->mmu_lock);
6776103bc07SBen Gardon 		else
678531810caSBen Gardon 			cond_resched_rwlock_write(&kvm->mmu_lock);
6796103bc07SBen Gardon 
6807cca2d0bSBen Gardon 		rcu_read_lock();
681ed5e484bSBen Gardon 
68220ba462dSSean Christopherson 		WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
683ed5e484bSBen Gardon 
6843a0f64deSSean Christopherson 		iter->yielded = true;
685a6a0b05dSBen Gardon 	}
686e28a436cSBen Gardon 
6873a0f64deSSean Christopherson 	return iter->yielded;
688a6a0b05dSBen Gardon }
689a6a0b05dSBen Gardon 
tdp_mmu_max_gfn_exclusive(void)69086931ff7SSean Christopherson static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
691e2b5b21dSSean Christopherson {
692e2b5b21dSSean Christopherson 	/*
69386931ff7SSean Christopherson 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
69486931ff7SSean Christopherson 	 * a gpa range that would exceed the max gfn, and KVM does not create
69586931ff7SSean Christopherson 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
69686931ff7SSean Christopherson 	 * the slow emulation path every time.
697e2b5b21dSSean Christopherson 	 */
69886931ff7SSean Christopherson 	return kvm_mmu_max_gfn() + 1;
699e2b5b21dSSean Christopherson }
700e2b5b21dSSean Christopherson 
__tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared,int zap_level)7011b6043e8SSean Christopherson static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
7021b6043e8SSean Christopherson 			       bool shared, int zap_level)
703e2b5b21dSSean Christopherson {
704e2b5b21dSSean Christopherson 	struct tdp_iter iter;
705e2b5b21dSSean Christopherson 
70686931ff7SSean Christopherson 	gfn_t end = tdp_mmu_max_gfn_exclusive();
707e2b5b21dSSean Christopherson 	gfn_t start = 0;
708e2b5b21dSSean Christopherson 
7091b6043e8SSean Christopherson 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
7101b6043e8SSean Christopherson retry:
7111b6043e8SSean Christopherson 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
7121b6043e8SSean Christopherson 			continue;
7131b6043e8SSean Christopherson 
7141b6043e8SSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte))
7151b6043e8SSean Christopherson 			continue;
7161b6043e8SSean Christopherson 
7171b6043e8SSean Christopherson 		if (iter.level > zap_level)
7181b6043e8SSean Christopherson 			continue;
7191b6043e8SSean Christopherson 
7201b6043e8SSean Christopherson 		if (!shared)
7210b7cc254SVipin Sharma 			tdp_mmu_iter_set_spte(kvm, &iter, 0);
7221b6043e8SSean Christopherson 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
7231b6043e8SSean Christopherson 			goto retry;
7241b6043e8SSean Christopherson 	}
7251b6043e8SSean Christopherson }
7261b6043e8SSean Christopherson 
tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)7271b6043e8SSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
7281b6043e8SSean Christopherson 			     bool shared)
7291b6043e8SSean Christopherson {
7301b6043e8SSean Christopherson 
7318351779cSPaolo Bonzini 	/*
7328351779cSPaolo Bonzini 	 * The root must have an elevated refcount so that it's reachable via
7338351779cSPaolo Bonzini 	 * mmu_notifier callbacks, which allows this path to yield and drop
7348351779cSPaolo Bonzini 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
7358351779cSPaolo Bonzini 	 * must drop all references to relevant pages prior to completing the
7368351779cSPaolo Bonzini 	 * callback.  Dropping mmu_lock with an unreachable root would result
7378351779cSPaolo Bonzini 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
7388351779cSPaolo Bonzini 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
7398351779cSPaolo Bonzini 	 * dirty accessed bits to the SPTE's associated struct page.
7408351779cSPaolo Bonzini 	 */
7418351779cSPaolo Bonzini 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
7428351779cSPaolo Bonzini 
743e2b5b21dSSean Christopherson 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
744e2b5b21dSSean Christopherson 
745e2b5b21dSSean Christopherson 	rcu_read_lock();
746e2b5b21dSSean Christopherson 
747e2b5b21dSSean Christopherson 	/*
7481b6043e8SSean Christopherson 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
7491b6043e8SSean Christopherson 	 * split the zap into two passes.  On the first pass, zap at the 1gb
7501b6043e8SSean Christopherson 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
7511b6043e8SSean Christopherson 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
7521b6043e8SSean Christopherson 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
7531b6043e8SSean Christopherson 	 *
7541b6043e8SSean Christopherson 	 * Because zapping a SP recurses on its children, stepping down to
7551b6043e8SSean Christopherson 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
756e2b5b21dSSean Christopherson 	 */
7571b6043e8SSean Christopherson 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
7581b6043e8SSean Christopherson 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
759e2b5b21dSSean Christopherson 
760e2b5b21dSSean Christopherson 	rcu_read_unlock();
761e2b5b21dSSean Christopherson }
762e2b5b21dSSean Christopherson 
kvm_tdp_mmu_zap_sp(struct kvm * kvm,struct kvm_mmu_page * sp)763c10743a1SSean Christopherson bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
764c10743a1SSean Christopherson {
765c10743a1SSean Christopherson 	u64 old_spte;
766c10743a1SSean Christopherson 
767c10743a1SSean Christopherson 	/*
768c10743a1SSean Christopherson 	 * This helper intentionally doesn't allow zapping a root shadow page,
769c10743a1SSean Christopherson 	 * which doesn't have a parent page table and thus no associated entry.
770c10743a1SSean Christopherson 	 */
771c10743a1SSean Christopherson 	if (WARN_ON_ONCE(!sp->ptep))
772c10743a1SSean Christopherson 		return false;
773c10743a1SSean Christopherson 
774c10743a1SSean Christopherson 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
775bb95dfb9SSean Christopherson 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
776c10743a1SSean Christopherson 		return false;
777c10743a1SSean Christopherson 
7780b7cc254SVipin Sharma 	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
7790b7cc254SVipin Sharma 			 sp->gfn, sp->role.level + 1);
780c10743a1SSean Christopherson 
781c10743a1SSean Christopherson 	return true;
782c10743a1SSean Christopherson }
783c10743a1SSean Christopherson 
784faaf05b0SBen Gardon /*
785063afacdSBen Gardon  * If can_yield is true, will release the MMU lock and reschedule if the
786063afacdSBen Gardon  * scheduler needs the CPU or there is contention on the MMU lock. If this
787063afacdSBen Gardon  * function cannot yield, it will not release the MMU lock or reschedule and
788063afacdSBen Gardon  * the caller must ensure it does not supply too large a GFN range, or the
7896103bc07SBen Gardon  * operation can cause a soft lockup.
790faaf05b0SBen Gardon  */
tdp_mmu_zap_leafs(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush)791f47e5bbbSSean Christopherson static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
792acbda82aSSean Christopherson 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
793faaf05b0SBen Gardon {
794faaf05b0SBen Gardon 	struct tdp_iter iter;
795faaf05b0SBen Gardon 
79686931ff7SSean Christopherson 	end = min(end, tdp_mmu_max_gfn_exclusive());
797524a1e4eSSean Christopherson 
798acbda82aSSean Christopherson 	lockdep_assert_held_write(&kvm->mmu_lock);
7996103bc07SBen Gardon 
8007cca2d0bSBen Gardon 	rcu_read_lock();
8017cca2d0bSBen Gardon 
802f47e5bbbSSean Christopherson 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
8031af4a960SBen Gardon 		if (can_yield &&
804acbda82aSSean Christopherson 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
805a835429cSSean Christopherson 			flush = false;
8061af4a960SBen Gardon 			continue;
8071af4a960SBen Gardon 		}
8081af4a960SBen Gardon 
809f47e5bbbSSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte) ||
810faaf05b0SBen Gardon 		    !is_last_spte(iter.old_spte, iter.level))
811faaf05b0SBen Gardon 			continue;
812faaf05b0SBen Gardon 
8130b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, &iter, 0);
814a835429cSSean Christopherson 		flush = true;
815faaf05b0SBen Gardon 	}
8167cca2d0bSBen Gardon 
8177cca2d0bSBen Gardon 	rcu_read_unlock();
818bb95dfb9SSean Christopherson 
819f47e5bbbSSean Christopherson 	/*
820f47e5bbbSSean Christopherson 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
821f47e5bbbSSean Christopherson 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
822f47e5bbbSSean Christopherson 	 */
823f47e5bbbSSean Christopherson 	return flush;
824faaf05b0SBen Gardon }
825faaf05b0SBen Gardon 
826faaf05b0SBen Gardon /*
8277edc3a68SKai Huang  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
8287edc3a68SKai Huang  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
8297edc3a68SKai Huang  * more SPTEs were zapped since the MMU lock was last acquired.
830faaf05b0SBen Gardon  */
kvm_tdp_mmu_zap_leafs(struct kvm * kvm,gfn_t start,gfn_t end,bool flush)831441a5dfcSPaolo Bonzini bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
832faaf05b0SBen Gardon {
833faaf05b0SBen Gardon 	struct kvm_mmu_page *root;
834faaf05b0SBen Gardon 
8350df9dab8SSean Christopherson 	for_each_tdp_mmu_root_yield_safe(kvm, root, false)
83650107e8bSSean Christopherson 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
837faaf05b0SBen Gardon 
838faaf05b0SBen Gardon 	return flush;
839faaf05b0SBen Gardon }
840faaf05b0SBen Gardon 
kvm_tdp_mmu_zap_all(struct kvm * kvm)841faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm)
842faaf05b0SBen Gardon {
843e2b5b21dSSean Christopherson 	struct kvm_mmu_page *root;
844faaf05b0SBen Gardon 
84577c8cd6bSSean Christopherson 	/*
84622b94c4bSPaolo Bonzini 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
84722b94c4bSPaolo Bonzini 	 * before returning to the caller.  Zap directly even if the root is
84822b94c4bSPaolo Bonzini 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
84922b94c4bSPaolo Bonzini 	 * all that expensive and mmu_lock is already held, which means the
85022b94c4bSPaolo Bonzini 	 * worker has yielded, i.e. flushing the work instead of zapping here
85122b94c4bSPaolo Bonzini 	 * isn't guaranteed to be any faster.
85222b94c4bSPaolo Bonzini 	 *
85377c8cd6bSSean Christopherson 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
85477c8cd6bSSean Christopherson 	 * is being destroyed or the userspace VMM has exited.  In both cases,
85577c8cd6bSSean Christopherson 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
85677c8cd6bSSean Christopherson 	 */
8570df9dab8SSean Christopherson 	for_each_tdp_mmu_root_yield_safe(kvm, root, false)
858e2b5b21dSSean Christopherson 		tdp_mmu_zap_root(kvm, root, false);
859e2b5b21dSSean Christopherson }
860bb18842eSBen Gardon 
8614c6654bdSBen Gardon /*
862f28e9c7fSSean Christopherson  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
86322b94c4bSPaolo Bonzini  * zap" completes.
8644c6654bdSBen Gardon  */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm)8654c6654bdSBen Gardon void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
8664c6654bdSBen Gardon {
8670df9dab8SSean Christopherson 	struct kvm_mmu_page *root;
8680df9dab8SSean Christopherson 
8690df9dab8SSean Christopherson 	read_lock(&kvm->mmu_lock);
8700df9dab8SSean Christopherson 
8710df9dab8SSean Christopherson 	for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
8720df9dab8SSean Christopherson 		if (!root->tdp_mmu_scheduled_root_to_zap)
8730df9dab8SSean Christopherson 			continue;
8740df9dab8SSean Christopherson 
8750df9dab8SSean Christopherson 		root->tdp_mmu_scheduled_root_to_zap = false;
8760df9dab8SSean Christopherson 		KVM_BUG_ON(!root->role.invalid, kvm);
8770df9dab8SSean Christopherson 
8780df9dab8SSean Christopherson 		/*
8790df9dab8SSean Christopherson 		 * A TLB flush is not necessary as KVM performs a local TLB
8800df9dab8SSean Christopherson 		 * flush when allocating a new root (see kvm_mmu_load()), and
8810df9dab8SSean Christopherson 		 * when migrating a vCPU to a different pCPU.  Note, the local
8820df9dab8SSean Christopherson 		 * TLB flush on reuse also invalidates paging-structure-cache
8830df9dab8SSean Christopherson 		 * entries, i.e. TLB entries for intermediate paging structures,
8840df9dab8SSean Christopherson 		 * that may be zapped, as such entries are associated with the
8850df9dab8SSean Christopherson 		 * ASID on both VMX and SVM.
8860df9dab8SSean Christopherson 		 */
8870df9dab8SSean Christopherson 		tdp_mmu_zap_root(kvm, root, true);
8880df9dab8SSean Christopherson 
8890df9dab8SSean Christopherson 		/*
8900df9dab8SSean Christopherson 		 * The referenced needs to be put *after* zapping the root, as
8910df9dab8SSean Christopherson 		 * the root must be reachable by mmu_notifiers while it's being
8920df9dab8SSean Christopherson 		 * zapped
8930df9dab8SSean Christopherson 		 */
8940df9dab8SSean Christopherson 		kvm_tdp_mmu_put_root(kvm, root, true);
8950df9dab8SSean Christopherson 	}
8960df9dab8SSean Christopherson 
8970df9dab8SSean Christopherson 	read_unlock(&kvm->mmu_lock);
8984c6654bdSBen Gardon }
8994c6654bdSBen Gardon 
900bb18842eSBen Gardon /*
901f28e9c7fSSean Christopherson  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
90222b94c4bSPaolo Bonzini  * is about to be zapped, e.g. in response to a memslots update.  The actual
9030df9dab8SSean Christopherson  * zapping is done separately so that it happens with mmu_lock with read,
9040df9dab8SSean Christopherson  * whereas invalidating roots must be done with mmu_lock held for write (unless
9050df9dab8SSean Christopherson  * the VM is being destroyed).
906b7cccd39SBen Gardon  *
9070df9dab8SSean Christopherson  * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
908edbdb43fSSean Christopherson  * See kvm_tdp_mmu_get_vcpu_root_hpa().
909b7cccd39SBen Gardon  */
kvm_tdp_mmu_invalidate_all_roots(struct kvm * kvm)910b7cccd39SBen Gardon void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
911b7cccd39SBen Gardon {
912b7cccd39SBen Gardon 	struct kvm_mmu_page *root;
913b7cccd39SBen Gardon 
914edbdb43fSSean Christopherson 	/*
915edbdb43fSSean Christopherson 	 * mmu_lock must be held for write to ensure that a root doesn't become
916edbdb43fSSean Christopherson 	 * invalid while there are active readers (invalidating a root while
917edbdb43fSSean Christopherson 	 * there are active readers may or may not be problematic in practice,
918edbdb43fSSean Christopherson 	 * but it's uncharted territory and not supported).
919edbdb43fSSean Christopherson 	 *
920edbdb43fSSean Christopherson 	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
921edbdb43fSSean Christopherson 	 * being destroyed after all references have been put, or if no vCPUs
922edbdb43fSSean Christopherson 	 * have been created (which means there are no roots), i.e. the VM is
923edbdb43fSSean Christopherson 	 * being destroyed in an error path of KVM_CREATE_VM.
924edbdb43fSSean Christopherson 	 */
925edbdb43fSSean Christopherson 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
926edbdb43fSSean Christopherson 	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
927b7cccd39SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
928edbdb43fSSean Christopherson 
929edbdb43fSSean Christopherson 	/*
930edbdb43fSSean Christopherson 	 * As above, mmu_lock isn't held when destroying the VM!  There can't
931edbdb43fSSean Christopherson 	 * be other references to @kvm, i.e. nothing else can invalidate roots
9320df9dab8SSean Christopherson 	 * or get/put references to roots.
933edbdb43fSSean Christopherson 	 */
9340df9dab8SSean Christopherson 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
9350df9dab8SSean Christopherson 		/*
9360df9dab8SSean Christopherson 		 * Note, invalid roots can outlive a memslot update!  Invalid
9370df9dab8SSean Christopherson 		 * roots must be *zapped* before the memslot update completes,
9380df9dab8SSean Christopherson 		 * but a different task can acquire a reference and keep the
9390df9dab8SSean Christopherson 		 * root alive after its been zapped.
9400df9dab8SSean Christopherson 		 */
941edbdb43fSSean Christopherson 		if (!root->role.invalid) {
9420df9dab8SSean Christopherson 			root->tdp_mmu_scheduled_root_to_zap = true;
943b7cccd39SBen Gardon 			root->role.invalid = true;
94422b94c4bSPaolo Bonzini 		}
945b7cccd39SBen Gardon 	}
946f28e9c7fSSean Christopherson }
947b7cccd39SBen Gardon 
948bb18842eSBen Gardon /*
949bb18842eSBen Gardon  * Installs a last-level SPTE to handle a TDP page fault.
950bb18842eSBen Gardon  * (NPT/EPT violation/misconfiguration)
951bb18842eSBen Gardon  */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault,struct tdp_iter * iter)952cdc47767SPaolo Bonzini static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
953cdc47767SPaolo Bonzini 					  struct kvm_page_fault *fault,
954cdc47767SPaolo Bonzini 					  struct tdp_iter *iter)
955bb18842eSBen Gardon {
956c435d4b7SSean Christopherson 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
957bb18842eSBen Gardon 	u64 new_spte;
95857a3e96dSKai Huang 	int ret = RET_PF_FIXED;
959ad67e480SPaolo Bonzini 	bool wrprot = false;
960bb18842eSBen Gardon 
96150a9ac25SSean Christopherson 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
96250a9ac25SSean Christopherson 		return RET_PF_RETRY;
96350a9ac25SSean Christopherson 
964e710c5f6SDavid Matlack 	if (unlikely(!fault->slot))
965bb18842eSBen Gardon 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9669a77daacSBen Gardon 	else
96753597858SDavid Matlack 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
9682839180cSPaolo Bonzini 					 fault->pfn, iter->old_spte, fault->prefetch, true,
9697158bee4SPaolo Bonzini 					 fault->map_writable, &new_spte);
970bb18842eSBen Gardon 
971bb18842eSBen Gardon 	if (new_spte == iter->old_spte)
972bb18842eSBen Gardon 		ret = RET_PF_SPURIOUS;
9733e72c791SDavid Matlack 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9749a77daacSBen Gardon 		return RET_PF_RETRY;
975bb95dfb9SSean Christopherson 	else if (is_shadow_present_pte(iter->old_spte) &&
976bb95dfb9SSean Christopherson 		 !is_last_spte(iter->old_spte, iter->level))
9771e203847SHou Wenlong 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
978bb18842eSBen Gardon 
979bb18842eSBen Gardon 	/*
980bb18842eSBen Gardon 	 * If the page fault was caused by a write but the page is write
981bb18842eSBen Gardon 	 * protected, emulation is needed. If the emulation was skipped,
982bb18842eSBen Gardon 	 * the vCPU would have the same fault again.
983bb18842eSBen Gardon 	 */
984ad67e480SPaolo Bonzini 	if (wrprot) {
985cdc47767SPaolo Bonzini 		if (fault->write)
986bb18842eSBen Gardon 			ret = RET_PF_EMULATE;
987bb18842eSBen Gardon 	}
988bb18842eSBen Gardon 
989bb18842eSBen Gardon 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9909a77daacSBen Gardon 	if (unlikely(is_mmio_spte(new_spte))) {
9911075d41eSSean Christopherson 		vcpu->stat.pf_mmio_spte_created++;
9929a77daacSBen Gardon 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
9939a77daacSBen Gardon 				     new_spte);
994bb18842eSBen Gardon 		ret = RET_PF_EMULATE;
9953849e092SSean Christopherson 	} else {
9969a77daacSBen Gardon 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
9979a77daacSBen Gardon 				       rcu_dereference(iter->sptep));
9983849e092SSean Christopherson 	}
999bb18842eSBen Gardon 
1000bb18842eSBen Gardon 	return ret;
1001bb18842eSBen Gardon }
1002bb18842eSBen Gardon 
1003bb18842eSBen Gardon /*
1004cb00a70bSDavid Matlack  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1005cb00a70bSDavid Matlack  * provided page table.
10067b7e1ab6SDavid Matlack  *
10077b7e1ab6SDavid Matlack  * @kvm: kvm instance
10087b7e1ab6SDavid Matlack  * @iter: a tdp_iter instance currently on the SPTE that should be set
10097b7e1ab6SDavid Matlack  * @sp: The new TDP page table to install.
1010cb00a70bSDavid Matlack  * @shared: This operation is running under the MMU lock in read mode.
10117b7e1ab6SDavid Matlack  *
10127b7e1ab6SDavid Matlack  * Returns: 0 if the new page table was installed. Non-0 if the page table
10137b7e1ab6SDavid Matlack  *          could not be installed (e.g. the atomic compare-exchange failed).
10147b7e1ab6SDavid Matlack  */
tdp_mmu_link_sp(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1015cb00a70bSDavid Matlack static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
101661f94478SSean Christopherson 			   struct kvm_mmu_page *sp, bool shared)
10177b7e1ab6SDavid Matlack {
101854275f74SSean Christopherson 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1019cb00a70bSDavid Matlack 	int ret = 0;
10207b7e1ab6SDavid Matlack 
1021cb00a70bSDavid Matlack 	if (shared) {
10227b7e1ab6SDavid Matlack 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
10237b7e1ab6SDavid Matlack 		if (ret)
10247b7e1ab6SDavid Matlack 			return ret;
1025cb00a70bSDavid Matlack 	} else {
10260b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, iter, spte);
1027cb00a70bSDavid Matlack 	}
10287b7e1ab6SDavid Matlack 
102943a063caSYosry Ahmed 	tdp_account_mmu_page(kvm, sp);
10307b7e1ab6SDavid Matlack 
10317b7e1ab6SDavid Matlack 	return 0;
10327b7e1ab6SDavid Matlack }
10337b7e1ab6SDavid Matlack 
1034c4b33d28SDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1035c4b33d28SDavid Matlack 				   struct kvm_mmu_page *sp, bool shared);
1036c4b33d28SDavid Matlack 
10377b7e1ab6SDavid Matlack /*
1038bb18842eSBen Gardon  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1039bb18842eSBen Gardon  * page tables and SPTEs to translate the faulting guest physical address.
1040bb18842eSBen Gardon  */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault)10412f6305ddSPaolo Bonzini int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1042bb18842eSBen Gardon {
1043bb18842eSBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
104461f94478SSean Christopherson 	struct kvm *kvm = vcpu->kvm;
1045bb18842eSBen Gardon 	struct tdp_iter iter;
104689c0fd49SBen Gardon 	struct kvm_mmu_page *sp;
104763d28a25SPaolo Bonzini 	int ret = RET_PF_RETRY;
1048bb18842eSBen Gardon 
104973a3c659SPaolo Bonzini 	kvm_mmu_hugepage_adjust(vcpu, fault);
1050bb18842eSBen Gardon 
1051f0066d94SPaolo Bonzini 	trace_kvm_mmu_spte_requested(fault);
10527cca2d0bSBen Gardon 
10537cca2d0bSBen Gardon 	rcu_read_lock();
10547cca2d0bSBen Gardon 
10552f6305ddSPaolo Bonzini 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
105663d28a25SPaolo Bonzini 		int r;
105763d28a25SPaolo Bonzini 
105873a3c659SPaolo Bonzini 		if (fault->nx_huge_page_workaround_enabled)
1059536f0e6aSPaolo Bonzini 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1060bb18842eSBen Gardon 
1061bb18842eSBen Gardon 		/*
1062c4b33d28SDavid Matlack 		 * If SPTE has been frozen by another thread, just give up and
1063c4b33d28SDavid Matlack 		 * retry, avoiding unnecessary page table allocation and free.
1064ff76d506SKai Huang 		 */
1065ff76d506SKai Huang 		if (is_removed_spte(iter.old_spte))
106663d28a25SPaolo Bonzini 			goto retry;
106763d28a25SPaolo Bonzini 
1068f5d16bb9SSean Christopherson 		if (iter.level == fault->goal_level)
106980a3e4aeSSean Christopherson 			goto map_target_level;
1070f5d16bb9SSean Christopherson 
107163d28a25SPaolo Bonzini 		/* Step down into the lower level page table if it exists. */
107263d28a25SPaolo Bonzini 		if (is_shadow_present_pte(iter.old_spte) &&
107363d28a25SPaolo Bonzini 		    !is_large_pte(iter.old_spte))
107463d28a25SPaolo Bonzini 			continue;
1075ff76d506SKai Huang 
1076c4b33d28SDavid Matlack 		/*
1077c4b33d28SDavid Matlack 		 * The SPTE is either non-present or points to a huge page that
1078c4b33d28SDavid Matlack 		 * needs to be split.
1079c4b33d28SDavid Matlack 		 */
1080a82070b6SDavid Matlack 		sp = tdp_mmu_alloc_sp(vcpu);
1081a82070b6SDavid Matlack 		tdp_mmu_init_child_sp(sp, &iter);
1082a82070b6SDavid Matlack 
108361f94478SSean Christopherson 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
108461f94478SSean Christopherson 
1085c4b33d28SDavid Matlack 		if (is_shadow_present_pte(iter.old_spte))
108663d28a25SPaolo Bonzini 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1087c4b33d28SDavid Matlack 		else
108863d28a25SPaolo Bonzini 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1089c4b33d28SDavid Matlack 
109063d28a25SPaolo Bonzini 		/*
109180a3e4aeSSean Christopherson 		 * Force the guest to retry if installing an upper level SPTE
109280a3e4aeSSean Christopherson 		 * failed, e.g. because a different task modified the SPTE.
109363d28a25SPaolo Bonzini 		 */
109463d28a25SPaolo Bonzini 		if (r) {
10959a77daacSBen Gardon 			tdp_mmu_free_sp(sp);
109663d28a25SPaolo Bonzini 			goto retry;
10979a77daacSBen Gardon 		}
109861f94478SSean Christopherson 
109961f94478SSean Christopherson 		if (fault->huge_page_disallowed &&
110061f94478SSean Christopherson 		    fault->req_level >= iter.level) {
110161f94478SSean Christopherson 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
110221a36ac6SSean Christopherson 			if (sp->nx_huge_page_disallowed)
110361f94478SSean Christopherson 				track_possible_nx_huge_page(kvm, sp);
110461f94478SSean Christopherson 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
110561f94478SSean Christopherson 		}
1106bb18842eSBen Gardon 	}
1107bb18842eSBen Gardon 
110880a3e4aeSSean Christopherson 	/*
110980a3e4aeSSean Christopherson 	 * The walk aborted before reaching the target level, e.g. because the
111080a3e4aeSSean Christopherson 	 * iterator detected an upper level SPTE was frozen during traversal.
111180a3e4aeSSean Christopherson 	 */
111280a3e4aeSSean Christopherson 	WARN_ON_ONCE(iter.level == fault->goal_level);
111380a3e4aeSSean Christopherson 	goto retry;
111480a3e4aeSSean Christopherson 
111580a3e4aeSSean Christopherson map_target_level:
1116cdc47767SPaolo Bonzini 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1117bb18842eSBen Gardon 
111863d28a25SPaolo Bonzini retry:
111963d28a25SPaolo Bonzini 	rcu_read_unlock();
1120bb18842eSBen Gardon 	return ret;
1121bb18842eSBen Gardon }
1122063afacdSBen Gardon 
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)11233039bcc7SSean Christopherson bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
11243039bcc7SSean Christopherson 				 bool flush)
1125063afacdSBen Gardon {
112650107e8bSSean Christopherson 	struct kvm_mmu_page *root;
112750107e8bSSean Christopherson 
1128441a5dfcSPaolo Bonzini 	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
112950107e8bSSean Christopherson 		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
113050107e8bSSean Christopherson 					  range->may_block, flush);
113150107e8bSSean Christopherson 
113250107e8bSSean Christopherson 	return flush;
11333039bcc7SSean Christopherson }
11343039bcc7SSean Christopherson 
11353039bcc7SSean Christopherson typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
11363039bcc7SSean Christopherson 			      struct kvm_gfn_range *range);
11373039bcc7SSean Christopherson 
kvm_tdp_mmu_handle_gfn(struct kvm * kvm,struct kvm_gfn_range * range,tdp_handler_t handler)11383039bcc7SSean Christopherson static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
11393039bcc7SSean Christopherson 						   struct kvm_gfn_range *range,
1140c1b91493SSean Christopherson 						   tdp_handler_t handler)
1141063afacdSBen Gardon {
1142063afacdSBen Gardon 	struct kvm_mmu_page *root;
11433039bcc7SSean Christopherson 	struct tdp_iter iter;
11443039bcc7SSean Christopherson 	bool ret = false;
1145063afacdSBen Gardon 
1146063afacdSBen Gardon 	/*
1147e1eed584SSean Christopherson 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1148e1eed584SSean Christopherson 	 * into this helper allow blocking; it'd be dead, wasteful code.
1149063afacdSBen Gardon 	 */
11503039bcc7SSean Christopherson 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1151a151acecSSean Christopherson 		rcu_read_lock();
1152a151acecSSean Christopherson 
11533039bcc7SSean Christopherson 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
11543039bcc7SSean Christopherson 			ret |= handler(kvm, &iter, range);
1155063afacdSBen Gardon 
11563039bcc7SSean Christopherson 		rcu_read_unlock();
1157a151acecSSean Christopherson 	}
1158063afacdSBen Gardon 
1159063afacdSBen Gardon 	return ret;
1160063afacdSBen Gardon }
1161063afacdSBen Gardon 
1162f8e14497SBen Gardon /*
1163f8e14497SBen Gardon  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1164f8e14497SBen Gardon  * if any of the GFNs in the range have been accessed.
11657ee131e3SVipin Sharma  *
11667ee131e3SVipin Sharma  * No need to mark the corresponding PFN as accessed as this call is coming
11677ee131e3SVipin Sharma  * from the clear_young() or clear_flush_young() notifier, which uses the
11687ee131e3SVipin Sharma  * return value to determine if the page has been accessed.
1169f8e14497SBen Gardon  */
age_gfn_range(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)11703039bcc7SSean Christopherson static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
11713039bcc7SSean Christopherson 			  struct kvm_gfn_range *range)
1172f8e14497SBen Gardon {
11737ee131e3SVipin Sharma 	u64 new_spte;
1174f8e14497SBen Gardon 
11753039bcc7SSean Christopherson 	/* If we have a non-accessed entry we don't need to change the pte. */
11763039bcc7SSean Christopherson 	if (!is_accessed_spte(iter->old_spte))
11773039bcc7SSean Christopherson 		return false;
11787cca2d0bSBen Gardon 
11797ee131e3SVipin Sharma 	if (spte_ad_enabled(iter->old_spte)) {
11807ee131e3SVipin Sharma 		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
11817ee131e3SVipin Sharma 							 iter->old_spte,
11827ee131e3SVipin Sharma 							 shadow_accessed_mask,
11837ee131e3SVipin Sharma 							 iter->level);
11847ee131e3SVipin Sharma 		new_spte = iter->old_spte & ~shadow_accessed_mask;
1185f8e14497SBen Gardon 	} else {
1186f8e14497SBen Gardon 		/*
1187f8e14497SBen Gardon 		 * Capture the dirty status of the page, so that it doesn't get
1188f8e14497SBen Gardon 		 * lost when the SPTE is marked for access tracking.
1189f8e14497SBen Gardon 		 */
11907ee131e3SVipin Sharma 		if (is_writable_pte(iter->old_spte))
11917ee131e3SVipin Sharma 			kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
1192f8e14497SBen Gardon 
11937ee131e3SVipin Sharma 		new_spte = mark_spte_for_access_track(iter->old_spte);
11947ee131e3SVipin Sharma 		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
11957ee131e3SVipin Sharma 							iter->old_spte, new_spte,
11967ee131e3SVipin Sharma 							iter->level);
1197f8e14497SBen Gardon 	}
1198f8e14497SBen Gardon 
1199891f1159SVipin Sharma 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1200891f1159SVipin Sharma 				       iter->old_spte, new_spte);
12013039bcc7SSean Christopherson 	return true;
1202f8e14497SBen Gardon }
1203f8e14497SBen Gardon 
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)12043039bcc7SSean Christopherson bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1205f8e14497SBen Gardon {
12063039bcc7SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1207f8e14497SBen Gardon }
1208f8e14497SBen Gardon 
test_age_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)12093039bcc7SSean Christopherson static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
12103039bcc7SSean Christopherson 			 struct kvm_gfn_range *range)
1211f8e14497SBen Gardon {
12123039bcc7SSean Christopherson 	return is_accessed_spte(iter->old_spte);
1213f8e14497SBen Gardon }
1214f8e14497SBen Gardon 
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)12153039bcc7SSean Christopherson bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1216f8e14497SBen Gardon {
12173039bcc7SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
12183039bcc7SSean Christopherson }
12193039bcc7SSean Christopherson 
set_spte_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)12203039bcc7SSean Christopherson static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
12213039bcc7SSean Christopherson 			 struct kvm_gfn_range *range)
12223039bcc7SSean Christopherson {
12233039bcc7SSean Christopherson 	u64 new_spte;
12243039bcc7SSean Christopherson 
12253039bcc7SSean Christopherson 	/* Huge pages aren't expected to be modified without first being zapped. */
122620ba462dSSean Christopherson 	WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
12273039bcc7SSean Christopherson 
12283039bcc7SSean Christopherson 	if (iter->level != PG_LEVEL_4K ||
12293039bcc7SSean Christopherson 	    !is_shadow_present_pte(iter->old_spte))
12303039bcc7SSean Christopherson 		return false;
12313039bcc7SSean Christopherson 
12323039bcc7SSean Christopherson 	/*
12333039bcc7SSean Christopherson 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
12343039bcc7SSean Christopherson 	 * zero the SPTE before setting the new PFN, but doing so preserves the
12353039bcc7SSean Christopherson 	 * invariant that the PFN of a present * leaf SPTE can never change.
123640fa907eSVipin Sharma 	 * See handle_changed_spte().
12373039bcc7SSean Christopherson 	 */
12380b7cc254SVipin Sharma 	tdp_mmu_iter_set_spte(kvm, iter, 0);
12393039bcc7SSean Christopherson 
12403e1efe2bSSean Christopherson 	if (!pte_write(range->arg.pte)) {
12413039bcc7SSean Christopherson 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
12423e1efe2bSSean Christopherson 								  pte_pfn(range->arg.pte));
12433039bcc7SSean Christopherson 
12440b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, iter, new_spte);
12453039bcc7SSean Christopherson 	}
12463039bcc7SSean Christopherson 
12473039bcc7SSean Christopherson 	return true;
1248f8e14497SBen Gardon }
12491d8dd6b3SBen Gardon 
12501d8dd6b3SBen Gardon /*
12511d8dd6b3SBen Gardon  * Handle the changed_pte MMU notifier for the TDP MMU.
12521d8dd6b3SBen Gardon  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
12531d8dd6b3SBen Gardon  * notifier.
12541d8dd6b3SBen Gardon  * Returns non-zero if a flush is needed before releasing the MMU lock.
12551d8dd6b3SBen Gardon  */
kvm_tdp_mmu_set_spte_gfn(struct kvm * kvm,struct kvm_gfn_range * range)12563039bcc7SSean Christopherson bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
12571d8dd6b3SBen Gardon {
125893fa50f6SSean Christopherson 	/*
125993fa50f6SSean Christopherson 	 * No need to handle the remote TLB flush under RCU protection, the
126093fa50f6SSean Christopherson 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
126140fa907eSVipin Sharma 	 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
126293fa50f6SSean Christopherson 	 */
126393fa50f6SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
12641d8dd6b3SBen Gardon }
12651d8dd6b3SBen Gardon 
1266a6a0b05dSBen Gardon /*
1267bedd9195SDavid Matlack  * Remove write access from all SPTEs at or above min_level that map GFNs
1268bedd9195SDavid Matlack  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1269bedd9195SDavid Matlack  * be flushed.
1270a6a0b05dSBen Gardon  */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1271a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1272a6a0b05dSBen Gardon 			     gfn_t start, gfn_t end, int min_level)
1273a6a0b05dSBen Gardon {
1274a6a0b05dSBen Gardon 	struct tdp_iter iter;
1275a6a0b05dSBen Gardon 	u64 new_spte;
1276a6a0b05dSBen Gardon 	bool spte_set = false;
1277a6a0b05dSBen Gardon 
12787cca2d0bSBen Gardon 	rcu_read_lock();
12797cca2d0bSBen Gardon 
1280a6a0b05dSBen Gardon 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1281a6a0b05dSBen Gardon 
128277aa6075SDavid Matlack 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
128324ae4cfaSBen Gardon retry:
128424ae4cfaSBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
12851af4a960SBen Gardon 			continue;
12861af4a960SBen Gardon 
1287a6a0b05dSBen Gardon 		if (!is_shadow_present_pte(iter.old_spte) ||
12880f99ee2cSBen Gardon 		    !is_last_spte(iter.old_spte, iter.level) ||
12890f99ee2cSBen Gardon 		    !(iter.old_spte & PT_WRITABLE_MASK))
1290a6a0b05dSBen Gardon 			continue;
1291a6a0b05dSBen Gardon 
1292a6a0b05dSBen Gardon 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1293a6a0b05dSBen Gardon 
12943e72c791SDavid Matlack 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
129524ae4cfaSBen Gardon 			goto retry;
12963255530aSDavid Matlack 
1297a6a0b05dSBen Gardon 		spte_set = true;
1298a6a0b05dSBen Gardon 	}
12997cca2d0bSBen Gardon 
13007cca2d0bSBen Gardon 	rcu_read_unlock();
1301a6a0b05dSBen Gardon 	return spte_set;
1302a6a0b05dSBen Gardon }
1303a6a0b05dSBen Gardon 
1304a6a0b05dSBen Gardon /*
1305a6a0b05dSBen Gardon  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1306a6a0b05dSBen Gardon  * only affect leaf SPTEs down to min_level.
1307a6a0b05dSBen Gardon  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1308a6a0b05dSBen Gardon  */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1309269e9552SHamza Mahfooz bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1310269e9552SHamza Mahfooz 			     const struct kvm_memory_slot *slot, int min_level)
1311a6a0b05dSBen Gardon {
1312a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1313a6a0b05dSBen Gardon 	bool spte_set = false;
1314a6a0b05dSBen Gardon 
131524ae4cfaSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
1316a6a0b05dSBen Gardon 
1317d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1318a6a0b05dSBen Gardon 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1319a6a0b05dSBen Gardon 			     slot->base_gfn + slot->npages, min_level);
1320a6a0b05dSBen Gardon 
1321a6a0b05dSBen Gardon 	return spte_set;
1322a6a0b05dSBen Gardon }
1323a6a0b05dSBen Gardon 
__tdp_mmu_alloc_sp_for_split(gfp_t gfp)1324a3fe5dbdSDavid Matlack static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1325a3fe5dbdSDavid Matlack {
1326a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp;
1327a3fe5dbdSDavid Matlack 
1328a3fe5dbdSDavid Matlack 	gfp |= __GFP_ZERO;
1329a3fe5dbdSDavid Matlack 
1330a3fe5dbdSDavid Matlack 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1331a3fe5dbdSDavid Matlack 	if (!sp)
1332a3fe5dbdSDavid Matlack 		return NULL;
1333a3fe5dbdSDavid Matlack 
1334a3fe5dbdSDavid Matlack 	sp->spt = (void *)__get_free_page(gfp);
1335a3fe5dbdSDavid Matlack 	if (!sp->spt) {
1336a3fe5dbdSDavid Matlack 		kmem_cache_free(mmu_page_header_cache, sp);
1337a3fe5dbdSDavid Matlack 		return NULL;
1338a3fe5dbdSDavid Matlack 	}
1339a3fe5dbdSDavid Matlack 
1340a3fe5dbdSDavid Matlack 	return sp;
1341a3fe5dbdSDavid Matlack }
1342a3fe5dbdSDavid Matlack 
tdp_mmu_alloc_sp_for_split(struct kvm * kvm,struct tdp_iter * iter,bool shared)1343a3fe5dbdSDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1344cb00a70bSDavid Matlack 						       struct tdp_iter *iter,
1345cb00a70bSDavid Matlack 						       bool shared)
1346a3fe5dbdSDavid Matlack {
1347a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp;
1348a3fe5dbdSDavid Matlack 
1349a3fe5dbdSDavid Matlack 	/*
1350a3fe5dbdSDavid Matlack 	 * Since we are allocating while under the MMU lock we have to be
1351a3fe5dbdSDavid Matlack 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1352a3fe5dbdSDavid Matlack 	 * reclaim and to avoid making any filesystem callbacks (which can end
1353a3fe5dbdSDavid Matlack 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1354a3fe5dbdSDavid Matlack 	 *
1355a3fe5dbdSDavid Matlack 	 * If this allocation fails we drop the lock and retry with reclaim
1356a3fe5dbdSDavid Matlack 	 * allowed.
1357a3fe5dbdSDavid Matlack 	 */
1358a3fe5dbdSDavid Matlack 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1359a3fe5dbdSDavid Matlack 	if (sp)
1360a3fe5dbdSDavid Matlack 		return sp;
1361a3fe5dbdSDavid Matlack 
1362a3fe5dbdSDavid Matlack 	rcu_read_unlock();
1363cb00a70bSDavid Matlack 
1364cb00a70bSDavid Matlack 	if (shared)
1365a3fe5dbdSDavid Matlack 		read_unlock(&kvm->mmu_lock);
1366cb00a70bSDavid Matlack 	else
1367cb00a70bSDavid Matlack 		write_unlock(&kvm->mmu_lock);
1368a3fe5dbdSDavid Matlack 
1369a3fe5dbdSDavid Matlack 	iter->yielded = true;
1370a3fe5dbdSDavid Matlack 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1371a3fe5dbdSDavid Matlack 
1372cb00a70bSDavid Matlack 	if (shared)
1373a3fe5dbdSDavid Matlack 		read_lock(&kvm->mmu_lock);
1374cb00a70bSDavid Matlack 	else
1375cb00a70bSDavid Matlack 		write_lock(&kvm->mmu_lock);
1376cb00a70bSDavid Matlack 
1377a3fe5dbdSDavid Matlack 	rcu_read_lock();
1378a3fe5dbdSDavid Matlack 
1379a3fe5dbdSDavid Matlack 	return sp;
1380a3fe5dbdSDavid Matlack }
1381a3fe5dbdSDavid Matlack 
1382c4b33d28SDavid Matlack /* Note, the caller is responsible for initializing @sp. */
tdp_mmu_split_huge_page(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1383cb00a70bSDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1384cb00a70bSDavid Matlack 				   struct kvm_mmu_page *sp, bool shared)
1385a3fe5dbdSDavid Matlack {
1386a3fe5dbdSDavid Matlack 	const u64 huge_spte = iter->old_spte;
1387a3fe5dbdSDavid Matlack 	const int level = iter->level;
1388a3fe5dbdSDavid Matlack 	int ret, i;
1389a3fe5dbdSDavid Matlack 
1390a3fe5dbdSDavid Matlack 	/*
1391a3fe5dbdSDavid Matlack 	 * No need for atomics when writing to sp->spt since the page table has
1392a3fe5dbdSDavid Matlack 	 * not been linked in yet and thus is not reachable from any other CPU.
1393a3fe5dbdSDavid Matlack 	 */
13942ca3129eSSean Christopherson 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
139547855da0SDavid Matlack 		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1396a3fe5dbdSDavid Matlack 
1397a3fe5dbdSDavid Matlack 	/*
1398a3fe5dbdSDavid Matlack 	 * Replace the huge spte with a pointer to the populated lower level
1399a3fe5dbdSDavid Matlack 	 * page table. Since we are making this change without a TLB flush vCPUs
1400a3fe5dbdSDavid Matlack 	 * will see a mix of the split mappings and the original huge mapping,
1401a3fe5dbdSDavid Matlack 	 * depending on what's currently in their TLB. This is fine from a
1402a3fe5dbdSDavid Matlack 	 * correctness standpoint since the translation will be the same either
1403a3fe5dbdSDavid Matlack 	 * way.
1404a3fe5dbdSDavid Matlack 	 */
140561f94478SSean Christopherson 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1406a3fe5dbdSDavid Matlack 	if (ret)
1407e0b728b1SDavid Matlack 		goto out;
1408a3fe5dbdSDavid Matlack 
1409a3fe5dbdSDavid Matlack 	/*
1410a3fe5dbdSDavid Matlack 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1411a3fe5dbdSDavid Matlack 	 * are overwriting from the page stats. But we have to manually update
1412a3fe5dbdSDavid Matlack 	 * the page stats with the new present child pages.
1413a3fe5dbdSDavid Matlack 	 */
14142ca3129eSSean Christopherson 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1415a3fe5dbdSDavid Matlack 
1416e0b728b1SDavid Matlack out:
1417e0b728b1SDavid Matlack 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1418e0b728b1SDavid Matlack 	return ret;
1419a3fe5dbdSDavid Matlack }
1420a3fe5dbdSDavid Matlack 
tdp_mmu_split_huge_pages_root(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int target_level,bool shared)1421a3fe5dbdSDavid Matlack static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1422a3fe5dbdSDavid Matlack 					 struct kvm_mmu_page *root,
1423a3fe5dbdSDavid Matlack 					 gfn_t start, gfn_t end,
1424cb00a70bSDavid Matlack 					 int target_level, bool shared)
1425a3fe5dbdSDavid Matlack {
1426a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp = NULL;
1427a3fe5dbdSDavid Matlack 	struct tdp_iter iter;
1428a3fe5dbdSDavid Matlack 	int ret = 0;
1429a3fe5dbdSDavid Matlack 
1430a3fe5dbdSDavid Matlack 	rcu_read_lock();
1431a3fe5dbdSDavid Matlack 
1432a3fe5dbdSDavid Matlack 	/*
1433a3fe5dbdSDavid Matlack 	 * Traverse the page table splitting all huge pages above the target
1434a3fe5dbdSDavid Matlack 	 * level into one lower level. For example, if we encounter a 1GB page
1435a3fe5dbdSDavid Matlack 	 * we split it into 512 2MB pages.
1436a3fe5dbdSDavid Matlack 	 *
1437a3fe5dbdSDavid Matlack 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1438a3fe5dbdSDavid Matlack 	 * to visit an SPTE before ever visiting its children, which means we
1439a3fe5dbdSDavid Matlack 	 * will correctly recursively split huge pages that are more than one
1440a3fe5dbdSDavid Matlack 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1441a3fe5dbdSDavid Matlack 	 * and then splitting each of those to 512 4KB pages).
1442a3fe5dbdSDavid Matlack 	 */
1443a3fe5dbdSDavid Matlack 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1444a3fe5dbdSDavid Matlack retry:
1445cb00a70bSDavid Matlack 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1446a3fe5dbdSDavid Matlack 			continue;
1447a3fe5dbdSDavid Matlack 
1448a3fe5dbdSDavid Matlack 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1449a3fe5dbdSDavid Matlack 			continue;
1450a3fe5dbdSDavid Matlack 
1451a3fe5dbdSDavid Matlack 		if (!sp) {
1452cb00a70bSDavid Matlack 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1453a3fe5dbdSDavid Matlack 			if (!sp) {
1454a3fe5dbdSDavid Matlack 				ret = -ENOMEM;
1455e0b728b1SDavid Matlack 				trace_kvm_mmu_split_huge_page(iter.gfn,
1456e0b728b1SDavid Matlack 							      iter.old_spte,
1457e0b728b1SDavid Matlack 							      iter.level, ret);
1458a3fe5dbdSDavid Matlack 				break;
1459a3fe5dbdSDavid Matlack 			}
1460a3fe5dbdSDavid Matlack 
1461a3fe5dbdSDavid Matlack 			if (iter.yielded)
1462a3fe5dbdSDavid Matlack 				continue;
1463a3fe5dbdSDavid Matlack 		}
1464a3fe5dbdSDavid Matlack 
1465c4b33d28SDavid Matlack 		tdp_mmu_init_child_sp(sp, &iter);
1466c4b33d28SDavid Matlack 
1467cb00a70bSDavid Matlack 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1468a3fe5dbdSDavid Matlack 			goto retry;
1469a3fe5dbdSDavid Matlack 
1470a3fe5dbdSDavid Matlack 		sp = NULL;
1471a3fe5dbdSDavid Matlack 	}
1472a3fe5dbdSDavid Matlack 
1473a3fe5dbdSDavid Matlack 	rcu_read_unlock();
1474a3fe5dbdSDavid Matlack 
1475a3fe5dbdSDavid Matlack 	/*
1476a3fe5dbdSDavid Matlack 	 * It's possible to exit the loop having never used the last sp if, for
1477a3fe5dbdSDavid Matlack 	 * example, a vCPU doing HugePage NX splitting wins the race and
1478a3fe5dbdSDavid Matlack 	 * installs its own sp in place of the last sp we tried to split.
1479a3fe5dbdSDavid Matlack 	 */
1480a3fe5dbdSDavid Matlack 	if (sp)
1481a3fe5dbdSDavid Matlack 		tdp_mmu_free_sp(sp);
1482a3fe5dbdSDavid Matlack 
1483a3fe5dbdSDavid Matlack 	return ret;
1484a3fe5dbdSDavid Matlack }
1485a3fe5dbdSDavid Matlack 
1486cb00a70bSDavid Matlack 
1487a3fe5dbdSDavid Matlack /*
1488a3fe5dbdSDavid Matlack  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1489a3fe5dbdSDavid Matlack  */
kvm_tdp_mmu_try_split_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot,gfn_t start,gfn_t end,int target_level,bool shared)1490a3fe5dbdSDavid Matlack void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1491a3fe5dbdSDavid Matlack 				      const struct kvm_memory_slot *slot,
1492a3fe5dbdSDavid Matlack 				      gfn_t start, gfn_t end,
1493cb00a70bSDavid Matlack 				      int target_level, bool shared)
1494a3fe5dbdSDavid Matlack {
1495a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *root;
1496a3fe5dbdSDavid Matlack 	int r = 0;
1497a3fe5dbdSDavid Matlack 
1498cb00a70bSDavid Matlack 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1499a3fe5dbdSDavid Matlack 
15007c554d8eSPaolo Bonzini 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1501cb00a70bSDavid Matlack 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1502a3fe5dbdSDavid Matlack 		if (r) {
1503cb00a70bSDavid Matlack 			kvm_tdp_mmu_put_root(kvm, root, shared);
1504a3fe5dbdSDavid Matlack 			break;
1505a3fe5dbdSDavid Matlack 		}
1506a3fe5dbdSDavid Matlack 	}
1507a3fe5dbdSDavid Matlack }
1508a3fe5dbdSDavid Matlack 
tdp_mmu_need_write_protect(struct kvm_mmu_page * sp)1509cdf811a9SDavid Matlack static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
1510cdf811a9SDavid Matlack {
1511cdf811a9SDavid Matlack 	/*
1512cdf811a9SDavid Matlack 	 * All TDP MMU shadow pages share the same role as their root, aside
1513cdf811a9SDavid Matlack 	 * from level, so it is valid to key off any shadow page to determine if
1514cdf811a9SDavid Matlack 	 * write protection is needed for an entire tree.
1515cdf811a9SDavid Matlack 	 */
1516cdf811a9SDavid Matlack 	return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled();
1517cdf811a9SDavid Matlack }
1518cdf811a9SDavid Matlack 
1519a6a0b05dSBen Gardon /*
1520a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1521a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1522a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1523a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1524a6a0b05dSBen Gardon  * be flushed.
1525a6a0b05dSBen Gardon  */
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1526a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1527a6a0b05dSBen Gardon 			   gfn_t start, gfn_t end)
1528a6a0b05dSBen Gardon {
1529cdf811a9SDavid Matlack 	const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
1530cdf811a9SDavid Matlack 							    shadow_dirty_mask;
1531a6a0b05dSBen Gardon 	struct tdp_iter iter;
1532a6a0b05dSBen Gardon 	bool spte_set = false;
1533a6a0b05dSBen Gardon 
15347cca2d0bSBen Gardon 	rcu_read_lock();
15357cca2d0bSBen Gardon 
1536a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
153724ae4cfaSBen Gardon retry:
153824ae4cfaSBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
15391af4a960SBen Gardon 			continue;
15401af4a960SBen Gardon 
15413354ef5aSSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte))
15423354ef5aSSean Christopherson 			continue;
15433354ef5aSSean Christopherson 
1544cdf811a9SDavid Matlack 		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
15455982a539SVipin Sharma 				spte_ad_need_write_protect(iter.old_spte));
15465982a539SVipin Sharma 
1547697c89beSVipin Sharma 		if (!(iter.old_spte & dbit))
1548a6a0b05dSBen Gardon 			continue;
1549a6a0b05dSBen Gardon 
1550697c89beSVipin Sharma 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
155124ae4cfaSBen Gardon 			goto retry;
15523255530aSDavid Matlack 
1553a6a0b05dSBen Gardon 		spte_set = true;
1554a6a0b05dSBen Gardon 	}
15557cca2d0bSBen Gardon 
15567cca2d0bSBen Gardon 	rcu_read_unlock();
1557a6a0b05dSBen Gardon 	return spte_set;
1558a6a0b05dSBen Gardon }
1559a6a0b05dSBen Gardon 
1560a6a0b05dSBen Gardon /*
1561a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1562a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1563a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1564a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1565a6a0b05dSBen Gardon  * be flushed.
1566a6a0b05dSBen Gardon  */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1567269e9552SHamza Mahfooz bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1568269e9552SHamza Mahfooz 				  const struct kvm_memory_slot *slot)
1569a6a0b05dSBen Gardon {
1570a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1571a6a0b05dSBen Gardon 	bool spte_set = false;
1572a6a0b05dSBen Gardon 
157324ae4cfaSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
1574a6a0b05dSBen Gardon 
1575d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1576a6a0b05dSBen Gardon 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1577a6a0b05dSBen Gardon 				slot->base_gfn + slot->npages);
1578a6a0b05dSBen Gardon 
1579a6a0b05dSBen Gardon 	return spte_set;
1580a6a0b05dSBen Gardon }
1581a6a0b05dSBen Gardon 
1582a6a0b05dSBen Gardon /*
1583a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1584a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1585a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1586a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1587a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1588a6a0b05dSBen Gardon  */
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1589a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1590a6a0b05dSBen Gardon 				  gfn_t gfn, unsigned long mask, bool wrprot)
1591a6a0b05dSBen Gardon {
1592cdf811a9SDavid Matlack 	const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
1593697c89beSVipin Sharma 									shadow_dirty_mask;
1594a6a0b05dSBen Gardon 	struct tdp_iter iter;
1595a6a0b05dSBen Gardon 
159691303f80SLike Xu 	lockdep_assert_held_write(&kvm->mmu_lock);
159791303f80SLike Xu 
15987cca2d0bSBen Gardon 	rcu_read_lock();
15997cca2d0bSBen Gardon 
1600a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1601a6a0b05dSBen Gardon 				    gfn + BITS_PER_LONG) {
1602a6a0b05dSBen Gardon 		if (!mask)
1603a6a0b05dSBen Gardon 			break;
1604a6a0b05dSBen Gardon 
1605cdf811a9SDavid Matlack 		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
16065982a539SVipin Sharma 				spte_ad_need_write_protect(iter.old_spte));
16075982a539SVipin Sharma 
1608a6a0b05dSBen Gardon 		if (iter.level > PG_LEVEL_4K ||
1609a6a0b05dSBen Gardon 		    !(mask & (1UL << (iter.gfn - gfn))))
1610a6a0b05dSBen Gardon 			continue;
1611a6a0b05dSBen Gardon 
1612f1b3b06aSBen Gardon 		mask &= ~(1UL << (iter.gfn - gfn));
1613f1b3b06aSBen Gardon 
1614697c89beSVipin Sharma 		if (!(iter.old_spte & dbit))
1615a6a0b05dSBen Gardon 			continue;
1616a6a0b05dSBen Gardon 
161789c313f2SVipin Sharma 		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
161889c313f2SVipin Sharma 							iter.old_spte, dbit,
161989c313f2SVipin Sharma 							iter.level);
162089c313f2SVipin Sharma 
16211e0f4298SVipin Sharma 		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
16221e0f4298SVipin Sharma 					       iter.old_spte,
16231e0f4298SVipin Sharma 					       iter.old_spte & ~dbit);
16241e0f4298SVipin Sharma 		kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1625a6a0b05dSBen Gardon 	}
16267cca2d0bSBen Gardon 
16277cca2d0bSBen Gardon 	rcu_read_unlock();
1628a6a0b05dSBen Gardon }
1629a6a0b05dSBen Gardon 
1630a6a0b05dSBen Gardon /*
1631a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1632a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1633a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1634a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1635a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1636a6a0b05dSBen Gardon  */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1637a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1638a6a0b05dSBen Gardon 				       struct kvm_memory_slot *slot,
1639a6a0b05dSBen Gardon 				       gfn_t gfn, unsigned long mask,
1640a6a0b05dSBen Gardon 				       bool wrprot)
1641a6a0b05dSBen Gardon {
1642a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1643a6a0b05dSBen Gardon 
1644a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1645a6a0b05dSBen Gardon 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1646a6a0b05dSBen Gardon }
1647a6a0b05dSBen Gardon 
zap_collapsible_spte_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)16484b85c921SSean Christopherson static void zap_collapsible_spte_range(struct kvm *kvm,
164914881998SBen Gardon 				       struct kvm_mmu_page *root,
16504b85c921SSean Christopherson 				       const struct kvm_memory_slot *slot)
165114881998SBen Gardon {
16529eba50f8SSean Christopherson 	gfn_t start = slot->base_gfn;
16539eba50f8SSean Christopherson 	gfn_t end = start + slot->npages;
165414881998SBen Gardon 	struct tdp_iter iter;
16555ba7c4c6SBen Gardon 	int max_mapping_level;
165614881998SBen Gardon 
16577cca2d0bSBen Gardon 	rcu_read_lock();
16587cca2d0bSBen Gardon 
165985f44f8cSSean Christopherson 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
166085f44f8cSSean Christopherson retry:
16614b85c921SSean Christopherson 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
16621af4a960SBen Gardon 			continue;
16631af4a960SBen Gardon 
166485f44f8cSSean Christopherson 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
166585f44f8cSSean Christopherson 		    !is_shadow_present_pte(iter.old_spte))
166685f44f8cSSean Christopherson 			continue;
166785f44f8cSSean Christopherson 
166885f44f8cSSean Christopherson 		/*
166985f44f8cSSean Christopherson 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
167085f44f8cSSean Christopherson 		 * a large page size, then its parent would have been zapped
167185f44f8cSSean Christopherson 		 * instead of stepping down.
167285f44f8cSSean Christopherson 		 */
167385f44f8cSSean Christopherson 		if (is_last_spte(iter.old_spte, iter.level))
167485f44f8cSSean Christopherson 			continue;
167585f44f8cSSean Christopherson 
167685f44f8cSSean Christopherson 		/*
167785f44f8cSSean Christopherson 		 * If iter.gfn resides outside of the slot, i.e. the page for
167885f44f8cSSean Christopherson 		 * the current level overlaps but is not contained by the slot,
167985f44f8cSSean Christopherson 		 * then the SPTE can't be made huge.  More importantly, trying
168085f44f8cSSean Christopherson 		 * to query that info from slot->arch.lpage_info will cause an
168185f44f8cSSean Christopherson 		 * out-of-bounds access.
168285f44f8cSSean Christopherson 		 */
168385f44f8cSSean Christopherson 		if (iter.gfn < start || iter.gfn >= end)
168414881998SBen Gardon 			continue;
168514881998SBen Gardon 
16865ba7c4c6SBen Gardon 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1687a8ac499bSSean Christopherson 							      iter.gfn, PG_LEVEL_NUM);
168885f44f8cSSean Christopherson 		if (max_mapping_level < iter.level)
16895ba7c4c6SBen Gardon 			continue;
16905ba7c4c6SBen Gardon 
16914b85c921SSean Christopherson 		/* Note, a successful atomic zap also does a remote TLB flush. */
169285f44f8cSSean Christopherson 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
169385f44f8cSSean Christopherson 			goto retry;
16942db6f772SBen Gardon 	}
169514881998SBen Gardon 
16967cca2d0bSBen Gardon 	rcu_read_unlock();
169714881998SBen Gardon }
169814881998SBen Gardon 
169914881998SBen Gardon /*
170085f44f8cSSean Christopherson  * Zap non-leaf SPTEs (and free their associated page tables) which could
170185f44f8cSSean Christopherson  * be replaced by huge pages, for GFNs within the slot.
170214881998SBen Gardon  */
kvm_tdp_mmu_zap_collapsible_sptes(struct kvm * kvm,const struct kvm_memory_slot * slot)17034b85c921SSean Christopherson void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
17044b85c921SSean Christopherson 				       const struct kvm_memory_slot *slot)
170514881998SBen Gardon {
170614881998SBen Gardon 	struct kvm_mmu_page *root;
170714881998SBen Gardon 
17082db6f772SBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
170914881998SBen Gardon 
1710d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
17114b85c921SSean Christopherson 		zap_collapsible_spte_range(kvm, root, slot);
171214881998SBen Gardon }
171346044f72SBen Gardon 
171446044f72SBen Gardon /*
171546044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
17165fc3424fSSean Christopherson  * MMU-writable bit to ensure future writes continue to be intercepted.
171746044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
171846044f72SBen Gardon  */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)171946044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
17203ad93562SKeqian Zhu 			      gfn_t gfn, int min_level)
172146044f72SBen Gardon {
172246044f72SBen Gardon 	struct tdp_iter iter;
172346044f72SBen Gardon 	u64 new_spte;
172446044f72SBen Gardon 	bool spte_set = false;
172546044f72SBen Gardon 
17263ad93562SKeqian Zhu 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
17273ad93562SKeqian Zhu 
17287cca2d0bSBen Gardon 	rcu_read_lock();
17297cca2d0bSBen Gardon 
173077aa6075SDavid Matlack 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
17313ad93562SKeqian Zhu 		if (!is_shadow_present_pte(iter.old_spte) ||
17323ad93562SKeqian Zhu 		    !is_last_spte(iter.old_spte, iter.level))
17333ad93562SKeqian Zhu 			continue;
17343ad93562SKeqian Zhu 
173546044f72SBen Gardon 		new_spte = iter.old_spte &
17365fc3424fSSean Christopherson 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
173746044f72SBen Gardon 
17387c8a4742SDavid Matlack 		if (new_spte == iter.old_spte)
17397c8a4742SDavid Matlack 			break;
17407c8a4742SDavid Matlack 
17410b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
174246044f72SBen Gardon 		spte_set = true;
174346044f72SBen Gardon 	}
174446044f72SBen Gardon 
17457cca2d0bSBen Gardon 	rcu_read_unlock();
17467cca2d0bSBen Gardon 
174746044f72SBen Gardon 	return spte_set;
174846044f72SBen Gardon }
174946044f72SBen Gardon 
175046044f72SBen Gardon /*
175146044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
17525fc3424fSSean Christopherson  * MMU-writable bit to ensure future writes continue to be intercepted.
175346044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
175446044f72SBen Gardon  */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)175546044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
17563ad93562SKeqian Zhu 				   struct kvm_memory_slot *slot, gfn_t gfn,
17573ad93562SKeqian Zhu 				   int min_level)
175846044f72SBen Gardon {
175946044f72SBen Gardon 	struct kvm_mmu_page *root;
176046044f72SBen Gardon 	bool spte_set = false;
176146044f72SBen Gardon 
1762531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
1763a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
17643ad93562SKeqian Zhu 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1765a3f15bdaSSean Christopherson 
176646044f72SBen Gardon 	return spte_set;
176746044f72SBen Gardon }
176846044f72SBen Gardon 
176995fb5b02SBen Gardon /*
177095fb5b02SBen Gardon  * Return the level of the lowest level SPTE added to sptes.
177195fb5b02SBen Gardon  * That SPTE may be non-present.
1772c5c8c7c5SDavid Matlack  *
1773c5c8c7c5SDavid Matlack  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
177495fb5b02SBen Gardon  */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)177539b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
177639b4d43eSSean Christopherson 			 int *root_level)
177795fb5b02SBen Gardon {
177895fb5b02SBen Gardon 	struct tdp_iter iter;
177995fb5b02SBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
178095fb5b02SBen Gardon 	gfn_t gfn = addr >> PAGE_SHIFT;
17812aa07893SSean Christopherson 	int leaf = -1;
178295fb5b02SBen Gardon 
1783a972e29cSPaolo Bonzini 	*root_level = vcpu->arch.mmu->root_role.level;
178495fb5b02SBen Gardon 
178595fb5b02SBen Gardon 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
178695fb5b02SBen Gardon 		leaf = iter.level;
1787dde81f94SSean Christopherson 		sptes[leaf] = iter.old_spte;
178895fb5b02SBen Gardon 	}
178995fb5b02SBen Gardon 
179095fb5b02SBen Gardon 	return leaf;
179195fb5b02SBen Gardon }
17926e8eb206SDavid Matlack 
17936e8eb206SDavid Matlack /*
17946e8eb206SDavid Matlack  * Returns the last level spte pointer of the shadow page walk for the given
17956e8eb206SDavid Matlack  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
17966e8eb206SDavid Matlack  * walk could be performed, returns NULL and *spte does not contain valid data.
17976e8eb206SDavid Matlack  *
17986e8eb206SDavid Matlack  * Contract:
17996e8eb206SDavid Matlack  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
18006e8eb206SDavid Matlack  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
18016e8eb206SDavid Matlack  *
18026e8eb206SDavid Matlack  * WARNING: This function is only intended to be called during fast_page_fault.
18036e8eb206SDavid Matlack  */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,u64 addr,u64 * spte)18046e8eb206SDavid Matlack u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
18056e8eb206SDavid Matlack 					u64 *spte)
18066e8eb206SDavid Matlack {
18076e8eb206SDavid Matlack 	struct tdp_iter iter;
18086e8eb206SDavid Matlack 	struct kvm_mmu *mmu = vcpu->arch.mmu;
18096e8eb206SDavid Matlack 	gfn_t gfn = addr >> PAGE_SHIFT;
18106e8eb206SDavid Matlack 	tdp_ptep_t sptep = NULL;
18116e8eb206SDavid Matlack 
18126e8eb206SDavid Matlack 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
18136e8eb206SDavid Matlack 		*spte = iter.old_spte;
18146e8eb206SDavid Matlack 		sptep = iter.sptep;
18156e8eb206SDavid Matlack 	}
18166e8eb206SDavid Matlack 
18176e8eb206SDavid Matlack 	/*
18186e8eb206SDavid Matlack 	 * Perform the rcu_dereference to get the raw spte pointer value since
18196e8eb206SDavid Matlack 	 * we are passing it up to fast_page_fault, which is shared with the
18206e8eb206SDavid Matlack 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
18216e8eb206SDavid Matlack 	 * annotation.
18226e8eb206SDavid Matlack 	 *
18236e8eb206SDavid Matlack 	 * This is safe since fast_page_fault obeys the contracts of this
18246e8eb206SDavid Matlack 	 * function as well as all TDP MMU contracts around modifying SPTEs
18256e8eb206SDavid Matlack 	 * outside of mmu_lock.
18266e8eb206SDavid Matlack 	 */
18276e8eb206SDavid Matlack 	return rcu_dereference(sptep);
18286e8eb206SDavid Matlack }
1829