xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision c5f2d564)
1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0
28d20bd63SSean Christopherson #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3fe5db27dSBen Gardon 
402c00b3aSBen Gardon #include "mmu.h"
502c00b3aSBen Gardon #include "mmu_internal.h"
6bb18842eSBen Gardon #include "mmutrace.h"
72f2fad08SBen Gardon #include "tdp_iter.h"
8fe5db27dSBen Gardon #include "tdp_mmu.h"
902c00b3aSBen Gardon #include "spte.h"
10fe5db27dSBen Gardon 
119a77daacSBen Gardon #include <asm/cmpxchg.h>
1233dd3574SBen Gardon #include <trace/events/kvm.h>
1333dd3574SBen Gardon 
14fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */
15a1a39128SPaolo Bonzini int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16fe5db27dSBen Gardon {
17a1a39128SPaolo Bonzini 	struct workqueue_struct *wq;
18a1a39128SPaolo Bonzini 
19a1a39128SPaolo Bonzini 	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
20a1a39128SPaolo Bonzini 	if (!wq)
21a1a39128SPaolo Bonzini 		return -ENOMEM;
22fe5db27dSBen Gardon 
2302c00b3aSBen Gardon 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
249a77daacSBen Gardon 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
25a1a39128SPaolo Bonzini 	kvm->arch.tdp_mmu_zap_wq = wq;
26a1a39128SPaolo Bonzini 	return 1;
27fe5db27dSBen Gardon }
28fe5db27dSBen Gardon 
29226b8c8fSSean Christopherson /* Arbitrarily returns true so that this may be used in if statements. */
30226b8c8fSSean Christopherson static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
316103bc07SBen Gardon 							     bool shared)
326103bc07SBen Gardon {
336103bc07SBen Gardon 	if (shared)
346103bc07SBen Gardon 		lockdep_assert_held_read(&kvm->mmu_lock);
356103bc07SBen Gardon 	else
366103bc07SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
37226b8c8fSSean Christopherson 
38226b8c8fSSean Christopherson 	return true;
396103bc07SBen Gardon }
406103bc07SBen Gardon 
41fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42fe5db27dSBen Gardon {
43edbdb43fSSean Christopherson 	/*
44edbdb43fSSean Christopherson 	 * Invalidate all roots, which besides the obvious, schedules all roots
45edbdb43fSSean Christopherson 	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
46edbdb43fSSean Christopherson 	 * ultimately frees all roots.
47edbdb43fSSean Christopherson 	 */
48edbdb43fSSean Christopherson 	kvm_tdp_mmu_invalidate_all_roots(kvm);
49edbdb43fSSean Christopherson 
50edbdb43fSSean Christopherson 	/*
51edbdb43fSSean Christopherson 	 * Destroying a workqueue also first flushes the workqueue, i.e. no
52edbdb43fSSean Christopherson 	 * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
53edbdb43fSSean Christopherson 	 */
5422b94c4bSPaolo Bonzini 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
5522b94c4bSPaolo Bonzini 
56d25ceb92SSean Christopherson 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
5702c00b3aSBen Gardon 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
587cca2d0bSBen Gardon 
597cca2d0bSBen Gardon 	/*
607cca2d0bSBen Gardon 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
6122b94c4bSPaolo Bonzini 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
6222b94c4bSPaolo Bonzini 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
637cca2d0bSBen Gardon 	 */
647cca2d0bSBen Gardon 	rcu_barrier();
6502c00b3aSBen Gardon }
6602c00b3aSBen Gardon 
672bdb3d84SBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
68a889ea54SBen Gardon {
692bdb3d84SBen Gardon 	free_page((unsigned long)sp->spt);
702bdb3d84SBen Gardon 	kmem_cache_free(mmu_page_header_cache, sp);
71a889ea54SBen Gardon }
72a889ea54SBen Gardon 
73c0e64238SBen Gardon /*
74c0e64238SBen Gardon  * This is called through call_rcu in order to free TDP page table memory
75c0e64238SBen Gardon  * safely with respect to other kernel threads that may be operating on
76c0e64238SBen Gardon  * the memory.
77c0e64238SBen Gardon  * By only accessing TDP MMU page table memory in an RCU read critical
78c0e64238SBen Gardon  * section, and freeing it after a grace period, lockless access to that
79c0e64238SBen Gardon  * memory won't use it after it is freed.
80c0e64238SBen Gardon  */
81c0e64238SBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
82a889ea54SBen Gardon {
83c0e64238SBen Gardon 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
84c0e64238SBen Gardon 					       rcu_head);
85a889ea54SBen Gardon 
86c0e64238SBen Gardon 	tdp_mmu_free_sp(sp);
87a889ea54SBen Gardon }
88a889ea54SBen Gardon 
89e2b5b21dSSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
90e2b5b21dSSean Christopherson 			     bool shared);
91e2b5b21dSSean Christopherson 
9222b94c4bSPaolo Bonzini static void tdp_mmu_zap_root_work(struct work_struct *work)
9322b94c4bSPaolo Bonzini {
9422b94c4bSPaolo Bonzini 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
9522b94c4bSPaolo Bonzini 						 tdp_mmu_async_work);
9622b94c4bSPaolo Bonzini 	struct kvm *kvm = root->tdp_mmu_async_data;
9722b94c4bSPaolo Bonzini 
9822b94c4bSPaolo Bonzini 	read_lock(&kvm->mmu_lock);
9922b94c4bSPaolo Bonzini 
10022b94c4bSPaolo Bonzini 	/*
10122b94c4bSPaolo Bonzini 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
10222b94c4bSPaolo Bonzini 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
10322b94c4bSPaolo Bonzini 	 * to a different pCPU.  Note, the local TLB flush on reuse also
10422b94c4bSPaolo Bonzini 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
10522b94c4bSPaolo Bonzini 	 * intermediate paging structures, that may be zapped, as such entries
10622b94c4bSPaolo Bonzini 	 * are associated with the ASID on both VMX and SVM.
10722b94c4bSPaolo Bonzini 	 */
10822b94c4bSPaolo Bonzini 	tdp_mmu_zap_root(kvm, root, true);
10922b94c4bSPaolo Bonzini 
11022b94c4bSPaolo Bonzini 	/*
11122b94c4bSPaolo Bonzini 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
11222b94c4bSPaolo Bonzini 	 * avoiding an infinite loop.  By design, the root is reachable while
11322b94c4bSPaolo Bonzini 	 * it's being asynchronously zapped, thus a different task can put its
11422b94c4bSPaolo Bonzini 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
11522b94c4bSPaolo Bonzini 	 * asynchronously zapped root is unavoidable.
11622b94c4bSPaolo Bonzini 	 */
11722b94c4bSPaolo Bonzini 	kvm_tdp_mmu_put_root(kvm, root, true);
11822b94c4bSPaolo Bonzini 
11922b94c4bSPaolo Bonzini 	read_unlock(&kvm->mmu_lock);
12022b94c4bSPaolo Bonzini }
12122b94c4bSPaolo Bonzini 
12222b94c4bSPaolo Bonzini static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
12322b94c4bSPaolo Bonzini {
12422b94c4bSPaolo Bonzini 	root->tdp_mmu_async_data = kvm;
12522b94c4bSPaolo Bonzini 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
12622b94c4bSPaolo Bonzini 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
12722b94c4bSPaolo Bonzini }
12822b94c4bSPaolo Bonzini 
1296103bc07SBen Gardon void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
1306103bc07SBen Gardon 			  bool shared)
1312bdb3d84SBen Gardon {
1326103bc07SBen Gardon 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1332bdb3d84SBen Gardon 
13411cccf5cSBen Gardon 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
1352bdb3d84SBen Gardon 		return;
1362bdb3d84SBen Gardon 
1378351779cSPaolo Bonzini 	/*
138edbdb43fSSean Christopherson 	 * The TDP MMU itself holds a reference to each root until the root is
139edbdb43fSSean Christopherson 	 * explicitly invalidated, i.e. the final reference should be never be
140edbdb43fSSean Christopherson 	 * put for a valid root.
1418351779cSPaolo Bonzini 	 */
142edbdb43fSSean Christopherson 	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
1438351779cSPaolo Bonzini 
144c0e64238SBen Gardon 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
145c0e64238SBen Gardon 	list_del_rcu(&root->link);
146c0e64238SBen Gardon 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
147c0e64238SBen Gardon 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
148a889ea54SBen Gardon }
149a889ea54SBen Gardon 
150cfc10997SBen Gardon /*
151d62007edSSean Christopherson  * Returns the next root after @prev_root (or the first root if @prev_root is
152d62007edSSean Christopherson  * NULL).  A reference to the returned root is acquired, and the reference to
153d62007edSSean Christopherson  * @prev_root is released (the caller obviously must hold a reference to
154d62007edSSean Christopherson  * @prev_root if it's non-NULL).
155d62007edSSean Christopherson  *
156d62007edSSean Christopherson  * If @only_valid is true, invalid roots are skipped.
157d62007edSSean Christopherson  *
158d62007edSSean Christopherson  * Returns NULL if the end of tdp_mmu_roots was reached.
159cfc10997SBen Gardon  */
160cfc10997SBen Gardon static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
1616103bc07SBen Gardon 					      struct kvm_mmu_page *prev_root,
162d62007edSSean Christopherson 					      bool shared, bool only_valid)
163a889ea54SBen Gardon {
164a889ea54SBen Gardon 	struct kvm_mmu_page *next_root;
165a889ea54SBen Gardon 
166c0e64238SBen Gardon 	rcu_read_lock();
167c0e64238SBen Gardon 
168cfc10997SBen Gardon 	if (prev_root)
169c0e64238SBen Gardon 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
170c0e64238SBen Gardon 						  &prev_root->link,
171c0e64238SBen Gardon 						  typeof(*prev_root), link);
172cfc10997SBen Gardon 	else
173c0e64238SBen Gardon 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
174cfc10997SBen Gardon 						   typeof(*next_root), link);
175cfc10997SBen Gardon 
17604dc4e6cSSean Christopherson 	while (next_root) {
177d62007edSSean Christopherson 		if ((!only_valid || !next_root->role.invalid) &&
178ad6d6b94SJinrong Liang 		    kvm_tdp_mmu_get_root(next_root))
17904dc4e6cSSean Christopherson 			break;
18004dc4e6cSSean Christopherson 
181c0e64238SBen Gardon 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
182c0e64238SBen Gardon 				&next_root->link, typeof(*next_root), link);
18304dc4e6cSSean Christopherson 	}
184fb101293SBen Gardon 
185c0e64238SBen Gardon 	rcu_read_unlock();
186cfc10997SBen Gardon 
187cfc10997SBen Gardon 	if (prev_root)
1886103bc07SBen Gardon 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
189cfc10997SBen Gardon 
190a889ea54SBen Gardon 	return next_root;
191a889ea54SBen Gardon }
192a889ea54SBen Gardon 
193a889ea54SBen Gardon /*
194a889ea54SBen Gardon  * Note: this iterator gets and puts references to the roots it iterates over.
195a889ea54SBen Gardon  * This makes it safe to release the MMU lock and yield within the loop, but
196a889ea54SBen Gardon  * if exiting the loop early, the caller must drop the reference to the most
197a889ea54SBen Gardon  * recent root. (Unless keeping a live reference is desirable.)
1986103bc07SBen Gardon  *
1996103bc07SBen Gardon  * If shared is set, this function is operating under the MMU lock in read
2006103bc07SBen Gardon  * mode. In the unlikely event that this thread must free a root, the lock
2016103bc07SBen Gardon  * will be temporarily dropped and reacquired in write mode.
202a889ea54SBen Gardon  */
203d62007edSSean Christopherson #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
204d62007edSSean Christopherson 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
205cfc10997SBen Gardon 	     _root;								\
206d62007edSSean Christopherson 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
207614f6970SPaolo Bonzini 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
208614f6970SPaolo Bonzini 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
209a3f15bdaSSean Christopherson 		} else
210a889ea54SBen Gardon 
211d62007edSSean Christopherson #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
212d62007edSSean Christopherson 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
213d62007edSSean Christopherson 
214614f6970SPaolo Bonzini #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
215614f6970SPaolo Bonzini 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
216d62007edSSean Christopherson 
217226b8c8fSSean Christopherson /*
218226b8c8fSSean Christopherson  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
219226b8c8fSSean Christopherson  * the implication being that any flow that holds mmu_lock for read is
220226b8c8fSSean Christopherson  * inherently yield-friendly and should use the yield-safe variant above.
221226b8c8fSSean Christopherson  * Holding mmu_lock for write obviates the need for RCU protection as the list
222226b8c8fSSean Christopherson  * is guaranteed to be stable.
223226b8c8fSSean Christopherson  */
224a3f15bdaSSean Christopherson #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
225226b8c8fSSean Christopherson 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
226226b8c8fSSean Christopherson 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
227226b8c8fSSean Christopherson 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
228a3f15bdaSSean Christopherson 		} else
22902c00b3aSBen Gardon 
230a82070b6SDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
23102c00b3aSBen Gardon {
23202c00b3aSBen Gardon 	struct kvm_mmu_page *sp;
23302c00b3aSBen Gardon 
23402c00b3aSBen Gardon 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
23502c00b3aSBen Gardon 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
236a82070b6SDavid Matlack 
237a82070b6SDavid Matlack 	return sp;
238a82070b6SDavid Matlack }
239a82070b6SDavid Matlack 
240c10743a1SSean Christopherson static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
241c10743a1SSean Christopherson 			    gfn_t gfn, union kvm_mmu_page_role role)
242a82070b6SDavid Matlack {
24355c510e2SSean Christopherson 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
244428e9216SSean Christopherson 
24502c00b3aSBen Gardon 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
24602c00b3aSBen Gardon 
247a3aca4deSDavid Matlack 	sp->role = role;
24802c00b3aSBen Gardon 	sp->gfn = gfn;
249c10743a1SSean Christopherson 	sp->ptep = sptep;
25002c00b3aSBen Gardon 	sp->tdp_mmu_page = true;
25102c00b3aSBen Gardon 
25233dd3574SBen Gardon 	trace_kvm_mmu_get_page(sp, true);
25302c00b3aSBen Gardon }
25402c00b3aSBen Gardon 
255a82070b6SDavid Matlack static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
256a3aca4deSDavid Matlack 				  struct tdp_iter *iter)
257a3aca4deSDavid Matlack {
258a3aca4deSDavid Matlack 	struct kvm_mmu_page *parent_sp;
259a3aca4deSDavid Matlack 	union kvm_mmu_page_role role;
260a3aca4deSDavid Matlack 
261a3aca4deSDavid Matlack 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
262a3aca4deSDavid Matlack 
263a3aca4deSDavid Matlack 	role = parent_sp->role;
264a3aca4deSDavid Matlack 	role.level--;
265a3aca4deSDavid Matlack 
266c10743a1SSean Christopherson 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
267a3aca4deSDavid Matlack }
268a3aca4deSDavid Matlack 
2696e6ec584SSean Christopherson hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
27002c00b3aSBen Gardon {
2717a458f0eSPaolo Bonzini 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
27202c00b3aSBen Gardon 	struct kvm *kvm = vcpu->kvm;
27302c00b3aSBen Gardon 	struct kvm_mmu_page *root;
27402c00b3aSBen Gardon 
2756e6ec584SSean Christopherson 	lockdep_assert_held_write(&kvm->mmu_lock);
27602c00b3aSBen Gardon 
27704dc4e6cSSean Christopherson 	/*
27804dc4e6cSSean Christopherson 	 * Check for an existing root before allocating a new one.  Note, the
27904dc4e6cSSean Christopherson 	 * role check prevents consuming an invalid root.
28004dc4e6cSSean Christopherson 	 */
281a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
282fb101293SBen Gardon 		if (root->role.word == role.word &&
283ad6d6b94SJinrong Liang 		    kvm_tdp_mmu_get_root(root))
2846e6ec584SSean Christopherson 			goto out;
28502c00b3aSBen Gardon 	}
28602c00b3aSBen Gardon 
287a82070b6SDavid Matlack 	root = tdp_mmu_alloc_sp(vcpu);
288c10743a1SSean Christopherson 	tdp_mmu_init_sp(root, NULL, 0, role);
289a82070b6SDavid Matlack 
290edbdb43fSSean Christopherson 	/*
291edbdb43fSSean Christopherson 	 * TDP MMU roots are kept until they are explicitly invalidated, either
292edbdb43fSSean Christopherson 	 * by a memslot update or by the destruction of the VM.  Initialize the
293edbdb43fSSean Christopherson 	 * refcount to two; one reference for the vCPU, and one reference for
294edbdb43fSSean Christopherson 	 * the TDP MMU itself, which is held until the root is invalidated and
295edbdb43fSSean Christopherson 	 * is ultimately put by tdp_mmu_zap_root_work().
296edbdb43fSSean Christopherson 	 */
297edbdb43fSSean Christopherson 	refcount_set(&root->tdp_mmu_root_count, 2);
29802c00b3aSBen Gardon 
299c0e64238SBen Gardon 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
300c0e64238SBen Gardon 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
301c0e64238SBen Gardon 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
30202c00b3aSBen Gardon 
3036e6ec584SSean Christopherson out:
30402c00b3aSBen Gardon 	return __pa(root->spt);
305fe5db27dSBen Gardon }
3062f2fad08SBen Gardon 
3072f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
3089a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
3099a77daacSBen Gardon 				bool shared);
3102f2fad08SBen Gardon 
31143a063caSYosry Ahmed static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
31243a063caSYosry Ahmed {
31343a063caSYosry Ahmed 	kvm_account_pgtable_pages((void *)sp->spt, +1);
314d25ceb92SSean Christopherson 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
31543a063caSYosry Ahmed }
31643a063caSYosry Ahmed 
31743a063caSYosry Ahmed static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
31843a063caSYosry Ahmed {
31943a063caSYosry Ahmed 	kvm_account_pgtable_pages((void *)sp->spt, -1);
320d25ceb92SSean Christopherson 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
32143a063caSYosry Ahmed }
32243a063caSYosry Ahmed 
3232f2fad08SBen Gardon /**
324c298a30cSDavid Matlack  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
325a9442f59SBen Gardon  *
326a9442f59SBen Gardon  * @kvm: kvm instance
327a9442f59SBen Gardon  * @sp: the page to be removed
3289a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
3299a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
3309a77daacSBen Gardon  *	    threads that might be adding or removing pages.
331a9442f59SBen Gardon  */
332c298a30cSDavid Matlack static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
3339a77daacSBen Gardon 			      bool shared)
334a9442f59SBen Gardon {
33543a063caSYosry Ahmed 	tdp_unaccount_mmu_page(kvm, sp);
336d25ceb92SSean Christopherson 
337d25ceb92SSean Christopherson 	if (!sp->nx_huge_page_disallowed)
338d25ceb92SSean Christopherson 		return;
339d25ceb92SSean Christopherson 
3409a77daacSBen Gardon 	if (shared)
3419a77daacSBen Gardon 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
3429a77daacSBen Gardon 	else
343a9442f59SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
344a9442f59SBen Gardon 
34561f94478SSean Christopherson 	sp->nx_huge_page_disallowed = false;
34661f94478SSean Christopherson 	untrack_possible_nx_huge_page(kvm, sp);
3479a77daacSBen Gardon 
3489a77daacSBen Gardon 	if (shared)
3499a77daacSBen Gardon 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
350a9442f59SBen Gardon }
351a9442f59SBen Gardon 
352a9442f59SBen Gardon /**
3530f53dfa3SDavid Matlack  * handle_removed_pt() - handle a page table removed from the TDP structure
354a066e61fSBen Gardon  *
355a066e61fSBen Gardon  * @kvm: kvm instance
356a066e61fSBen Gardon  * @pt: the page removed from the paging structure
3579a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use
3589a77daacSBen Gardon  *	    of the MMU lock and the operation must synchronize with other
3599a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
360a066e61fSBen Gardon  *
361a066e61fSBen Gardon  * Given a page table that has been removed from the TDP paging structure,
362a066e61fSBen Gardon  * iterates through the page table to clear SPTEs and free child page tables.
36370fb3e41SBen Gardon  *
36470fb3e41SBen Gardon  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
36570fb3e41SBen Gardon  * protection. Since this thread removed it from the paging structure,
36670fb3e41SBen Gardon  * this thread will be responsible for ensuring the page is freed. Hence the
36770fb3e41SBen Gardon  * early rcu_dereferences in the function.
368a066e61fSBen Gardon  */
3690f53dfa3SDavid Matlack static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
370a066e61fSBen Gardon {
37170fb3e41SBen Gardon 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
372a066e61fSBen Gardon 	int level = sp->role.level;
373e25f0e0cSBen Gardon 	gfn_t base_gfn = sp->gfn;
374a066e61fSBen Gardon 	int i;
375a066e61fSBen Gardon 
376a066e61fSBen Gardon 	trace_kvm_mmu_prepare_zap_page(sp);
377a066e61fSBen Gardon 
378c298a30cSDavid Matlack 	tdp_mmu_unlink_sp(kvm, sp, shared);
379a066e61fSBen Gardon 
3802ca3129eSSean Christopherson 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
381ba3a6120SSean Christopherson 		tdp_ptep_t sptep = pt + i;
382574c3c55SBen Gardon 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
383ba3a6120SSean Christopherson 		u64 old_spte;
3849a77daacSBen Gardon 
3859a77daacSBen Gardon 		if (shared) {
386e25f0e0cSBen Gardon 			/*
387e25f0e0cSBen Gardon 			 * Set the SPTE to a nonpresent value that other
388e25f0e0cSBen Gardon 			 * threads will not overwrite. If the SPTE was
389e25f0e0cSBen Gardon 			 * already marked as removed then another thread
390e25f0e0cSBen Gardon 			 * handling a page fault could overwrite it, so
391e25f0e0cSBen Gardon 			 * set the SPTE until it is set from some other
392e25f0e0cSBen Gardon 			 * value to the removed SPTE value.
393e25f0e0cSBen Gardon 			 */
394e25f0e0cSBen Gardon 			for (;;) {
395ba3a6120SSean Christopherson 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
396ba3a6120SSean Christopherson 				if (!is_removed_spte(old_spte))
397e25f0e0cSBen Gardon 					break;
398e25f0e0cSBen Gardon 				cpu_relax();
399e25f0e0cSBen Gardon 			}
4009a77daacSBen Gardon 		} else {
4018df9f1afSSean Christopherson 			/*
4028df9f1afSSean Christopherson 			 * If the SPTE is not MMU-present, there is no backing
4038df9f1afSSean Christopherson 			 * page associated with the SPTE and so no side effects
4048df9f1afSSean Christopherson 			 * that need to be recorded, and exclusive ownership of
4058df9f1afSSean Christopherson 			 * mmu_lock ensures the SPTE can't be made present.
4068df9f1afSSean Christopherson 			 * Note, zapping MMIO SPTEs is also unnecessary as they
4078df9f1afSSean Christopherson 			 * are guarded by the memslots generation, not by being
4088df9f1afSSean Christopherson 			 * unreachable.
4098df9f1afSSean Christopherson 			 */
410ba3a6120SSean Christopherson 			old_spte = kvm_tdp_mmu_read_spte(sptep);
411ba3a6120SSean Christopherson 			if (!is_shadow_present_pte(old_spte))
4128df9f1afSSean Christopherson 				continue;
413e25f0e0cSBen Gardon 
414e25f0e0cSBen Gardon 			/*
415ba3a6120SSean Christopherson 			 * Use the common helper instead of a raw WRITE_ONCE as
416ba3a6120SSean Christopherson 			 * the SPTE needs to be updated atomically if it can be
417ba3a6120SSean Christopherson 			 * modified by a different vCPU outside of mmu_lock.
418ba3a6120SSean Christopherson 			 * Even though the parent SPTE is !PRESENT, the TLB
419ba3a6120SSean Christopherson 			 * hasn't yet been flushed, and both Intel and AMD
420ba3a6120SSean Christopherson 			 * document that A/D assists can use upper-level PxE
421ba3a6120SSean Christopherson 			 * entries that are cached in the TLB, i.e. the CPU can
422ba3a6120SSean Christopherson 			 * still access the page and mark it dirty.
423ba3a6120SSean Christopherson 			 *
424ba3a6120SSean Christopherson 			 * No retry is needed in the atomic update path as the
425ba3a6120SSean Christopherson 			 * sole concern is dropping a Dirty bit, i.e. no other
426ba3a6120SSean Christopherson 			 * task can zap/remove the SPTE as mmu_lock is held for
427ba3a6120SSean Christopherson 			 * write.  Marking the SPTE as a removed SPTE is not
428ba3a6120SSean Christopherson 			 * strictly necessary for the same reason, but using
429ba3a6120SSean Christopherson 			 * the remove SPTE value keeps the shared/exclusive
430ba3a6120SSean Christopherson 			 * paths consistent and allows the handle_changed_spte()
431ba3a6120SSean Christopherson 			 * call below to hardcode the new value to REMOVED_SPTE.
432ba3a6120SSean Christopherson 			 *
433ba3a6120SSean Christopherson 			 * Note, even though dropping a Dirty bit is the only
434ba3a6120SSean Christopherson 			 * scenario where a non-atomic update could result in a
435ba3a6120SSean Christopherson 			 * functional bug, simply checking the Dirty bit isn't
436ba3a6120SSean Christopherson 			 * sufficient as a fast page fault could read the upper
437ba3a6120SSean Christopherson 			 * level SPTE before it is zapped, and then make this
438ba3a6120SSean Christopherson 			 * target SPTE writable, resume the guest, and set the
439ba3a6120SSean Christopherson 			 * Dirty bit between reading the SPTE above and writing
440ba3a6120SSean Christopherson 			 * it here.
441e25f0e0cSBen Gardon 			 */
442ba3a6120SSean Christopherson 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
443ba3a6120SSean Christopherson 							  REMOVED_SPTE, level);
4449a77daacSBen Gardon 		}
445e25f0e0cSBen Gardon 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
446ba3a6120SSean Christopherson 				    old_spte, REMOVED_SPTE, level, shared);
447a066e61fSBen Gardon 	}
448a066e61fSBen Gardon 
4497cca2d0bSBen Gardon 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
450a066e61fSBen Gardon }
451a066e61fSBen Gardon 
452a066e61fSBen Gardon /**
45340fa907eSVipin Sharma  * handle_changed_spte - handle bookkeeping associated with an SPTE change
4542f2fad08SBen Gardon  * @kvm: kvm instance
4552f2fad08SBen Gardon  * @as_id: the address space of the paging structure the SPTE was a part of
4562f2fad08SBen Gardon  * @gfn: the base GFN that was mapped by the SPTE
4572f2fad08SBen Gardon  * @old_spte: The value of the SPTE before the change
4582f2fad08SBen Gardon  * @new_spte: The value of the SPTE after the change
4592f2fad08SBen Gardon  * @level: the level of the PT the SPTE is part of in the paging structure
4609a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
4619a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
4629a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
4632f2fad08SBen Gardon  *
4641f997345SVipin Sharma  * Handle bookkeeping that might result from the modification of a SPTE.  Note,
4651f997345SVipin Sharma  * dirty logging updates are handled in common code, not here (see make_spte()
4661f997345SVipin Sharma  * and fast_pf_fix_direct_spte()).
4672f2fad08SBen Gardon  */
46840fa907eSVipin Sharma static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
4699a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
4709a77daacSBen Gardon 				bool shared)
4712f2fad08SBen Gardon {
4722f2fad08SBen Gardon 	bool was_present = is_shadow_present_pte(old_spte);
4732f2fad08SBen Gardon 	bool is_present = is_shadow_present_pte(new_spte);
4742f2fad08SBen Gardon 	bool was_leaf = was_present && is_last_spte(old_spte, level);
4752f2fad08SBen Gardon 	bool is_leaf = is_present && is_last_spte(new_spte, level);
4762f2fad08SBen Gardon 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
4772f2fad08SBen Gardon 
47820ba462dSSean Christopherson 	WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
47920ba462dSSean Christopherson 	WARN_ON_ONCE(level < PG_LEVEL_4K);
48020ba462dSSean Christopherson 	WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
4812f2fad08SBen Gardon 
4822f2fad08SBen Gardon 	/*
4832f2fad08SBen Gardon 	 * If this warning were to trigger it would indicate that there was a
4842f2fad08SBen Gardon 	 * missing MMU notifier or a race with some notifier handler.
4852f2fad08SBen Gardon 	 * A present, leaf SPTE should never be directly replaced with another
486d9f6e12fSIngo Molnar 	 * present leaf SPTE pointing to a different PFN. A notifier handler
4872f2fad08SBen Gardon 	 * should be zapping the SPTE before the main MM's page table is
4882f2fad08SBen Gardon 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
4892f2fad08SBen Gardon 	 * thread before replacement.
4902f2fad08SBen Gardon 	 */
4912f2fad08SBen Gardon 	if (was_leaf && is_leaf && pfn_changed) {
4922f2fad08SBen Gardon 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
4932f2fad08SBen Gardon 		       "SPTE with another present leaf SPTE mapping a\n"
4942f2fad08SBen Gardon 		       "different PFN!\n"
4952f2fad08SBen Gardon 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
4962f2fad08SBen Gardon 		       as_id, gfn, old_spte, new_spte, level);
4972f2fad08SBen Gardon 
4982f2fad08SBen Gardon 		/*
4992f2fad08SBen Gardon 		 * Crash the host to prevent error propagation and guest data
500d9f6e12fSIngo Molnar 		 * corruption.
5012f2fad08SBen Gardon 		 */
5022f2fad08SBen Gardon 		BUG();
5032f2fad08SBen Gardon 	}
5042f2fad08SBen Gardon 
5052f2fad08SBen Gardon 	if (old_spte == new_spte)
5062f2fad08SBen Gardon 		return;
5072f2fad08SBen Gardon 
508b9a98c34SBen Gardon 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
509b9a98c34SBen Gardon 
510115111efSDavid Matlack 	if (is_leaf)
511115111efSDavid Matlack 		check_spte_writable_invariants(new_spte);
512115111efSDavid Matlack 
5132f2fad08SBen Gardon 	/*
5142f2fad08SBen Gardon 	 * The only times a SPTE should be changed from a non-present to
5152f2fad08SBen Gardon 	 * non-present state is when an MMIO entry is installed/modified/
5162f2fad08SBen Gardon 	 * removed. In that case, there is nothing to do here.
5172f2fad08SBen Gardon 	 */
5182f2fad08SBen Gardon 	if (!was_present && !is_present) {
5192f2fad08SBen Gardon 		/*
52008f07c80SBen Gardon 		 * If this change does not involve a MMIO SPTE or removed SPTE,
52108f07c80SBen Gardon 		 * it is unexpected. Log the change, though it should not
52208f07c80SBen Gardon 		 * impact the guest since both the former and current SPTEs
52308f07c80SBen Gardon 		 * are nonpresent.
5242f2fad08SBen Gardon 		 */
52520ba462dSSean Christopherson 		if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
52608f07c80SBen Gardon 				 !is_mmio_spte(new_spte) &&
52708f07c80SBen Gardon 				 !is_removed_spte(new_spte)))
5282f2fad08SBen Gardon 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
5292f2fad08SBen Gardon 			       "should not be replaced with another,\n"
5302f2fad08SBen Gardon 			       "different nonpresent SPTE, unless one or both\n"
53108f07c80SBen Gardon 			       "are MMIO SPTEs, or the new SPTE is\n"
53208f07c80SBen Gardon 			       "a temporary removed SPTE.\n"
5332f2fad08SBen Gardon 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
5342f2fad08SBen Gardon 			       as_id, gfn, old_spte, new_spte, level);
5352f2fad08SBen Gardon 		return;
5362f2fad08SBen Gardon 	}
5372f2fad08SBen Gardon 
53871f51d2cSMingwei Zhang 	if (is_leaf != was_leaf)
53971f51d2cSMingwei Zhang 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
5402f2fad08SBen Gardon 
5412f2fad08SBen Gardon 	if (was_leaf && is_dirty_spte(old_spte) &&
54264bb2769SSean Christopherson 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
5432f2fad08SBen Gardon 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
5442f2fad08SBen Gardon 
5452f2fad08SBen Gardon 	/*
5462f2fad08SBen Gardon 	 * Recursively handle child PTs if the change removed a subtree from
547c8e5a0d0SSean Christopherson 	 * the paging structure.  Note the WARN on the PFN changing without the
548c8e5a0d0SSean Christopherson 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
549c8e5a0d0SSean Christopherson 	 * pages are kernel allocations and should never be migrated.
5502f2fad08SBen Gardon 	 */
551c8e5a0d0SSean Christopherson 	if (was_present && !was_leaf &&
552c8e5a0d0SSean Christopherson 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
5530f53dfa3SDavid Matlack 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
5542f2fad08SBen Gardon 
55540fa907eSVipin Sharma 	if (was_leaf && is_accessed_spte(old_spte) &&
55640fa907eSVipin Sharma 	    (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
55740fa907eSVipin Sharma 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
5582f2fad08SBen Gardon }
559faaf05b0SBen Gardon 
560fe43fa2fSBen Gardon /*
5616ccf4438SPaolo Bonzini  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
5626ccf4438SPaolo Bonzini  * and handle the associated bookkeeping.  Do not mark the page dirty
56324ae4cfaSBen Gardon  * in KVM's dirty bitmaps.
5649a77daacSBen Gardon  *
5653255530aSDavid Matlack  * If setting the SPTE fails because it has changed, iter->old_spte will be
5663255530aSDavid Matlack  * refreshed to the current value of the spte.
5673255530aSDavid Matlack  *
5689a77daacSBen Gardon  * @kvm: kvm instance
5699a77daacSBen Gardon  * @iter: a tdp_iter instance currently on the SPTE that should be set
5709a77daacSBen Gardon  * @new_spte: The value the SPTE should be set to
5713e72c791SDavid Matlack  * Return:
5723e72c791SDavid Matlack  * * 0      - If the SPTE was set.
5733e72c791SDavid Matlack  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
5743e72c791SDavid Matlack  *            no side-effects other than setting iter->old_spte to the last
5753e72c791SDavid Matlack  *            known value of the spte.
5769a77daacSBen Gardon  */
5773e72c791SDavid Matlack static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
5789a77daacSBen Gardon 					  struct tdp_iter *iter,
5799a77daacSBen Gardon 					  u64 new_spte)
5809a77daacSBen Gardon {
5813255530aSDavid Matlack 	u64 *sptep = rcu_dereference(iter->sptep);
5823255530aSDavid Matlack 
583396fd74dSSean Christopherson 	/*
584396fd74dSSean Christopherson 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
585396fd74dSSean Christopherson 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
586396fd74dSSean Christopherson 	 * and pre-checking before inserting a new SPTE is advantageous as it
587396fd74dSSean Christopherson 	 * avoids unnecessary work.
588396fd74dSSean Christopherson 	 */
589396fd74dSSean Christopherson 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
5903a0f64deSSean Christopherson 
5919a77daacSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
5929a77daacSBen Gardon 
59308f07c80SBen Gardon 	/*
5946e8eb206SDavid Matlack 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
59512ced095SUros Bizjak 	 * does not hold the mmu_lock.  On failure, i.e. if a different logical
59612ced095SUros Bizjak 	 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
59712ced095SUros Bizjak 	 * the current value, so the caller operates on fresh data, e.g. if it
59812ced095SUros Bizjak 	 * retries tdp_mmu_set_spte_atomic()
5996e8eb206SDavid Matlack 	 */
600aee98a68SUros Bizjak 	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
6013e72c791SDavid Matlack 		return -EBUSY;
6029a77daacSBen Gardon 
60340fa907eSVipin Sharma 	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
60408889894SSean Christopherson 			    new_spte, iter->level, true);
6059a77daacSBen Gardon 
6063e72c791SDavid Matlack 	return 0;
6079a77daacSBen Gardon }
6089a77daacSBen Gardon 
6093e72c791SDavid Matlack static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
61008f07c80SBen Gardon 					  struct tdp_iter *iter)
61108f07c80SBen Gardon {
6123e72c791SDavid Matlack 	int ret;
6133e72c791SDavid Matlack 
61408f07c80SBen Gardon 	/*
61508f07c80SBen Gardon 	 * Freeze the SPTE by setting it to a special,
61608f07c80SBen Gardon 	 * non-present value. This will stop other threads from
61708f07c80SBen Gardon 	 * immediately installing a present entry in its place
61808f07c80SBen Gardon 	 * before the TLBs are flushed.
61908f07c80SBen Gardon 	 */
6203e72c791SDavid Matlack 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
6213e72c791SDavid Matlack 	if (ret)
6223e72c791SDavid Matlack 		return ret;
62308f07c80SBen Gardon 
6244ad980aeSHou Wenlong 	kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
62508f07c80SBen Gardon 
62608f07c80SBen Gardon 	/*
627ba3a6120SSean Christopherson 	 * No other thread can overwrite the removed SPTE as they must either
628ba3a6120SSean Christopherson 	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
629ba3a6120SSean Christopherson 	 * overwrite the special removed SPTE value. No bookkeeping is needed
630ba3a6120SSean Christopherson 	 * here since the SPTE is going from non-present to non-present.  Use
631ba3a6120SSean Christopherson 	 * the raw write helper to avoid an unnecessary check on volatile bits.
63208f07c80SBen Gardon 	 */
633ba3a6120SSean Christopherson 	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
63408f07c80SBen Gardon 
6353e72c791SDavid Matlack 	return 0;
63608f07c80SBen Gardon }
63708f07c80SBen Gardon 
6389a77daacSBen Gardon 
6399a77daacSBen Gardon /*
6400b7cc254SVipin Sharma  * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
641626808d1SSean Christopherson  * @kvm:	      KVM instance
642626808d1SSean Christopherson  * @as_id:	      Address space ID, i.e. regular vs. SMM
643626808d1SSean Christopherson  * @sptep:	      Pointer to the SPTE
644626808d1SSean Christopherson  * @old_spte:	      The current value of the SPTE
645626808d1SSean Christopherson  * @new_spte:	      The new value that will be set for the SPTE
646626808d1SSean Christopherson  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
647626808d1SSean Christopherson  * @level:	      The level _containing_ the SPTE (its parent PT's level)
648ba3a6120SSean Christopherson  *
649ba3a6120SSean Christopherson  * Returns the old SPTE value, which _may_ be different than @old_spte if the
650ba3a6120SSean Christopherson  * SPTE had voldatile bits.
651fe43fa2fSBen Gardon  */
6520b7cc254SVipin Sharma static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
6530b7cc254SVipin Sharma 			    u64 old_spte, u64 new_spte, gfn_t gfn, int level)
654faaf05b0SBen Gardon {
655531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
6563a9a4aa5SBen Gardon 
65708f07c80SBen Gardon 	/*
658966da62aSSean Christopherson 	 * No thread should be using this function to set SPTEs to or from the
65908f07c80SBen Gardon 	 * temporary removed SPTE value.
66008f07c80SBen Gardon 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
66108f07c80SBen Gardon 	 * should be used. If operating under the MMU lock in write mode, the
66208f07c80SBen Gardon 	 * use of the removed SPTE should not be necessary.
66308f07c80SBen Gardon 	 */
66420ba462dSSean Christopherson 	WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
66508f07c80SBen Gardon 
666ba3a6120SSean Christopherson 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
667faaf05b0SBen Gardon 
66840fa907eSVipin Sharma 	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
669ba3a6120SSean Christopherson 	return old_spte;
670626808d1SSean Christopherson }
671626808d1SSean Christopherson 
6720b7cc254SVipin Sharma static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
673f8e14497SBen Gardon 					 u64 new_spte)
674f8e14497SBen Gardon {
6750b7cc254SVipin Sharma 	WARN_ON_ONCE(iter->yielded);
6760b7cc254SVipin Sharma 	iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
6770b7cc254SVipin Sharma 					  iter->old_spte, new_spte,
6780b7cc254SVipin Sharma 					  iter->gfn, iter->level);
679f8e14497SBen Gardon }
680f8e14497SBen Gardon 
681faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
68277aa6075SDavid Matlack 	for_each_tdp_pte(_iter, _root, _start, _end)
683faaf05b0SBen Gardon 
684f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
685f8e14497SBen Gardon 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
686f8e14497SBen Gardon 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
687f8e14497SBen Gardon 		    !is_last_spte(_iter.old_spte, _iter.level))		\
688f8e14497SBen Gardon 			continue;					\
689f8e14497SBen Gardon 		else
690f8e14497SBen Gardon 
691bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
692*c5f2d564SSean Christopherson 	for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
693bb18842eSBen Gardon 
694faaf05b0SBen Gardon /*
695e28a436cSBen Gardon  * Yield if the MMU lock is contended or this thread needs to return control
696e28a436cSBen Gardon  * to the scheduler.
697e28a436cSBen Gardon  *
698e139a34eSBen Gardon  * If this function should yield and flush is set, it will perform a remote
699e139a34eSBen Gardon  * TLB flush before yielding.
700e139a34eSBen Gardon  *
7013a0f64deSSean Christopherson  * If this function yields, iter->yielded is set and the caller must skip to
7023a0f64deSSean Christopherson  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
7033a0f64deSSean Christopherson  * over the paging structures to allow the iterator to continue its traversal
7043a0f64deSSean Christopherson  * from the paging structure root.
705e28a436cSBen Gardon  *
7063a0f64deSSean Christopherson  * Returns true if this function yielded.
707e28a436cSBen Gardon  */
7083a0f64deSSean Christopherson static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
7093a0f64deSSean Christopherson 							  struct tdp_iter *iter,
7103a0f64deSSean Christopherson 							  bool flush, bool shared)
711a6a0b05dSBen Gardon {
71220ba462dSSean Christopherson 	WARN_ON_ONCE(iter->yielded);
7133a0f64deSSean Christopherson 
714ed5e484bSBen Gardon 	/* Ensure forward progress has been made before yielding. */
715ed5e484bSBen Gardon 	if (iter->next_last_level_gfn == iter->yielded_gfn)
716ed5e484bSBen Gardon 		return false;
717ed5e484bSBen Gardon 
718531810caSBen Gardon 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
719e139a34eSBen Gardon 		if (flush)
720e139a34eSBen Gardon 			kvm_flush_remote_tlbs(kvm);
721e139a34eSBen Gardon 
722bd296779SSean Christopherson 		rcu_read_unlock();
723bd296779SSean Christopherson 
7246103bc07SBen Gardon 		if (shared)
7256103bc07SBen Gardon 			cond_resched_rwlock_read(&kvm->mmu_lock);
7266103bc07SBen Gardon 		else
727531810caSBen Gardon 			cond_resched_rwlock_write(&kvm->mmu_lock);
7286103bc07SBen Gardon 
7297cca2d0bSBen Gardon 		rcu_read_lock();
730ed5e484bSBen Gardon 
73120ba462dSSean Christopherson 		WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
732ed5e484bSBen Gardon 
7333a0f64deSSean Christopherson 		iter->yielded = true;
734a6a0b05dSBen Gardon 	}
735e28a436cSBen Gardon 
7363a0f64deSSean Christopherson 	return iter->yielded;
737a6a0b05dSBen Gardon }
738a6a0b05dSBen Gardon 
73986931ff7SSean Christopherson static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
740e2b5b21dSSean Christopherson {
741e2b5b21dSSean Christopherson 	/*
74286931ff7SSean Christopherson 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
74386931ff7SSean Christopherson 	 * a gpa range that would exceed the max gfn, and KVM does not create
74486931ff7SSean Christopherson 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
74586931ff7SSean Christopherson 	 * the slow emulation path every time.
746e2b5b21dSSean Christopherson 	 */
74786931ff7SSean Christopherson 	return kvm_mmu_max_gfn() + 1;
748e2b5b21dSSean Christopherson }
749e2b5b21dSSean Christopherson 
7501b6043e8SSean Christopherson static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
7511b6043e8SSean Christopherson 			       bool shared, int zap_level)
752e2b5b21dSSean Christopherson {
753e2b5b21dSSean Christopherson 	struct tdp_iter iter;
754e2b5b21dSSean Christopherson 
75586931ff7SSean Christopherson 	gfn_t end = tdp_mmu_max_gfn_exclusive();
756e2b5b21dSSean Christopherson 	gfn_t start = 0;
757e2b5b21dSSean Christopherson 
7581b6043e8SSean Christopherson 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
7591b6043e8SSean Christopherson retry:
7601b6043e8SSean Christopherson 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
7611b6043e8SSean Christopherson 			continue;
7621b6043e8SSean Christopherson 
7631b6043e8SSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte))
7641b6043e8SSean Christopherson 			continue;
7651b6043e8SSean Christopherson 
7661b6043e8SSean Christopherson 		if (iter.level > zap_level)
7671b6043e8SSean Christopherson 			continue;
7681b6043e8SSean Christopherson 
7691b6043e8SSean Christopherson 		if (!shared)
7700b7cc254SVipin Sharma 			tdp_mmu_iter_set_spte(kvm, &iter, 0);
7711b6043e8SSean Christopherson 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
7721b6043e8SSean Christopherson 			goto retry;
7731b6043e8SSean Christopherson 	}
7741b6043e8SSean Christopherson }
7751b6043e8SSean Christopherson 
7761b6043e8SSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
7771b6043e8SSean Christopherson 			     bool shared)
7781b6043e8SSean Christopherson {
7791b6043e8SSean Christopherson 
7808351779cSPaolo Bonzini 	/*
7818351779cSPaolo Bonzini 	 * The root must have an elevated refcount so that it's reachable via
7828351779cSPaolo Bonzini 	 * mmu_notifier callbacks, which allows this path to yield and drop
7838351779cSPaolo Bonzini 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
7848351779cSPaolo Bonzini 	 * must drop all references to relevant pages prior to completing the
7858351779cSPaolo Bonzini 	 * callback.  Dropping mmu_lock with an unreachable root would result
7868351779cSPaolo Bonzini 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
7878351779cSPaolo Bonzini 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
7888351779cSPaolo Bonzini 	 * dirty accessed bits to the SPTE's associated struct page.
7898351779cSPaolo Bonzini 	 */
7908351779cSPaolo Bonzini 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
7918351779cSPaolo Bonzini 
792e2b5b21dSSean Christopherson 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
793e2b5b21dSSean Christopherson 
794e2b5b21dSSean Christopherson 	rcu_read_lock();
795e2b5b21dSSean Christopherson 
796e2b5b21dSSean Christopherson 	/*
7971b6043e8SSean Christopherson 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
7981b6043e8SSean Christopherson 	 * split the zap into two passes.  On the first pass, zap at the 1gb
7991b6043e8SSean Christopherson 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
8001b6043e8SSean Christopherson 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
8011b6043e8SSean Christopherson 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
8021b6043e8SSean Christopherson 	 *
8031b6043e8SSean Christopherson 	 * Because zapping a SP recurses on its children, stepping down to
8041b6043e8SSean Christopherson 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
805e2b5b21dSSean Christopherson 	 */
8061b6043e8SSean Christopherson 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
8071b6043e8SSean Christopherson 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
808e2b5b21dSSean Christopherson 
809e2b5b21dSSean Christopherson 	rcu_read_unlock();
810e2b5b21dSSean Christopherson }
811e2b5b21dSSean Christopherson 
812c10743a1SSean Christopherson bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
813c10743a1SSean Christopherson {
814c10743a1SSean Christopherson 	u64 old_spte;
815c10743a1SSean Christopherson 
816c10743a1SSean Christopherson 	/*
817c10743a1SSean Christopherson 	 * This helper intentionally doesn't allow zapping a root shadow page,
818c10743a1SSean Christopherson 	 * which doesn't have a parent page table and thus no associated entry.
819c10743a1SSean Christopherson 	 */
820c10743a1SSean Christopherson 	if (WARN_ON_ONCE(!sp->ptep))
821c10743a1SSean Christopherson 		return false;
822c10743a1SSean Christopherson 
823c10743a1SSean Christopherson 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
824bb95dfb9SSean Christopherson 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
825c10743a1SSean Christopherson 		return false;
826c10743a1SSean Christopherson 
8270b7cc254SVipin Sharma 	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
8280b7cc254SVipin Sharma 			 sp->gfn, sp->role.level + 1);
829c10743a1SSean Christopherson 
830c10743a1SSean Christopherson 	return true;
831c10743a1SSean Christopherson }
832c10743a1SSean Christopherson 
833faaf05b0SBen Gardon /*
834063afacdSBen Gardon  * If can_yield is true, will release the MMU lock and reschedule if the
835063afacdSBen Gardon  * scheduler needs the CPU or there is contention on the MMU lock. If this
836063afacdSBen Gardon  * function cannot yield, it will not release the MMU lock or reschedule and
837063afacdSBen Gardon  * the caller must ensure it does not supply too large a GFN range, or the
8386103bc07SBen Gardon  * operation can cause a soft lockup.
839faaf05b0SBen Gardon  */
840f47e5bbbSSean Christopherson static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
841acbda82aSSean Christopherson 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
842faaf05b0SBen Gardon {
843faaf05b0SBen Gardon 	struct tdp_iter iter;
844faaf05b0SBen Gardon 
84586931ff7SSean Christopherson 	end = min(end, tdp_mmu_max_gfn_exclusive());
846524a1e4eSSean Christopherson 
847acbda82aSSean Christopherson 	lockdep_assert_held_write(&kvm->mmu_lock);
8486103bc07SBen Gardon 
8497cca2d0bSBen Gardon 	rcu_read_lock();
8507cca2d0bSBen Gardon 
851f47e5bbbSSean Christopherson 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
8521af4a960SBen Gardon 		if (can_yield &&
853acbda82aSSean Christopherson 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
854a835429cSSean Christopherson 			flush = false;
8551af4a960SBen Gardon 			continue;
8561af4a960SBen Gardon 		}
8571af4a960SBen Gardon 
858f47e5bbbSSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte) ||
859faaf05b0SBen Gardon 		    !is_last_spte(iter.old_spte, iter.level))
860faaf05b0SBen Gardon 			continue;
861faaf05b0SBen Gardon 
8620b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, &iter, 0);
863a835429cSSean Christopherson 		flush = true;
864faaf05b0SBen Gardon 	}
8657cca2d0bSBen Gardon 
8667cca2d0bSBen Gardon 	rcu_read_unlock();
867bb95dfb9SSean Christopherson 
868f47e5bbbSSean Christopherson 	/*
869f47e5bbbSSean Christopherson 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
870f47e5bbbSSean Christopherson 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
871f47e5bbbSSean Christopherson 	 */
872f47e5bbbSSean Christopherson 	return flush;
873faaf05b0SBen Gardon }
874faaf05b0SBen Gardon 
875faaf05b0SBen Gardon /*
8767edc3a68SKai Huang  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
8777edc3a68SKai Huang  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
8787edc3a68SKai Huang  * more SPTEs were zapped since the MMU lock was last acquired.
879faaf05b0SBen Gardon  */
880f47e5bbbSSean Christopherson bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
881f47e5bbbSSean Christopherson 			   bool can_yield, bool flush)
882faaf05b0SBen Gardon {
883faaf05b0SBen Gardon 	struct kvm_mmu_page *root;
884faaf05b0SBen Gardon 
885614f6970SPaolo Bonzini 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
886f47e5bbbSSean Christopherson 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
887faaf05b0SBen Gardon 
888faaf05b0SBen Gardon 	return flush;
889faaf05b0SBen Gardon }
890faaf05b0SBen Gardon 
891faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm)
892faaf05b0SBen Gardon {
893e2b5b21dSSean Christopherson 	struct kvm_mmu_page *root;
8942b9663d8SSean Christopherson 	int i;
895faaf05b0SBen Gardon 
89677c8cd6bSSean Christopherson 	/*
89722b94c4bSPaolo Bonzini 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
89822b94c4bSPaolo Bonzini 	 * before returning to the caller.  Zap directly even if the root is
89922b94c4bSPaolo Bonzini 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
90022b94c4bSPaolo Bonzini 	 * all that expensive and mmu_lock is already held, which means the
90122b94c4bSPaolo Bonzini 	 * worker has yielded, i.e. flushing the work instead of zapping here
90222b94c4bSPaolo Bonzini 	 * isn't guaranteed to be any faster.
90322b94c4bSPaolo Bonzini 	 *
90477c8cd6bSSean Christopherson 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
90577c8cd6bSSean Christopherson 	 * is being destroyed or the userspace VMM has exited.  In both cases,
90677c8cd6bSSean Christopherson 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
90777c8cd6bSSean Christopherson 	 */
908e2b5b21dSSean Christopherson 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
909e2b5b21dSSean Christopherson 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
910e2b5b21dSSean Christopherson 			tdp_mmu_zap_root(kvm, root, false);
911e2b5b21dSSean Christopherson 	}
912faaf05b0SBen Gardon }
913bb18842eSBen Gardon 
9144c6654bdSBen Gardon /*
915f28e9c7fSSean Christopherson  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
91622b94c4bSPaolo Bonzini  * zap" completes.
9174c6654bdSBen Gardon  */
9184c6654bdSBen Gardon void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
9194c6654bdSBen Gardon {
92022b94c4bSPaolo Bonzini 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
9214c6654bdSBen Gardon }
9224c6654bdSBen Gardon 
923bb18842eSBen Gardon /*
924f28e9c7fSSean Christopherson  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
92522b94c4bSPaolo Bonzini  * is about to be zapped, e.g. in response to a memslots update.  The actual
926edbdb43fSSean Christopherson  * zapping is performed asynchronously.  Using a separate workqueue makes it
927edbdb43fSSean Christopherson  * easy to ensure that the destruction is performed before the "fast zap"
928edbdb43fSSean Christopherson  * completes, without keeping a separate list of invalidated roots; the list is
929edbdb43fSSean Christopherson  * effectively the list of work items in the workqueue.
930b7cccd39SBen Gardon  *
931edbdb43fSSean Christopherson  * Note, the asynchronous worker is gifted the TDP MMU's reference.
932edbdb43fSSean Christopherson  * See kvm_tdp_mmu_get_vcpu_root_hpa().
933b7cccd39SBen Gardon  */
934b7cccd39SBen Gardon void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
935b7cccd39SBen Gardon {
936b7cccd39SBen Gardon 	struct kvm_mmu_page *root;
937b7cccd39SBen Gardon 
938edbdb43fSSean Christopherson 	/*
939edbdb43fSSean Christopherson 	 * mmu_lock must be held for write to ensure that a root doesn't become
940edbdb43fSSean Christopherson 	 * invalid while there are active readers (invalidating a root while
941edbdb43fSSean Christopherson 	 * there are active readers may or may not be problematic in practice,
942edbdb43fSSean Christopherson 	 * but it's uncharted territory and not supported).
943edbdb43fSSean Christopherson 	 *
944edbdb43fSSean Christopherson 	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
945edbdb43fSSean Christopherson 	 * being destroyed after all references have been put, or if no vCPUs
946edbdb43fSSean Christopherson 	 * have been created (which means there are no roots), i.e. the VM is
947edbdb43fSSean Christopherson 	 * being destroyed in an error path of KVM_CREATE_VM.
948edbdb43fSSean Christopherson 	 */
949edbdb43fSSean Christopherson 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
950edbdb43fSSean Christopherson 	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
951b7cccd39SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
952edbdb43fSSean Christopherson 
953edbdb43fSSean Christopherson 	/*
954edbdb43fSSean Christopherson 	 * As above, mmu_lock isn't held when destroying the VM!  There can't
955edbdb43fSSean Christopherson 	 * be other references to @kvm, i.e. nothing else can invalidate roots
956edbdb43fSSean Christopherson 	 * or be consuming roots, but walking the list of roots does need to be
957edbdb43fSSean Christopherson 	 * guarded against roots being deleted by the asynchronous zap worker.
958edbdb43fSSean Christopherson 	 */
959edbdb43fSSean Christopherson 	rcu_read_lock();
960edbdb43fSSean Christopherson 
961edbdb43fSSean Christopherson 	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
962edbdb43fSSean Christopherson 		if (!root->role.invalid) {
963b7cccd39SBen Gardon 			root->role.invalid = true;
96422b94c4bSPaolo Bonzini 			tdp_mmu_schedule_zap_root(kvm, root);
96522b94c4bSPaolo Bonzini 		}
966b7cccd39SBen Gardon 	}
967edbdb43fSSean Christopherson 
968edbdb43fSSean Christopherson 	rcu_read_unlock();
969f28e9c7fSSean Christopherson }
970b7cccd39SBen Gardon 
971bb18842eSBen Gardon /*
972bb18842eSBen Gardon  * Installs a last-level SPTE to handle a TDP page fault.
973bb18842eSBen Gardon  * (NPT/EPT violation/misconfiguration)
974bb18842eSBen Gardon  */
975cdc47767SPaolo Bonzini static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
976cdc47767SPaolo Bonzini 					  struct kvm_page_fault *fault,
977cdc47767SPaolo Bonzini 					  struct tdp_iter *iter)
978bb18842eSBen Gardon {
979c435d4b7SSean Christopherson 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
980bb18842eSBen Gardon 	u64 new_spte;
98157a3e96dSKai Huang 	int ret = RET_PF_FIXED;
982ad67e480SPaolo Bonzini 	bool wrprot = false;
983bb18842eSBen Gardon 
98450a9ac25SSean Christopherson 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
98550a9ac25SSean Christopherson 		return RET_PF_RETRY;
98650a9ac25SSean Christopherson 
987e710c5f6SDavid Matlack 	if (unlikely(!fault->slot))
988bb18842eSBen Gardon 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9899a77daacSBen Gardon 	else
99053597858SDavid Matlack 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
9912839180cSPaolo Bonzini 					 fault->pfn, iter->old_spte, fault->prefetch, true,
9927158bee4SPaolo Bonzini 					 fault->map_writable, &new_spte);
993bb18842eSBen Gardon 
994bb18842eSBen Gardon 	if (new_spte == iter->old_spte)
995bb18842eSBen Gardon 		ret = RET_PF_SPURIOUS;
9963e72c791SDavid Matlack 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9979a77daacSBen Gardon 		return RET_PF_RETRY;
998bb95dfb9SSean Christopherson 	else if (is_shadow_present_pte(iter->old_spte) &&
999bb95dfb9SSean Christopherson 		 !is_last_spte(iter->old_spte, iter->level))
10001e203847SHou Wenlong 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1001bb18842eSBen Gardon 
1002bb18842eSBen Gardon 	/*
1003bb18842eSBen Gardon 	 * If the page fault was caused by a write but the page is write
1004bb18842eSBen Gardon 	 * protected, emulation is needed. If the emulation was skipped,
1005bb18842eSBen Gardon 	 * the vCPU would have the same fault again.
1006bb18842eSBen Gardon 	 */
1007ad67e480SPaolo Bonzini 	if (wrprot) {
1008cdc47767SPaolo Bonzini 		if (fault->write)
1009bb18842eSBen Gardon 			ret = RET_PF_EMULATE;
1010bb18842eSBen Gardon 	}
1011bb18842eSBen Gardon 
1012bb18842eSBen Gardon 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
10139a77daacSBen Gardon 	if (unlikely(is_mmio_spte(new_spte))) {
10141075d41eSSean Christopherson 		vcpu->stat.pf_mmio_spte_created++;
10159a77daacSBen Gardon 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
10169a77daacSBen Gardon 				     new_spte);
1017bb18842eSBen Gardon 		ret = RET_PF_EMULATE;
10183849e092SSean Christopherson 	} else {
10199a77daacSBen Gardon 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
10209a77daacSBen Gardon 				       rcu_dereference(iter->sptep));
10213849e092SSean Christopherson 	}
1022bb18842eSBen Gardon 
1023bb18842eSBen Gardon 	return ret;
1024bb18842eSBen Gardon }
1025bb18842eSBen Gardon 
1026bb18842eSBen Gardon /*
1027cb00a70bSDavid Matlack  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1028cb00a70bSDavid Matlack  * provided page table.
10297b7e1ab6SDavid Matlack  *
10307b7e1ab6SDavid Matlack  * @kvm: kvm instance
10317b7e1ab6SDavid Matlack  * @iter: a tdp_iter instance currently on the SPTE that should be set
10327b7e1ab6SDavid Matlack  * @sp: The new TDP page table to install.
1033cb00a70bSDavid Matlack  * @shared: This operation is running under the MMU lock in read mode.
10347b7e1ab6SDavid Matlack  *
10357b7e1ab6SDavid Matlack  * Returns: 0 if the new page table was installed. Non-0 if the page table
10367b7e1ab6SDavid Matlack  *          could not be installed (e.g. the atomic compare-exchange failed).
10377b7e1ab6SDavid Matlack  */
1038cb00a70bSDavid Matlack static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
103961f94478SSean Christopherson 			   struct kvm_mmu_page *sp, bool shared)
10407b7e1ab6SDavid Matlack {
104154275f74SSean Christopherson 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1042cb00a70bSDavid Matlack 	int ret = 0;
10437b7e1ab6SDavid Matlack 
1044cb00a70bSDavid Matlack 	if (shared) {
10457b7e1ab6SDavid Matlack 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
10467b7e1ab6SDavid Matlack 		if (ret)
10477b7e1ab6SDavid Matlack 			return ret;
1048cb00a70bSDavid Matlack 	} else {
10490b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, iter, spte);
1050cb00a70bSDavid Matlack 	}
10517b7e1ab6SDavid Matlack 
105243a063caSYosry Ahmed 	tdp_account_mmu_page(kvm, sp);
10537b7e1ab6SDavid Matlack 
10547b7e1ab6SDavid Matlack 	return 0;
10557b7e1ab6SDavid Matlack }
10567b7e1ab6SDavid Matlack 
1057c4b33d28SDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1058c4b33d28SDavid Matlack 				   struct kvm_mmu_page *sp, bool shared);
1059c4b33d28SDavid Matlack 
10607b7e1ab6SDavid Matlack /*
1061bb18842eSBen Gardon  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1062bb18842eSBen Gardon  * page tables and SPTEs to translate the faulting guest physical address.
1063bb18842eSBen Gardon  */
10642f6305ddSPaolo Bonzini int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1065bb18842eSBen Gardon {
1066bb18842eSBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
106761f94478SSean Christopherson 	struct kvm *kvm = vcpu->kvm;
1068bb18842eSBen Gardon 	struct tdp_iter iter;
106989c0fd49SBen Gardon 	struct kvm_mmu_page *sp;
107063d28a25SPaolo Bonzini 	int ret = RET_PF_RETRY;
1071bb18842eSBen Gardon 
107273a3c659SPaolo Bonzini 	kvm_mmu_hugepage_adjust(vcpu, fault);
1073bb18842eSBen Gardon 
1074f0066d94SPaolo Bonzini 	trace_kvm_mmu_spte_requested(fault);
10757cca2d0bSBen Gardon 
10767cca2d0bSBen Gardon 	rcu_read_lock();
10777cca2d0bSBen Gardon 
10782f6305ddSPaolo Bonzini 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
107963d28a25SPaolo Bonzini 		int r;
108063d28a25SPaolo Bonzini 
108173a3c659SPaolo Bonzini 		if (fault->nx_huge_page_workaround_enabled)
1082536f0e6aSPaolo Bonzini 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1083bb18842eSBen Gardon 
1084bb18842eSBen Gardon 		/*
1085c4b33d28SDavid Matlack 		 * If SPTE has been frozen by another thread, just give up and
1086c4b33d28SDavid Matlack 		 * retry, avoiding unnecessary page table allocation and free.
1087ff76d506SKai Huang 		 */
1088ff76d506SKai Huang 		if (is_removed_spte(iter.old_spte))
108963d28a25SPaolo Bonzini 			goto retry;
109063d28a25SPaolo Bonzini 
1091f5d16bb9SSean Christopherson 		if (iter.level == fault->goal_level)
109280a3e4aeSSean Christopherson 			goto map_target_level;
1093f5d16bb9SSean Christopherson 
109463d28a25SPaolo Bonzini 		/* Step down into the lower level page table if it exists. */
109563d28a25SPaolo Bonzini 		if (is_shadow_present_pte(iter.old_spte) &&
109663d28a25SPaolo Bonzini 		    !is_large_pte(iter.old_spte))
109763d28a25SPaolo Bonzini 			continue;
1098ff76d506SKai Huang 
1099c4b33d28SDavid Matlack 		/*
1100c4b33d28SDavid Matlack 		 * The SPTE is either non-present or points to a huge page that
1101c4b33d28SDavid Matlack 		 * needs to be split.
1102c4b33d28SDavid Matlack 		 */
1103a82070b6SDavid Matlack 		sp = tdp_mmu_alloc_sp(vcpu);
1104a82070b6SDavid Matlack 		tdp_mmu_init_child_sp(sp, &iter);
1105a82070b6SDavid Matlack 
110661f94478SSean Christopherson 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
110761f94478SSean Christopherson 
1108c4b33d28SDavid Matlack 		if (is_shadow_present_pte(iter.old_spte))
110963d28a25SPaolo Bonzini 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1110c4b33d28SDavid Matlack 		else
111163d28a25SPaolo Bonzini 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1112c4b33d28SDavid Matlack 
111363d28a25SPaolo Bonzini 		/*
111480a3e4aeSSean Christopherson 		 * Force the guest to retry if installing an upper level SPTE
111580a3e4aeSSean Christopherson 		 * failed, e.g. because a different task modified the SPTE.
111663d28a25SPaolo Bonzini 		 */
111763d28a25SPaolo Bonzini 		if (r) {
11189a77daacSBen Gardon 			tdp_mmu_free_sp(sp);
111963d28a25SPaolo Bonzini 			goto retry;
11209a77daacSBen Gardon 		}
112161f94478SSean Christopherson 
112261f94478SSean Christopherson 		if (fault->huge_page_disallowed &&
112361f94478SSean Christopherson 		    fault->req_level >= iter.level) {
112461f94478SSean Christopherson 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
112521a36ac6SSean Christopherson 			if (sp->nx_huge_page_disallowed)
112661f94478SSean Christopherson 				track_possible_nx_huge_page(kvm, sp);
112761f94478SSean Christopherson 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
112861f94478SSean Christopherson 		}
1129bb18842eSBen Gardon 	}
1130bb18842eSBen Gardon 
113180a3e4aeSSean Christopherson 	/*
113280a3e4aeSSean Christopherson 	 * The walk aborted before reaching the target level, e.g. because the
113380a3e4aeSSean Christopherson 	 * iterator detected an upper level SPTE was frozen during traversal.
113480a3e4aeSSean Christopherson 	 */
113580a3e4aeSSean Christopherson 	WARN_ON_ONCE(iter.level == fault->goal_level);
113680a3e4aeSSean Christopherson 	goto retry;
113780a3e4aeSSean Christopherson 
113880a3e4aeSSean Christopherson map_target_level:
1139cdc47767SPaolo Bonzini 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1140bb18842eSBen Gardon 
114163d28a25SPaolo Bonzini retry:
114263d28a25SPaolo Bonzini 	rcu_read_unlock();
1143bb18842eSBen Gardon 	return ret;
1144bb18842eSBen Gardon }
1145063afacdSBen Gardon 
11463039bcc7SSean Christopherson bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
11473039bcc7SSean Christopherson 				 bool flush)
1148063afacdSBen Gardon {
1149f47e5bbbSSean Christopherson 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
115083b83a02SSean Christopherson 				     range->end, range->may_block, flush);
11513039bcc7SSean Christopherson }
11523039bcc7SSean Christopherson 
11533039bcc7SSean Christopherson typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
11543039bcc7SSean Christopherson 			      struct kvm_gfn_range *range);
11553039bcc7SSean Christopherson 
11563039bcc7SSean Christopherson static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
11573039bcc7SSean Christopherson 						   struct kvm_gfn_range *range,
1158c1b91493SSean Christopherson 						   tdp_handler_t handler)
1159063afacdSBen Gardon {
1160063afacdSBen Gardon 	struct kvm_mmu_page *root;
11613039bcc7SSean Christopherson 	struct tdp_iter iter;
11623039bcc7SSean Christopherson 	bool ret = false;
1163063afacdSBen Gardon 
1164063afacdSBen Gardon 	/*
1165e1eed584SSean Christopherson 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1166e1eed584SSean Christopherson 	 * into this helper allow blocking; it'd be dead, wasteful code.
1167063afacdSBen Gardon 	 */
11683039bcc7SSean Christopherson 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1169a151acecSSean Christopherson 		rcu_read_lock();
1170a151acecSSean Christopherson 
11713039bcc7SSean Christopherson 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
11723039bcc7SSean Christopherson 			ret |= handler(kvm, &iter, range);
1173063afacdSBen Gardon 
11743039bcc7SSean Christopherson 		rcu_read_unlock();
1175a151acecSSean Christopherson 	}
1176063afacdSBen Gardon 
1177063afacdSBen Gardon 	return ret;
1178063afacdSBen Gardon }
1179063afacdSBen Gardon 
1180f8e14497SBen Gardon /*
1181f8e14497SBen Gardon  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1182f8e14497SBen Gardon  * if any of the GFNs in the range have been accessed.
11837ee131e3SVipin Sharma  *
11847ee131e3SVipin Sharma  * No need to mark the corresponding PFN as accessed as this call is coming
11857ee131e3SVipin Sharma  * from the clear_young() or clear_flush_young() notifier, which uses the
11867ee131e3SVipin Sharma  * return value to determine if the page has been accessed.
1187f8e14497SBen Gardon  */
11883039bcc7SSean Christopherson static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
11893039bcc7SSean Christopherson 			  struct kvm_gfn_range *range)
1190f8e14497SBen Gardon {
11917ee131e3SVipin Sharma 	u64 new_spte;
1192f8e14497SBen Gardon 
11933039bcc7SSean Christopherson 	/* If we have a non-accessed entry we don't need to change the pte. */
11943039bcc7SSean Christopherson 	if (!is_accessed_spte(iter->old_spte))
11953039bcc7SSean Christopherson 		return false;
11967cca2d0bSBen Gardon 
11977ee131e3SVipin Sharma 	if (spte_ad_enabled(iter->old_spte)) {
11987ee131e3SVipin Sharma 		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
11997ee131e3SVipin Sharma 							 iter->old_spte,
12007ee131e3SVipin Sharma 							 shadow_accessed_mask,
12017ee131e3SVipin Sharma 							 iter->level);
12027ee131e3SVipin Sharma 		new_spte = iter->old_spte & ~shadow_accessed_mask;
1203f8e14497SBen Gardon 	} else {
1204f8e14497SBen Gardon 		/*
1205f8e14497SBen Gardon 		 * Capture the dirty status of the page, so that it doesn't get
1206f8e14497SBen Gardon 		 * lost when the SPTE is marked for access tracking.
1207f8e14497SBen Gardon 		 */
12087ee131e3SVipin Sharma 		if (is_writable_pte(iter->old_spte))
12097ee131e3SVipin Sharma 			kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
1210f8e14497SBen Gardon 
12117ee131e3SVipin Sharma 		new_spte = mark_spte_for_access_track(iter->old_spte);
12127ee131e3SVipin Sharma 		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
12137ee131e3SVipin Sharma 							iter->old_spte, new_spte,
12147ee131e3SVipin Sharma 							iter->level);
1215f8e14497SBen Gardon 	}
1216f8e14497SBen Gardon 
1217891f1159SVipin Sharma 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1218891f1159SVipin Sharma 				       iter->old_spte, new_spte);
12193039bcc7SSean Christopherson 	return true;
1220f8e14497SBen Gardon }
1221f8e14497SBen Gardon 
12223039bcc7SSean Christopherson bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1223f8e14497SBen Gardon {
12243039bcc7SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1225f8e14497SBen Gardon }
1226f8e14497SBen Gardon 
12273039bcc7SSean Christopherson static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
12283039bcc7SSean Christopherson 			 struct kvm_gfn_range *range)
1229f8e14497SBen Gardon {
12303039bcc7SSean Christopherson 	return is_accessed_spte(iter->old_spte);
1231f8e14497SBen Gardon }
1232f8e14497SBen Gardon 
12333039bcc7SSean Christopherson bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1234f8e14497SBen Gardon {
12353039bcc7SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
12363039bcc7SSean Christopherson }
12373039bcc7SSean Christopherson 
12383039bcc7SSean Christopherson static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
12393039bcc7SSean Christopherson 			 struct kvm_gfn_range *range)
12403039bcc7SSean Christopherson {
12413039bcc7SSean Christopherson 	u64 new_spte;
12423039bcc7SSean Christopherson 
12433039bcc7SSean Christopherson 	/* Huge pages aren't expected to be modified without first being zapped. */
124420ba462dSSean Christopherson 	WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
12453039bcc7SSean Christopherson 
12463039bcc7SSean Christopherson 	if (iter->level != PG_LEVEL_4K ||
12473039bcc7SSean Christopherson 	    !is_shadow_present_pte(iter->old_spte))
12483039bcc7SSean Christopherson 		return false;
12493039bcc7SSean Christopherson 
12503039bcc7SSean Christopherson 	/*
12513039bcc7SSean Christopherson 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
12523039bcc7SSean Christopherson 	 * zero the SPTE before setting the new PFN, but doing so preserves the
12533039bcc7SSean Christopherson 	 * invariant that the PFN of a present * leaf SPTE can never change.
125440fa907eSVipin Sharma 	 * See handle_changed_spte().
12553039bcc7SSean Christopherson 	 */
12560b7cc254SVipin Sharma 	tdp_mmu_iter_set_spte(kvm, iter, 0);
12573039bcc7SSean Christopherson 
12583e1efe2bSSean Christopherson 	if (!pte_write(range->arg.pte)) {
12593039bcc7SSean Christopherson 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
12603e1efe2bSSean Christopherson 								  pte_pfn(range->arg.pte));
12613039bcc7SSean Christopherson 
12620b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, iter, new_spte);
12633039bcc7SSean Christopherson 	}
12643039bcc7SSean Christopherson 
12653039bcc7SSean Christopherson 	return true;
1266f8e14497SBen Gardon }
12671d8dd6b3SBen Gardon 
12681d8dd6b3SBen Gardon /*
12691d8dd6b3SBen Gardon  * Handle the changed_pte MMU notifier for the TDP MMU.
12701d8dd6b3SBen Gardon  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
12711d8dd6b3SBen Gardon  * notifier.
12721d8dd6b3SBen Gardon  * Returns non-zero if a flush is needed before releasing the MMU lock.
12731d8dd6b3SBen Gardon  */
12743039bcc7SSean Christopherson bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
12751d8dd6b3SBen Gardon {
127693fa50f6SSean Christopherson 	/*
127793fa50f6SSean Christopherson 	 * No need to handle the remote TLB flush under RCU protection, the
127893fa50f6SSean Christopherson 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
127940fa907eSVipin Sharma 	 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
128093fa50f6SSean Christopherson 	 */
128193fa50f6SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
12821d8dd6b3SBen Gardon }
12831d8dd6b3SBen Gardon 
1284a6a0b05dSBen Gardon /*
1285bedd9195SDavid Matlack  * Remove write access from all SPTEs at or above min_level that map GFNs
1286bedd9195SDavid Matlack  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1287bedd9195SDavid Matlack  * be flushed.
1288a6a0b05dSBen Gardon  */
1289a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1290a6a0b05dSBen Gardon 			     gfn_t start, gfn_t end, int min_level)
1291a6a0b05dSBen Gardon {
1292a6a0b05dSBen Gardon 	struct tdp_iter iter;
1293a6a0b05dSBen Gardon 	u64 new_spte;
1294a6a0b05dSBen Gardon 	bool spte_set = false;
1295a6a0b05dSBen Gardon 
12967cca2d0bSBen Gardon 	rcu_read_lock();
12977cca2d0bSBen Gardon 
1298a6a0b05dSBen Gardon 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1299a6a0b05dSBen Gardon 
130077aa6075SDavid Matlack 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
130124ae4cfaSBen Gardon retry:
130224ae4cfaSBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
13031af4a960SBen Gardon 			continue;
13041af4a960SBen Gardon 
1305a6a0b05dSBen Gardon 		if (!is_shadow_present_pte(iter.old_spte) ||
13060f99ee2cSBen Gardon 		    !is_last_spte(iter.old_spte, iter.level) ||
13070f99ee2cSBen Gardon 		    !(iter.old_spte & PT_WRITABLE_MASK))
1308a6a0b05dSBen Gardon 			continue;
1309a6a0b05dSBen Gardon 
1310a6a0b05dSBen Gardon 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1311a6a0b05dSBen Gardon 
13123e72c791SDavid Matlack 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
131324ae4cfaSBen Gardon 			goto retry;
13143255530aSDavid Matlack 
1315a6a0b05dSBen Gardon 		spte_set = true;
1316a6a0b05dSBen Gardon 	}
13177cca2d0bSBen Gardon 
13187cca2d0bSBen Gardon 	rcu_read_unlock();
1319a6a0b05dSBen Gardon 	return spte_set;
1320a6a0b05dSBen Gardon }
1321a6a0b05dSBen Gardon 
1322a6a0b05dSBen Gardon /*
1323a6a0b05dSBen Gardon  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1324a6a0b05dSBen Gardon  * only affect leaf SPTEs down to min_level.
1325a6a0b05dSBen Gardon  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1326a6a0b05dSBen Gardon  */
1327269e9552SHamza Mahfooz bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1328269e9552SHamza Mahfooz 			     const struct kvm_memory_slot *slot, int min_level)
1329a6a0b05dSBen Gardon {
1330a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1331a6a0b05dSBen Gardon 	bool spte_set = false;
1332a6a0b05dSBen Gardon 
133324ae4cfaSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
1334a6a0b05dSBen Gardon 
1335d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1336a6a0b05dSBen Gardon 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1337a6a0b05dSBen Gardon 			     slot->base_gfn + slot->npages, min_level);
1338a6a0b05dSBen Gardon 
1339a6a0b05dSBen Gardon 	return spte_set;
1340a6a0b05dSBen Gardon }
1341a6a0b05dSBen Gardon 
1342a3fe5dbdSDavid Matlack static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1343a3fe5dbdSDavid Matlack {
1344a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp;
1345a3fe5dbdSDavid Matlack 
1346a3fe5dbdSDavid Matlack 	gfp |= __GFP_ZERO;
1347a3fe5dbdSDavid Matlack 
1348a3fe5dbdSDavid Matlack 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1349a3fe5dbdSDavid Matlack 	if (!sp)
1350a3fe5dbdSDavid Matlack 		return NULL;
1351a3fe5dbdSDavid Matlack 
1352a3fe5dbdSDavid Matlack 	sp->spt = (void *)__get_free_page(gfp);
1353a3fe5dbdSDavid Matlack 	if (!sp->spt) {
1354a3fe5dbdSDavid Matlack 		kmem_cache_free(mmu_page_header_cache, sp);
1355a3fe5dbdSDavid Matlack 		return NULL;
1356a3fe5dbdSDavid Matlack 	}
1357a3fe5dbdSDavid Matlack 
1358a3fe5dbdSDavid Matlack 	return sp;
1359a3fe5dbdSDavid Matlack }
1360a3fe5dbdSDavid Matlack 
1361a3fe5dbdSDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1362cb00a70bSDavid Matlack 						       struct tdp_iter *iter,
1363cb00a70bSDavid Matlack 						       bool shared)
1364a3fe5dbdSDavid Matlack {
1365a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp;
1366a3fe5dbdSDavid Matlack 
1367a3fe5dbdSDavid Matlack 	/*
1368a3fe5dbdSDavid Matlack 	 * Since we are allocating while under the MMU lock we have to be
1369a3fe5dbdSDavid Matlack 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1370a3fe5dbdSDavid Matlack 	 * reclaim and to avoid making any filesystem callbacks (which can end
1371a3fe5dbdSDavid Matlack 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1372a3fe5dbdSDavid Matlack 	 *
1373a3fe5dbdSDavid Matlack 	 * If this allocation fails we drop the lock and retry with reclaim
1374a3fe5dbdSDavid Matlack 	 * allowed.
1375a3fe5dbdSDavid Matlack 	 */
1376a3fe5dbdSDavid Matlack 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1377a3fe5dbdSDavid Matlack 	if (sp)
1378a3fe5dbdSDavid Matlack 		return sp;
1379a3fe5dbdSDavid Matlack 
1380a3fe5dbdSDavid Matlack 	rcu_read_unlock();
1381cb00a70bSDavid Matlack 
1382cb00a70bSDavid Matlack 	if (shared)
1383a3fe5dbdSDavid Matlack 		read_unlock(&kvm->mmu_lock);
1384cb00a70bSDavid Matlack 	else
1385cb00a70bSDavid Matlack 		write_unlock(&kvm->mmu_lock);
1386a3fe5dbdSDavid Matlack 
1387a3fe5dbdSDavid Matlack 	iter->yielded = true;
1388a3fe5dbdSDavid Matlack 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1389a3fe5dbdSDavid Matlack 
1390cb00a70bSDavid Matlack 	if (shared)
1391a3fe5dbdSDavid Matlack 		read_lock(&kvm->mmu_lock);
1392cb00a70bSDavid Matlack 	else
1393cb00a70bSDavid Matlack 		write_lock(&kvm->mmu_lock);
1394cb00a70bSDavid Matlack 
1395a3fe5dbdSDavid Matlack 	rcu_read_lock();
1396a3fe5dbdSDavid Matlack 
1397a3fe5dbdSDavid Matlack 	return sp;
1398a3fe5dbdSDavid Matlack }
1399a3fe5dbdSDavid Matlack 
1400c4b33d28SDavid Matlack /* Note, the caller is responsible for initializing @sp. */
1401cb00a70bSDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1402cb00a70bSDavid Matlack 				   struct kvm_mmu_page *sp, bool shared)
1403a3fe5dbdSDavid Matlack {
1404a3fe5dbdSDavid Matlack 	const u64 huge_spte = iter->old_spte;
1405a3fe5dbdSDavid Matlack 	const int level = iter->level;
1406a3fe5dbdSDavid Matlack 	int ret, i;
1407a3fe5dbdSDavid Matlack 
1408a3fe5dbdSDavid Matlack 	/*
1409a3fe5dbdSDavid Matlack 	 * No need for atomics when writing to sp->spt since the page table has
1410a3fe5dbdSDavid Matlack 	 * not been linked in yet and thus is not reachable from any other CPU.
1411a3fe5dbdSDavid Matlack 	 */
14122ca3129eSSean Christopherson 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
141347855da0SDavid Matlack 		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1414a3fe5dbdSDavid Matlack 
1415a3fe5dbdSDavid Matlack 	/*
1416a3fe5dbdSDavid Matlack 	 * Replace the huge spte with a pointer to the populated lower level
1417a3fe5dbdSDavid Matlack 	 * page table. Since we are making this change without a TLB flush vCPUs
1418a3fe5dbdSDavid Matlack 	 * will see a mix of the split mappings and the original huge mapping,
1419a3fe5dbdSDavid Matlack 	 * depending on what's currently in their TLB. This is fine from a
1420a3fe5dbdSDavid Matlack 	 * correctness standpoint since the translation will be the same either
1421a3fe5dbdSDavid Matlack 	 * way.
1422a3fe5dbdSDavid Matlack 	 */
142361f94478SSean Christopherson 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1424a3fe5dbdSDavid Matlack 	if (ret)
1425e0b728b1SDavid Matlack 		goto out;
1426a3fe5dbdSDavid Matlack 
1427a3fe5dbdSDavid Matlack 	/*
1428a3fe5dbdSDavid Matlack 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1429a3fe5dbdSDavid Matlack 	 * are overwriting from the page stats. But we have to manually update
1430a3fe5dbdSDavid Matlack 	 * the page stats with the new present child pages.
1431a3fe5dbdSDavid Matlack 	 */
14322ca3129eSSean Christopherson 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1433a3fe5dbdSDavid Matlack 
1434e0b728b1SDavid Matlack out:
1435e0b728b1SDavid Matlack 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1436e0b728b1SDavid Matlack 	return ret;
1437a3fe5dbdSDavid Matlack }
1438a3fe5dbdSDavid Matlack 
1439a3fe5dbdSDavid Matlack static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1440a3fe5dbdSDavid Matlack 					 struct kvm_mmu_page *root,
1441a3fe5dbdSDavid Matlack 					 gfn_t start, gfn_t end,
1442cb00a70bSDavid Matlack 					 int target_level, bool shared)
1443a3fe5dbdSDavid Matlack {
1444a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp = NULL;
1445a3fe5dbdSDavid Matlack 	struct tdp_iter iter;
1446a3fe5dbdSDavid Matlack 	int ret = 0;
1447a3fe5dbdSDavid Matlack 
1448a3fe5dbdSDavid Matlack 	rcu_read_lock();
1449a3fe5dbdSDavid Matlack 
1450a3fe5dbdSDavid Matlack 	/*
1451a3fe5dbdSDavid Matlack 	 * Traverse the page table splitting all huge pages above the target
1452a3fe5dbdSDavid Matlack 	 * level into one lower level. For example, if we encounter a 1GB page
1453a3fe5dbdSDavid Matlack 	 * we split it into 512 2MB pages.
1454a3fe5dbdSDavid Matlack 	 *
1455a3fe5dbdSDavid Matlack 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1456a3fe5dbdSDavid Matlack 	 * to visit an SPTE before ever visiting its children, which means we
1457a3fe5dbdSDavid Matlack 	 * will correctly recursively split huge pages that are more than one
1458a3fe5dbdSDavid Matlack 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1459a3fe5dbdSDavid Matlack 	 * and then splitting each of those to 512 4KB pages).
1460a3fe5dbdSDavid Matlack 	 */
1461a3fe5dbdSDavid Matlack 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1462a3fe5dbdSDavid Matlack retry:
1463cb00a70bSDavid Matlack 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1464a3fe5dbdSDavid Matlack 			continue;
1465a3fe5dbdSDavid Matlack 
1466a3fe5dbdSDavid Matlack 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1467a3fe5dbdSDavid Matlack 			continue;
1468a3fe5dbdSDavid Matlack 
1469a3fe5dbdSDavid Matlack 		if (!sp) {
1470cb00a70bSDavid Matlack 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1471a3fe5dbdSDavid Matlack 			if (!sp) {
1472a3fe5dbdSDavid Matlack 				ret = -ENOMEM;
1473e0b728b1SDavid Matlack 				trace_kvm_mmu_split_huge_page(iter.gfn,
1474e0b728b1SDavid Matlack 							      iter.old_spte,
1475e0b728b1SDavid Matlack 							      iter.level, ret);
1476a3fe5dbdSDavid Matlack 				break;
1477a3fe5dbdSDavid Matlack 			}
1478a3fe5dbdSDavid Matlack 
1479a3fe5dbdSDavid Matlack 			if (iter.yielded)
1480a3fe5dbdSDavid Matlack 				continue;
1481a3fe5dbdSDavid Matlack 		}
1482a3fe5dbdSDavid Matlack 
1483c4b33d28SDavid Matlack 		tdp_mmu_init_child_sp(sp, &iter);
1484c4b33d28SDavid Matlack 
1485cb00a70bSDavid Matlack 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1486a3fe5dbdSDavid Matlack 			goto retry;
1487a3fe5dbdSDavid Matlack 
1488a3fe5dbdSDavid Matlack 		sp = NULL;
1489a3fe5dbdSDavid Matlack 	}
1490a3fe5dbdSDavid Matlack 
1491a3fe5dbdSDavid Matlack 	rcu_read_unlock();
1492a3fe5dbdSDavid Matlack 
1493a3fe5dbdSDavid Matlack 	/*
1494a3fe5dbdSDavid Matlack 	 * It's possible to exit the loop having never used the last sp if, for
1495a3fe5dbdSDavid Matlack 	 * example, a vCPU doing HugePage NX splitting wins the race and
1496a3fe5dbdSDavid Matlack 	 * installs its own sp in place of the last sp we tried to split.
1497a3fe5dbdSDavid Matlack 	 */
1498a3fe5dbdSDavid Matlack 	if (sp)
1499a3fe5dbdSDavid Matlack 		tdp_mmu_free_sp(sp);
1500a3fe5dbdSDavid Matlack 
1501a3fe5dbdSDavid Matlack 	return ret;
1502a3fe5dbdSDavid Matlack }
1503a3fe5dbdSDavid Matlack 
1504cb00a70bSDavid Matlack 
1505a3fe5dbdSDavid Matlack /*
1506a3fe5dbdSDavid Matlack  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1507a3fe5dbdSDavid Matlack  */
1508a3fe5dbdSDavid Matlack void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1509a3fe5dbdSDavid Matlack 				      const struct kvm_memory_slot *slot,
1510a3fe5dbdSDavid Matlack 				      gfn_t start, gfn_t end,
1511cb00a70bSDavid Matlack 				      int target_level, bool shared)
1512a3fe5dbdSDavid Matlack {
1513a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *root;
1514a3fe5dbdSDavid Matlack 	int r = 0;
1515a3fe5dbdSDavid Matlack 
1516cb00a70bSDavid Matlack 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1517a3fe5dbdSDavid Matlack 
15187c554d8eSPaolo Bonzini 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1519cb00a70bSDavid Matlack 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1520a3fe5dbdSDavid Matlack 		if (r) {
1521cb00a70bSDavid Matlack 			kvm_tdp_mmu_put_root(kvm, root, shared);
1522a3fe5dbdSDavid Matlack 			break;
1523a3fe5dbdSDavid Matlack 		}
1524a3fe5dbdSDavid Matlack 	}
1525a3fe5dbdSDavid Matlack }
1526a3fe5dbdSDavid Matlack 
1527a6a0b05dSBen Gardon /*
1528a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1529a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1530a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1531a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1532a6a0b05dSBen Gardon  * be flushed.
1533a6a0b05dSBen Gardon  */
1534a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1535a6a0b05dSBen Gardon 			   gfn_t start, gfn_t end)
1536a6a0b05dSBen Gardon {
1537697c89beSVipin Sharma 	u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1538a6a0b05dSBen Gardon 	struct tdp_iter iter;
1539a6a0b05dSBen Gardon 	bool spte_set = false;
1540a6a0b05dSBen Gardon 
15417cca2d0bSBen Gardon 	rcu_read_lock();
15427cca2d0bSBen Gardon 
1543a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
154424ae4cfaSBen Gardon retry:
154524ae4cfaSBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
15461af4a960SBen Gardon 			continue;
15471af4a960SBen Gardon 
15483354ef5aSSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte))
15493354ef5aSSean Christopherson 			continue;
15503354ef5aSSean Christopherson 
15510fe6370eSSean Christopherson 		KVM_MMU_WARN_ON(kvm_ad_enabled() &&
15525982a539SVipin Sharma 				spte_ad_need_write_protect(iter.old_spte));
15535982a539SVipin Sharma 
1554697c89beSVipin Sharma 		if (!(iter.old_spte & dbit))
1555a6a0b05dSBen Gardon 			continue;
1556a6a0b05dSBen Gardon 
1557697c89beSVipin Sharma 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
155824ae4cfaSBen Gardon 			goto retry;
15593255530aSDavid Matlack 
1560a6a0b05dSBen Gardon 		spte_set = true;
1561a6a0b05dSBen Gardon 	}
15627cca2d0bSBen Gardon 
15637cca2d0bSBen Gardon 	rcu_read_unlock();
1564a6a0b05dSBen Gardon 	return spte_set;
1565a6a0b05dSBen Gardon }
1566a6a0b05dSBen Gardon 
1567a6a0b05dSBen Gardon /*
1568a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1569a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1570a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1571a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1572a6a0b05dSBen Gardon  * be flushed.
1573a6a0b05dSBen Gardon  */
1574269e9552SHamza Mahfooz bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1575269e9552SHamza Mahfooz 				  const struct kvm_memory_slot *slot)
1576a6a0b05dSBen Gardon {
1577a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1578a6a0b05dSBen Gardon 	bool spte_set = false;
1579a6a0b05dSBen Gardon 
158024ae4cfaSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
1581a6a0b05dSBen Gardon 
1582d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1583a6a0b05dSBen Gardon 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1584a6a0b05dSBen Gardon 				slot->base_gfn + slot->npages);
1585a6a0b05dSBen Gardon 
1586a6a0b05dSBen Gardon 	return spte_set;
1587a6a0b05dSBen Gardon }
1588a6a0b05dSBen Gardon 
1589a6a0b05dSBen Gardon /*
1590a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1591a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1592a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1593a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1594a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1595a6a0b05dSBen Gardon  */
1596a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1597a6a0b05dSBen Gardon 				  gfn_t gfn, unsigned long mask, bool wrprot)
1598a6a0b05dSBen Gardon {
1599697c89beSVipin Sharma 	u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1600697c89beSVipin Sharma 						   shadow_dirty_mask;
1601a6a0b05dSBen Gardon 	struct tdp_iter iter;
1602a6a0b05dSBen Gardon 
160391303f80SLike Xu 	lockdep_assert_held_write(&kvm->mmu_lock);
160491303f80SLike Xu 
16057cca2d0bSBen Gardon 	rcu_read_lock();
16067cca2d0bSBen Gardon 
1607a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1608a6a0b05dSBen Gardon 				    gfn + BITS_PER_LONG) {
1609a6a0b05dSBen Gardon 		if (!mask)
1610a6a0b05dSBen Gardon 			break;
1611a6a0b05dSBen Gardon 
16120fe6370eSSean Christopherson 		KVM_MMU_WARN_ON(kvm_ad_enabled() &&
16135982a539SVipin Sharma 				spte_ad_need_write_protect(iter.old_spte));
16145982a539SVipin Sharma 
1615a6a0b05dSBen Gardon 		if (iter.level > PG_LEVEL_4K ||
1616a6a0b05dSBen Gardon 		    !(mask & (1UL << (iter.gfn - gfn))))
1617a6a0b05dSBen Gardon 			continue;
1618a6a0b05dSBen Gardon 
1619f1b3b06aSBen Gardon 		mask &= ~(1UL << (iter.gfn - gfn));
1620f1b3b06aSBen Gardon 
1621697c89beSVipin Sharma 		if (!(iter.old_spte & dbit))
1622a6a0b05dSBen Gardon 			continue;
1623a6a0b05dSBen Gardon 
162489c313f2SVipin Sharma 		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
162589c313f2SVipin Sharma 							iter.old_spte, dbit,
162689c313f2SVipin Sharma 							iter.level);
162789c313f2SVipin Sharma 
16281e0f4298SVipin Sharma 		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
16291e0f4298SVipin Sharma 					       iter.old_spte,
16301e0f4298SVipin Sharma 					       iter.old_spte & ~dbit);
16311e0f4298SVipin Sharma 		kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1632a6a0b05dSBen Gardon 	}
16337cca2d0bSBen Gardon 
16347cca2d0bSBen Gardon 	rcu_read_unlock();
1635a6a0b05dSBen Gardon }
1636a6a0b05dSBen Gardon 
1637a6a0b05dSBen Gardon /*
1638a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1639a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1640a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1641a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1642a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1643a6a0b05dSBen Gardon  */
1644a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1645a6a0b05dSBen Gardon 				       struct kvm_memory_slot *slot,
1646a6a0b05dSBen Gardon 				       gfn_t gfn, unsigned long mask,
1647a6a0b05dSBen Gardon 				       bool wrprot)
1648a6a0b05dSBen Gardon {
1649a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1650a6a0b05dSBen Gardon 
1651a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1652a6a0b05dSBen Gardon 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1653a6a0b05dSBen Gardon }
1654a6a0b05dSBen Gardon 
16554b85c921SSean Christopherson static void zap_collapsible_spte_range(struct kvm *kvm,
165614881998SBen Gardon 				       struct kvm_mmu_page *root,
16574b85c921SSean Christopherson 				       const struct kvm_memory_slot *slot)
165814881998SBen Gardon {
16599eba50f8SSean Christopherson 	gfn_t start = slot->base_gfn;
16609eba50f8SSean Christopherson 	gfn_t end = start + slot->npages;
166114881998SBen Gardon 	struct tdp_iter iter;
16625ba7c4c6SBen Gardon 	int max_mapping_level;
166314881998SBen Gardon 
16647cca2d0bSBen Gardon 	rcu_read_lock();
16657cca2d0bSBen Gardon 
166685f44f8cSSean Christopherson 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
166785f44f8cSSean Christopherson retry:
16684b85c921SSean Christopherson 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
16691af4a960SBen Gardon 			continue;
16701af4a960SBen Gardon 
167185f44f8cSSean Christopherson 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
167285f44f8cSSean Christopherson 		    !is_shadow_present_pte(iter.old_spte))
167385f44f8cSSean Christopherson 			continue;
167485f44f8cSSean Christopherson 
167585f44f8cSSean Christopherson 		/*
167685f44f8cSSean Christopherson 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
167785f44f8cSSean Christopherson 		 * a large page size, then its parent would have been zapped
167885f44f8cSSean Christopherson 		 * instead of stepping down.
167985f44f8cSSean Christopherson 		 */
168085f44f8cSSean Christopherson 		if (is_last_spte(iter.old_spte, iter.level))
168185f44f8cSSean Christopherson 			continue;
168285f44f8cSSean Christopherson 
168385f44f8cSSean Christopherson 		/*
168485f44f8cSSean Christopherson 		 * If iter.gfn resides outside of the slot, i.e. the page for
168585f44f8cSSean Christopherson 		 * the current level overlaps but is not contained by the slot,
168685f44f8cSSean Christopherson 		 * then the SPTE can't be made huge.  More importantly, trying
168785f44f8cSSean Christopherson 		 * to query that info from slot->arch.lpage_info will cause an
168885f44f8cSSean Christopherson 		 * out-of-bounds access.
168985f44f8cSSean Christopherson 		 */
169085f44f8cSSean Christopherson 		if (iter.gfn < start || iter.gfn >= end)
169114881998SBen Gardon 			continue;
169214881998SBen Gardon 
16935ba7c4c6SBen Gardon 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1694a8ac499bSSean Christopherson 							      iter.gfn, PG_LEVEL_NUM);
169585f44f8cSSean Christopherson 		if (max_mapping_level < iter.level)
16965ba7c4c6SBen Gardon 			continue;
16975ba7c4c6SBen Gardon 
16984b85c921SSean Christopherson 		/* Note, a successful atomic zap also does a remote TLB flush. */
169985f44f8cSSean Christopherson 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
170085f44f8cSSean Christopherson 			goto retry;
17012db6f772SBen Gardon 	}
170214881998SBen Gardon 
17037cca2d0bSBen Gardon 	rcu_read_unlock();
170414881998SBen Gardon }
170514881998SBen Gardon 
170614881998SBen Gardon /*
170785f44f8cSSean Christopherson  * Zap non-leaf SPTEs (and free their associated page tables) which could
170885f44f8cSSean Christopherson  * be replaced by huge pages, for GFNs within the slot.
170914881998SBen Gardon  */
17104b85c921SSean Christopherson void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
17114b85c921SSean Christopherson 				       const struct kvm_memory_slot *slot)
171214881998SBen Gardon {
171314881998SBen Gardon 	struct kvm_mmu_page *root;
171414881998SBen Gardon 
17152db6f772SBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
171614881998SBen Gardon 
1717d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
17184b85c921SSean Christopherson 		zap_collapsible_spte_range(kvm, root, slot);
171914881998SBen Gardon }
172046044f72SBen Gardon 
172146044f72SBen Gardon /*
172246044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
17235fc3424fSSean Christopherson  * MMU-writable bit to ensure future writes continue to be intercepted.
172446044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
172546044f72SBen Gardon  */
172646044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
17273ad93562SKeqian Zhu 			      gfn_t gfn, int min_level)
172846044f72SBen Gardon {
172946044f72SBen Gardon 	struct tdp_iter iter;
173046044f72SBen Gardon 	u64 new_spte;
173146044f72SBen Gardon 	bool spte_set = false;
173246044f72SBen Gardon 
17333ad93562SKeqian Zhu 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
17343ad93562SKeqian Zhu 
17357cca2d0bSBen Gardon 	rcu_read_lock();
17367cca2d0bSBen Gardon 
173777aa6075SDavid Matlack 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
17383ad93562SKeqian Zhu 		if (!is_shadow_present_pte(iter.old_spte) ||
17393ad93562SKeqian Zhu 		    !is_last_spte(iter.old_spte, iter.level))
17403ad93562SKeqian Zhu 			continue;
17413ad93562SKeqian Zhu 
174246044f72SBen Gardon 		new_spte = iter.old_spte &
17435fc3424fSSean Christopherson 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
174446044f72SBen Gardon 
17457c8a4742SDavid Matlack 		if (new_spte == iter.old_spte)
17467c8a4742SDavid Matlack 			break;
17477c8a4742SDavid Matlack 
17480b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
174946044f72SBen Gardon 		spte_set = true;
175046044f72SBen Gardon 	}
175146044f72SBen Gardon 
17527cca2d0bSBen Gardon 	rcu_read_unlock();
17537cca2d0bSBen Gardon 
175446044f72SBen Gardon 	return spte_set;
175546044f72SBen Gardon }
175646044f72SBen Gardon 
175746044f72SBen Gardon /*
175846044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
17595fc3424fSSean Christopherson  * MMU-writable bit to ensure future writes continue to be intercepted.
176046044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
176146044f72SBen Gardon  */
176246044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
17633ad93562SKeqian Zhu 				   struct kvm_memory_slot *slot, gfn_t gfn,
17643ad93562SKeqian Zhu 				   int min_level)
176546044f72SBen Gardon {
176646044f72SBen Gardon 	struct kvm_mmu_page *root;
176746044f72SBen Gardon 	bool spte_set = false;
176846044f72SBen Gardon 
1769531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
1770a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
17713ad93562SKeqian Zhu 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1772a3f15bdaSSean Christopherson 
177346044f72SBen Gardon 	return spte_set;
177446044f72SBen Gardon }
177546044f72SBen Gardon 
177695fb5b02SBen Gardon /*
177795fb5b02SBen Gardon  * Return the level of the lowest level SPTE added to sptes.
177895fb5b02SBen Gardon  * That SPTE may be non-present.
1779c5c8c7c5SDavid Matlack  *
1780c5c8c7c5SDavid Matlack  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
178195fb5b02SBen Gardon  */
178239b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
178339b4d43eSSean Christopherson 			 int *root_level)
178495fb5b02SBen Gardon {
178595fb5b02SBen Gardon 	struct tdp_iter iter;
178695fb5b02SBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
178795fb5b02SBen Gardon 	gfn_t gfn = addr >> PAGE_SHIFT;
17882aa07893SSean Christopherson 	int leaf = -1;
178995fb5b02SBen Gardon 
1790a972e29cSPaolo Bonzini 	*root_level = vcpu->arch.mmu->root_role.level;
179195fb5b02SBen Gardon 
179295fb5b02SBen Gardon 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
179395fb5b02SBen Gardon 		leaf = iter.level;
1794dde81f94SSean Christopherson 		sptes[leaf] = iter.old_spte;
179595fb5b02SBen Gardon 	}
179695fb5b02SBen Gardon 
179795fb5b02SBen Gardon 	return leaf;
179895fb5b02SBen Gardon }
17996e8eb206SDavid Matlack 
18006e8eb206SDavid Matlack /*
18016e8eb206SDavid Matlack  * Returns the last level spte pointer of the shadow page walk for the given
18026e8eb206SDavid Matlack  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
18036e8eb206SDavid Matlack  * walk could be performed, returns NULL and *spte does not contain valid data.
18046e8eb206SDavid Matlack  *
18056e8eb206SDavid Matlack  * Contract:
18066e8eb206SDavid Matlack  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
18076e8eb206SDavid Matlack  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
18086e8eb206SDavid Matlack  *
18096e8eb206SDavid Matlack  * WARNING: This function is only intended to be called during fast_page_fault.
18106e8eb206SDavid Matlack  */
18116e8eb206SDavid Matlack u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
18126e8eb206SDavid Matlack 					u64 *spte)
18136e8eb206SDavid Matlack {
18146e8eb206SDavid Matlack 	struct tdp_iter iter;
18156e8eb206SDavid Matlack 	struct kvm_mmu *mmu = vcpu->arch.mmu;
18166e8eb206SDavid Matlack 	gfn_t gfn = addr >> PAGE_SHIFT;
18176e8eb206SDavid Matlack 	tdp_ptep_t sptep = NULL;
18186e8eb206SDavid Matlack 
18196e8eb206SDavid Matlack 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
18206e8eb206SDavid Matlack 		*spte = iter.old_spte;
18216e8eb206SDavid Matlack 		sptep = iter.sptep;
18226e8eb206SDavid Matlack 	}
18236e8eb206SDavid Matlack 
18246e8eb206SDavid Matlack 	/*
18256e8eb206SDavid Matlack 	 * Perform the rcu_dereference to get the raw spte pointer value since
18266e8eb206SDavid Matlack 	 * we are passing it up to fast_page_fault, which is shared with the
18276e8eb206SDavid Matlack 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
18286e8eb206SDavid Matlack 	 * annotation.
18296e8eb206SDavid Matlack 	 *
18306e8eb206SDavid Matlack 	 * This is safe since fast_page_fault obeys the contracts of this
18316e8eb206SDavid Matlack 	 * function as well as all TDP MMU contracts around modifying SPTEs
18326e8eb206SDavid Matlack 	 * outside of mmu_lock.
18336e8eb206SDavid Matlack 	 */
18346e8eb206SDavid Matlack 	return rcu_dereference(sptep);
18356e8eb206SDavid Matlack }
1836