xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision edbdb43f)
1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0
28d20bd63SSean Christopherson #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3fe5db27dSBen Gardon 
402c00b3aSBen Gardon #include "mmu.h"
502c00b3aSBen Gardon #include "mmu_internal.h"
6bb18842eSBen Gardon #include "mmutrace.h"
72f2fad08SBen Gardon #include "tdp_iter.h"
8fe5db27dSBen Gardon #include "tdp_mmu.h"
902c00b3aSBen Gardon #include "spte.h"
10fe5db27dSBen Gardon 
119a77daacSBen Gardon #include <asm/cmpxchg.h>
1233dd3574SBen Gardon #include <trace/events/kvm.h>
1333dd3574SBen Gardon 
14fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */
15a1a39128SPaolo Bonzini int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16fe5db27dSBen Gardon {
17a1a39128SPaolo Bonzini 	struct workqueue_struct *wq;
18a1a39128SPaolo Bonzini 
19a1a39128SPaolo Bonzini 	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
20a1a39128SPaolo Bonzini 	if (!wq)
21a1a39128SPaolo Bonzini 		return -ENOMEM;
22fe5db27dSBen Gardon 
2302c00b3aSBen Gardon 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
249a77daacSBen Gardon 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
25a1a39128SPaolo Bonzini 	kvm->arch.tdp_mmu_zap_wq = wq;
26a1a39128SPaolo Bonzini 	return 1;
27fe5db27dSBen Gardon }
28fe5db27dSBen Gardon 
29226b8c8fSSean Christopherson /* Arbitrarily returns true so that this may be used in if statements. */
30226b8c8fSSean Christopherson static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
316103bc07SBen Gardon 							     bool shared)
326103bc07SBen Gardon {
336103bc07SBen Gardon 	if (shared)
346103bc07SBen Gardon 		lockdep_assert_held_read(&kvm->mmu_lock);
356103bc07SBen Gardon 	else
366103bc07SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
37226b8c8fSSean Christopherson 
38226b8c8fSSean Christopherson 	return true;
396103bc07SBen Gardon }
406103bc07SBen Gardon 
41fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42fe5db27dSBen Gardon {
43*edbdb43fSSean Christopherson 	/*
44*edbdb43fSSean Christopherson 	 * Invalidate all roots, which besides the obvious, schedules all roots
45*edbdb43fSSean Christopherson 	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
46*edbdb43fSSean Christopherson 	 * ultimately frees all roots.
47*edbdb43fSSean Christopherson 	 */
48*edbdb43fSSean Christopherson 	kvm_tdp_mmu_invalidate_all_roots(kvm);
49*edbdb43fSSean Christopherson 
50*edbdb43fSSean Christopherson 	/*
51*edbdb43fSSean Christopherson 	 * Destroying a workqueue also first flushes the workqueue, i.e. no
52*edbdb43fSSean Christopherson 	 * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
53*edbdb43fSSean Christopherson 	 */
5422b94c4bSPaolo Bonzini 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
5522b94c4bSPaolo Bonzini 
56d25ceb92SSean Christopherson 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
5702c00b3aSBen Gardon 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
587cca2d0bSBen Gardon 
597cca2d0bSBen Gardon 	/*
607cca2d0bSBen Gardon 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
6122b94c4bSPaolo Bonzini 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
6222b94c4bSPaolo Bonzini 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
637cca2d0bSBen Gardon 	 */
647cca2d0bSBen Gardon 	rcu_barrier();
6502c00b3aSBen Gardon }
6602c00b3aSBen Gardon 
672bdb3d84SBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
68a889ea54SBen Gardon {
692bdb3d84SBen Gardon 	free_page((unsigned long)sp->spt);
702bdb3d84SBen Gardon 	kmem_cache_free(mmu_page_header_cache, sp);
71a889ea54SBen Gardon }
72a889ea54SBen Gardon 
73c0e64238SBen Gardon /*
74c0e64238SBen Gardon  * This is called through call_rcu in order to free TDP page table memory
75c0e64238SBen Gardon  * safely with respect to other kernel threads that may be operating on
76c0e64238SBen Gardon  * the memory.
77c0e64238SBen Gardon  * By only accessing TDP MMU page table memory in an RCU read critical
78c0e64238SBen Gardon  * section, and freeing it after a grace period, lockless access to that
79c0e64238SBen Gardon  * memory won't use it after it is freed.
80c0e64238SBen Gardon  */
81c0e64238SBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
82a889ea54SBen Gardon {
83c0e64238SBen Gardon 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
84c0e64238SBen Gardon 					       rcu_head);
85a889ea54SBen Gardon 
86c0e64238SBen Gardon 	tdp_mmu_free_sp(sp);
87a889ea54SBen Gardon }
88a889ea54SBen Gardon 
89e2b5b21dSSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
90e2b5b21dSSean Christopherson 			     bool shared);
91e2b5b21dSSean Christopherson 
9222b94c4bSPaolo Bonzini static void tdp_mmu_zap_root_work(struct work_struct *work)
9322b94c4bSPaolo Bonzini {
9422b94c4bSPaolo Bonzini 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
9522b94c4bSPaolo Bonzini 						 tdp_mmu_async_work);
9622b94c4bSPaolo Bonzini 	struct kvm *kvm = root->tdp_mmu_async_data;
9722b94c4bSPaolo Bonzini 
9822b94c4bSPaolo Bonzini 	read_lock(&kvm->mmu_lock);
9922b94c4bSPaolo Bonzini 
10022b94c4bSPaolo Bonzini 	/*
10122b94c4bSPaolo Bonzini 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
10222b94c4bSPaolo Bonzini 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
10322b94c4bSPaolo Bonzini 	 * to a different pCPU.  Note, the local TLB flush on reuse also
10422b94c4bSPaolo Bonzini 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
10522b94c4bSPaolo Bonzini 	 * intermediate paging structures, that may be zapped, as such entries
10622b94c4bSPaolo Bonzini 	 * are associated with the ASID on both VMX and SVM.
10722b94c4bSPaolo Bonzini 	 */
10822b94c4bSPaolo Bonzini 	tdp_mmu_zap_root(kvm, root, true);
10922b94c4bSPaolo Bonzini 
11022b94c4bSPaolo Bonzini 	/*
11122b94c4bSPaolo Bonzini 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
11222b94c4bSPaolo Bonzini 	 * avoiding an infinite loop.  By design, the root is reachable while
11322b94c4bSPaolo Bonzini 	 * it's being asynchronously zapped, thus a different task can put its
11422b94c4bSPaolo Bonzini 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
11522b94c4bSPaolo Bonzini 	 * asynchronously zapped root is unavoidable.
11622b94c4bSPaolo Bonzini 	 */
11722b94c4bSPaolo Bonzini 	kvm_tdp_mmu_put_root(kvm, root, true);
11822b94c4bSPaolo Bonzini 
11922b94c4bSPaolo Bonzini 	read_unlock(&kvm->mmu_lock);
12022b94c4bSPaolo Bonzini }
12122b94c4bSPaolo Bonzini 
12222b94c4bSPaolo Bonzini static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
12322b94c4bSPaolo Bonzini {
12422b94c4bSPaolo Bonzini 	root->tdp_mmu_async_data = kvm;
12522b94c4bSPaolo Bonzini 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
12622b94c4bSPaolo Bonzini 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
12722b94c4bSPaolo Bonzini }
12822b94c4bSPaolo Bonzini 
1296103bc07SBen Gardon void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
1306103bc07SBen Gardon 			  bool shared)
1312bdb3d84SBen Gardon {
1326103bc07SBen Gardon 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1332bdb3d84SBen Gardon 
13411cccf5cSBen Gardon 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
1352bdb3d84SBen Gardon 		return;
1362bdb3d84SBen Gardon 
1378351779cSPaolo Bonzini 	/*
138*edbdb43fSSean Christopherson 	 * The TDP MMU itself holds a reference to each root until the root is
139*edbdb43fSSean Christopherson 	 * explicitly invalidated, i.e. the final reference should be never be
140*edbdb43fSSean Christopherson 	 * put for a valid root.
1418351779cSPaolo Bonzini 	 */
142*edbdb43fSSean Christopherson 	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
1438351779cSPaolo Bonzini 
144c0e64238SBen Gardon 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
145c0e64238SBen Gardon 	list_del_rcu(&root->link);
146c0e64238SBen Gardon 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
147c0e64238SBen Gardon 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
148a889ea54SBen Gardon }
149a889ea54SBen Gardon 
150cfc10997SBen Gardon /*
151d62007edSSean Christopherson  * Returns the next root after @prev_root (or the first root if @prev_root is
152d62007edSSean Christopherson  * NULL).  A reference to the returned root is acquired, and the reference to
153d62007edSSean Christopherson  * @prev_root is released (the caller obviously must hold a reference to
154d62007edSSean Christopherson  * @prev_root if it's non-NULL).
155d62007edSSean Christopherson  *
156d62007edSSean Christopherson  * If @only_valid is true, invalid roots are skipped.
157d62007edSSean Christopherson  *
158d62007edSSean Christopherson  * Returns NULL if the end of tdp_mmu_roots was reached.
159cfc10997SBen Gardon  */
160cfc10997SBen Gardon static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
1616103bc07SBen Gardon 					      struct kvm_mmu_page *prev_root,
162d62007edSSean Christopherson 					      bool shared, bool only_valid)
163a889ea54SBen Gardon {
164a889ea54SBen Gardon 	struct kvm_mmu_page *next_root;
165a889ea54SBen Gardon 
166c0e64238SBen Gardon 	rcu_read_lock();
167c0e64238SBen Gardon 
168cfc10997SBen Gardon 	if (prev_root)
169c0e64238SBen Gardon 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
170c0e64238SBen Gardon 						  &prev_root->link,
171c0e64238SBen Gardon 						  typeof(*prev_root), link);
172cfc10997SBen Gardon 	else
173c0e64238SBen Gardon 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
174cfc10997SBen Gardon 						   typeof(*next_root), link);
175cfc10997SBen Gardon 
17604dc4e6cSSean Christopherson 	while (next_root) {
177d62007edSSean Christopherson 		if ((!only_valid || !next_root->role.invalid) &&
178ad6d6b94SJinrong Liang 		    kvm_tdp_mmu_get_root(next_root))
17904dc4e6cSSean Christopherson 			break;
18004dc4e6cSSean Christopherson 
181c0e64238SBen Gardon 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
182c0e64238SBen Gardon 				&next_root->link, typeof(*next_root), link);
18304dc4e6cSSean Christopherson 	}
184fb101293SBen Gardon 
185c0e64238SBen Gardon 	rcu_read_unlock();
186cfc10997SBen Gardon 
187cfc10997SBen Gardon 	if (prev_root)
1886103bc07SBen Gardon 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
189cfc10997SBen Gardon 
190a889ea54SBen Gardon 	return next_root;
191a889ea54SBen Gardon }
192a889ea54SBen Gardon 
193a889ea54SBen Gardon /*
194a889ea54SBen Gardon  * Note: this iterator gets and puts references to the roots it iterates over.
195a889ea54SBen Gardon  * This makes it safe to release the MMU lock and yield within the loop, but
196a889ea54SBen Gardon  * if exiting the loop early, the caller must drop the reference to the most
197a889ea54SBen Gardon  * recent root. (Unless keeping a live reference is desirable.)
1986103bc07SBen Gardon  *
1996103bc07SBen Gardon  * If shared is set, this function is operating under the MMU lock in read
2006103bc07SBen Gardon  * mode. In the unlikely event that this thread must free a root, the lock
2016103bc07SBen Gardon  * will be temporarily dropped and reacquired in write mode.
202a889ea54SBen Gardon  */
203d62007edSSean Christopherson #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
204d62007edSSean Christopherson 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
205cfc10997SBen Gardon 	     _root;								\
206d62007edSSean Christopherson 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
207614f6970SPaolo Bonzini 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
208614f6970SPaolo Bonzini 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
209a3f15bdaSSean Christopherson 		} else
210a889ea54SBen Gardon 
211d62007edSSean Christopherson #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
212d62007edSSean Christopherson 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
213d62007edSSean Christopherson 
214614f6970SPaolo Bonzini #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
215614f6970SPaolo Bonzini 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
216d62007edSSean Christopherson 
217226b8c8fSSean Christopherson /*
218226b8c8fSSean Christopherson  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
219226b8c8fSSean Christopherson  * the implication being that any flow that holds mmu_lock for read is
220226b8c8fSSean Christopherson  * inherently yield-friendly and should use the yield-safe variant above.
221226b8c8fSSean Christopherson  * Holding mmu_lock for write obviates the need for RCU protection as the list
222226b8c8fSSean Christopherson  * is guaranteed to be stable.
223226b8c8fSSean Christopherson  */
224a3f15bdaSSean Christopherson #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
225226b8c8fSSean Christopherson 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
226226b8c8fSSean Christopherson 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
227226b8c8fSSean Christopherson 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
228a3f15bdaSSean Christopherson 		} else
22902c00b3aSBen Gardon 
230a82070b6SDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
23102c00b3aSBen Gardon {
23202c00b3aSBen Gardon 	struct kvm_mmu_page *sp;
23302c00b3aSBen Gardon 
23402c00b3aSBen Gardon 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
23502c00b3aSBen Gardon 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
236a82070b6SDavid Matlack 
237a82070b6SDavid Matlack 	return sp;
238a82070b6SDavid Matlack }
239a82070b6SDavid Matlack 
240c10743a1SSean Christopherson static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
241c10743a1SSean Christopherson 			    gfn_t gfn, union kvm_mmu_page_role role)
242a82070b6SDavid Matlack {
24355c510e2SSean Christopherson 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
244428e9216SSean Christopherson 
24502c00b3aSBen Gardon 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
24602c00b3aSBen Gardon 
247a3aca4deSDavid Matlack 	sp->role = role;
24802c00b3aSBen Gardon 	sp->gfn = gfn;
249c10743a1SSean Christopherson 	sp->ptep = sptep;
25002c00b3aSBen Gardon 	sp->tdp_mmu_page = true;
25102c00b3aSBen Gardon 
25233dd3574SBen Gardon 	trace_kvm_mmu_get_page(sp, true);
25302c00b3aSBen Gardon }
25402c00b3aSBen Gardon 
255a82070b6SDavid Matlack static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
256a3aca4deSDavid Matlack 				  struct tdp_iter *iter)
257a3aca4deSDavid Matlack {
258a3aca4deSDavid Matlack 	struct kvm_mmu_page *parent_sp;
259a3aca4deSDavid Matlack 	union kvm_mmu_page_role role;
260a3aca4deSDavid Matlack 
261a3aca4deSDavid Matlack 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
262a3aca4deSDavid Matlack 
263a3aca4deSDavid Matlack 	role = parent_sp->role;
264a3aca4deSDavid Matlack 	role.level--;
265a3aca4deSDavid Matlack 
266c10743a1SSean Christopherson 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
267a3aca4deSDavid Matlack }
268a3aca4deSDavid Matlack 
2696e6ec584SSean Christopherson hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
27002c00b3aSBen Gardon {
2717a458f0eSPaolo Bonzini 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
27202c00b3aSBen Gardon 	struct kvm *kvm = vcpu->kvm;
27302c00b3aSBen Gardon 	struct kvm_mmu_page *root;
27402c00b3aSBen Gardon 
2756e6ec584SSean Christopherson 	lockdep_assert_held_write(&kvm->mmu_lock);
27602c00b3aSBen Gardon 
27704dc4e6cSSean Christopherson 	/*
27804dc4e6cSSean Christopherson 	 * Check for an existing root before allocating a new one.  Note, the
27904dc4e6cSSean Christopherson 	 * role check prevents consuming an invalid root.
28004dc4e6cSSean Christopherson 	 */
281a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
282fb101293SBen Gardon 		if (root->role.word == role.word &&
283ad6d6b94SJinrong Liang 		    kvm_tdp_mmu_get_root(root))
2846e6ec584SSean Christopherson 			goto out;
28502c00b3aSBen Gardon 	}
28602c00b3aSBen Gardon 
287a82070b6SDavid Matlack 	root = tdp_mmu_alloc_sp(vcpu);
288c10743a1SSean Christopherson 	tdp_mmu_init_sp(root, NULL, 0, role);
289a82070b6SDavid Matlack 
290*edbdb43fSSean Christopherson 	/*
291*edbdb43fSSean Christopherson 	 * TDP MMU roots are kept until they are explicitly invalidated, either
292*edbdb43fSSean Christopherson 	 * by a memslot update or by the destruction of the VM.  Initialize the
293*edbdb43fSSean Christopherson 	 * refcount to two; one reference for the vCPU, and one reference for
294*edbdb43fSSean Christopherson 	 * the TDP MMU itself, which is held until the root is invalidated and
295*edbdb43fSSean Christopherson 	 * is ultimately put by tdp_mmu_zap_root_work().
296*edbdb43fSSean Christopherson 	 */
297*edbdb43fSSean Christopherson 	refcount_set(&root->tdp_mmu_root_count, 2);
29802c00b3aSBen Gardon 
299c0e64238SBen Gardon 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
300c0e64238SBen Gardon 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
301c0e64238SBen Gardon 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
30202c00b3aSBen Gardon 
3036e6ec584SSean Christopherson out:
30402c00b3aSBen Gardon 	return __pa(root->spt);
305fe5db27dSBen Gardon }
3062f2fad08SBen Gardon 
3072f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
3089a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
3099a77daacSBen Gardon 				bool shared);
3102f2fad08SBen Gardon 
31143a063caSYosry Ahmed static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
31243a063caSYosry Ahmed {
31343a063caSYosry Ahmed 	kvm_account_pgtable_pages((void *)sp->spt, +1);
314d25ceb92SSean Christopherson 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
31543a063caSYosry Ahmed }
31643a063caSYosry Ahmed 
31743a063caSYosry Ahmed static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
31843a063caSYosry Ahmed {
31943a063caSYosry Ahmed 	kvm_account_pgtable_pages((void *)sp->spt, -1);
320d25ceb92SSean Christopherson 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
32143a063caSYosry Ahmed }
32243a063caSYosry Ahmed 
3232f2fad08SBen Gardon /**
324c298a30cSDavid Matlack  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
325a9442f59SBen Gardon  *
326a9442f59SBen Gardon  * @kvm: kvm instance
327a9442f59SBen Gardon  * @sp: the page to be removed
3289a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
3299a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
3309a77daacSBen Gardon  *	    threads that might be adding or removing pages.
331a9442f59SBen Gardon  */
332c298a30cSDavid Matlack static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
3339a77daacSBen Gardon 			      bool shared)
334a9442f59SBen Gardon {
33543a063caSYosry Ahmed 	tdp_unaccount_mmu_page(kvm, sp);
336d25ceb92SSean Christopherson 
337d25ceb92SSean Christopherson 	if (!sp->nx_huge_page_disallowed)
338d25ceb92SSean Christopherson 		return;
339d25ceb92SSean Christopherson 
3409a77daacSBen Gardon 	if (shared)
3419a77daacSBen Gardon 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
3429a77daacSBen Gardon 	else
343a9442f59SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
344a9442f59SBen Gardon 
34561f94478SSean Christopherson 	sp->nx_huge_page_disallowed = false;
34661f94478SSean Christopherson 	untrack_possible_nx_huge_page(kvm, sp);
3479a77daacSBen Gardon 
3489a77daacSBen Gardon 	if (shared)
3499a77daacSBen Gardon 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
350a9442f59SBen Gardon }
351a9442f59SBen Gardon 
352a9442f59SBen Gardon /**
3530f53dfa3SDavid Matlack  * handle_removed_pt() - handle a page table removed from the TDP structure
354a066e61fSBen Gardon  *
355a066e61fSBen Gardon  * @kvm: kvm instance
356a066e61fSBen Gardon  * @pt: the page removed from the paging structure
3579a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use
3589a77daacSBen Gardon  *	    of the MMU lock and the operation must synchronize with other
3599a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
360a066e61fSBen Gardon  *
361a066e61fSBen Gardon  * Given a page table that has been removed from the TDP paging structure,
362a066e61fSBen Gardon  * iterates through the page table to clear SPTEs and free child page tables.
36370fb3e41SBen Gardon  *
36470fb3e41SBen Gardon  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
36570fb3e41SBen Gardon  * protection. Since this thread removed it from the paging structure,
36670fb3e41SBen Gardon  * this thread will be responsible for ensuring the page is freed. Hence the
36770fb3e41SBen Gardon  * early rcu_dereferences in the function.
368a066e61fSBen Gardon  */
3690f53dfa3SDavid Matlack static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
370a066e61fSBen Gardon {
37170fb3e41SBen Gardon 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
372a066e61fSBen Gardon 	int level = sp->role.level;
373e25f0e0cSBen Gardon 	gfn_t base_gfn = sp->gfn;
374a066e61fSBen Gardon 	int i;
375a066e61fSBen Gardon 
376a066e61fSBen Gardon 	trace_kvm_mmu_prepare_zap_page(sp);
377a066e61fSBen Gardon 
378c298a30cSDavid Matlack 	tdp_mmu_unlink_sp(kvm, sp, shared);
379a066e61fSBen Gardon 
3802ca3129eSSean Christopherson 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
381ba3a6120SSean Christopherson 		tdp_ptep_t sptep = pt + i;
382574c3c55SBen Gardon 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
383ba3a6120SSean Christopherson 		u64 old_spte;
3849a77daacSBen Gardon 
3859a77daacSBen Gardon 		if (shared) {
386e25f0e0cSBen Gardon 			/*
387e25f0e0cSBen Gardon 			 * Set the SPTE to a nonpresent value that other
388e25f0e0cSBen Gardon 			 * threads will not overwrite. If the SPTE was
389e25f0e0cSBen Gardon 			 * already marked as removed then another thread
390e25f0e0cSBen Gardon 			 * handling a page fault could overwrite it, so
391e25f0e0cSBen Gardon 			 * set the SPTE until it is set from some other
392e25f0e0cSBen Gardon 			 * value to the removed SPTE value.
393e25f0e0cSBen Gardon 			 */
394e25f0e0cSBen Gardon 			for (;;) {
395ba3a6120SSean Christopherson 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
396ba3a6120SSean Christopherson 				if (!is_removed_spte(old_spte))
397e25f0e0cSBen Gardon 					break;
398e25f0e0cSBen Gardon 				cpu_relax();
399e25f0e0cSBen Gardon 			}
4009a77daacSBen Gardon 		} else {
4018df9f1afSSean Christopherson 			/*
4028df9f1afSSean Christopherson 			 * If the SPTE is not MMU-present, there is no backing
4038df9f1afSSean Christopherson 			 * page associated with the SPTE and so no side effects
4048df9f1afSSean Christopherson 			 * that need to be recorded, and exclusive ownership of
4058df9f1afSSean Christopherson 			 * mmu_lock ensures the SPTE can't be made present.
4068df9f1afSSean Christopherson 			 * Note, zapping MMIO SPTEs is also unnecessary as they
4078df9f1afSSean Christopherson 			 * are guarded by the memslots generation, not by being
4088df9f1afSSean Christopherson 			 * unreachable.
4098df9f1afSSean Christopherson 			 */
410ba3a6120SSean Christopherson 			old_spte = kvm_tdp_mmu_read_spte(sptep);
411ba3a6120SSean Christopherson 			if (!is_shadow_present_pte(old_spte))
4128df9f1afSSean Christopherson 				continue;
413e25f0e0cSBen Gardon 
414e25f0e0cSBen Gardon 			/*
415ba3a6120SSean Christopherson 			 * Use the common helper instead of a raw WRITE_ONCE as
416ba3a6120SSean Christopherson 			 * the SPTE needs to be updated atomically if it can be
417ba3a6120SSean Christopherson 			 * modified by a different vCPU outside of mmu_lock.
418ba3a6120SSean Christopherson 			 * Even though the parent SPTE is !PRESENT, the TLB
419ba3a6120SSean Christopherson 			 * hasn't yet been flushed, and both Intel and AMD
420ba3a6120SSean Christopherson 			 * document that A/D assists can use upper-level PxE
421ba3a6120SSean Christopherson 			 * entries that are cached in the TLB, i.e. the CPU can
422ba3a6120SSean Christopherson 			 * still access the page and mark it dirty.
423ba3a6120SSean Christopherson 			 *
424ba3a6120SSean Christopherson 			 * No retry is needed in the atomic update path as the
425ba3a6120SSean Christopherson 			 * sole concern is dropping a Dirty bit, i.e. no other
426ba3a6120SSean Christopherson 			 * task can zap/remove the SPTE as mmu_lock is held for
427ba3a6120SSean Christopherson 			 * write.  Marking the SPTE as a removed SPTE is not
428ba3a6120SSean Christopherson 			 * strictly necessary for the same reason, but using
429ba3a6120SSean Christopherson 			 * the remove SPTE value keeps the shared/exclusive
430ba3a6120SSean Christopherson 			 * paths consistent and allows the handle_changed_spte()
431ba3a6120SSean Christopherson 			 * call below to hardcode the new value to REMOVED_SPTE.
432ba3a6120SSean Christopherson 			 *
433ba3a6120SSean Christopherson 			 * Note, even though dropping a Dirty bit is the only
434ba3a6120SSean Christopherson 			 * scenario where a non-atomic update could result in a
435ba3a6120SSean Christopherson 			 * functional bug, simply checking the Dirty bit isn't
436ba3a6120SSean Christopherson 			 * sufficient as a fast page fault could read the upper
437ba3a6120SSean Christopherson 			 * level SPTE before it is zapped, and then make this
438ba3a6120SSean Christopherson 			 * target SPTE writable, resume the guest, and set the
439ba3a6120SSean Christopherson 			 * Dirty bit between reading the SPTE above and writing
440ba3a6120SSean Christopherson 			 * it here.
441e25f0e0cSBen Gardon 			 */
442ba3a6120SSean Christopherson 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
443ba3a6120SSean Christopherson 							  REMOVED_SPTE, level);
4449a77daacSBen Gardon 		}
445e25f0e0cSBen Gardon 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
446ba3a6120SSean Christopherson 				    old_spte, REMOVED_SPTE, level, shared);
447a066e61fSBen Gardon 	}
448a066e61fSBen Gardon 
4497cca2d0bSBen Gardon 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
450a066e61fSBen Gardon }
451a066e61fSBen Gardon 
452a066e61fSBen Gardon /**
45340fa907eSVipin Sharma  * handle_changed_spte - handle bookkeeping associated with an SPTE change
4542f2fad08SBen Gardon  * @kvm: kvm instance
4552f2fad08SBen Gardon  * @as_id: the address space of the paging structure the SPTE was a part of
4562f2fad08SBen Gardon  * @gfn: the base GFN that was mapped by the SPTE
4572f2fad08SBen Gardon  * @old_spte: The value of the SPTE before the change
4582f2fad08SBen Gardon  * @new_spte: The value of the SPTE after the change
4592f2fad08SBen Gardon  * @level: the level of the PT the SPTE is part of in the paging structure
4609a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
4619a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
4629a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
4632f2fad08SBen Gardon  *
4641f997345SVipin Sharma  * Handle bookkeeping that might result from the modification of a SPTE.  Note,
4651f997345SVipin Sharma  * dirty logging updates are handled in common code, not here (see make_spte()
4661f997345SVipin Sharma  * and fast_pf_fix_direct_spte()).
4672f2fad08SBen Gardon  */
46840fa907eSVipin Sharma static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
4699a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
4709a77daacSBen Gardon 				bool shared)
4712f2fad08SBen Gardon {
4722f2fad08SBen Gardon 	bool was_present = is_shadow_present_pte(old_spte);
4732f2fad08SBen Gardon 	bool is_present = is_shadow_present_pte(new_spte);
4742f2fad08SBen Gardon 	bool was_leaf = was_present && is_last_spte(old_spte, level);
4752f2fad08SBen Gardon 	bool is_leaf = is_present && is_last_spte(new_spte, level);
4762f2fad08SBen Gardon 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
4772f2fad08SBen Gardon 
4782f2fad08SBen Gardon 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
4792f2fad08SBen Gardon 	WARN_ON(level < PG_LEVEL_4K);
480764388ceSSean Christopherson 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
4812f2fad08SBen Gardon 
4822f2fad08SBen Gardon 	/*
4832f2fad08SBen Gardon 	 * If this warning were to trigger it would indicate that there was a
4842f2fad08SBen Gardon 	 * missing MMU notifier or a race with some notifier handler.
4852f2fad08SBen Gardon 	 * A present, leaf SPTE should never be directly replaced with another
486d9f6e12fSIngo Molnar 	 * present leaf SPTE pointing to a different PFN. A notifier handler
4872f2fad08SBen Gardon 	 * should be zapping the SPTE before the main MM's page table is
4882f2fad08SBen Gardon 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
4892f2fad08SBen Gardon 	 * thread before replacement.
4902f2fad08SBen Gardon 	 */
4912f2fad08SBen Gardon 	if (was_leaf && is_leaf && pfn_changed) {
4922f2fad08SBen Gardon 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
4932f2fad08SBen Gardon 		       "SPTE with another present leaf SPTE mapping a\n"
4942f2fad08SBen Gardon 		       "different PFN!\n"
4952f2fad08SBen Gardon 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
4962f2fad08SBen Gardon 		       as_id, gfn, old_spte, new_spte, level);
4972f2fad08SBen Gardon 
4982f2fad08SBen Gardon 		/*
4992f2fad08SBen Gardon 		 * Crash the host to prevent error propagation and guest data
500d9f6e12fSIngo Molnar 		 * corruption.
5012f2fad08SBen Gardon 		 */
5022f2fad08SBen Gardon 		BUG();
5032f2fad08SBen Gardon 	}
5042f2fad08SBen Gardon 
5052f2fad08SBen Gardon 	if (old_spte == new_spte)
5062f2fad08SBen Gardon 		return;
5072f2fad08SBen Gardon 
508b9a98c34SBen Gardon 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
509b9a98c34SBen Gardon 
510115111efSDavid Matlack 	if (is_leaf)
511115111efSDavid Matlack 		check_spte_writable_invariants(new_spte);
512115111efSDavid Matlack 
5132f2fad08SBen Gardon 	/*
5142f2fad08SBen Gardon 	 * The only times a SPTE should be changed from a non-present to
5152f2fad08SBen Gardon 	 * non-present state is when an MMIO entry is installed/modified/
5162f2fad08SBen Gardon 	 * removed. In that case, there is nothing to do here.
5172f2fad08SBen Gardon 	 */
5182f2fad08SBen Gardon 	if (!was_present && !is_present) {
5192f2fad08SBen Gardon 		/*
52008f07c80SBen Gardon 		 * If this change does not involve a MMIO SPTE or removed SPTE,
52108f07c80SBen Gardon 		 * it is unexpected. Log the change, though it should not
52208f07c80SBen Gardon 		 * impact the guest since both the former and current SPTEs
52308f07c80SBen Gardon 		 * are nonpresent.
5242f2fad08SBen Gardon 		 */
52508f07c80SBen Gardon 		if (WARN_ON(!is_mmio_spte(old_spte) &&
52608f07c80SBen Gardon 			    !is_mmio_spte(new_spte) &&
52708f07c80SBen Gardon 			    !is_removed_spte(new_spte)))
5282f2fad08SBen Gardon 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
5292f2fad08SBen Gardon 			       "should not be replaced with another,\n"
5302f2fad08SBen Gardon 			       "different nonpresent SPTE, unless one or both\n"
53108f07c80SBen Gardon 			       "are MMIO SPTEs, or the new SPTE is\n"
53208f07c80SBen Gardon 			       "a temporary removed SPTE.\n"
5332f2fad08SBen Gardon 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
5342f2fad08SBen Gardon 			       as_id, gfn, old_spte, new_spte, level);
5352f2fad08SBen Gardon 		return;
5362f2fad08SBen Gardon 	}
5372f2fad08SBen Gardon 
53871f51d2cSMingwei Zhang 	if (is_leaf != was_leaf)
53971f51d2cSMingwei Zhang 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
5402f2fad08SBen Gardon 
5412f2fad08SBen Gardon 	if (was_leaf && is_dirty_spte(old_spte) &&
54264bb2769SSean Christopherson 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
5432f2fad08SBen Gardon 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
5442f2fad08SBen Gardon 
5452f2fad08SBen Gardon 	/*
5462f2fad08SBen Gardon 	 * Recursively handle child PTs if the change removed a subtree from
547c8e5a0d0SSean Christopherson 	 * the paging structure.  Note the WARN on the PFN changing without the
548c8e5a0d0SSean Christopherson 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
549c8e5a0d0SSean Christopherson 	 * pages are kernel allocations and should never be migrated.
5502f2fad08SBen Gardon 	 */
551c8e5a0d0SSean Christopherson 	if (was_present && !was_leaf &&
552c8e5a0d0SSean Christopherson 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
5530f53dfa3SDavid Matlack 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
5542f2fad08SBen Gardon 
55540fa907eSVipin Sharma 	if (was_leaf && is_accessed_spte(old_spte) &&
55640fa907eSVipin Sharma 	    (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
55740fa907eSVipin Sharma 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
5582f2fad08SBen Gardon }
559faaf05b0SBen Gardon 
560fe43fa2fSBen Gardon /*
5616ccf4438SPaolo Bonzini  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
5626ccf4438SPaolo Bonzini  * and handle the associated bookkeeping.  Do not mark the page dirty
56324ae4cfaSBen Gardon  * in KVM's dirty bitmaps.
5649a77daacSBen Gardon  *
5653255530aSDavid Matlack  * If setting the SPTE fails because it has changed, iter->old_spte will be
5663255530aSDavid Matlack  * refreshed to the current value of the spte.
5673255530aSDavid Matlack  *
5689a77daacSBen Gardon  * @kvm: kvm instance
5699a77daacSBen Gardon  * @iter: a tdp_iter instance currently on the SPTE that should be set
5709a77daacSBen Gardon  * @new_spte: The value the SPTE should be set to
5713e72c791SDavid Matlack  * Return:
5723e72c791SDavid Matlack  * * 0      - If the SPTE was set.
5733e72c791SDavid Matlack  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
5743e72c791SDavid Matlack  *            no side-effects other than setting iter->old_spte to the last
5753e72c791SDavid Matlack  *            known value of the spte.
5769a77daacSBen Gardon  */
5773e72c791SDavid Matlack static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
5789a77daacSBen Gardon 					  struct tdp_iter *iter,
5799a77daacSBen Gardon 					  u64 new_spte)
5809a77daacSBen Gardon {
5813255530aSDavid Matlack 	u64 *sptep = rcu_dereference(iter->sptep);
5823255530aSDavid Matlack 
583396fd74dSSean Christopherson 	/*
584396fd74dSSean Christopherson 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
585396fd74dSSean Christopherson 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
586396fd74dSSean Christopherson 	 * and pre-checking before inserting a new SPTE is advantageous as it
587396fd74dSSean Christopherson 	 * avoids unnecessary work.
588396fd74dSSean Christopherson 	 */
589396fd74dSSean Christopherson 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
5903a0f64deSSean Christopherson 
5919a77daacSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
5929a77daacSBen Gardon 
59308f07c80SBen Gardon 	/*
5946e8eb206SDavid Matlack 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
5956e8eb206SDavid Matlack 	 * does not hold the mmu_lock.
5966e8eb206SDavid Matlack 	 */
597aee98a68SUros Bizjak 	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
5983e72c791SDavid Matlack 		return -EBUSY;
5999a77daacSBen Gardon 
60040fa907eSVipin Sharma 	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
60108889894SSean Christopherson 			    new_spte, iter->level, true);
6029a77daacSBen Gardon 
6033e72c791SDavid Matlack 	return 0;
6049a77daacSBen Gardon }
6059a77daacSBen Gardon 
6063e72c791SDavid Matlack static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
60708f07c80SBen Gardon 					  struct tdp_iter *iter)
60808f07c80SBen Gardon {
6093e72c791SDavid Matlack 	int ret;
6103e72c791SDavid Matlack 
61108f07c80SBen Gardon 	/*
61208f07c80SBen Gardon 	 * Freeze the SPTE by setting it to a special,
61308f07c80SBen Gardon 	 * non-present value. This will stop other threads from
61408f07c80SBen Gardon 	 * immediately installing a present entry in its place
61508f07c80SBen Gardon 	 * before the TLBs are flushed.
61608f07c80SBen Gardon 	 */
6173e72c791SDavid Matlack 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
6183e72c791SDavid Matlack 	if (ret)
6193e72c791SDavid Matlack 		return ret;
62008f07c80SBen Gardon 
6214ad980aeSHou Wenlong 	kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
62208f07c80SBen Gardon 
62308f07c80SBen Gardon 	/*
624ba3a6120SSean Christopherson 	 * No other thread can overwrite the removed SPTE as they must either
625ba3a6120SSean Christopherson 	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
626ba3a6120SSean Christopherson 	 * overwrite the special removed SPTE value. No bookkeeping is needed
627ba3a6120SSean Christopherson 	 * here since the SPTE is going from non-present to non-present.  Use
628ba3a6120SSean Christopherson 	 * the raw write helper to avoid an unnecessary check on volatile bits.
62908f07c80SBen Gardon 	 */
630ba3a6120SSean Christopherson 	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
63108f07c80SBen Gardon 
6323e72c791SDavid Matlack 	return 0;
63308f07c80SBen Gardon }
63408f07c80SBen Gardon 
6359a77daacSBen Gardon 
6369a77daacSBen Gardon /*
6370b7cc254SVipin Sharma  * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
638626808d1SSean Christopherson  * @kvm:	      KVM instance
639626808d1SSean Christopherson  * @as_id:	      Address space ID, i.e. regular vs. SMM
640626808d1SSean Christopherson  * @sptep:	      Pointer to the SPTE
641626808d1SSean Christopherson  * @old_spte:	      The current value of the SPTE
642626808d1SSean Christopherson  * @new_spte:	      The new value that will be set for the SPTE
643626808d1SSean Christopherson  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
644626808d1SSean Christopherson  * @level:	      The level _containing_ the SPTE (its parent PT's level)
645ba3a6120SSean Christopherson  *
646ba3a6120SSean Christopherson  * Returns the old SPTE value, which _may_ be different than @old_spte if the
647ba3a6120SSean Christopherson  * SPTE had voldatile bits.
648fe43fa2fSBen Gardon  */
6490b7cc254SVipin Sharma static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
6500b7cc254SVipin Sharma 			    u64 old_spte, u64 new_spte, gfn_t gfn, int level)
651faaf05b0SBen Gardon {
652531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
6533a9a4aa5SBen Gardon 
65408f07c80SBen Gardon 	/*
655966da62aSSean Christopherson 	 * No thread should be using this function to set SPTEs to or from the
65608f07c80SBen Gardon 	 * temporary removed SPTE value.
65708f07c80SBen Gardon 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
65808f07c80SBen Gardon 	 * should be used. If operating under the MMU lock in write mode, the
65908f07c80SBen Gardon 	 * use of the removed SPTE should not be necessary.
66008f07c80SBen Gardon 	 */
661626808d1SSean Christopherson 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
66208f07c80SBen Gardon 
663ba3a6120SSean Christopherson 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
664faaf05b0SBen Gardon 
66540fa907eSVipin Sharma 	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
666ba3a6120SSean Christopherson 	return old_spte;
667626808d1SSean Christopherson }
668626808d1SSean Christopherson 
6690b7cc254SVipin Sharma static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
670f8e14497SBen Gardon 					 u64 new_spte)
671f8e14497SBen Gardon {
6720b7cc254SVipin Sharma 	WARN_ON_ONCE(iter->yielded);
6730b7cc254SVipin Sharma 	iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
6740b7cc254SVipin Sharma 					  iter->old_spte, new_spte,
6750b7cc254SVipin Sharma 					  iter->gfn, iter->level);
676f8e14497SBen Gardon }
677f8e14497SBen Gardon 
678faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
67977aa6075SDavid Matlack 	for_each_tdp_pte(_iter, _root, _start, _end)
680faaf05b0SBen Gardon 
681f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
682f8e14497SBen Gardon 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
683f8e14497SBen Gardon 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
684f8e14497SBen Gardon 		    !is_last_spte(_iter.old_spte, _iter.level))		\
685f8e14497SBen Gardon 			continue;					\
686f8e14497SBen Gardon 		else
687f8e14497SBen Gardon 
688bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
689b9e5603cSPaolo Bonzini 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
690bb18842eSBen Gardon 
691faaf05b0SBen Gardon /*
692e28a436cSBen Gardon  * Yield if the MMU lock is contended or this thread needs to return control
693e28a436cSBen Gardon  * to the scheduler.
694e28a436cSBen Gardon  *
695e139a34eSBen Gardon  * If this function should yield and flush is set, it will perform a remote
696e139a34eSBen Gardon  * TLB flush before yielding.
697e139a34eSBen Gardon  *
6983a0f64deSSean Christopherson  * If this function yields, iter->yielded is set and the caller must skip to
6993a0f64deSSean Christopherson  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
7003a0f64deSSean Christopherson  * over the paging structures to allow the iterator to continue its traversal
7013a0f64deSSean Christopherson  * from the paging structure root.
702e28a436cSBen Gardon  *
7033a0f64deSSean Christopherson  * Returns true if this function yielded.
704e28a436cSBen Gardon  */
7053a0f64deSSean Christopherson static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
7063a0f64deSSean Christopherson 							  struct tdp_iter *iter,
7073a0f64deSSean Christopherson 							  bool flush, bool shared)
708a6a0b05dSBen Gardon {
7093a0f64deSSean Christopherson 	WARN_ON(iter->yielded);
7103a0f64deSSean Christopherson 
711ed5e484bSBen Gardon 	/* Ensure forward progress has been made before yielding. */
712ed5e484bSBen Gardon 	if (iter->next_last_level_gfn == iter->yielded_gfn)
713ed5e484bSBen Gardon 		return false;
714ed5e484bSBen Gardon 
715531810caSBen Gardon 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
716e139a34eSBen Gardon 		if (flush)
717e139a34eSBen Gardon 			kvm_flush_remote_tlbs(kvm);
718e139a34eSBen Gardon 
719bd296779SSean Christopherson 		rcu_read_unlock();
720bd296779SSean Christopherson 
7216103bc07SBen Gardon 		if (shared)
7226103bc07SBen Gardon 			cond_resched_rwlock_read(&kvm->mmu_lock);
7236103bc07SBen Gardon 		else
724531810caSBen Gardon 			cond_resched_rwlock_write(&kvm->mmu_lock);
7256103bc07SBen Gardon 
7267cca2d0bSBen Gardon 		rcu_read_lock();
727ed5e484bSBen Gardon 
728ed5e484bSBen Gardon 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
729ed5e484bSBen Gardon 
7303a0f64deSSean Christopherson 		iter->yielded = true;
731a6a0b05dSBen Gardon 	}
732e28a436cSBen Gardon 
7333a0f64deSSean Christopherson 	return iter->yielded;
734a6a0b05dSBen Gardon }
735a6a0b05dSBen Gardon 
73686931ff7SSean Christopherson static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
737e2b5b21dSSean Christopherson {
738e2b5b21dSSean Christopherson 	/*
73986931ff7SSean Christopherson 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
74086931ff7SSean Christopherson 	 * a gpa range that would exceed the max gfn, and KVM does not create
74186931ff7SSean Christopherson 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
74286931ff7SSean Christopherson 	 * the slow emulation path every time.
743e2b5b21dSSean Christopherson 	 */
74486931ff7SSean Christopherson 	return kvm_mmu_max_gfn() + 1;
745e2b5b21dSSean Christopherson }
746e2b5b21dSSean Christopherson 
7471b6043e8SSean Christopherson static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
7481b6043e8SSean Christopherson 			       bool shared, int zap_level)
749e2b5b21dSSean Christopherson {
750e2b5b21dSSean Christopherson 	struct tdp_iter iter;
751e2b5b21dSSean Christopherson 
75286931ff7SSean Christopherson 	gfn_t end = tdp_mmu_max_gfn_exclusive();
753e2b5b21dSSean Christopherson 	gfn_t start = 0;
754e2b5b21dSSean Christopherson 
7551b6043e8SSean Christopherson 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
7561b6043e8SSean Christopherson retry:
7571b6043e8SSean Christopherson 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
7581b6043e8SSean Christopherson 			continue;
7591b6043e8SSean Christopherson 
7601b6043e8SSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte))
7611b6043e8SSean Christopherson 			continue;
7621b6043e8SSean Christopherson 
7631b6043e8SSean Christopherson 		if (iter.level > zap_level)
7641b6043e8SSean Christopherson 			continue;
7651b6043e8SSean Christopherson 
7661b6043e8SSean Christopherson 		if (!shared)
7670b7cc254SVipin Sharma 			tdp_mmu_iter_set_spte(kvm, &iter, 0);
7681b6043e8SSean Christopherson 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
7691b6043e8SSean Christopherson 			goto retry;
7701b6043e8SSean Christopherson 	}
7711b6043e8SSean Christopherson }
7721b6043e8SSean Christopherson 
7731b6043e8SSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
7741b6043e8SSean Christopherson 			     bool shared)
7751b6043e8SSean Christopherson {
7761b6043e8SSean Christopherson 
7778351779cSPaolo Bonzini 	/*
7788351779cSPaolo Bonzini 	 * The root must have an elevated refcount so that it's reachable via
7798351779cSPaolo Bonzini 	 * mmu_notifier callbacks, which allows this path to yield and drop
7808351779cSPaolo Bonzini 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
7818351779cSPaolo Bonzini 	 * must drop all references to relevant pages prior to completing the
7828351779cSPaolo Bonzini 	 * callback.  Dropping mmu_lock with an unreachable root would result
7838351779cSPaolo Bonzini 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
7848351779cSPaolo Bonzini 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
7858351779cSPaolo Bonzini 	 * dirty accessed bits to the SPTE's associated struct page.
7868351779cSPaolo Bonzini 	 */
7878351779cSPaolo Bonzini 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
7888351779cSPaolo Bonzini 
789e2b5b21dSSean Christopherson 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
790e2b5b21dSSean Christopherson 
791e2b5b21dSSean Christopherson 	rcu_read_lock();
792e2b5b21dSSean Christopherson 
793e2b5b21dSSean Christopherson 	/*
7941b6043e8SSean Christopherson 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
7951b6043e8SSean Christopherson 	 * split the zap into two passes.  On the first pass, zap at the 1gb
7961b6043e8SSean Christopherson 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
7971b6043e8SSean Christopherson 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
7981b6043e8SSean Christopherson 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
7991b6043e8SSean Christopherson 	 *
8001b6043e8SSean Christopherson 	 * Because zapping a SP recurses on its children, stepping down to
8011b6043e8SSean Christopherson 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
802e2b5b21dSSean Christopherson 	 */
8031b6043e8SSean Christopherson 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
8041b6043e8SSean Christopherson 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
805e2b5b21dSSean Christopherson 
806e2b5b21dSSean Christopherson 	rcu_read_unlock();
807e2b5b21dSSean Christopherson }
808e2b5b21dSSean Christopherson 
809c10743a1SSean Christopherson bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
810c10743a1SSean Christopherson {
811c10743a1SSean Christopherson 	u64 old_spte;
812c10743a1SSean Christopherson 
813c10743a1SSean Christopherson 	/*
814c10743a1SSean Christopherson 	 * This helper intentionally doesn't allow zapping a root shadow page,
815c10743a1SSean Christopherson 	 * which doesn't have a parent page table and thus no associated entry.
816c10743a1SSean Christopherson 	 */
817c10743a1SSean Christopherson 	if (WARN_ON_ONCE(!sp->ptep))
818c10743a1SSean Christopherson 		return false;
819c10743a1SSean Christopherson 
820c10743a1SSean Christopherson 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
821bb95dfb9SSean Christopherson 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
822c10743a1SSean Christopherson 		return false;
823c10743a1SSean Christopherson 
8240b7cc254SVipin Sharma 	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
8250b7cc254SVipin Sharma 			 sp->gfn, sp->role.level + 1);
826c10743a1SSean Christopherson 
827c10743a1SSean Christopherson 	return true;
828c10743a1SSean Christopherson }
829c10743a1SSean Christopherson 
830faaf05b0SBen Gardon /*
831063afacdSBen Gardon  * If can_yield is true, will release the MMU lock and reschedule if the
832063afacdSBen Gardon  * scheduler needs the CPU or there is contention on the MMU lock. If this
833063afacdSBen Gardon  * function cannot yield, it will not release the MMU lock or reschedule and
834063afacdSBen Gardon  * the caller must ensure it does not supply too large a GFN range, or the
8356103bc07SBen Gardon  * operation can cause a soft lockup.
836faaf05b0SBen Gardon  */
837f47e5bbbSSean Christopherson static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
838acbda82aSSean Christopherson 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
839faaf05b0SBen Gardon {
840faaf05b0SBen Gardon 	struct tdp_iter iter;
841faaf05b0SBen Gardon 
84286931ff7SSean Christopherson 	end = min(end, tdp_mmu_max_gfn_exclusive());
843524a1e4eSSean Christopherson 
844acbda82aSSean Christopherson 	lockdep_assert_held_write(&kvm->mmu_lock);
8456103bc07SBen Gardon 
8467cca2d0bSBen Gardon 	rcu_read_lock();
8477cca2d0bSBen Gardon 
848f47e5bbbSSean Christopherson 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
8491af4a960SBen Gardon 		if (can_yield &&
850acbda82aSSean Christopherson 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
851a835429cSSean Christopherson 			flush = false;
8521af4a960SBen Gardon 			continue;
8531af4a960SBen Gardon 		}
8541af4a960SBen Gardon 
855f47e5bbbSSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte) ||
856faaf05b0SBen Gardon 		    !is_last_spte(iter.old_spte, iter.level))
857faaf05b0SBen Gardon 			continue;
858faaf05b0SBen Gardon 
8590b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, &iter, 0);
860a835429cSSean Christopherson 		flush = true;
861faaf05b0SBen Gardon 	}
8627cca2d0bSBen Gardon 
8637cca2d0bSBen Gardon 	rcu_read_unlock();
864bb95dfb9SSean Christopherson 
865f47e5bbbSSean Christopherson 	/*
866f47e5bbbSSean Christopherson 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
867f47e5bbbSSean Christopherson 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
868f47e5bbbSSean Christopherson 	 */
869f47e5bbbSSean Christopherson 	return flush;
870faaf05b0SBen Gardon }
871faaf05b0SBen Gardon 
872faaf05b0SBen Gardon /*
8737edc3a68SKai Huang  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
8747edc3a68SKai Huang  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
8757edc3a68SKai Huang  * more SPTEs were zapped since the MMU lock was last acquired.
876faaf05b0SBen Gardon  */
877f47e5bbbSSean Christopherson bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
878f47e5bbbSSean Christopherson 			   bool can_yield, bool flush)
879faaf05b0SBen Gardon {
880faaf05b0SBen Gardon 	struct kvm_mmu_page *root;
881faaf05b0SBen Gardon 
882614f6970SPaolo Bonzini 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
883f47e5bbbSSean Christopherson 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
884faaf05b0SBen Gardon 
885faaf05b0SBen Gardon 	return flush;
886faaf05b0SBen Gardon }
887faaf05b0SBen Gardon 
888faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm)
889faaf05b0SBen Gardon {
890e2b5b21dSSean Christopherson 	struct kvm_mmu_page *root;
8912b9663d8SSean Christopherson 	int i;
892faaf05b0SBen Gardon 
89377c8cd6bSSean Christopherson 	/*
89422b94c4bSPaolo Bonzini 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
89522b94c4bSPaolo Bonzini 	 * before returning to the caller.  Zap directly even if the root is
89622b94c4bSPaolo Bonzini 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
89722b94c4bSPaolo Bonzini 	 * all that expensive and mmu_lock is already held, which means the
89822b94c4bSPaolo Bonzini 	 * worker has yielded, i.e. flushing the work instead of zapping here
89922b94c4bSPaolo Bonzini 	 * isn't guaranteed to be any faster.
90022b94c4bSPaolo Bonzini 	 *
90177c8cd6bSSean Christopherson 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
90277c8cd6bSSean Christopherson 	 * is being destroyed or the userspace VMM has exited.  In both cases,
90377c8cd6bSSean Christopherson 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
90477c8cd6bSSean Christopherson 	 */
905e2b5b21dSSean Christopherson 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
906e2b5b21dSSean Christopherson 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
907e2b5b21dSSean Christopherson 			tdp_mmu_zap_root(kvm, root, false);
908e2b5b21dSSean Christopherson 	}
909faaf05b0SBen Gardon }
910bb18842eSBen Gardon 
9114c6654bdSBen Gardon /*
912f28e9c7fSSean Christopherson  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
91322b94c4bSPaolo Bonzini  * zap" completes.
9144c6654bdSBen Gardon  */
9154c6654bdSBen Gardon void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
9164c6654bdSBen Gardon {
91722b94c4bSPaolo Bonzini 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
9184c6654bdSBen Gardon }
9194c6654bdSBen Gardon 
920bb18842eSBen Gardon /*
921f28e9c7fSSean Christopherson  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
92222b94c4bSPaolo Bonzini  * is about to be zapped, e.g. in response to a memslots update.  The actual
923*edbdb43fSSean Christopherson  * zapping is performed asynchronously.  Using a separate workqueue makes it
924*edbdb43fSSean Christopherson  * easy to ensure that the destruction is performed before the "fast zap"
925*edbdb43fSSean Christopherson  * completes, without keeping a separate list of invalidated roots; the list is
926*edbdb43fSSean Christopherson  * effectively the list of work items in the workqueue.
927b7cccd39SBen Gardon  *
928*edbdb43fSSean Christopherson  * Note, the asynchronous worker is gifted the TDP MMU's reference.
929*edbdb43fSSean Christopherson  * See kvm_tdp_mmu_get_vcpu_root_hpa().
930b7cccd39SBen Gardon  */
931b7cccd39SBen Gardon void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
932b7cccd39SBen Gardon {
933b7cccd39SBen Gardon 	struct kvm_mmu_page *root;
934b7cccd39SBen Gardon 
935*edbdb43fSSean Christopherson 	/*
936*edbdb43fSSean Christopherson 	 * mmu_lock must be held for write to ensure that a root doesn't become
937*edbdb43fSSean Christopherson 	 * invalid while there are active readers (invalidating a root while
938*edbdb43fSSean Christopherson 	 * there are active readers may or may not be problematic in practice,
939*edbdb43fSSean Christopherson 	 * but it's uncharted territory and not supported).
940*edbdb43fSSean Christopherson 	 *
941*edbdb43fSSean Christopherson 	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
942*edbdb43fSSean Christopherson 	 * being destroyed after all references have been put, or if no vCPUs
943*edbdb43fSSean Christopherson 	 * have been created (which means there are no roots), i.e. the VM is
944*edbdb43fSSean Christopherson 	 * being destroyed in an error path of KVM_CREATE_VM.
945*edbdb43fSSean Christopherson 	 */
946*edbdb43fSSean Christopherson 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
947*edbdb43fSSean Christopherson 	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
948b7cccd39SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
949*edbdb43fSSean Christopherson 
950*edbdb43fSSean Christopherson 	/*
951*edbdb43fSSean Christopherson 	 * As above, mmu_lock isn't held when destroying the VM!  There can't
952*edbdb43fSSean Christopherson 	 * be other references to @kvm, i.e. nothing else can invalidate roots
953*edbdb43fSSean Christopherson 	 * or be consuming roots, but walking the list of roots does need to be
954*edbdb43fSSean Christopherson 	 * guarded against roots being deleted by the asynchronous zap worker.
955*edbdb43fSSean Christopherson 	 */
956*edbdb43fSSean Christopherson 	rcu_read_lock();
957*edbdb43fSSean Christopherson 
958*edbdb43fSSean Christopherson 	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
959*edbdb43fSSean Christopherson 		if (!root->role.invalid) {
960b7cccd39SBen Gardon 			root->role.invalid = true;
96122b94c4bSPaolo Bonzini 			tdp_mmu_schedule_zap_root(kvm, root);
96222b94c4bSPaolo Bonzini 		}
963b7cccd39SBen Gardon 	}
964*edbdb43fSSean Christopherson 
965*edbdb43fSSean Christopherson 	rcu_read_unlock();
966f28e9c7fSSean Christopherson }
967b7cccd39SBen Gardon 
968bb18842eSBen Gardon /*
969bb18842eSBen Gardon  * Installs a last-level SPTE to handle a TDP page fault.
970bb18842eSBen Gardon  * (NPT/EPT violation/misconfiguration)
971bb18842eSBen Gardon  */
972cdc47767SPaolo Bonzini static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
973cdc47767SPaolo Bonzini 					  struct kvm_page_fault *fault,
974cdc47767SPaolo Bonzini 					  struct tdp_iter *iter)
975bb18842eSBen Gardon {
976c435d4b7SSean Christopherson 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
977bb18842eSBen Gardon 	u64 new_spte;
97857a3e96dSKai Huang 	int ret = RET_PF_FIXED;
979ad67e480SPaolo Bonzini 	bool wrprot = false;
980bb18842eSBen Gardon 
98150a9ac25SSean Christopherson 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
98250a9ac25SSean Christopherson 		return RET_PF_RETRY;
98350a9ac25SSean Christopherson 
984e710c5f6SDavid Matlack 	if (unlikely(!fault->slot))
985bb18842eSBen Gardon 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9869a77daacSBen Gardon 	else
98753597858SDavid Matlack 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
9882839180cSPaolo Bonzini 					 fault->pfn, iter->old_spte, fault->prefetch, true,
9897158bee4SPaolo Bonzini 					 fault->map_writable, &new_spte);
990bb18842eSBen Gardon 
991bb18842eSBen Gardon 	if (new_spte == iter->old_spte)
992bb18842eSBen Gardon 		ret = RET_PF_SPURIOUS;
9933e72c791SDavid Matlack 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9949a77daacSBen Gardon 		return RET_PF_RETRY;
995bb95dfb9SSean Christopherson 	else if (is_shadow_present_pte(iter->old_spte) &&
996bb95dfb9SSean Christopherson 		 !is_last_spte(iter->old_spte, iter->level))
9971e203847SHou Wenlong 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
998bb18842eSBen Gardon 
999bb18842eSBen Gardon 	/*
1000bb18842eSBen Gardon 	 * If the page fault was caused by a write but the page is write
1001bb18842eSBen Gardon 	 * protected, emulation is needed. If the emulation was skipped,
1002bb18842eSBen Gardon 	 * the vCPU would have the same fault again.
1003bb18842eSBen Gardon 	 */
1004ad67e480SPaolo Bonzini 	if (wrprot) {
1005cdc47767SPaolo Bonzini 		if (fault->write)
1006bb18842eSBen Gardon 			ret = RET_PF_EMULATE;
1007bb18842eSBen Gardon 	}
1008bb18842eSBen Gardon 
1009bb18842eSBen Gardon 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
10109a77daacSBen Gardon 	if (unlikely(is_mmio_spte(new_spte))) {
10111075d41eSSean Christopherson 		vcpu->stat.pf_mmio_spte_created++;
10129a77daacSBen Gardon 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
10139a77daacSBen Gardon 				     new_spte);
1014bb18842eSBen Gardon 		ret = RET_PF_EMULATE;
10153849e092SSean Christopherson 	} else {
10169a77daacSBen Gardon 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
10179a77daacSBen Gardon 				       rcu_dereference(iter->sptep));
10183849e092SSean Christopherson 	}
1019bb18842eSBen Gardon 
1020bb18842eSBen Gardon 	return ret;
1021bb18842eSBen Gardon }
1022bb18842eSBen Gardon 
1023bb18842eSBen Gardon /*
1024cb00a70bSDavid Matlack  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1025cb00a70bSDavid Matlack  * provided page table.
10267b7e1ab6SDavid Matlack  *
10277b7e1ab6SDavid Matlack  * @kvm: kvm instance
10287b7e1ab6SDavid Matlack  * @iter: a tdp_iter instance currently on the SPTE that should be set
10297b7e1ab6SDavid Matlack  * @sp: The new TDP page table to install.
1030cb00a70bSDavid Matlack  * @shared: This operation is running under the MMU lock in read mode.
10317b7e1ab6SDavid Matlack  *
10327b7e1ab6SDavid Matlack  * Returns: 0 if the new page table was installed. Non-0 if the page table
10337b7e1ab6SDavid Matlack  *          could not be installed (e.g. the atomic compare-exchange failed).
10347b7e1ab6SDavid Matlack  */
1035cb00a70bSDavid Matlack static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
103661f94478SSean Christopherson 			   struct kvm_mmu_page *sp, bool shared)
10377b7e1ab6SDavid Matlack {
103854275f74SSean Christopherson 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1039cb00a70bSDavid Matlack 	int ret = 0;
10407b7e1ab6SDavid Matlack 
1041cb00a70bSDavid Matlack 	if (shared) {
10427b7e1ab6SDavid Matlack 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
10437b7e1ab6SDavid Matlack 		if (ret)
10447b7e1ab6SDavid Matlack 			return ret;
1045cb00a70bSDavid Matlack 	} else {
10460b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, iter, spte);
1047cb00a70bSDavid Matlack 	}
10487b7e1ab6SDavid Matlack 
104943a063caSYosry Ahmed 	tdp_account_mmu_page(kvm, sp);
10507b7e1ab6SDavid Matlack 
10517b7e1ab6SDavid Matlack 	return 0;
10527b7e1ab6SDavid Matlack }
10537b7e1ab6SDavid Matlack 
1054c4b33d28SDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1055c4b33d28SDavid Matlack 				   struct kvm_mmu_page *sp, bool shared);
1056c4b33d28SDavid Matlack 
10577b7e1ab6SDavid Matlack /*
1058bb18842eSBen Gardon  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1059bb18842eSBen Gardon  * page tables and SPTEs to translate the faulting guest physical address.
1060bb18842eSBen Gardon  */
10612f6305ddSPaolo Bonzini int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1062bb18842eSBen Gardon {
1063bb18842eSBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
106461f94478SSean Christopherson 	struct kvm *kvm = vcpu->kvm;
1065bb18842eSBen Gardon 	struct tdp_iter iter;
106689c0fd49SBen Gardon 	struct kvm_mmu_page *sp;
106763d28a25SPaolo Bonzini 	int ret = RET_PF_RETRY;
1068bb18842eSBen Gardon 
106973a3c659SPaolo Bonzini 	kvm_mmu_hugepage_adjust(vcpu, fault);
1070bb18842eSBen Gardon 
1071f0066d94SPaolo Bonzini 	trace_kvm_mmu_spte_requested(fault);
10727cca2d0bSBen Gardon 
10737cca2d0bSBen Gardon 	rcu_read_lock();
10747cca2d0bSBen Gardon 
10752f6305ddSPaolo Bonzini 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
107663d28a25SPaolo Bonzini 		int r;
107763d28a25SPaolo Bonzini 
107873a3c659SPaolo Bonzini 		if (fault->nx_huge_page_workaround_enabled)
1079536f0e6aSPaolo Bonzini 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1080bb18842eSBen Gardon 
1081bb18842eSBen Gardon 		/*
1082c4b33d28SDavid Matlack 		 * If SPTE has been frozen by another thread, just give up and
1083c4b33d28SDavid Matlack 		 * retry, avoiding unnecessary page table allocation and free.
1084ff76d506SKai Huang 		 */
1085ff76d506SKai Huang 		if (is_removed_spte(iter.old_spte))
108663d28a25SPaolo Bonzini 			goto retry;
108763d28a25SPaolo Bonzini 
1088f5d16bb9SSean Christopherson 		if (iter.level == fault->goal_level)
108980a3e4aeSSean Christopherson 			goto map_target_level;
1090f5d16bb9SSean Christopherson 
109163d28a25SPaolo Bonzini 		/* Step down into the lower level page table if it exists. */
109263d28a25SPaolo Bonzini 		if (is_shadow_present_pte(iter.old_spte) &&
109363d28a25SPaolo Bonzini 		    !is_large_pte(iter.old_spte))
109463d28a25SPaolo Bonzini 			continue;
1095ff76d506SKai Huang 
1096c4b33d28SDavid Matlack 		/*
1097c4b33d28SDavid Matlack 		 * The SPTE is either non-present or points to a huge page that
1098c4b33d28SDavid Matlack 		 * needs to be split.
1099c4b33d28SDavid Matlack 		 */
1100a82070b6SDavid Matlack 		sp = tdp_mmu_alloc_sp(vcpu);
1101a82070b6SDavid Matlack 		tdp_mmu_init_child_sp(sp, &iter);
1102a82070b6SDavid Matlack 
110361f94478SSean Christopherson 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
110461f94478SSean Christopherson 
1105c4b33d28SDavid Matlack 		if (is_shadow_present_pte(iter.old_spte))
110663d28a25SPaolo Bonzini 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1107c4b33d28SDavid Matlack 		else
110863d28a25SPaolo Bonzini 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1109c4b33d28SDavid Matlack 
111063d28a25SPaolo Bonzini 		/*
111180a3e4aeSSean Christopherson 		 * Force the guest to retry if installing an upper level SPTE
111280a3e4aeSSean Christopherson 		 * failed, e.g. because a different task modified the SPTE.
111363d28a25SPaolo Bonzini 		 */
111463d28a25SPaolo Bonzini 		if (r) {
11159a77daacSBen Gardon 			tdp_mmu_free_sp(sp);
111663d28a25SPaolo Bonzini 			goto retry;
11179a77daacSBen Gardon 		}
111861f94478SSean Christopherson 
111961f94478SSean Christopherson 		if (fault->huge_page_disallowed &&
112061f94478SSean Christopherson 		    fault->req_level >= iter.level) {
112161f94478SSean Christopherson 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
112221a36ac6SSean Christopherson 			if (sp->nx_huge_page_disallowed)
112361f94478SSean Christopherson 				track_possible_nx_huge_page(kvm, sp);
112461f94478SSean Christopherson 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
112561f94478SSean Christopherson 		}
1126bb18842eSBen Gardon 	}
1127bb18842eSBen Gardon 
112880a3e4aeSSean Christopherson 	/*
112980a3e4aeSSean Christopherson 	 * The walk aborted before reaching the target level, e.g. because the
113080a3e4aeSSean Christopherson 	 * iterator detected an upper level SPTE was frozen during traversal.
113180a3e4aeSSean Christopherson 	 */
113280a3e4aeSSean Christopherson 	WARN_ON_ONCE(iter.level == fault->goal_level);
113380a3e4aeSSean Christopherson 	goto retry;
113480a3e4aeSSean Christopherson 
113580a3e4aeSSean Christopherson map_target_level:
1136cdc47767SPaolo Bonzini 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1137bb18842eSBen Gardon 
113863d28a25SPaolo Bonzini retry:
113963d28a25SPaolo Bonzini 	rcu_read_unlock();
1140bb18842eSBen Gardon 	return ret;
1141bb18842eSBen Gardon }
1142063afacdSBen Gardon 
11433039bcc7SSean Christopherson bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
11443039bcc7SSean Christopherson 				 bool flush)
1145063afacdSBen Gardon {
1146f47e5bbbSSean Christopherson 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
114783b83a02SSean Christopherson 				     range->end, range->may_block, flush);
11483039bcc7SSean Christopherson }
11493039bcc7SSean Christopherson 
11503039bcc7SSean Christopherson typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
11513039bcc7SSean Christopherson 			      struct kvm_gfn_range *range);
11523039bcc7SSean Christopherson 
11533039bcc7SSean Christopherson static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
11543039bcc7SSean Christopherson 						   struct kvm_gfn_range *range,
1155c1b91493SSean Christopherson 						   tdp_handler_t handler)
1156063afacdSBen Gardon {
1157063afacdSBen Gardon 	struct kvm_mmu_page *root;
11583039bcc7SSean Christopherson 	struct tdp_iter iter;
11593039bcc7SSean Christopherson 	bool ret = false;
1160063afacdSBen Gardon 
1161063afacdSBen Gardon 	/*
1162e1eed584SSean Christopherson 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1163e1eed584SSean Christopherson 	 * into this helper allow blocking; it'd be dead, wasteful code.
1164063afacdSBen Gardon 	 */
11653039bcc7SSean Christopherson 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1166a151acecSSean Christopherson 		rcu_read_lock();
1167a151acecSSean Christopherson 
11683039bcc7SSean Christopherson 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
11693039bcc7SSean Christopherson 			ret |= handler(kvm, &iter, range);
1170063afacdSBen Gardon 
11713039bcc7SSean Christopherson 		rcu_read_unlock();
1172a151acecSSean Christopherson 	}
1173063afacdSBen Gardon 
1174063afacdSBen Gardon 	return ret;
1175063afacdSBen Gardon }
1176063afacdSBen Gardon 
1177f8e14497SBen Gardon /*
1178f8e14497SBen Gardon  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1179f8e14497SBen Gardon  * if any of the GFNs in the range have been accessed.
11807ee131e3SVipin Sharma  *
11817ee131e3SVipin Sharma  * No need to mark the corresponding PFN as accessed as this call is coming
11827ee131e3SVipin Sharma  * from the clear_young() or clear_flush_young() notifier, which uses the
11837ee131e3SVipin Sharma  * return value to determine if the page has been accessed.
1184f8e14497SBen Gardon  */
11853039bcc7SSean Christopherson static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
11863039bcc7SSean Christopherson 			  struct kvm_gfn_range *range)
1187f8e14497SBen Gardon {
11887ee131e3SVipin Sharma 	u64 new_spte;
1189f8e14497SBen Gardon 
11903039bcc7SSean Christopherson 	/* If we have a non-accessed entry we don't need to change the pte. */
11913039bcc7SSean Christopherson 	if (!is_accessed_spte(iter->old_spte))
11923039bcc7SSean Christopherson 		return false;
11937cca2d0bSBen Gardon 
11947ee131e3SVipin Sharma 	if (spte_ad_enabled(iter->old_spte)) {
11957ee131e3SVipin Sharma 		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
11967ee131e3SVipin Sharma 							 iter->old_spte,
11977ee131e3SVipin Sharma 							 shadow_accessed_mask,
11987ee131e3SVipin Sharma 							 iter->level);
11997ee131e3SVipin Sharma 		new_spte = iter->old_spte & ~shadow_accessed_mask;
1200f8e14497SBen Gardon 	} else {
1201f8e14497SBen Gardon 		/*
1202f8e14497SBen Gardon 		 * Capture the dirty status of the page, so that it doesn't get
1203f8e14497SBen Gardon 		 * lost when the SPTE is marked for access tracking.
1204f8e14497SBen Gardon 		 */
12057ee131e3SVipin Sharma 		if (is_writable_pte(iter->old_spte))
12067ee131e3SVipin Sharma 			kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
1207f8e14497SBen Gardon 
12087ee131e3SVipin Sharma 		new_spte = mark_spte_for_access_track(iter->old_spte);
12097ee131e3SVipin Sharma 		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
12107ee131e3SVipin Sharma 							iter->old_spte, new_spte,
12117ee131e3SVipin Sharma 							iter->level);
1212f8e14497SBen Gardon 	}
1213f8e14497SBen Gardon 
1214891f1159SVipin Sharma 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1215891f1159SVipin Sharma 				       iter->old_spte, new_spte);
12163039bcc7SSean Christopherson 	return true;
1217f8e14497SBen Gardon }
1218f8e14497SBen Gardon 
12193039bcc7SSean Christopherson bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1220f8e14497SBen Gardon {
12213039bcc7SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1222f8e14497SBen Gardon }
1223f8e14497SBen Gardon 
12243039bcc7SSean Christopherson static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
12253039bcc7SSean Christopherson 			 struct kvm_gfn_range *range)
1226f8e14497SBen Gardon {
12273039bcc7SSean Christopherson 	return is_accessed_spte(iter->old_spte);
1228f8e14497SBen Gardon }
1229f8e14497SBen Gardon 
12303039bcc7SSean Christopherson bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1231f8e14497SBen Gardon {
12323039bcc7SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
12333039bcc7SSean Christopherson }
12343039bcc7SSean Christopherson 
12353039bcc7SSean Christopherson static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
12363039bcc7SSean Christopherson 			 struct kvm_gfn_range *range)
12373039bcc7SSean Christopherson {
12383039bcc7SSean Christopherson 	u64 new_spte;
12393039bcc7SSean Christopherson 
12403039bcc7SSean Christopherson 	/* Huge pages aren't expected to be modified without first being zapped. */
12413039bcc7SSean Christopherson 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
12423039bcc7SSean Christopherson 
12433039bcc7SSean Christopherson 	if (iter->level != PG_LEVEL_4K ||
12443039bcc7SSean Christopherson 	    !is_shadow_present_pte(iter->old_spte))
12453039bcc7SSean Christopherson 		return false;
12463039bcc7SSean Christopherson 
12473039bcc7SSean Christopherson 	/*
12483039bcc7SSean Christopherson 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
12493039bcc7SSean Christopherson 	 * zero the SPTE before setting the new PFN, but doing so preserves the
12503039bcc7SSean Christopherson 	 * invariant that the PFN of a present * leaf SPTE can never change.
125140fa907eSVipin Sharma 	 * See handle_changed_spte().
12523039bcc7SSean Christopherson 	 */
12530b7cc254SVipin Sharma 	tdp_mmu_iter_set_spte(kvm, iter, 0);
12543039bcc7SSean Christopherson 
12553039bcc7SSean Christopherson 	if (!pte_write(range->pte)) {
12563039bcc7SSean Christopherson 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
12573039bcc7SSean Christopherson 								  pte_pfn(range->pte));
12583039bcc7SSean Christopherson 
12590b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, iter, new_spte);
12603039bcc7SSean Christopherson 	}
12613039bcc7SSean Christopherson 
12623039bcc7SSean Christopherson 	return true;
1263f8e14497SBen Gardon }
12641d8dd6b3SBen Gardon 
12651d8dd6b3SBen Gardon /*
12661d8dd6b3SBen Gardon  * Handle the changed_pte MMU notifier for the TDP MMU.
12671d8dd6b3SBen Gardon  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
12681d8dd6b3SBen Gardon  * notifier.
12691d8dd6b3SBen Gardon  * Returns non-zero if a flush is needed before releasing the MMU lock.
12701d8dd6b3SBen Gardon  */
12713039bcc7SSean Christopherson bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
12721d8dd6b3SBen Gardon {
127393fa50f6SSean Christopherson 	/*
127493fa50f6SSean Christopherson 	 * No need to handle the remote TLB flush under RCU protection, the
127593fa50f6SSean Christopherson 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
127640fa907eSVipin Sharma 	 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
127793fa50f6SSean Christopherson 	 */
127893fa50f6SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
12791d8dd6b3SBen Gardon }
12801d8dd6b3SBen Gardon 
1281a6a0b05dSBen Gardon /*
1282bedd9195SDavid Matlack  * Remove write access from all SPTEs at or above min_level that map GFNs
1283bedd9195SDavid Matlack  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1284bedd9195SDavid Matlack  * be flushed.
1285a6a0b05dSBen Gardon  */
1286a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1287a6a0b05dSBen Gardon 			     gfn_t start, gfn_t end, int min_level)
1288a6a0b05dSBen Gardon {
1289a6a0b05dSBen Gardon 	struct tdp_iter iter;
1290a6a0b05dSBen Gardon 	u64 new_spte;
1291a6a0b05dSBen Gardon 	bool spte_set = false;
1292a6a0b05dSBen Gardon 
12937cca2d0bSBen Gardon 	rcu_read_lock();
12947cca2d0bSBen Gardon 
1295a6a0b05dSBen Gardon 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1296a6a0b05dSBen Gardon 
129777aa6075SDavid Matlack 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
129824ae4cfaSBen Gardon retry:
129924ae4cfaSBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
13001af4a960SBen Gardon 			continue;
13011af4a960SBen Gardon 
1302a6a0b05dSBen Gardon 		if (!is_shadow_present_pte(iter.old_spte) ||
13030f99ee2cSBen Gardon 		    !is_last_spte(iter.old_spte, iter.level) ||
13040f99ee2cSBen Gardon 		    !(iter.old_spte & PT_WRITABLE_MASK))
1305a6a0b05dSBen Gardon 			continue;
1306a6a0b05dSBen Gardon 
1307a6a0b05dSBen Gardon 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1308a6a0b05dSBen Gardon 
13093e72c791SDavid Matlack 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
131024ae4cfaSBen Gardon 			goto retry;
13113255530aSDavid Matlack 
1312a6a0b05dSBen Gardon 		spte_set = true;
1313a6a0b05dSBen Gardon 	}
13147cca2d0bSBen Gardon 
13157cca2d0bSBen Gardon 	rcu_read_unlock();
1316a6a0b05dSBen Gardon 	return spte_set;
1317a6a0b05dSBen Gardon }
1318a6a0b05dSBen Gardon 
1319a6a0b05dSBen Gardon /*
1320a6a0b05dSBen Gardon  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1321a6a0b05dSBen Gardon  * only affect leaf SPTEs down to min_level.
1322a6a0b05dSBen Gardon  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1323a6a0b05dSBen Gardon  */
1324269e9552SHamza Mahfooz bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1325269e9552SHamza Mahfooz 			     const struct kvm_memory_slot *slot, int min_level)
1326a6a0b05dSBen Gardon {
1327a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1328a6a0b05dSBen Gardon 	bool spte_set = false;
1329a6a0b05dSBen Gardon 
133024ae4cfaSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
1331a6a0b05dSBen Gardon 
1332d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1333a6a0b05dSBen Gardon 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1334a6a0b05dSBen Gardon 			     slot->base_gfn + slot->npages, min_level);
1335a6a0b05dSBen Gardon 
1336a6a0b05dSBen Gardon 	return spte_set;
1337a6a0b05dSBen Gardon }
1338a6a0b05dSBen Gardon 
1339a3fe5dbdSDavid Matlack static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1340a3fe5dbdSDavid Matlack {
1341a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp;
1342a3fe5dbdSDavid Matlack 
1343a3fe5dbdSDavid Matlack 	gfp |= __GFP_ZERO;
1344a3fe5dbdSDavid Matlack 
1345a3fe5dbdSDavid Matlack 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1346a3fe5dbdSDavid Matlack 	if (!sp)
1347a3fe5dbdSDavid Matlack 		return NULL;
1348a3fe5dbdSDavid Matlack 
1349a3fe5dbdSDavid Matlack 	sp->spt = (void *)__get_free_page(gfp);
1350a3fe5dbdSDavid Matlack 	if (!sp->spt) {
1351a3fe5dbdSDavid Matlack 		kmem_cache_free(mmu_page_header_cache, sp);
1352a3fe5dbdSDavid Matlack 		return NULL;
1353a3fe5dbdSDavid Matlack 	}
1354a3fe5dbdSDavid Matlack 
1355a3fe5dbdSDavid Matlack 	return sp;
1356a3fe5dbdSDavid Matlack }
1357a3fe5dbdSDavid Matlack 
1358a3fe5dbdSDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1359cb00a70bSDavid Matlack 						       struct tdp_iter *iter,
1360cb00a70bSDavid Matlack 						       bool shared)
1361a3fe5dbdSDavid Matlack {
1362a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp;
1363a3fe5dbdSDavid Matlack 
1364a3fe5dbdSDavid Matlack 	/*
1365a3fe5dbdSDavid Matlack 	 * Since we are allocating while under the MMU lock we have to be
1366a3fe5dbdSDavid Matlack 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1367a3fe5dbdSDavid Matlack 	 * reclaim and to avoid making any filesystem callbacks (which can end
1368a3fe5dbdSDavid Matlack 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1369a3fe5dbdSDavid Matlack 	 *
1370a3fe5dbdSDavid Matlack 	 * If this allocation fails we drop the lock and retry with reclaim
1371a3fe5dbdSDavid Matlack 	 * allowed.
1372a3fe5dbdSDavid Matlack 	 */
1373a3fe5dbdSDavid Matlack 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1374a3fe5dbdSDavid Matlack 	if (sp)
1375a3fe5dbdSDavid Matlack 		return sp;
1376a3fe5dbdSDavid Matlack 
1377a3fe5dbdSDavid Matlack 	rcu_read_unlock();
1378cb00a70bSDavid Matlack 
1379cb00a70bSDavid Matlack 	if (shared)
1380a3fe5dbdSDavid Matlack 		read_unlock(&kvm->mmu_lock);
1381cb00a70bSDavid Matlack 	else
1382cb00a70bSDavid Matlack 		write_unlock(&kvm->mmu_lock);
1383a3fe5dbdSDavid Matlack 
1384a3fe5dbdSDavid Matlack 	iter->yielded = true;
1385a3fe5dbdSDavid Matlack 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1386a3fe5dbdSDavid Matlack 
1387cb00a70bSDavid Matlack 	if (shared)
1388a3fe5dbdSDavid Matlack 		read_lock(&kvm->mmu_lock);
1389cb00a70bSDavid Matlack 	else
1390cb00a70bSDavid Matlack 		write_lock(&kvm->mmu_lock);
1391cb00a70bSDavid Matlack 
1392a3fe5dbdSDavid Matlack 	rcu_read_lock();
1393a3fe5dbdSDavid Matlack 
1394a3fe5dbdSDavid Matlack 	return sp;
1395a3fe5dbdSDavid Matlack }
1396a3fe5dbdSDavid Matlack 
1397c4b33d28SDavid Matlack /* Note, the caller is responsible for initializing @sp. */
1398cb00a70bSDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1399cb00a70bSDavid Matlack 				   struct kvm_mmu_page *sp, bool shared)
1400a3fe5dbdSDavid Matlack {
1401a3fe5dbdSDavid Matlack 	const u64 huge_spte = iter->old_spte;
1402a3fe5dbdSDavid Matlack 	const int level = iter->level;
1403a3fe5dbdSDavid Matlack 	int ret, i;
1404a3fe5dbdSDavid Matlack 
1405a3fe5dbdSDavid Matlack 	/*
1406a3fe5dbdSDavid Matlack 	 * No need for atomics when writing to sp->spt since the page table has
1407a3fe5dbdSDavid Matlack 	 * not been linked in yet and thus is not reachable from any other CPU.
1408a3fe5dbdSDavid Matlack 	 */
14092ca3129eSSean Christopherson 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
141047855da0SDavid Matlack 		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1411a3fe5dbdSDavid Matlack 
1412a3fe5dbdSDavid Matlack 	/*
1413a3fe5dbdSDavid Matlack 	 * Replace the huge spte with a pointer to the populated lower level
1414a3fe5dbdSDavid Matlack 	 * page table. Since we are making this change without a TLB flush vCPUs
1415a3fe5dbdSDavid Matlack 	 * will see a mix of the split mappings and the original huge mapping,
1416a3fe5dbdSDavid Matlack 	 * depending on what's currently in their TLB. This is fine from a
1417a3fe5dbdSDavid Matlack 	 * correctness standpoint since the translation will be the same either
1418a3fe5dbdSDavid Matlack 	 * way.
1419a3fe5dbdSDavid Matlack 	 */
142061f94478SSean Christopherson 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1421a3fe5dbdSDavid Matlack 	if (ret)
1422e0b728b1SDavid Matlack 		goto out;
1423a3fe5dbdSDavid Matlack 
1424a3fe5dbdSDavid Matlack 	/*
1425a3fe5dbdSDavid Matlack 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1426a3fe5dbdSDavid Matlack 	 * are overwriting from the page stats. But we have to manually update
1427a3fe5dbdSDavid Matlack 	 * the page stats with the new present child pages.
1428a3fe5dbdSDavid Matlack 	 */
14292ca3129eSSean Christopherson 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1430a3fe5dbdSDavid Matlack 
1431e0b728b1SDavid Matlack out:
1432e0b728b1SDavid Matlack 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1433e0b728b1SDavid Matlack 	return ret;
1434a3fe5dbdSDavid Matlack }
1435a3fe5dbdSDavid Matlack 
1436a3fe5dbdSDavid Matlack static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1437a3fe5dbdSDavid Matlack 					 struct kvm_mmu_page *root,
1438a3fe5dbdSDavid Matlack 					 gfn_t start, gfn_t end,
1439cb00a70bSDavid Matlack 					 int target_level, bool shared)
1440a3fe5dbdSDavid Matlack {
1441a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp = NULL;
1442a3fe5dbdSDavid Matlack 	struct tdp_iter iter;
1443a3fe5dbdSDavid Matlack 	int ret = 0;
1444a3fe5dbdSDavid Matlack 
1445a3fe5dbdSDavid Matlack 	rcu_read_lock();
1446a3fe5dbdSDavid Matlack 
1447a3fe5dbdSDavid Matlack 	/*
1448a3fe5dbdSDavid Matlack 	 * Traverse the page table splitting all huge pages above the target
1449a3fe5dbdSDavid Matlack 	 * level into one lower level. For example, if we encounter a 1GB page
1450a3fe5dbdSDavid Matlack 	 * we split it into 512 2MB pages.
1451a3fe5dbdSDavid Matlack 	 *
1452a3fe5dbdSDavid Matlack 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1453a3fe5dbdSDavid Matlack 	 * to visit an SPTE before ever visiting its children, which means we
1454a3fe5dbdSDavid Matlack 	 * will correctly recursively split huge pages that are more than one
1455a3fe5dbdSDavid Matlack 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1456a3fe5dbdSDavid Matlack 	 * and then splitting each of those to 512 4KB pages).
1457a3fe5dbdSDavid Matlack 	 */
1458a3fe5dbdSDavid Matlack 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1459a3fe5dbdSDavid Matlack retry:
1460cb00a70bSDavid Matlack 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1461a3fe5dbdSDavid Matlack 			continue;
1462a3fe5dbdSDavid Matlack 
1463a3fe5dbdSDavid Matlack 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1464a3fe5dbdSDavid Matlack 			continue;
1465a3fe5dbdSDavid Matlack 
1466a3fe5dbdSDavid Matlack 		if (!sp) {
1467cb00a70bSDavid Matlack 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1468a3fe5dbdSDavid Matlack 			if (!sp) {
1469a3fe5dbdSDavid Matlack 				ret = -ENOMEM;
1470e0b728b1SDavid Matlack 				trace_kvm_mmu_split_huge_page(iter.gfn,
1471e0b728b1SDavid Matlack 							      iter.old_spte,
1472e0b728b1SDavid Matlack 							      iter.level, ret);
1473a3fe5dbdSDavid Matlack 				break;
1474a3fe5dbdSDavid Matlack 			}
1475a3fe5dbdSDavid Matlack 
1476a3fe5dbdSDavid Matlack 			if (iter.yielded)
1477a3fe5dbdSDavid Matlack 				continue;
1478a3fe5dbdSDavid Matlack 		}
1479a3fe5dbdSDavid Matlack 
1480c4b33d28SDavid Matlack 		tdp_mmu_init_child_sp(sp, &iter);
1481c4b33d28SDavid Matlack 
1482cb00a70bSDavid Matlack 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1483a3fe5dbdSDavid Matlack 			goto retry;
1484a3fe5dbdSDavid Matlack 
1485a3fe5dbdSDavid Matlack 		sp = NULL;
1486a3fe5dbdSDavid Matlack 	}
1487a3fe5dbdSDavid Matlack 
1488a3fe5dbdSDavid Matlack 	rcu_read_unlock();
1489a3fe5dbdSDavid Matlack 
1490a3fe5dbdSDavid Matlack 	/*
1491a3fe5dbdSDavid Matlack 	 * It's possible to exit the loop having never used the last sp if, for
1492a3fe5dbdSDavid Matlack 	 * example, a vCPU doing HugePage NX splitting wins the race and
1493a3fe5dbdSDavid Matlack 	 * installs its own sp in place of the last sp we tried to split.
1494a3fe5dbdSDavid Matlack 	 */
1495a3fe5dbdSDavid Matlack 	if (sp)
1496a3fe5dbdSDavid Matlack 		tdp_mmu_free_sp(sp);
1497a3fe5dbdSDavid Matlack 
1498a3fe5dbdSDavid Matlack 	return ret;
1499a3fe5dbdSDavid Matlack }
1500a3fe5dbdSDavid Matlack 
1501cb00a70bSDavid Matlack 
1502a3fe5dbdSDavid Matlack /*
1503a3fe5dbdSDavid Matlack  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1504a3fe5dbdSDavid Matlack  */
1505a3fe5dbdSDavid Matlack void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1506a3fe5dbdSDavid Matlack 				      const struct kvm_memory_slot *slot,
1507a3fe5dbdSDavid Matlack 				      gfn_t start, gfn_t end,
1508cb00a70bSDavid Matlack 				      int target_level, bool shared)
1509a3fe5dbdSDavid Matlack {
1510a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *root;
1511a3fe5dbdSDavid Matlack 	int r = 0;
1512a3fe5dbdSDavid Matlack 
1513cb00a70bSDavid Matlack 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1514a3fe5dbdSDavid Matlack 
15157c554d8eSPaolo Bonzini 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1516cb00a70bSDavid Matlack 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1517a3fe5dbdSDavid Matlack 		if (r) {
1518cb00a70bSDavid Matlack 			kvm_tdp_mmu_put_root(kvm, root, shared);
1519a3fe5dbdSDavid Matlack 			break;
1520a3fe5dbdSDavid Matlack 		}
1521a3fe5dbdSDavid Matlack 	}
1522a3fe5dbdSDavid Matlack }
1523a3fe5dbdSDavid Matlack 
1524a6a0b05dSBen Gardon /*
1525a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1526a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1527a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1528a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1529a6a0b05dSBen Gardon  * be flushed.
1530a6a0b05dSBen Gardon  */
1531a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1532a6a0b05dSBen Gardon 			   gfn_t start, gfn_t end)
1533a6a0b05dSBen Gardon {
1534697c89beSVipin Sharma 	u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1535a6a0b05dSBen Gardon 	struct tdp_iter iter;
1536a6a0b05dSBen Gardon 	bool spte_set = false;
1537a6a0b05dSBen Gardon 
15387cca2d0bSBen Gardon 	rcu_read_lock();
15397cca2d0bSBen Gardon 
1540a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
154124ae4cfaSBen Gardon retry:
154224ae4cfaSBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
15431af4a960SBen Gardon 			continue;
15441af4a960SBen Gardon 
15453354ef5aSSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte))
15463354ef5aSSean Christopherson 			continue;
15473354ef5aSSean Christopherson 
15485982a539SVipin Sharma 		MMU_WARN_ON(kvm_ad_enabled() &&
15495982a539SVipin Sharma 			    spte_ad_need_write_protect(iter.old_spte));
15505982a539SVipin Sharma 
1551697c89beSVipin Sharma 		if (!(iter.old_spte & dbit))
1552a6a0b05dSBen Gardon 			continue;
1553a6a0b05dSBen Gardon 
1554697c89beSVipin Sharma 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
155524ae4cfaSBen Gardon 			goto retry;
15563255530aSDavid Matlack 
1557a6a0b05dSBen Gardon 		spte_set = true;
1558a6a0b05dSBen Gardon 	}
15597cca2d0bSBen Gardon 
15607cca2d0bSBen Gardon 	rcu_read_unlock();
1561a6a0b05dSBen Gardon 	return spte_set;
1562a6a0b05dSBen Gardon }
1563a6a0b05dSBen Gardon 
1564a6a0b05dSBen Gardon /*
1565a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1566a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1567a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1568a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1569a6a0b05dSBen Gardon  * be flushed.
1570a6a0b05dSBen Gardon  */
1571269e9552SHamza Mahfooz bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1572269e9552SHamza Mahfooz 				  const struct kvm_memory_slot *slot)
1573a6a0b05dSBen Gardon {
1574a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1575a6a0b05dSBen Gardon 	bool spte_set = false;
1576a6a0b05dSBen Gardon 
157724ae4cfaSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
1578a6a0b05dSBen Gardon 
1579d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1580a6a0b05dSBen Gardon 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1581a6a0b05dSBen Gardon 				slot->base_gfn + slot->npages);
1582a6a0b05dSBen Gardon 
1583a6a0b05dSBen Gardon 	return spte_set;
1584a6a0b05dSBen Gardon }
1585a6a0b05dSBen Gardon 
1586a6a0b05dSBen Gardon /*
1587a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1588a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1589a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1590a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1591a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1592a6a0b05dSBen Gardon  */
1593a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1594a6a0b05dSBen Gardon 				  gfn_t gfn, unsigned long mask, bool wrprot)
1595a6a0b05dSBen Gardon {
1596697c89beSVipin Sharma 	u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1597697c89beSVipin Sharma 						   shadow_dirty_mask;
1598a6a0b05dSBen Gardon 	struct tdp_iter iter;
1599a6a0b05dSBen Gardon 
16007cca2d0bSBen Gardon 	rcu_read_lock();
16017cca2d0bSBen Gardon 
1602a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1603a6a0b05dSBen Gardon 				    gfn + BITS_PER_LONG) {
1604a6a0b05dSBen Gardon 		if (!mask)
1605a6a0b05dSBen Gardon 			break;
1606a6a0b05dSBen Gardon 
16075982a539SVipin Sharma 		MMU_WARN_ON(kvm_ad_enabled() &&
16085982a539SVipin Sharma 			    spte_ad_need_write_protect(iter.old_spte));
16095982a539SVipin Sharma 
1610a6a0b05dSBen Gardon 		if (iter.level > PG_LEVEL_4K ||
1611a6a0b05dSBen Gardon 		    !(mask & (1UL << (iter.gfn - gfn))))
1612a6a0b05dSBen Gardon 			continue;
1613a6a0b05dSBen Gardon 
1614f1b3b06aSBen Gardon 		mask &= ~(1UL << (iter.gfn - gfn));
1615f1b3b06aSBen Gardon 
1616697c89beSVipin Sharma 		if (!(iter.old_spte & dbit))
1617a6a0b05dSBen Gardon 			continue;
1618a6a0b05dSBen Gardon 
161989c313f2SVipin Sharma 		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
162089c313f2SVipin Sharma 							iter.old_spte, dbit,
162189c313f2SVipin Sharma 							iter.level);
162289c313f2SVipin Sharma 
16231e0f4298SVipin Sharma 		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
16241e0f4298SVipin Sharma 					       iter.old_spte,
16251e0f4298SVipin Sharma 					       iter.old_spte & ~dbit);
16261e0f4298SVipin Sharma 		kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1627a6a0b05dSBen Gardon 	}
16287cca2d0bSBen Gardon 
16297cca2d0bSBen Gardon 	rcu_read_unlock();
1630a6a0b05dSBen Gardon }
1631a6a0b05dSBen Gardon 
1632a6a0b05dSBen Gardon /*
1633a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1634a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1635a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1636a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1637a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1638a6a0b05dSBen Gardon  */
1639a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1640a6a0b05dSBen Gardon 				       struct kvm_memory_slot *slot,
1641a6a0b05dSBen Gardon 				       gfn_t gfn, unsigned long mask,
1642a6a0b05dSBen Gardon 				       bool wrprot)
1643a6a0b05dSBen Gardon {
1644a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1645a6a0b05dSBen Gardon 
1646531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
1647a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1648a6a0b05dSBen Gardon 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1649a6a0b05dSBen Gardon }
1650a6a0b05dSBen Gardon 
16514b85c921SSean Christopherson static void zap_collapsible_spte_range(struct kvm *kvm,
165214881998SBen Gardon 				       struct kvm_mmu_page *root,
16534b85c921SSean Christopherson 				       const struct kvm_memory_slot *slot)
165414881998SBen Gardon {
16559eba50f8SSean Christopherson 	gfn_t start = slot->base_gfn;
16569eba50f8SSean Christopherson 	gfn_t end = start + slot->npages;
165714881998SBen Gardon 	struct tdp_iter iter;
16585ba7c4c6SBen Gardon 	int max_mapping_level;
165914881998SBen Gardon 
16607cca2d0bSBen Gardon 	rcu_read_lock();
16617cca2d0bSBen Gardon 
166285f44f8cSSean Christopherson 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
166385f44f8cSSean Christopherson retry:
16644b85c921SSean Christopherson 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
16651af4a960SBen Gardon 			continue;
16661af4a960SBen Gardon 
166785f44f8cSSean Christopherson 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
166885f44f8cSSean Christopherson 		    !is_shadow_present_pte(iter.old_spte))
166985f44f8cSSean Christopherson 			continue;
167085f44f8cSSean Christopherson 
167185f44f8cSSean Christopherson 		/*
167285f44f8cSSean Christopherson 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
167385f44f8cSSean Christopherson 		 * a large page size, then its parent would have been zapped
167485f44f8cSSean Christopherson 		 * instead of stepping down.
167585f44f8cSSean Christopherson 		 */
167685f44f8cSSean Christopherson 		if (is_last_spte(iter.old_spte, iter.level))
167785f44f8cSSean Christopherson 			continue;
167885f44f8cSSean Christopherson 
167985f44f8cSSean Christopherson 		/*
168085f44f8cSSean Christopherson 		 * If iter.gfn resides outside of the slot, i.e. the page for
168185f44f8cSSean Christopherson 		 * the current level overlaps but is not contained by the slot,
168285f44f8cSSean Christopherson 		 * then the SPTE can't be made huge.  More importantly, trying
168385f44f8cSSean Christopherson 		 * to query that info from slot->arch.lpage_info will cause an
168485f44f8cSSean Christopherson 		 * out-of-bounds access.
168585f44f8cSSean Christopherson 		 */
168685f44f8cSSean Christopherson 		if (iter.gfn < start || iter.gfn >= end)
168714881998SBen Gardon 			continue;
168814881998SBen Gardon 
16895ba7c4c6SBen Gardon 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1690a8ac499bSSean Christopherson 							      iter.gfn, PG_LEVEL_NUM);
169185f44f8cSSean Christopherson 		if (max_mapping_level < iter.level)
16925ba7c4c6SBen Gardon 			continue;
16935ba7c4c6SBen Gardon 
16944b85c921SSean Christopherson 		/* Note, a successful atomic zap also does a remote TLB flush. */
169585f44f8cSSean Christopherson 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
169685f44f8cSSean Christopherson 			goto retry;
16972db6f772SBen Gardon 	}
169814881998SBen Gardon 
16997cca2d0bSBen Gardon 	rcu_read_unlock();
170014881998SBen Gardon }
170114881998SBen Gardon 
170214881998SBen Gardon /*
170385f44f8cSSean Christopherson  * Zap non-leaf SPTEs (and free their associated page tables) which could
170485f44f8cSSean Christopherson  * be replaced by huge pages, for GFNs within the slot.
170514881998SBen Gardon  */
17064b85c921SSean Christopherson void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
17074b85c921SSean Christopherson 				       const struct kvm_memory_slot *slot)
170814881998SBen Gardon {
170914881998SBen Gardon 	struct kvm_mmu_page *root;
171014881998SBen Gardon 
17112db6f772SBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
171214881998SBen Gardon 
1713d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
17144b85c921SSean Christopherson 		zap_collapsible_spte_range(kvm, root, slot);
171514881998SBen Gardon }
171646044f72SBen Gardon 
171746044f72SBen Gardon /*
171846044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
17195fc3424fSSean Christopherson  * MMU-writable bit to ensure future writes continue to be intercepted.
172046044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
172146044f72SBen Gardon  */
172246044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
17233ad93562SKeqian Zhu 			      gfn_t gfn, int min_level)
172446044f72SBen Gardon {
172546044f72SBen Gardon 	struct tdp_iter iter;
172646044f72SBen Gardon 	u64 new_spte;
172746044f72SBen Gardon 	bool spte_set = false;
172846044f72SBen Gardon 
17293ad93562SKeqian Zhu 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
17303ad93562SKeqian Zhu 
17317cca2d0bSBen Gardon 	rcu_read_lock();
17327cca2d0bSBen Gardon 
173377aa6075SDavid Matlack 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
17343ad93562SKeqian Zhu 		if (!is_shadow_present_pte(iter.old_spte) ||
17353ad93562SKeqian Zhu 		    !is_last_spte(iter.old_spte, iter.level))
17363ad93562SKeqian Zhu 			continue;
17373ad93562SKeqian Zhu 
173846044f72SBen Gardon 		new_spte = iter.old_spte &
17395fc3424fSSean Christopherson 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
174046044f72SBen Gardon 
17417c8a4742SDavid Matlack 		if (new_spte == iter.old_spte)
17427c8a4742SDavid Matlack 			break;
17437c8a4742SDavid Matlack 
17440b7cc254SVipin Sharma 		tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
174546044f72SBen Gardon 		spte_set = true;
174646044f72SBen Gardon 	}
174746044f72SBen Gardon 
17487cca2d0bSBen Gardon 	rcu_read_unlock();
17497cca2d0bSBen Gardon 
175046044f72SBen Gardon 	return spte_set;
175146044f72SBen Gardon }
175246044f72SBen Gardon 
175346044f72SBen Gardon /*
175446044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
17555fc3424fSSean Christopherson  * MMU-writable bit to ensure future writes continue to be intercepted.
175646044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
175746044f72SBen Gardon  */
175846044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
17593ad93562SKeqian Zhu 				   struct kvm_memory_slot *slot, gfn_t gfn,
17603ad93562SKeqian Zhu 				   int min_level)
176146044f72SBen Gardon {
176246044f72SBen Gardon 	struct kvm_mmu_page *root;
176346044f72SBen Gardon 	bool spte_set = false;
176446044f72SBen Gardon 
1765531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
1766a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
17673ad93562SKeqian Zhu 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1768a3f15bdaSSean Christopherson 
176946044f72SBen Gardon 	return spte_set;
177046044f72SBen Gardon }
177146044f72SBen Gardon 
177295fb5b02SBen Gardon /*
177395fb5b02SBen Gardon  * Return the level of the lowest level SPTE added to sptes.
177495fb5b02SBen Gardon  * That SPTE may be non-present.
1775c5c8c7c5SDavid Matlack  *
1776c5c8c7c5SDavid Matlack  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
177795fb5b02SBen Gardon  */
177839b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
177939b4d43eSSean Christopherson 			 int *root_level)
178095fb5b02SBen Gardon {
178195fb5b02SBen Gardon 	struct tdp_iter iter;
178295fb5b02SBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
178395fb5b02SBen Gardon 	gfn_t gfn = addr >> PAGE_SHIFT;
17842aa07893SSean Christopherson 	int leaf = -1;
178595fb5b02SBen Gardon 
1786a972e29cSPaolo Bonzini 	*root_level = vcpu->arch.mmu->root_role.level;
178795fb5b02SBen Gardon 
178895fb5b02SBen Gardon 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
178995fb5b02SBen Gardon 		leaf = iter.level;
1790dde81f94SSean Christopherson 		sptes[leaf] = iter.old_spte;
179195fb5b02SBen Gardon 	}
179295fb5b02SBen Gardon 
179395fb5b02SBen Gardon 	return leaf;
179495fb5b02SBen Gardon }
17956e8eb206SDavid Matlack 
17966e8eb206SDavid Matlack /*
17976e8eb206SDavid Matlack  * Returns the last level spte pointer of the shadow page walk for the given
17986e8eb206SDavid Matlack  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
17996e8eb206SDavid Matlack  * walk could be performed, returns NULL and *spte does not contain valid data.
18006e8eb206SDavid Matlack  *
18016e8eb206SDavid Matlack  * Contract:
18026e8eb206SDavid Matlack  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
18036e8eb206SDavid Matlack  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
18046e8eb206SDavid Matlack  *
18056e8eb206SDavid Matlack  * WARNING: This function is only intended to be called during fast_page_fault.
18066e8eb206SDavid Matlack  */
18076e8eb206SDavid Matlack u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
18086e8eb206SDavid Matlack 					u64 *spte)
18096e8eb206SDavid Matlack {
18106e8eb206SDavid Matlack 	struct tdp_iter iter;
18116e8eb206SDavid Matlack 	struct kvm_mmu *mmu = vcpu->arch.mmu;
18126e8eb206SDavid Matlack 	gfn_t gfn = addr >> PAGE_SHIFT;
18136e8eb206SDavid Matlack 	tdp_ptep_t sptep = NULL;
18146e8eb206SDavid Matlack 
18156e8eb206SDavid Matlack 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
18166e8eb206SDavid Matlack 		*spte = iter.old_spte;
18176e8eb206SDavid Matlack 		sptep = iter.sptep;
18186e8eb206SDavid Matlack 	}
18196e8eb206SDavid Matlack 
18206e8eb206SDavid Matlack 	/*
18216e8eb206SDavid Matlack 	 * Perform the rcu_dereference to get the raw spte pointer value since
18226e8eb206SDavid Matlack 	 * we are passing it up to fast_page_fault, which is shared with the
18236e8eb206SDavid Matlack 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
18246e8eb206SDavid Matlack 	 * annotation.
18256e8eb206SDavid Matlack 	 *
18266e8eb206SDavid Matlack 	 * This is safe since fast_page_fault obeys the contracts of this
18276e8eb206SDavid Matlack 	 * function as well as all TDP MMU contracts around modifying SPTEs
18286e8eb206SDavid Matlack 	 * outside of mmu_lock.
18296e8eb206SDavid Matlack 	 */
18306e8eb206SDavid Matlack 	return rcu_dereference(sptep);
18316e8eb206SDavid Matlack }
1832