xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 1b6043e8)
1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0
2fe5db27dSBen Gardon 
302c00b3aSBen Gardon #include "mmu.h"
402c00b3aSBen Gardon #include "mmu_internal.h"
5bb18842eSBen Gardon #include "mmutrace.h"
62f2fad08SBen Gardon #include "tdp_iter.h"
7fe5db27dSBen Gardon #include "tdp_mmu.h"
802c00b3aSBen Gardon #include "spte.h"
9fe5db27dSBen Gardon 
109a77daacSBen Gardon #include <asm/cmpxchg.h>
1133dd3574SBen Gardon #include <trace/events/kvm.h>
1233dd3574SBen Gardon 
1371ba3f31SPaolo Bonzini static bool __read_mostly tdp_mmu_enabled = true;
1495fb5b02SBen Gardon module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15fe5db27dSBen Gardon 
16fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */
17d501f747SBen Gardon bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18fe5db27dSBen Gardon {
19897218ffSPaolo Bonzini 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20d501f747SBen Gardon 		return false;
21fe5db27dSBen Gardon 
22fe5db27dSBen Gardon 	/* This should not be changed for the lifetime of the VM. */
23fe5db27dSBen Gardon 	kvm->arch.tdp_mmu_enabled = true;
2402c00b3aSBen Gardon 
2502c00b3aSBen Gardon 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
269a77daacSBen Gardon 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
2789c0fd49SBen Gardon 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
2822b94c4bSPaolo Bonzini 	kvm->arch.tdp_mmu_zap_wq =
2922b94c4bSPaolo Bonzini 		alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
30d501f747SBen Gardon 
31d501f747SBen Gardon 	return true;
32fe5db27dSBen Gardon }
33fe5db27dSBen Gardon 
34226b8c8fSSean Christopherson /* Arbitrarily returns true so that this may be used in if statements. */
35226b8c8fSSean Christopherson static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
366103bc07SBen Gardon 							     bool shared)
376103bc07SBen Gardon {
386103bc07SBen Gardon 	if (shared)
396103bc07SBen Gardon 		lockdep_assert_held_read(&kvm->mmu_lock);
406103bc07SBen Gardon 	else
416103bc07SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
42226b8c8fSSean Christopherson 
43226b8c8fSSean Christopherson 	return true;
446103bc07SBen Gardon }
456103bc07SBen Gardon 
46fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
47fe5db27dSBen Gardon {
48fe5db27dSBen Gardon 	if (!kvm->arch.tdp_mmu_enabled)
49fe5db27dSBen Gardon 		return;
5002c00b3aSBen Gardon 
5122b94c4bSPaolo Bonzini 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
5222b94c4bSPaolo Bonzini 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
5322b94c4bSPaolo Bonzini 
54524a1e4eSSean Christopherson 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
5502c00b3aSBen Gardon 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
567cca2d0bSBen Gardon 
577cca2d0bSBen Gardon 	/*
587cca2d0bSBen Gardon 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
5922b94c4bSPaolo Bonzini 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
6022b94c4bSPaolo Bonzini 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
617cca2d0bSBen Gardon 	 */
627cca2d0bSBen Gardon 	rcu_barrier();
6302c00b3aSBen Gardon }
6402c00b3aSBen Gardon 
652bdb3d84SBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
66a889ea54SBen Gardon {
672bdb3d84SBen Gardon 	free_page((unsigned long)sp->spt);
682bdb3d84SBen Gardon 	kmem_cache_free(mmu_page_header_cache, sp);
69a889ea54SBen Gardon }
70a889ea54SBen Gardon 
71c0e64238SBen Gardon /*
72c0e64238SBen Gardon  * This is called through call_rcu in order to free TDP page table memory
73c0e64238SBen Gardon  * safely with respect to other kernel threads that may be operating on
74c0e64238SBen Gardon  * the memory.
75c0e64238SBen Gardon  * By only accessing TDP MMU page table memory in an RCU read critical
76c0e64238SBen Gardon  * section, and freeing it after a grace period, lockless access to that
77c0e64238SBen Gardon  * memory won't use it after it is freed.
78c0e64238SBen Gardon  */
79c0e64238SBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
80a889ea54SBen Gardon {
81c0e64238SBen Gardon 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
82c0e64238SBen Gardon 					       rcu_head);
83a889ea54SBen Gardon 
84c0e64238SBen Gardon 	tdp_mmu_free_sp(sp);
85a889ea54SBen Gardon }
86a889ea54SBen Gardon 
87e2b5b21dSSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
88e2b5b21dSSean Christopherson 			     bool shared);
89e2b5b21dSSean Christopherson 
9022b94c4bSPaolo Bonzini static void tdp_mmu_zap_root_work(struct work_struct *work)
9122b94c4bSPaolo Bonzini {
9222b94c4bSPaolo Bonzini 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
9322b94c4bSPaolo Bonzini 						 tdp_mmu_async_work);
9422b94c4bSPaolo Bonzini 	struct kvm *kvm = root->tdp_mmu_async_data;
9522b94c4bSPaolo Bonzini 
9622b94c4bSPaolo Bonzini 	read_lock(&kvm->mmu_lock);
9722b94c4bSPaolo Bonzini 
9822b94c4bSPaolo Bonzini 	/*
9922b94c4bSPaolo Bonzini 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
10022b94c4bSPaolo Bonzini 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
10122b94c4bSPaolo Bonzini 	 * to a different pCPU.  Note, the local TLB flush on reuse also
10222b94c4bSPaolo Bonzini 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
10322b94c4bSPaolo Bonzini 	 * intermediate paging structures, that may be zapped, as such entries
10422b94c4bSPaolo Bonzini 	 * are associated with the ASID on both VMX and SVM.
10522b94c4bSPaolo Bonzini 	 */
10622b94c4bSPaolo Bonzini 	tdp_mmu_zap_root(kvm, root, true);
10722b94c4bSPaolo Bonzini 
10822b94c4bSPaolo Bonzini 	/*
10922b94c4bSPaolo Bonzini 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
11022b94c4bSPaolo Bonzini 	 * avoiding an infinite loop.  By design, the root is reachable while
11122b94c4bSPaolo Bonzini 	 * it's being asynchronously zapped, thus a different task can put its
11222b94c4bSPaolo Bonzini 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
11322b94c4bSPaolo Bonzini 	 * asynchronously zapped root is unavoidable.
11422b94c4bSPaolo Bonzini 	 */
11522b94c4bSPaolo Bonzini 	kvm_tdp_mmu_put_root(kvm, root, true);
11622b94c4bSPaolo Bonzini 
11722b94c4bSPaolo Bonzini 	read_unlock(&kvm->mmu_lock);
11822b94c4bSPaolo Bonzini }
11922b94c4bSPaolo Bonzini 
12022b94c4bSPaolo Bonzini static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
12122b94c4bSPaolo Bonzini {
12222b94c4bSPaolo Bonzini 	root->tdp_mmu_async_data = kvm;
12322b94c4bSPaolo Bonzini 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
12422b94c4bSPaolo Bonzini 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
12522b94c4bSPaolo Bonzini }
12622b94c4bSPaolo Bonzini 
1278351779cSPaolo Bonzini static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
1288351779cSPaolo Bonzini {
1298351779cSPaolo Bonzini 	union kvm_mmu_page_role role = page->role;
1308351779cSPaolo Bonzini 	role.invalid = true;
1318351779cSPaolo Bonzini 
1328351779cSPaolo Bonzini 	/* No need to use cmpxchg, only the invalid bit can change.  */
1338351779cSPaolo Bonzini 	role.word = xchg(&page->role.word, role.word);
1348351779cSPaolo Bonzini 	return role.invalid;
1358351779cSPaolo Bonzini }
1368351779cSPaolo Bonzini 
1376103bc07SBen Gardon void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
1386103bc07SBen Gardon 			  bool shared)
1392bdb3d84SBen Gardon {
1406103bc07SBen Gardon 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1412bdb3d84SBen Gardon 
14211cccf5cSBen Gardon 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
1432bdb3d84SBen Gardon 		return;
1442bdb3d84SBen Gardon 
1452bdb3d84SBen Gardon 	WARN_ON(!root->tdp_mmu_page);
1462bdb3d84SBen Gardon 
1478351779cSPaolo Bonzini 	/*
1488351779cSPaolo Bonzini 	 * The root now has refcount=0.  It is valid, but readers already
1498351779cSPaolo Bonzini 	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
1508351779cSPaolo Bonzini 	 * rejects it.  This remains true for the rest of the execution
1518351779cSPaolo Bonzini 	 * of this function, because readers visit valid roots only
1528351779cSPaolo Bonzini 	 * (except for tdp_mmu_zap_root_work(), which however
1538351779cSPaolo Bonzini 	 * does not acquire any reference itself).
1548351779cSPaolo Bonzini 	 *
1558351779cSPaolo Bonzini 	 * Even though there are flows that need to visit all roots for
1568351779cSPaolo Bonzini 	 * correctness, they all take mmu_lock for write, so they cannot yet
1578351779cSPaolo Bonzini 	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
1588351779cSPaolo Bonzini 	 * since the root still has refcount=0.
1598351779cSPaolo Bonzini 	 *
1608351779cSPaolo Bonzini 	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
1618351779cSPaolo Bonzini 	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
1628351779cSPaolo Bonzini 	 * So the root temporarily gets an extra reference, going to refcount=1
1638351779cSPaolo Bonzini 	 * while staying invalid.  Readers still cannot acquire any reference;
1648351779cSPaolo Bonzini 	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
1658351779cSPaolo Bonzini 	 * they might take an extra reference if they themselves yield.  Therefore,
1668351779cSPaolo Bonzini 	 * when the reference is given back after tdp_mmu_zap_root terminates,
1678351779cSPaolo Bonzini 	 * there is no guarantee that the refcount is still 1.  If not, whoever
1688351779cSPaolo Bonzini 	 * puts the last reference will free the page, but they will not have to
1698351779cSPaolo Bonzini 	 * zap the root because a root cannot go from invalid to valid.
1708351779cSPaolo Bonzini 	 */
1718351779cSPaolo Bonzini 	if (!kvm_tdp_root_mark_invalid(root)) {
1728351779cSPaolo Bonzini 		refcount_set(&root->tdp_mmu_root_count, 1);
1738351779cSPaolo Bonzini 		tdp_mmu_zap_root(kvm, root, shared);
1748351779cSPaolo Bonzini 
1758351779cSPaolo Bonzini 		/*
1768351779cSPaolo Bonzini 		 * Give back the reference that was added back above.  We now
1778351779cSPaolo Bonzini 		 * know that the root is invalid, so go ahead and free it if
1788351779cSPaolo Bonzini 		 * no one has taken a reference in the meanwhile.
1798351779cSPaolo Bonzini 		 */
1808351779cSPaolo Bonzini 		if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
1818351779cSPaolo Bonzini 			return;
1828351779cSPaolo Bonzini 	}
1838351779cSPaolo Bonzini 
184c0e64238SBen Gardon 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
185c0e64238SBen Gardon 	list_del_rcu(&root->link);
186c0e64238SBen Gardon 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
187c0e64238SBen Gardon 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
188a889ea54SBen Gardon }
189a889ea54SBen Gardon 
190cfc10997SBen Gardon /*
191d62007edSSean Christopherson  * Returns the next root after @prev_root (or the first root if @prev_root is
192d62007edSSean Christopherson  * NULL).  A reference to the returned root is acquired, and the reference to
193d62007edSSean Christopherson  * @prev_root is released (the caller obviously must hold a reference to
194d62007edSSean Christopherson  * @prev_root if it's non-NULL).
195d62007edSSean Christopherson  *
196d62007edSSean Christopherson  * If @only_valid is true, invalid roots are skipped.
197d62007edSSean Christopherson  *
198d62007edSSean Christopherson  * Returns NULL if the end of tdp_mmu_roots was reached.
199cfc10997SBen Gardon  */
200cfc10997SBen Gardon static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
2016103bc07SBen Gardon 					      struct kvm_mmu_page *prev_root,
202d62007edSSean Christopherson 					      bool shared, bool only_valid)
203a889ea54SBen Gardon {
204a889ea54SBen Gardon 	struct kvm_mmu_page *next_root;
205a889ea54SBen Gardon 
206c0e64238SBen Gardon 	rcu_read_lock();
207c0e64238SBen Gardon 
208cfc10997SBen Gardon 	if (prev_root)
209c0e64238SBen Gardon 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
210c0e64238SBen Gardon 						  &prev_root->link,
211c0e64238SBen Gardon 						  typeof(*prev_root), link);
212cfc10997SBen Gardon 	else
213c0e64238SBen Gardon 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
214cfc10997SBen Gardon 						   typeof(*next_root), link);
215cfc10997SBen Gardon 
21604dc4e6cSSean Christopherson 	while (next_root) {
217d62007edSSean Christopherson 		if ((!only_valid || !next_root->role.invalid) &&
218ad6d6b94SJinrong Liang 		    kvm_tdp_mmu_get_root(next_root))
21904dc4e6cSSean Christopherson 			break;
22004dc4e6cSSean Christopherson 
221c0e64238SBen Gardon 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
222c0e64238SBen Gardon 				&next_root->link, typeof(*next_root), link);
22304dc4e6cSSean Christopherson 	}
224fb101293SBen Gardon 
225c0e64238SBen Gardon 	rcu_read_unlock();
226cfc10997SBen Gardon 
227cfc10997SBen Gardon 	if (prev_root)
2286103bc07SBen Gardon 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
229cfc10997SBen Gardon 
230a889ea54SBen Gardon 	return next_root;
231a889ea54SBen Gardon }
232a889ea54SBen Gardon 
233a889ea54SBen Gardon /*
234a889ea54SBen Gardon  * Note: this iterator gets and puts references to the roots it iterates over.
235a889ea54SBen Gardon  * This makes it safe to release the MMU lock and yield within the loop, but
236a889ea54SBen Gardon  * if exiting the loop early, the caller must drop the reference to the most
237a889ea54SBen Gardon  * recent root. (Unless keeping a live reference is desirable.)
2386103bc07SBen Gardon  *
2396103bc07SBen Gardon  * If shared is set, this function is operating under the MMU lock in read
2406103bc07SBen Gardon  * mode. In the unlikely event that this thread must free a root, the lock
2416103bc07SBen Gardon  * will be temporarily dropped and reacquired in write mode.
242a889ea54SBen Gardon  */
243d62007edSSean Christopherson #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
244d62007edSSean Christopherson 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
245cfc10997SBen Gardon 	     _root;								\
246d62007edSSean Christopherson 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
247614f6970SPaolo Bonzini 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
248614f6970SPaolo Bonzini 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
249a3f15bdaSSean Christopherson 		} else
250a889ea54SBen Gardon 
251d62007edSSean Christopherson #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
252d62007edSSean Christopherson 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
253d62007edSSean Christopherson 
254614f6970SPaolo Bonzini #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
255614f6970SPaolo Bonzini 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
256d62007edSSean Christopherson 
257226b8c8fSSean Christopherson /*
258226b8c8fSSean Christopherson  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
259226b8c8fSSean Christopherson  * the implication being that any flow that holds mmu_lock for read is
260226b8c8fSSean Christopherson  * inherently yield-friendly and should use the yield-safe variant above.
261226b8c8fSSean Christopherson  * Holding mmu_lock for write obviates the need for RCU protection as the list
262226b8c8fSSean Christopherson  * is guaranteed to be stable.
263226b8c8fSSean Christopherson  */
264a3f15bdaSSean Christopherson #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
265226b8c8fSSean Christopherson 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
266226b8c8fSSean Christopherson 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
267226b8c8fSSean Christopherson 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
268a3f15bdaSSean Christopherson 		} else
26902c00b3aSBen Gardon 
270a82070b6SDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
27102c00b3aSBen Gardon {
27202c00b3aSBen Gardon 	struct kvm_mmu_page *sp;
27302c00b3aSBen Gardon 
27402c00b3aSBen Gardon 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
27502c00b3aSBen Gardon 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
276a82070b6SDavid Matlack 
277a82070b6SDavid Matlack 	return sp;
278a82070b6SDavid Matlack }
279a82070b6SDavid Matlack 
280c10743a1SSean Christopherson static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
281c10743a1SSean Christopherson 			    gfn_t gfn, union kvm_mmu_page_role role)
282a82070b6SDavid Matlack {
28302c00b3aSBen Gardon 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
28402c00b3aSBen Gardon 
285a3aca4deSDavid Matlack 	sp->role = role;
28602c00b3aSBen Gardon 	sp->gfn = gfn;
287c10743a1SSean Christopherson 	sp->ptep = sptep;
28802c00b3aSBen Gardon 	sp->tdp_mmu_page = true;
28902c00b3aSBen Gardon 
29033dd3574SBen Gardon 	trace_kvm_mmu_get_page(sp, true);
29102c00b3aSBen Gardon }
29202c00b3aSBen Gardon 
293a82070b6SDavid Matlack static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
294a3aca4deSDavid Matlack 				  struct tdp_iter *iter)
295a3aca4deSDavid Matlack {
296a3aca4deSDavid Matlack 	struct kvm_mmu_page *parent_sp;
297a3aca4deSDavid Matlack 	union kvm_mmu_page_role role;
298a3aca4deSDavid Matlack 
299a3aca4deSDavid Matlack 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
300a3aca4deSDavid Matlack 
301a3aca4deSDavid Matlack 	role = parent_sp->role;
302a3aca4deSDavid Matlack 	role.level--;
303a3aca4deSDavid Matlack 
304c10743a1SSean Christopherson 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
305a3aca4deSDavid Matlack }
306a3aca4deSDavid Matlack 
3076e6ec584SSean Christopherson hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
30802c00b3aSBen Gardon {
309a3aca4deSDavid Matlack 	union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
31002c00b3aSBen Gardon 	struct kvm *kvm = vcpu->kvm;
31102c00b3aSBen Gardon 	struct kvm_mmu_page *root;
31202c00b3aSBen Gardon 
3136e6ec584SSean Christopherson 	lockdep_assert_held_write(&kvm->mmu_lock);
31402c00b3aSBen Gardon 
31504dc4e6cSSean Christopherson 	/*
31604dc4e6cSSean Christopherson 	 * Check for an existing root before allocating a new one.  Note, the
31704dc4e6cSSean Christopherson 	 * role check prevents consuming an invalid root.
31804dc4e6cSSean Christopherson 	 */
319a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
320fb101293SBen Gardon 		if (root->role.word == role.word &&
321ad6d6b94SJinrong Liang 		    kvm_tdp_mmu_get_root(root))
3226e6ec584SSean Christopherson 			goto out;
32302c00b3aSBen Gardon 	}
32402c00b3aSBen Gardon 
325a82070b6SDavid Matlack 	root = tdp_mmu_alloc_sp(vcpu);
326c10743a1SSean Christopherson 	tdp_mmu_init_sp(root, NULL, 0, role);
327a82070b6SDavid Matlack 
32811cccf5cSBen Gardon 	refcount_set(&root->tdp_mmu_root_count, 1);
32902c00b3aSBen Gardon 
330c0e64238SBen Gardon 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
331c0e64238SBen Gardon 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
332c0e64238SBen Gardon 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
33302c00b3aSBen Gardon 
3346e6ec584SSean Christopherson out:
33502c00b3aSBen Gardon 	return __pa(root->spt);
336fe5db27dSBen Gardon }
3372f2fad08SBen Gardon 
3382f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
3399a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
3409a77daacSBen Gardon 				bool shared);
3412f2fad08SBen Gardon 
342f8e14497SBen Gardon static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
343f8e14497SBen Gardon {
344f8e14497SBen Gardon 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
345f8e14497SBen Gardon 		return;
346f8e14497SBen Gardon 
347f8e14497SBen Gardon 	if (is_accessed_spte(old_spte) &&
34864bb2769SSean Christopherson 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
34964bb2769SSean Christopherson 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
350f8e14497SBen Gardon 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
351f8e14497SBen Gardon }
352f8e14497SBen Gardon 
353a6a0b05dSBen Gardon static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
354a6a0b05dSBen Gardon 					  u64 old_spte, u64 new_spte, int level)
355a6a0b05dSBen Gardon {
356a6a0b05dSBen Gardon 	bool pfn_changed;
357a6a0b05dSBen Gardon 	struct kvm_memory_slot *slot;
358a6a0b05dSBen Gardon 
359a6a0b05dSBen Gardon 	if (level > PG_LEVEL_4K)
360a6a0b05dSBen Gardon 		return;
361a6a0b05dSBen Gardon 
362a6a0b05dSBen Gardon 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
363a6a0b05dSBen Gardon 
364a6a0b05dSBen Gardon 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
365a6a0b05dSBen Gardon 	    is_writable_pte(new_spte)) {
366a6a0b05dSBen Gardon 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
367fb04a1edSPeter Xu 		mark_page_dirty_in_slot(kvm, slot, gfn);
368a6a0b05dSBen Gardon 	}
369a6a0b05dSBen Gardon }
370a6a0b05dSBen Gardon 
3712f2fad08SBen Gardon /**
372c298a30cSDavid Matlack  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
373a9442f59SBen Gardon  *
374a9442f59SBen Gardon  * @kvm: kvm instance
375a9442f59SBen Gardon  * @sp: the page to be removed
3769a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
3779a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
3789a77daacSBen Gardon  *	    threads that might be adding or removing pages.
379a9442f59SBen Gardon  */
380c298a30cSDavid Matlack static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
3819a77daacSBen Gardon 			      bool shared)
382a9442f59SBen Gardon {
3839a77daacSBen Gardon 	if (shared)
3849a77daacSBen Gardon 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
3859a77daacSBen Gardon 	else
386a9442f59SBen Gardon 		lockdep_assert_held_write(&kvm->mmu_lock);
387a9442f59SBen Gardon 
388a9442f59SBen Gardon 	list_del(&sp->link);
389a9442f59SBen Gardon 	if (sp->lpage_disallowed)
390a9442f59SBen Gardon 		unaccount_huge_nx_page(kvm, sp);
3919a77daacSBen Gardon 
3929a77daacSBen Gardon 	if (shared)
3939a77daacSBen Gardon 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
394a9442f59SBen Gardon }
395a9442f59SBen Gardon 
396a9442f59SBen Gardon /**
3970f53dfa3SDavid Matlack  * handle_removed_pt() - handle a page table removed from the TDP structure
398a066e61fSBen Gardon  *
399a066e61fSBen Gardon  * @kvm: kvm instance
400a066e61fSBen Gardon  * @pt: the page removed from the paging structure
4019a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use
4029a77daacSBen Gardon  *	    of the MMU lock and the operation must synchronize with other
4039a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
404a066e61fSBen Gardon  *
405a066e61fSBen Gardon  * Given a page table that has been removed from the TDP paging structure,
406a066e61fSBen Gardon  * iterates through the page table to clear SPTEs and free child page tables.
40770fb3e41SBen Gardon  *
40870fb3e41SBen Gardon  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
40970fb3e41SBen Gardon  * protection. Since this thread removed it from the paging structure,
41070fb3e41SBen Gardon  * this thread will be responsible for ensuring the page is freed. Hence the
41170fb3e41SBen Gardon  * early rcu_dereferences in the function.
412a066e61fSBen Gardon  */
4130f53dfa3SDavid Matlack static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
414a066e61fSBen Gardon {
41570fb3e41SBen Gardon 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
416a066e61fSBen Gardon 	int level = sp->role.level;
417e25f0e0cSBen Gardon 	gfn_t base_gfn = sp->gfn;
418a066e61fSBen Gardon 	int i;
419a066e61fSBen Gardon 
420a066e61fSBen Gardon 	trace_kvm_mmu_prepare_zap_page(sp);
421a066e61fSBen Gardon 
422c298a30cSDavid Matlack 	tdp_mmu_unlink_sp(kvm, sp, shared);
423a066e61fSBen Gardon 
424a066e61fSBen Gardon 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
425574c3c55SBen Gardon 		u64 *sptep = rcu_dereference(pt) + i;
426574c3c55SBen Gardon 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
427574c3c55SBen Gardon 		u64 old_child_spte;
4289a77daacSBen Gardon 
4299a77daacSBen Gardon 		if (shared) {
430e25f0e0cSBen Gardon 			/*
431e25f0e0cSBen Gardon 			 * Set the SPTE to a nonpresent value that other
432e25f0e0cSBen Gardon 			 * threads will not overwrite. If the SPTE was
433e25f0e0cSBen Gardon 			 * already marked as removed then another thread
434e25f0e0cSBen Gardon 			 * handling a page fault could overwrite it, so
435e25f0e0cSBen Gardon 			 * set the SPTE until it is set from some other
436e25f0e0cSBen Gardon 			 * value to the removed SPTE value.
437e25f0e0cSBen Gardon 			 */
438e25f0e0cSBen Gardon 			for (;;) {
439e25f0e0cSBen Gardon 				old_child_spte = xchg(sptep, REMOVED_SPTE);
440e25f0e0cSBen Gardon 				if (!is_removed_spte(old_child_spte))
441e25f0e0cSBen Gardon 					break;
442e25f0e0cSBen Gardon 				cpu_relax();
443e25f0e0cSBen Gardon 			}
4449a77daacSBen Gardon 		} else {
4458df9f1afSSean Christopherson 			/*
4468df9f1afSSean Christopherson 			 * If the SPTE is not MMU-present, there is no backing
4478df9f1afSSean Christopherson 			 * page associated with the SPTE and so no side effects
4488df9f1afSSean Christopherson 			 * that need to be recorded, and exclusive ownership of
4498df9f1afSSean Christopherson 			 * mmu_lock ensures the SPTE can't be made present.
4508df9f1afSSean Christopherson 			 * Note, zapping MMIO SPTEs is also unnecessary as they
4518df9f1afSSean Christopherson 			 * are guarded by the memslots generation, not by being
4528df9f1afSSean Christopherson 			 * unreachable.
4538df9f1afSSean Christopherson 			 */
4549a77daacSBen Gardon 			old_child_spte = READ_ONCE(*sptep);
4558df9f1afSSean Christopherson 			if (!is_shadow_present_pte(old_child_spte))
4568df9f1afSSean Christopherson 				continue;
457e25f0e0cSBen Gardon 
458e25f0e0cSBen Gardon 			/*
459e25f0e0cSBen Gardon 			 * Marking the SPTE as a removed SPTE is not
460e25f0e0cSBen Gardon 			 * strictly necessary here as the MMU lock will
461e25f0e0cSBen Gardon 			 * stop other threads from concurrently modifying
462e25f0e0cSBen Gardon 			 * this SPTE. Using the removed SPTE value keeps
463e25f0e0cSBen Gardon 			 * the two branches consistent and simplifies
464e25f0e0cSBen Gardon 			 * the function.
465e25f0e0cSBen Gardon 			 */
466e25f0e0cSBen Gardon 			WRITE_ONCE(*sptep, REMOVED_SPTE);
4679a77daacSBen Gardon 		}
468e25f0e0cSBen Gardon 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
469f1b83255SKai Huang 				    old_child_spte, REMOVED_SPTE, level,
470e25f0e0cSBen Gardon 				    shared);
471a066e61fSBen Gardon 	}
472a066e61fSBen Gardon 
4737cca2d0bSBen Gardon 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
474a066e61fSBen Gardon }
475a066e61fSBen Gardon 
476a066e61fSBen Gardon /**
4777f6231a3SKai Huang  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
4782f2fad08SBen Gardon  * @kvm: kvm instance
4792f2fad08SBen Gardon  * @as_id: the address space of the paging structure the SPTE was a part of
4802f2fad08SBen Gardon  * @gfn: the base GFN that was mapped by the SPTE
4812f2fad08SBen Gardon  * @old_spte: The value of the SPTE before the change
4822f2fad08SBen Gardon  * @new_spte: The value of the SPTE after the change
4832f2fad08SBen Gardon  * @level: the level of the PT the SPTE is part of in the paging structure
4849a77daacSBen Gardon  * @shared: This operation may not be running under the exclusive use of
4859a77daacSBen Gardon  *	    the MMU lock and the operation must synchronize with other
4869a77daacSBen Gardon  *	    threads that might be modifying SPTEs.
4872f2fad08SBen Gardon  *
4882f2fad08SBen Gardon  * Handle bookkeeping that might result from the modification of a SPTE.
4892f2fad08SBen Gardon  * This function must be called for all TDP SPTE modifications.
4902f2fad08SBen Gardon  */
4912f2fad08SBen Gardon static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
4929a77daacSBen Gardon 				  u64 old_spte, u64 new_spte, int level,
4939a77daacSBen Gardon 				  bool shared)
4942f2fad08SBen Gardon {
4952f2fad08SBen Gardon 	bool was_present = is_shadow_present_pte(old_spte);
4962f2fad08SBen Gardon 	bool is_present = is_shadow_present_pte(new_spte);
4972f2fad08SBen Gardon 	bool was_leaf = was_present && is_last_spte(old_spte, level);
4982f2fad08SBen Gardon 	bool is_leaf = is_present && is_last_spte(new_spte, level);
4992f2fad08SBen Gardon 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
5002f2fad08SBen Gardon 
5012f2fad08SBen Gardon 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
5022f2fad08SBen Gardon 	WARN_ON(level < PG_LEVEL_4K);
503764388ceSSean Christopherson 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
5042f2fad08SBen Gardon 
5052f2fad08SBen Gardon 	/*
5062f2fad08SBen Gardon 	 * If this warning were to trigger it would indicate that there was a
5072f2fad08SBen Gardon 	 * missing MMU notifier or a race with some notifier handler.
5082f2fad08SBen Gardon 	 * A present, leaf SPTE should never be directly replaced with another
509d9f6e12fSIngo Molnar 	 * present leaf SPTE pointing to a different PFN. A notifier handler
5102f2fad08SBen Gardon 	 * should be zapping the SPTE before the main MM's page table is
5112f2fad08SBen Gardon 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
5122f2fad08SBen Gardon 	 * thread before replacement.
5132f2fad08SBen Gardon 	 */
5142f2fad08SBen Gardon 	if (was_leaf && is_leaf && pfn_changed) {
5152f2fad08SBen Gardon 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
5162f2fad08SBen Gardon 		       "SPTE with another present leaf SPTE mapping a\n"
5172f2fad08SBen Gardon 		       "different PFN!\n"
5182f2fad08SBen Gardon 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
5192f2fad08SBen Gardon 		       as_id, gfn, old_spte, new_spte, level);
5202f2fad08SBen Gardon 
5212f2fad08SBen Gardon 		/*
5222f2fad08SBen Gardon 		 * Crash the host to prevent error propagation and guest data
523d9f6e12fSIngo Molnar 		 * corruption.
5242f2fad08SBen Gardon 		 */
5252f2fad08SBen Gardon 		BUG();
5262f2fad08SBen Gardon 	}
5272f2fad08SBen Gardon 
5282f2fad08SBen Gardon 	if (old_spte == new_spte)
5292f2fad08SBen Gardon 		return;
5302f2fad08SBen Gardon 
531b9a98c34SBen Gardon 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
532b9a98c34SBen Gardon 
533115111efSDavid Matlack 	if (is_leaf)
534115111efSDavid Matlack 		check_spte_writable_invariants(new_spte);
535115111efSDavid Matlack 
5362f2fad08SBen Gardon 	/*
5372f2fad08SBen Gardon 	 * The only times a SPTE should be changed from a non-present to
5382f2fad08SBen Gardon 	 * non-present state is when an MMIO entry is installed/modified/
5392f2fad08SBen Gardon 	 * removed. In that case, there is nothing to do here.
5402f2fad08SBen Gardon 	 */
5412f2fad08SBen Gardon 	if (!was_present && !is_present) {
5422f2fad08SBen Gardon 		/*
54308f07c80SBen Gardon 		 * If this change does not involve a MMIO SPTE or removed SPTE,
54408f07c80SBen Gardon 		 * it is unexpected. Log the change, though it should not
54508f07c80SBen Gardon 		 * impact the guest since both the former and current SPTEs
54608f07c80SBen Gardon 		 * are nonpresent.
5472f2fad08SBen Gardon 		 */
54808f07c80SBen Gardon 		if (WARN_ON(!is_mmio_spte(old_spte) &&
54908f07c80SBen Gardon 			    !is_mmio_spte(new_spte) &&
55008f07c80SBen Gardon 			    !is_removed_spte(new_spte)))
5512f2fad08SBen Gardon 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
5522f2fad08SBen Gardon 			       "should not be replaced with another,\n"
5532f2fad08SBen Gardon 			       "different nonpresent SPTE, unless one or both\n"
55408f07c80SBen Gardon 			       "are MMIO SPTEs, or the new SPTE is\n"
55508f07c80SBen Gardon 			       "a temporary removed SPTE.\n"
5562f2fad08SBen Gardon 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
5572f2fad08SBen Gardon 			       as_id, gfn, old_spte, new_spte, level);
5582f2fad08SBen Gardon 		return;
5592f2fad08SBen Gardon 	}
5602f2fad08SBen Gardon 
56171f51d2cSMingwei Zhang 	if (is_leaf != was_leaf)
56271f51d2cSMingwei Zhang 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
5632f2fad08SBen Gardon 
5642f2fad08SBen Gardon 	if (was_leaf && is_dirty_spte(old_spte) &&
56564bb2769SSean Christopherson 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
5662f2fad08SBen Gardon 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
5672f2fad08SBen Gardon 
5682f2fad08SBen Gardon 	/*
5692f2fad08SBen Gardon 	 * Recursively handle child PTs if the change removed a subtree from
570c8e5a0d0SSean Christopherson 	 * the paging structure.  Note the WARN on the PFN changing without the
571c8e5a0d0SSean Christopherson 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
572c8e5a0d0SSean Christopherson 	 * pages are kernel allocations and should never be migrated.
5732f2fad08SBen Gardon 	 */
574c8e5a0d0SSean Christopherson 	if (was_present && !was_leaf &&
575c8e5a0d0SSean Christopherson 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
5760f53dfa3SDavid Matlack 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
5772f2fad08SBen Gardon }
5782f2fad08SBen Gardon 
5792f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
5809a77daacSBen Gardon 				u64 old_spte, u64 new_spte, int level,
5819a77daacSBen Gardon 				bool shared)
5822f2fad08SBen Gardon {
5839a77daacSBen Gardon 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
5849a77daacSBen Gardon 			      shared);
585f8e14497SBen Gardon 	handle_changed_spte_acc_track(old_spte, new_spte, level);
586a6a0b05dSBen Gardon 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
587a6a0b05dSBen Gardon 				      new_spte, level);
5882f2fad08SBen Gardon }
589faaf05b0SBen Gardon 
590fe43fa2fSBen Gardon /*
5916ccf4438SPaolo Bonzini  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
5926ccf4438SPaolo Bonzini  * and handle the associated bookkeeping.  Do not mark the page dirty
59324ae4cfaSBen Gardon  * in KVM's dirty bitmaps.
5949a77daacSBen Gardon  *
5953255530aSDavid Matlack  * If setting the SPTE fails because it has changed, iter->old_spte will be
5963255530aSDavid Matlack  * refreshed to the current value of the spte.
5973255530aSDavid Matlack  *
5989a77daacSBen Gardon  * @kvm: kvm instance
5999a77daacSBen Gardon  * @iter: a tdp_iter instance currently on the SPTE that should be set
6009a77daacSBen Gardon  * @new_spte: The value the SPTE should be set to
6013e72c791SDavid Matlack  * Return:
6023e72c791SDavid Matlack  * * 0      - If the SPTE was set.
6033e72c791SDavid Matlack  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
6043e72c791SDavid Matlack  *            no side-effects other than setting iter->old_spte to the last
6053e72c791SDavid Matlack  *            known value of the spte.
6069a77daacSBen Gardon  */
6073e72c791SDavid Matlack static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
6089a77daacSBen Gardon 					  struct tdp_iter *iter,
6099a77daacSBen Gardon 					  u64 new_spte)
6109a77daacSBen Gardon {
6113255530aSDavid Matlack 	u64 *sptep = rcu_dereference(iter->sptep);
6123255530aSDavid Matlack 	u64 old_spte;
6133255530aSDavid Matlack 
6143a0f64deSSean Christopherson 	WARN_ON_ONCE(iter->yielded);
6153a0f64deSSean Christopherson 
6169a77daacSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
6179a77daacSBen Gardon 
61808f07c80SBen Gardon 	/*
61908f07c80SBen Gardon 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
62008f07c80SBen Gardon 	 * may modify it.
62108f07c80SBen Gardon 	 */
6227a51393aSSean Christopherson 	if (is_removed_spte(iter->old_spte))
6233e72c791SDavid Matlack 		return -EBUSY;
62408f07c80SBen Gardon 
6256e8eb206SDavid Matlack 	/*
6266e8eb206SDavid Matlack 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
6276e8eb206SDavid Matlack 	 * does not hold the mmu_lock.
6286e8eb206SDavid Matlack 	 */
6293255530aSDavid Matlack 	old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
6303255530aSDavid Matlack 	if (old_spte != iter->old_spte) {
6313255530aSDavid Matlack 		/*
6323255530aSDavid Matlack 		 * The page table entry was modified by a different logical
6333255530aSDavid Matlack 		 * CPU. Refresh iter->old_spte with the current value so the
6343255530aSDavid Matlack 		 * caller operates on fresh data, e.g. if it retries
6353255530aSDavid Matlack 		 * tdp_mmu_set_spte_atomic().
6363255530aSDavid Matlack 		 */
6373255530aSDavid Matlack 		iter->old_spte = old_spte;
6383e72c791SDavid Matlack 		return -EBUSY;
6393255530aSDavid Matlack 	}
6409a77daacSBen Gardon 
64124ae4cfaSBen Gardon 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
64208889894SSean Christopherson 			      new_spte, iter->level, true);
64324ae4cfaSBen Gardon 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
6449a77daacSBen Gardon 
6453e72c791SDavid Matlack 	return 0;
6469a77daacSBen Gardon }
6479a77daacSBen Gardon 
6483e72c791SDavid Matlack static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
64908f07c80SBen Gardon 					  struct tdp_iter *iter)
65008f07c80SBen Gardon {
6513e72c791SDavid Matlack 	int ret;
6523e72c791SDavid Matlack 
65308f07c80SBen Gardon 	/*
65408f07c80SBen Gardon 	 * Freeze the SPTE by setting it to a special,
65508f07c80SBen Gardon 	 * non-present value. This will stop other threads from
65608f07c80SBen Gardon 	 * immediately installing a present entry in its place
65708f07c80SBen Gardon 	 * before the TLBs are flushed.
65808f07c80SBen Gardon 	 */
6593e72c791SDavid Matlack 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
6603e72c791SDavid Matlack 	if (ret)
6613e72c791SDavid Matlack 		return ret;
66208f07c80SBen Gardon 
66308f07c80SBen Gardon 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
66408f07c80SBen Gardon 					   KVM_PAGES_PER_HPAGE(iter->level));
66508f07c80SBen Gardon 
66608f07c80SBen Gardon 	/*
66708f07c80SBen Gardon 	 * No other thread can overwrite the removed SPTE as they
66808f07c80SBen Gardon 	 * must either wait on the MMU lock or use
669d9f6e12fSIngo Molnar 	 * tdp_mmu_set_spte_atomic which will not overwrite the
67008f07c80SBen Gardon 	 * special removed SPTE value. No bookkeeping is needed
67108f07c80SBen Gardon 	 * here since the SPTE is going from non-present
67208f07c80SBen Gardon 	 * to non-present.
67308f07c80SBen Gardon 	 */
6740e587aa7SSean Christopherson 	kvm_tdp_mmu_write_spte(iter->sptep, 0);
67508f07c80SBen Gardon 
6763e72c791SDavid Matlack 	return 0;
67708f07c80SBen Gardon }
67808f07c80SBen Gardon 
6799a77daacSBen Gardon 
6809a77daacSBen Gardon /*
681fe43fa2fSBen Gardon  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
682626808d1SSean Christopherson  * @kvm:	      KVM instance
683626808d1SSean Christopherson  * @as_id:	      Address space ID, i.e. regular vs. SMM
684626808d1SSean Christopherson  * @sptep:	      Pointer to the SPTE
685626808d1SSean Christopherson  * @old_spte:	      The current value of the SPTE
686626808d1SSean Christopherson  * @new_spte:	      The new value that will be set for the SPTE
687626808d1SSean Christopherson  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
688626808d1SSean Christopherson  * @level:	      The level _containing_ the SPTE (its parent PT's level)
689fe43fa2fSBen Gardon  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
690fe43fa2fSBen Gardon  *		      of the page. Should be set unless handling an MMU
691fe43fa2fSBen Gardon  *		      notifier for access tracking. Leaving record_acc_track
692fe43fa2fSBen Gardon  *		      unset in that case prevents page accesses from being
693fe43fa2fSBen Gardon  *		      double counted.
694fe43fa2fSBen Gardon  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
695fe43fa2fSBen Gardon  *		      appropriate for the change being made. Should be set
696fe43fa2fSBen Gardon  *		      unless performing certain dirty logging operations.
697fe43fa2fSBen Gardon  *		      Leaving record_dirty_log unset in that case prevents page
698fe43fa2fSBen Gardon  *		      writes from being double counted.
699fe43fa2fSBen Gardon  */
700626808d1SSean Christopherson static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
701626808d1SSean Christopherson 			       u64 old_spte, u64 new_spte, gfn_t gfn, int level,
702626808d1SSean Christopherson 			       bool record_acc_track, bool record_dirty_log)
703faaf05b0SBen Gardon {
704531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
7053a9a4aa5SBen Gardon 
70608f07c80SBen Gardon 	/*
707966da62aSSean Christopherson 	 * No thread should be using this function to set SPTEs to or from the
70808f07c80SBen Gardon 	 * temporary removed SPTE value.
70908f07c80SBen Gardon 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
71008f07c80SBen Gardon 	 * should be used. If operating under the MMU lock in write mode, the
71108f07c80SBen Gardon 	 * use of the removed SPTE should not be necessary.
71208f07c80SBen Gardon 	 */
713626808d1SSean Christopherson 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
71408f07c80SBen Gardon 
715626808d1SSean Christopherson 	kvm_tdp_mmu_write_spte(sptep, new_spte);
716faaf05b0SBen Gardon 
717626808d1SSean Christopherson 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
718626808d1SSean Christopherson 
719f8e14497SBen Gardon 	if (record_acc_track)
720626808d1SSean Christopherson 		handle_changed_spte_acc_track(old_spte, new_spte, level);
721a6a0b05dSBen Gardon 	if (record_dirty_log)
722626808d1SSean Christopherson 		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
723626808d1SSean Christopherson 					      new_spte, level);
724626808d1SSean Christopherson }
725626808d1SSean Christopherson 
726626808d1SSean Christopherson static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
727626808d1SSean Christopherson 				     u64 new_spte, bool record_acc_track,
728626808d1SSean Christopherson 				     bool record_dirty_log)
729626808d1SSean Christopherson {
730626808d1SSean Christopherson 	WARN_ON_ONCE(iter->yielded);
731626808d1SSean Christopherson 
732626808d1SSean Christopherson 	__tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
733626808d1SSean Christopherson 			   new_spte, iter->gfn, iter->level,
734626808d1SSean Christopherson 			   record_acc_track, record_dirty_log);
735f8e14497SBen Gardon }
736f8e14497SBen Gardon 
737f8e14497SBen Gardon static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
738f8e14497SBen Gardon 				    u64 new_spte)
739f8e14497SBen Gardon {
740626808d1SSean Christopherson 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
741f8e14497SBen Gardon }
742f8e14497SBen Gardon 
743f8e14497SBen Gardon static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
744f8e14497SBen Gardon 						 struct tdp_iter *iter,
745f8e14497SBen Gardon 						 u64 new_spte)
746f8e14497SBen Gardon {
747626808d1SSean Christopherson 	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
748a6a0b05dSBen Gardon }
749a6a0b05dSBen Gardon 
750a6a0b05dSBen Gardon static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
751a6a0b05dSBen Gardon 						 struct tdp_iter *iter,
752a6a0b05dSBen Gardon 						 u64 new_spte)
753a6a0b05dSBen Gardon {
754626808d1SSean Christopherson 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
755faaf05b0SBen Gardon }
756faaf05b0SBen Gardon 
757faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
75877aa6075SDavid Matlack 	for_each_tdp_pte(_iter, _root, _start, _end)
759faaf05b0SBen Gardon 
760f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
761f8e14497SBen Gardon 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
762f8e14497SBen Gardon 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
763f8e14497SBen Gardon 		    !is_last_spte(_iter.old_spte, _iter.level))		\
764f8e14497SBen Gardon 			continue;					\
765f8e14497SBen Gardon 		else
766f8e14497SBen Gardon 
767bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
768b9e5603cSPaolo Bonzini 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
769bb18842eSBen Gardon 
770faaf05b0SBen Gardon /*
771e28a436cSBen Gardon  * Yield if the MMU lock is contended or this thread needs to return control
772e28a436cSBen Gardon  * to the scheduler.
773e28a436cSBen Gardon  *
774e139a34eSBen Gardon  * If this function should yield and flush is set, it will perform a remote
775e139a34eSBen Gardon  * TLB flush before yielding.
776e139a34eSBen Gardon  *
7773a0f64deSSean Christopherson  * If this function yields, iter->yielded is set and the caller must skip to
7783a0f64deSSean Christopherson  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
7793a0f64deSSean Christopherson  * over the paging structures to allow the iterator to continue its traversal
7803a0f64deSSean Christopherson  * from the paging structure root.
781e28a436cSBen Gardon  *
7823a0f64deSSean Christopherson  * Returns true if this function yielded.
783e28a436cSBen Gardon  */
7843a0f64deSSean Christopherson static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
7853a0f64deSSean Christopherson 							  struct tdp_iter *iter,
7863a0f64deSSean Christopherson 							  bool flush, bool shared)
787a6a0b05dSBen Gardon {
7883a0f64deSSean Christopherson 	WARN_ON(iter->yielded);
7893a0f64deSSean Christopherson 
790ed5e484bSBen Gardon 	/* Ensure forward progress has been made before yielding. */
791ed5e484bSBen Gardon 	if (iter->next_last_level_gfn == iter->yielded_gfn)
792ed5e484bSBen Gardon 		return false;
793ed5e484bSBen Gardon 
794531810caSBen Gardon 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
795e139a34eSBen Gardon 		if (flush)
796e139a34eSBen Gardon 			kvm_flush_remote_tlbs(kvm);
797e139a34eSBen Gardon 
798bd296779SSean Christopherson 		rcu_read_unlock();
799bd296779SSean Christopherson 
8006103bc07SBen Gardon 		if (shared)
8016103bc07SBen Gardon 			cond_resched_rwlock_read(&kvm->mmu_lock);
8026103bc07SBen Gardon 		else
803531810caSBen Gardon 			cond_resched_rwlock_write(&kvm->mmu_lock);
8046103bc07SBen Gardon 
8057cca2d0bSBen Gardon 		rcu_read_lock();
806ed5e484bSBen Gardon 
807ed5e484bSBen Gardon 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
808ed5e484bSBen Gardon 
8093a0f64deSSean Christopherson 		iter->yielded = true;
810a6a0b05dSBen Gardon 	}
811e28a436cSBen Gardon 
8123a0f64deSSean Christopherson 	return iter->yielded;
813a6a0b05dSBen Gardon }
814a6a0b05dSBen Gardon 
815e2b5b21dSSean Christopherson static inline gfn_t tdp_mmu_max_gfn_host(void)
816e2b5b21dSSean Christopherson {
817e2b5b21dSSean Christopherson 	/*
818e2b5b21dSSean Christopherson 	 * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
819e2b5b21dSSean Christopherson 	 * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
820e2b5b21dSSean Christopherson 	 * and so KVM will never install a SPTE for such addresses.
821e2b5b21dSSean Christopherson 	 */
822e2b5b21dSSean Christopherson 	return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
823e2b5b21dSSean Christopherson }
824e2b5b21dSSean Christopherson 
825*1b6043e8SSean Christopherson static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
826*1b6043e8SSean Christopherson 			       bool shared, int zap_level)
827e2b5b21dSSean Christopherson {
828e2b5b21dSSean Christopherson 	struct tdp_iter iter;
829e2b5b21dSSean Christopherson 
830e2b5b21dSSean Christopherson 	gfn_t end = tdp_mmu_max_gfn_host();
831e2b5b21dSSean Christopherson 	gfn_t start = 0;
832e2b5b21dSSean Christopherson 
833*1b6043e8SSean Christopherson 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
834*1b6043e8SSean Christopherson retry:
835*1b6043e8SSean Christopherson 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
836*1b6043e8SSean Christopherson 			continue;
837*1b6043e8SSean Christopherson 
838*1b6043e8SSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte))
839*1b6043e8SSean Christopherson 			continue;
840*1b6043e8SSean Christopherson 
841*1b6043e8SSean Christopherson 		if (iter.level > zap_level)
842*1b6043e8SSean Christopherson 			continue;
843*1b6043e8SSean Christopherson 
844*1b6043e8SSean Christopherson 		if (!shared)
845*1b6043e8SSean Christopherson 			tdp_mmu_set_spte(kvm, &iter, 0);
846*1b6043e8SSean Christopherson 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
847*1b6043e8SSean Christopherson 			goto retry;
848*1b6043e8SSean Christopherson 	}
849*1b6043e8SSean Christopherson }
850*1b6043e8SSean Christopherson 
851*1b6043e8SSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
852*1b6043e8SSean Christopherson 			     bool shared)
853*1b6043e8SSean Christopherson {
854*1b6043e8SSean Christopherson 
8558351779cSPaolo Bonzini 	/*
8568351779cSPaolo Bonzini 	 * The root must have an elevated refcount so that it's reachable via
8578351779cSPaolo Bonzini 	 * mmu_notifier callbacks, which allows this path to yield and drop
8588351779cSPaolo Bonzini 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
8598351779cSPaolo Bonzini 	 * must drop all references to relevant pages prior to completing the
8608351779cSPaolo Bonzini 	 * callback.  Dropping mmu_lock with an unreachable root would result
8618351779cSPaolo Bonzini 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
8628351779cSPaolo Bonzini 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
8638351779cSPaolo Bonzini 	 * dirty accessed bits to the SPTE's associated struct page.
8648351779cSPaolo Bonzini 	 */
8658351779cSPaolo Bonzini 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
8668351779cSPaolo Bonzini 
867e2b5b21dSSean Christopherson 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
868e2b5b21dSSean Christopherson 
869e2b5b21dSSean Christopherson 	rcu_read_lock();
870e2b5b21dSSean Christopherson 
871e2b5b21dSSean Christopherson 	/*
872*1b6043e8SSean Christopherson 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
873*1b6043e8SSean Christopherson 	 * split the zap into two passes.  On the first pass, zap at the 1gb
874*1b6043e8SSean Christopherson 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
875*1b6043e8SSean Christopherson 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
876*1b6043e8SSean Christopherson 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
877*1b6043e8SSean Christopherson 	 *
878*1b6043e8SSean Christopherson 	 * Because zapping a SP recurses on its children, stepping down to
879*1b6043e8SSean Christopherson 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
880e2b5b21dSSean Christopherson 	 */
881*1b6043e8SSean Christopherson 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
882*1b6043e8SSean Christopherson 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
883e2b5b21dSSean Christopherson 
884e2b5b21dSSean Christopherson 	rcu_read_unlock();
885e2b5b21dSSean Christopherson }
886e2b5b21dSSean Christopherson 
887c10743a1SSean Christopherson bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
888c10743a1SSean Christopherson {
889c10743a1SSean Christopherson 	u64 old_spte;
890c10743a1SSean Christopherson 
891c10743a1SSean Christopherson 	/*
892c10743a1SSean Christopherson 	 * This helper intentionally doesn't allow zapping a root shadow page,
893c10743a1SSean Christopherson 	 * which doesn't have a parent page table and thus no associated entry.
894c10743a1SSean Christopherson 	 */
895c10743a1SSean Christopherson 	if (WARN_ON_ONCE(!sp->ptep))
896c10743a1SSean Christopherson 		return false;
897c10743a1SSean Christopherson 
898c10743a1SSean Christopherson 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
899bb95dfb9SSean Christopherson 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
900c10743a1SSean Christopherson 		return false;
901c10743a1SSean Christopherson 
902c10743a1SSean Christopherson 	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
903c10743a1SSean Christopherson 			   sp->gfn, sp->role.level + 1, true, true);
904c10743a1SSean Christopherson 
905c10743a1SSean Christopherson 	return true;
906c10743a1SSean Christopherson }
907c10743a1SSean Christopherson 
908faaf05b0SBen Gardon /*
909cf3e2642SSean Christopherson  * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
910cf3e2642SSean Christopherson  * have been cleared and a TLB flush is needed before releasing the MMU lock.
9116103bc07SBen Gardon  *
912063afacdSBen Gardon  * If can_yield is true, will release the MMU lock and reschedule if the
913063afacdSBen Gardon  * scheduler needs the CPU or there is contention on the MMU lock. If this
914063afacdSBen Gardon  * function cannot yield, it will not release the MMU lock or reschedule and
915063afacdSBen Gardon  * the caller must ensure it does not supply too large a GFN range, or the
9166103bc07SBen Gardon  * operation can cause a soft lockup.
917faaf05b0SBen Gardon  */
918cf3e2642SSean Christopherson static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
919acbda82aSSean Christopherson 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
920faaf05b0SBen Gardon {
921faaf05b0SBen Gardon 	struct tdp_iter iter;
922faaf05b0SBen Gardon 
923e2b5b21dSSean Christopherson 	end = min(end, tdp_mmu_max_gfn_host());
924524a1e4eSSean Christopherson 
925acbda82aSSean Christopherson 	lockdep_assert_held_write(&kvm->mmu_lock);
9266103bc07SBen Gardon 
9277cca2d0bSBen Gardon 	rcu_read_lock();
9287cca2d0bSBen Gardon 
929cf3e2642SSean Christopherson 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
9301af4a960SBen Gardon 		if (can_yield &&
931acbda82aSSean Christopherson 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
932a835429cSSean Christopherson 			flush = false;
9331af4a960SBen Gardon 			continue;
9341af4a960SBen Gardon 		}
9351af4a960SBen Gardon 
936cf3e2642SSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte) ||
937faaf05b0SBen Gardon 		    !is_last_spte(iter.old_spte, iter.level))
938faaf05b0SBen Gardon 			continue;
939faaf05b0SBen Gardon 
940faaf05b0SBen Gardon 		tdp_mmu_set_spte(kvm, &iter, 0);
941a835429cSSean Christopherson 		flush = true;
942faaf05b0SBen Gardon 	}
9437cca2d0bSBen Gardon 
9447cca2d0bSBen Gardon 	rcu_read_unlock();
945bb95dfb9SSean Christopherson 
946bb95dfb9SSean Christopherson 	/*
947bb95dfb9SSean Christopherson 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
948bb95dfb9SSean Christopherson 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
949bb95dfb9SSean Christopherson 	 */
950a835429cSSean Christopherson 	return flush;
951faaf05b0SBen Gardon }
952faaf05b0SBen Gardon 
953faaf05b0SBen Gardon /*
954faaf05b0SBen Gardon  * Tears down the mappings for the range of gfns, [start, end), and frees the
955faaf05b0SBen Gardon  * non-root pages mapping GFNs strictly within that range. Returns true if
956faaf05b0SBen Gardon  * SPTEs have been cleared and a TLB flush is needed before releasing the
957faaf05b0SBen Gardon  * MMU lock.
958faaf05b0SBen Gardon  */
959cf3e2642SSean Christopherson bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
960cf3e2642SSean Christopherson 			   bool can_yield, bool flush)
961faaf05b0SBen Gardon {
962faaf05b0SBen Gardon 	struct kvm_mmu_page *root;
963faaf05b0SBen Gardon 
964614f6970SPaolo Bonzini 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
965cf3e2642SSean Christopherson 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
966faaf05b0SBen Gardon 
967faaf05b0SBen Gardon 	return flush;
968faaf05b0SBen Gardon }
969faaf05b0SBen Gardon 
970faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm)
971faaf05b0SBen Gardon {
972e2b5b21dSSean Christopherson 	struct kvm_mmu_page *root;
9732b9663d8SSean Christopherson 	int i;
974faaf05b0SBen Gardon 
97577c8cd6bSSean Christopherson 	/*
97622b94c4bSPaolo Bonzini 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
97722b94c4bSPaolo Bonzini 	 * before returning to the caller.  Zap directly even if the root is
97822b94c4bSPaolo Bonzini 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
97922b94c4bSPaolo Bonzini 	 * all that expensive and mmu_lock is already held, which means the
98022b94c4bSPaolo Bonzini 	 * worker has yielded, i.e. flushing the work instead of zapping here
98122b94c4bSPaolo Bonzini 	 * isn't guaranteed to be any faster.
98222b94c4bSPaolo Bonzini 	 *
98377c8cd6bSSean Christopherson 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
98477c8cd6bSSean Christopherson 	 * is being destroyed or the userspace VMM has exited.  In both cases,
98577c8cd6bSSean Christopherson 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
98677c8cd6bSSean Christopherson 	 */
987e2b5b21dSSean Christopherson 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
988e2b5b21dSSean Christopherson 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
989e2b5b21dSSean Christopherson 			tdp_mmu_zap_root(kvm, root, false);
990e2b5b21dSSean Christopherson 	}
991faaf05b0SBen Gardon }
992bb18842eSBen Gardon 
9934c6654bdSBen Gardon /*
994f28e9c7fSSean Christopherson  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
99522b94c4bSPaolo Bonzini  * zap" completes.
9964c6654bdSBen Gardon  */
9974c6654bdSBen Gardon void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
9984c6654bdSBen Gardon {
99922b94c4bSPaolo Bonzini 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
10004c6654bdSBen Gardon }
10014c6654bdSBen Gardon 
1002bb18842eSBen Gardon /*
1003f28e9c7fSSean Christopherson  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
100422b94c4bSPaolo Bonzini  * is about to be zapped, e.g. in response to a memslots update.  The actual
100522b94c4bSPaolo Bonzini  * zapping is performed asynchronously, so a reference is taken on all roots.
100622b94c4bSPaolo Bonzini  * Using a separate workqueue makes it easy to ensure that the destruction is
100722b94c4bSPaolo Bonzini  * performed before the "fast zap" completes, without keeping a separate list
100822b94c4bSPaolo Bonzini  * of invalidated roots; the list is effectively the list of work items in
100922b94c4bSPaolo Bonzini  * the workqueue.
1010b7cccd39SBen Gardon  *
101122b94c4bSPaolo Bonzini  * Get a reference even if the root is already invalid, the asynchronous worker
101222b94c4bSPaolo Bonzini  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
101322b94c4bSPaolo Bonzini  * is held for write, it should be impossible to observe a root with zero refcount,
101422b94c4bSPaolo Bonzini  * i.e. the list of roots cannot be stale.
10154c6654bdSBen Gardon  *
1016b7cccd39SBen Gardon  * This has essentially the same effect for the TDP MMU
1017b7cccd39SBen Gardon  * as updating mmu_valid_gen does for the shadow MMU.
1018b7cccd39SBen Gardon  */
1019b7cccd39SBen Gardon void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1020b7cccd39SBen Gardon {
1021b7cccd39SBen Gardon 	struct kvm_mmu_page *root;
1022b7cccd39SBen Gardon 
1023b7cccd39SBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
1024f28e9c7fSSean Christopherson 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
102522b94c4bSPaolo Bonzini 		if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1026b7cccd39SBen Gardon 			root->role.invalid = true;
102722b94c4bSPaolo Bonzini 			tdp_mmu_schedule_zap_root(kvm, root);
102822b94c4bSPaolo Bonzini 		}
1029b7cccd39SBen Gardon 	}
1030f28e9c7fSSean Christopherson }
1031b7cccd39SBen Gardon 
1032bb18842eSBen Gardon /*
1033bb18842eSBen Gardon  * Installs a last-level SPTE to handle a TDP page fault.
1034bb18842eSBen Gardon  * (NPT/EPT violation/misconfiguration)
1035bb18842eSBen Gardon  */
1036cdc47767SPaolo Bonzini static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1037cdc47767SPaolo Bonzini 					  struct kvm_page_fault *fault,
1038cdc47767SPaolo Bonzini 					  struct tdp_iter *iter)
1039bb18842eSBen Gardon {
1040c435d4b7SSean Christopherson 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1041bb18842eSBen Gardon 	u64 new_spte;
104257a3e96dSKai Huang 	int ret = RET_PF_FIXED;
1043ad67e480SPaolo Bonzini 	bool wrprot = false;
1044bb18842eSBen Gardon 
10457158bee4SPaolo Bonzini 	WARN_ON(sp->role.level != fault->goal_level);
1046e710c5f6SDavid Matlack 	if (unlikely(!fault->slot))
1047bb18842eSBen Gardon 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
10489a77daacSBen Gardon 	else
104953597858SDavid Matlack 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
10502839180cSPaolo Bonzini 					 fault->pfn, iter->old_spte, fault->prefetch, true,
10517158bee4SPaolo Bonzini 					 fault->map_writable, &new_spte);
1052bb18842eSBen Gardon 
1053bb18842eSBen Gardon 	if (new_spte == iter->old_spte)
1054bb18842eSBen Gardon 		ret = RET_PF_SPURIOUS;
10553e72c791SDavid Matlack 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
10569a77daacSBen Gardon 		return RET_PF_RETRY;
1057bb95dfb9SSean Christopherson 	else if (is_shadow_present_pte(iter->old_spte) &&
1058bb95dfb9SSean Christopherson 		 !is_last_spte(iter->old_spte, iter->level))
1059bb95dfb9SSean Christopherson 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1060bb95dfb9SSean Christopherson 						   KVM_PAGES_PER_HPAGE(iter->level + 1));
1061bb18842eSBen Gardon 
1062bb18842eSBen Gardon 	/*
1063bb18842eSBen Gardon 	 * If the page fault was caused by a write but the page is write
1064bb18842eSBen Gardon 	 * protected, emulation is needed. If the emulation was skipped,
1065bb18842eSBen Gardon 	 * the vCPU would have the same fault again.
1066bb18842eSBen Gardon 	 */
1067ad67e480SPaolo Bonzini 	if (wrprot) {
1068cdc47767SPaolo Bonzini 		if (fault->write)
1069bb18842eSBen Gardon 			ret = RET_PF_EMULATE;
1070bb18842eSBen Gardon 	}
1071bb18842eSBen Gardon 
1072bb18842eSBen Gardon 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
10739a77daacSBen Gardon 	if (unlikely(is_mmio_spte(new_spte))) {
10749a77daacSBen Gardon 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
10759a77daacSBen Gardon 				     new_spte);
1076bb18842eSBen Gardon 		ret = RET_PF_EMULATE;
10773849e092SSean Christopherson 	} else {
10789a77daacSBen Gardon 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
10799a77daacSBen Gardon 				       rcu_dereference(iter->sptep));
10803849e092SSean Christopherson 	}
1081bb18842eSBen Gardon 
1082857f8474SKai Huang 	/*
1083857f8474SKai Huang 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1084857f8474SKai Huang 	 * consistent with legacy MMU behavior.
1085857f8474SKai Huang 	 */
1086857f8474SKai Huang 	if (ret != RET_PF_SPURIOUS)
1087bb18842eSBen Gardon 		vcpu->stat.pf_fixed++;
1088bb18842eSBen Gardon 
1089bb18842eSBen Gardon 	return ret;
1090bb18842eSBen Gardon }
1091bb18842eSBen Gardon 
1092bb18842eSBen Gardon /*
1093cb00a70bSDavid Matlack  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1094cb00a70bSDavid Matlack  * provided page table.
10957b7e1ab6SDavid Matlack  *
10967b7e1ab6SDavid Matlack  * @kvm: kvm instance
10977b7e1ab6SDavid Matlack  * @iter: a tdp_iter instance currently on the SPTE that should be set
10987b7e1ab6SDavid Matlack  * @sp: The new TDP page table to install.
10997b7e1ab6SDavid Matlack  * @account_nx: True if this page table is being installed to split a
11007b7e1ab6SDavid Matlack  *              non-executable huge page.
1101cb00a70bSDavid Matlack  * @shared: This operation is running under the MMU lock in read mode.
11027b7e1ab6SDavid Matlack  *
11037b7e1ab6SDavid Matlack  * Returns: 0 if the new page table was installed. Non-0 if the page table
11047b7e1ab6SDavid Matlack  *          could not be installed (e.g. the atomic compare-exchange failed).
11057b7e1ab6SDavid Matlack  */
1106cb00a70bSDavid Matlack static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1107cb00a70bSDavid Matlack 			   struct kvm_mmu_page *sp, bool account_nx,
1108cb00a70bSDavid Matlack 			   bool shared)
11097b7e1ab6SDavid Matlack {
11107b7e1ab6SDavid Matlack 	u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1111cb00a70bSDavid Matlack 	int ret = 0;
11127b7e1ab6SDavid Matlack 
1113cb00a70bSDavid Matlack 	if (shared) {
11147b7e1ab6SDavid Matlack 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
11157b7e1ab6SDavid Matlack 		if (ret)
11167b7e1ab6SDavid Matlack 			return ret;
1117cb00a70bSDavid Matlack 	} else {
1118cb00a70bSDavid Matlack 		tdp_mmu_set_spte(kvm, iter, spte);
1119cb00a70bSDavid Matlack 	}
11207b7e1ab6SDavid Matlack 
11217b7e1ab6SDavid Matlack 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
11227b7e1ab6SDavid Matlack 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
11237b7e1ab6SDavid Matlack 	if (account_nx)
11247b7e1ab6SDavid Matlack 		account_huge_nx_page(kvm, sp);
11257b7e1ab6SDavid Matlack 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
11267b7e1ab6SDavid Matlack 
11277b7e1ab6SDavid Matlack 	return 0;
11287b7e1ab6SDavid Matlack }
11297b7e1ab6SDavid Matlack 
11307b7e1ab6SDavid Matlack /*
1131bb18842eSBen Gardon  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1132bb18842eSBen Gardon  * page tables and SPTEs to translate the faulting guest physical address.
1133bb18842eSBen Gardon  */
11342f6305ddSPaolo Bonzini int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1135bb18842eSBen Gardon {
1136bb18842eSBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1137bb18842eSBen Gardon 	struct tdp_iter iter;
113889c0fd49SBen Gardon 	struct kvm_mmu_page *sp;
1139bb18842eSBen Gardon 	int ret;
1140bb18842eSBen Gardon 
114173a3c659SPaolo Bonzini 	kvm_mmu_hugepage_adjust(vcpu, fault);
1142bb18842eSBen Gardon 
1143f0066d94SPaolo Bonzini 	trace_kvm_mmu_spte_requested(fault);
11447cca2d0bSBen Gardon 
11457cca2d0bSBen Gardon 	rcu_read_lock();
11467cca2d0bSBen Gardon 
11472f6305ddSPaolo Bonzini 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
114873a3c659SPaolo Bonzini 		if (fault->nx_huge_page_workaround_enabled)
1149536f0e6aSPaolo Bonzini 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1150bb18842eSBen Gardon 
115173a3c659SPaolo Bonzini 		if (iter.level == fault->goal_level)
1152bb18842eSBen Gardon 			break;
1153bb18842eSBen Gardon 
1154bb18842eSBen Gardon 		/*
1155bb18842eSBen Gardon 		 * If there is an SPTE mapping a large page at a higher level
1156bb18842eSBen Gardon 		 * than the target, that SPTE must be cleared and replaced
1157bb18842eSBen Gardon 		 * with a non-leaf SPTE.
1158bb18842eSBen Gardon 		 */
1159bb18842eSBen Gardon 		if (is_shadow_present_pte(iter.old_spte) &&
1160bb18842eSBen Gardon 		    is_large_pte(iter.old_spte)) {
11613e72c791SDavid Matlack 			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
11629a77daacSBen Gardon 				break;
1163bb18842eSBen Gardon 
1164bb18842eSBen Gardon 			/*
1165bb18842eSBen Gardon 			 * The iter must explicitly re-read the spte here
1166bb18842eSBen Gardon 			 * because the new value informs the !present
1167bb18842eSBen Gardon 			 * path below.
1168bb18842eSBen Gardon 			 */
11690e587aa7SSean Christopherson 			iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1170bb18842eSBen Gardon 		}
1171bb18842eSBen Gardon 
1172bb18842eSBen Gardon 		if (!is_shadow_present_pte(iter.old_spte)) {
11737b7e1ab6SDavid Matlack 			bool account_nx = fault->huge_page_disallowed &&
11747b7e1ab6SDavid Matlack 					  fault->req_level >= iter.level;
11757b7e1ab6SDavid Matlack 
1176ff76d506SKai Huang 			/*
1177c4342633SIngo Molnar 			 * If SPTE has been frozen by another thread, just
1178ff76d506SKai Huang 			 * give up and retry, avoiding unnecessary page table
1179ff76d506SKai Huang 			 * allocation and free.
1180ff76d506SKai Huang 			 */
1181ff76d506SKai Huang 			if (is_removed_spte(iter.old_spte))
1182ff76d506SKai Huang 				break;
1183ff76d506SKai Huang 
1184a82070b6SDavid Matlack 			sp = tdp_mmu_alloc_sp(vcpu);
1185a82070b6SDavid Matlack 			tdp_mmu_init_child_sp(sp, &iter);
1186a82070b6SDavid Matlack 
1187cb00a70bSDavid Matlack 			if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
11889a77daacSBen Gardon 				tdp_mmu_free_sp(sp);
11899a77daacSBen Gardon 				break;
11909a77daacSBen Gardon 			}
1191bb18842eSBen Gardon 		}
1192bb18842eSBen Gardon 	}
1193bb18842eSBen Gardon 
119473a3c659SPaolo Bonzini 	if (iter.level != fault->goal_level) {
11957cca2d0bSBen Gardon 		rcu_read_unlock();
1196bb18842eSBen Gardon 		return RET_PF_RETRY;
11977cca2d0bSBen Gardon 	}
1198bb18842eSBen Gardon 
1199cdc47767SPaolo Bonzini 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
12007cca2d0bSBen Gardon 	rcu_read_unlock();
1201bb18842eSBen Gardon 
1202bb18842eSBen Gardon 	return ret;
1203bb18842eSBen Gardon }
1204063afacdSBen Gardon 
12053039bcc7SSean Christopherson bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
12063039bcc7SSean Christopherson 				 bool flush)
1207063afacdSBen Gardon {
1208cf3e2642SSean Christopherson 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
120983b83a02SSean Christopherson 				     range->end, range->may_block, flush);
12103039bcc7SSean Christopherson }
12113039bcc7SSean Christopherson 
12123039bcc7SSean Christopherson typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
12133039bcc7SSean Christopherson 			      struct kvm_gfn_range *range);
12143039bcc7SSean Christopherson 
12153039bcc7SSean Christopherson static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
12163039bcc7SSean Christopherson 						   struct kvm_gfn_range *range,
1217c1b91493SSean Christopherson 						   tdp_handler_t handler)
1218063afacdSBen Gardon {
1219063afacdSBen Gardon 	struct kvm_mmu_page *root;
12203039bcc7SSean Christopherson 	struct tdp_iter iter;
12213039bcc7SSean Christopherson 	bool ret = false;
1222063afacdSBen Gardon 
1223063afacdSBen Gardon 	/*
1224e1eed584SSean Christopherson 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1225e1eed584SSean Christopherson 	 * into this helper allow blocking; it'd be dead, wasteful code.
1226063afacdSBen Gardon 	 */
12273039bcc7SSean Christopherson 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1228a151acecSSean Christopherson 		rcu_read_lock();
1229a151acecSSean Christopherson 
12303039bcc7SSean Christopherson 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
12313039bcc7SSean Christopherson 			ret |= handler(kvm, &iter, range);
1232063afacdSBen Gardon 
12333039bcc7SSean Christopherson 		rcu_read_unlock();
1234a151acecSSean Christopherson 	}
1235063afacdSBen Gardon 
1236063afacdSBen Gardon 	return ret;
1237063afacdSBen Gardon }
1238063afacdSBen Gardon 
1239f8e14497SBen Gardon /*
1240f8e14497SBen Gardon  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1241f8e14497SBen Gardon  * if any of the GFNs in the range have been accessed.
1242f8e14497SBen Gardon  */
12433039bcc7SSean Christopherson static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
12443039bcc7SSean Christopherson 			  struct kvm_gfn_range *range)
1245f8e14497SBen Gardon {
1246f8e14497SBen Gardon 	u64 new_spte = 0;
1247f8e14497SBen Gardon 
12483039bcc7SSean Christopherson 	/* If we have a non-accessed entry we don't need to change the pte. */
12493039bcc7SSean Christopherson 	if (!is_accessed_spte(iter->old_spte))
12503039bcc7SSean Christopherson 		return false;
12517cca2d0bSBen Gardon 
12523039bcc7SSean Christopherson 	new_spte = iter->old_spte;
1253f8e14497SBen Gardon 
1254f8e14497SBen Gardon 	if (spte_ad_enabled(new_spte)) {
12558f8f52a4SSean Christopherson 		new_spte &= ~shadow_accessed_mask;
1256f8e14497SBen Gardon 	} else {
1257f8e14497SBen Gardon 		/*
1258f8e14497SBen Gardon 		 * Capture the dirty status of the page, so that it doesn't get
1259f8e14497SBen Gardon 		 * lost when the SPTE is marked for access tracking.
1260f8e14497SBen Gardon 		 */
1261f8e14497SBen Gardon 		if (is_writable_pte(new_spte))
1262f8e14497SBen Gardon 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1263f8e14497SBen Gardon 
1264f8e14497SBen Gardon 		new_spte = mark_spte_for_access_track(new_spte);
1265f8e14497SBen Gardon 	}
1266f8e14497SBen Gardon 
12673039bcc7SSean Christopherson 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
126833dd3574SBen Gardon 
12693039bcc7SSean Christopherson 	return true;
1270f8e14497SBen Gardon }
1271f8e14497SBen Gardon 
12723039bcc7SSean Christopherson bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1273f8e14497SBen Gardon {
12743039bcc7SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1275f8e14497SBen Gardon }
1276f8e14497SBen Gardon 
12773039bcc7SSean Christopherson static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
12783039bcc7SSean Christopherson 			 struct kvm_gfn_range *range)
1279f8e14497SBen Gardon {
12803039bcc7SSean Christopherson 	return is_accessed_spte(iter->old_spte);
1281f8e14497SBen Gardon }
1282f8e14497SBen Gardon 
12833039bcc7SSean Christopherson bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1284f8e14497SBen Gardon {
12853039bcc7SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
12863039bcc7SSean Christopherson }
12873039bcc7SSean Christopherson 
12883039bcc7SSean Christopherson static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
12893039bcc7SSean Christopherson 			 struct kvm_gfn_range *range)
12903039bcc7SSean Christopherson {
12913039bcc7SSean Christopherson 	u64 new_spte;
12923039bcc7SSean Christopherson 
12933039bcc7SSean Christopherson 	/* Huge pages aren't expected to be modified without first being zapped. */
12943039bcc7SSean Christopherson 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
12953039bcc7SSean Christopherson 
12963039bcc7SSean Christopherson 	if (iter->level != PG_LEVEL_4K ||
12973039bcc7SSean Christopherson 	    !is_shadow_present_pte(iter->old_spte))
12983039bcc7SSean Christopherson 		return false;
12993039bcc7SSean Christopherson 
13003039bcc7SSean Christopherson 	/*
13013039bcc7SSean Christopherson 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
13023039bcc7SSean Christopherson 	 * zero the SPTE before setting the new PFN, but doing so preserves the
13033039bcc7SSean Christopherson 	 * invariant that the PFN of a present * leaf SPTE can never change.
13043039bcc7SSean Christopherson 	 * See __handle_changed_spte().
13053039bcc7SSean Christopherson 	 */
13063039bcc7SSean Christopherson 	tdp_mmu_set_spte(kvm, iter, 0);
13073039bcc7SSean Christopherson 
13083039bcc7SSean Christopherson 	if (!pte_write(range->pte)) {
13093039bcc7SSean Christopherson 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
13103039bcc7SSean Christopherson 								  pte_pfn(range->pte));
13113039bcc7SSean Christopherson 
13123039bcc7SSean Christopherson 		tdp_mmu_set_spte(kvm, iter, new_spte);
13133039bcc7SSean Christopherson 	}
13143039bcc7SSean Christopherson 
13153039bcc7SSean Christopherson 	return true;
1316f8e14497SBen Gardon }
13171d8dd6b3SBen Gardon 
13181d8dd6b3SBen Gardon /*
13191d8dd6b3SBen Gardon  * Handle the changed_pte MMU notifier for the TDP MMU.
13201d8dd6b3SBen Gardon  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
13211d8dd6b3SBen Gardon  * notifier.
13221d8dd6b3SBen Gardon  * Returns non-zero if a flush is needed before releasing the MMU lock.
13231d8dd6b3SBen Gardon  */
13243039bcc7SSean Christopherson bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
13251d8dd6b3SBen Gardon {
132693fa50f6SSean Christopherson 	/*
132793fa50f6SSean Christopherson 	 * No need to handle the remote TLB flush under RCU protection, the
132893fa50f6SSean Christopherson 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
132993fa50f6SSean Christopherson 	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
133093fa50f6SSean Christopherson 	 */
133193fa50f6SSean Christopherson 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
13321d8dd6b3SBen Gardon }
13331d8dd6b3SBen Gardon 
1334a6a0b05dSBen Gardon /*
1335bedd9195SDavid Matlack  * Remove write access from all SPTEs at or above min_level that map GFNs
1336bedd9195SDavid Matlack  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1337bedd9195SDavid Matlack  * be flushed.
1338a6a0b05dSBen Gardon  */
1339a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1340a6a0b05dSBen Gardon 			     gfn_t start, gfn_t end, int min_level)
1341a6a0b05dSBen Gardon {
1342a6a0b05dSBen Gardon 	struct tdp_iter iter;
1343a6a0b05dSBen Gardon 	u64 new_spte;
1344a6a0b05dSBen Gardon 	bool spte_set = false;
1345a6a0b05dSBen Gardon 
13467cca2d0bSBen Gardon 	rcu_read_lock();
13477cca2d0bSBen Gardon 
1348a6a0b05dSBen Gardon 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1349a6a0b05dSBen Gardon 
135077aa6075SDavid Matlack 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
135124ae4cfaSBen Gardon retry:
135224ae4cfaSBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
13531af4a960SBen Gardon 			continue;
13541af4a960SBen Gardon 
1355a6a0b05dSBen Gardon 		if (!is_shadow_present_pte(iter.old_spte) ||
13560f99ee2cSBen Gardon 		    !is_last_spte(iter.old_spte, iter.level) ||
13570f99ee2cSBen Gardon 		    !(iter.old_spte & PT_WRITABLE_MASK))
1358a6a0b05dSBen Gardon 			continue;
1359a6a0b05dSBen Gardon 
1360a6a0b05dSBen Gardon 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1361a6a0b05dSBen Gardon 
13623e72c791SDavid Matlack 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
136324ae4cfaSBen Gardon 			goto retry;
13643255530aSDavid Matlack 
1365a6a0b05dSBen Gardon 		spte_set = true;
1366a6a0b05dSBen Gardon 	}
13677cca2d0bSBen Gardon 
13687cca2d0bSBen Gardon 	rcu_read_unlock();
1369a6a0b05dSBen Gardon 	return spte_set;
1370a6a0b05dSBen Gardon }
1371a6a0b05dSBen Gardon 
1372a6a0b05dSBen Gardon /*
1373a6a0b05dSBen Gardon  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1374a6a0b05dSBen Gardon  * only affect leaf SPTEs down to min_level.
1375a6a0b05dSBen Gardon  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1376a6a0b05dSBen Gardon  */
1377269e9552SHamza Mahfooz bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1378269e9552SHamza Mahfooz 			     const struct kvm_memory_slot *slot, int min_level)
1379a6a0b05dSBen Gardon {
1380a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1381a6a0b05dSBen Gardon 	bool spte_set = false;
1382a6a0b05dSBen Gardon 
138324ae4cfaSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
1384a6a0b05dSBen Gardon 
1385d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1386a6a0b05dSBen Gardon 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1387a6a0b05dSBen Gardon 			     slot->base_gfn + slot->npages, min_level);
1388a6a0b05dSBen Gardon 
1389a6a0b05dSBen Gardon 	return spte_set;
1390a6a0b05dSBen Gardon }
1391a6a0b05dSBen Gardon 
1392a3fe5dbdSDavid Matlack static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1393a3fe5dbdSDavid Matlack {
1394a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp;
1395a3fe5dbdSDavid Matlack 
1396a3fe5dbdSDavid Matlack 	gfp |= __GFP_ZERO;
1397a3fe5dbdSDavid Matlack 
1398a3fe5dbdSDavid Matlack 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1399a3fe5dbdSDavid Matlack 	if (!sp)
1400a3fe5dbdSDavid Matlack 		return NULL;
1401a3fe5dbdSDavid Matlack 
1402a3fe5dbdSDavid Matlack 	sp->spt = (void *)__get_free_page(gfp);
1403a3fe5dbdSDavid Matlack 	if (!sp->spt) {
1404a3fe5dbdSDavid Matlack 		kmem_cache_free(mmu_page_header_cache, sp);
1405a3fe5dbdSDavid Matlack 		return NULL;
1406a3fe5dbdSDavid Matlack 	}
1407a3fe5dbdSDavid Matlack 
1408a3fe5dbdSDavid Matlack 	return sp;
1409a3fe5dbdSDavid Matlack }
1410a3fe5dbdSDavid Matlack 
1411a3fe5dbdSDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1412cb00a70bSDavid Matlack 						       struct tdp_iter *iter,
1413cb00a70bSDavid Matlack 						       bool shared)
1414a3fe5dbdSDavid Matlack {
1415a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp;
1416a3fe5dbdSDavid Matlack 
1417a3fe5dbdSDavid Matlack 	/*
1418a3fe5dbdSDavid Matlack 	 * Since we are allocating while under the MMU lock we have to be
1419a3fe5dbdSDavid Matlack 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1420a3fe5dbdSDavid Matlack 	 * reclaim and to avoid making any filesystem callbacks (which can end
1421a3fe5dbdSDavid Matlack 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1422a3fe5dbdSDavid Matlack 	 *
1423a3fe5dbdSDavid Matlack 	 * If this allocation fails we drop the lock and retry with reclaim
1424a3fe5dbdSDavid Matlack 	 * allowed.
1425a3fe5dbdSDavid Matlack 	 */
1426a3fe5dbdSDavid Matlack 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1427a3fe5dbdSDavid Matlack 	if (sp)
1428a3fe5dbdSDavid Matlack 		return sp;
1429a3fe5dbdSDavid Matlack 
1430a3fe5dbdSDavid Matlack 	rcu_read_unlock();
1431cb00a70bSDavid Matlack 
1432cb00a70bSDavid Matlack 	if (shared)
1433a3fe5dbdSDavid Matlack 		read_unlock(&kvm->mmu_lock);
1434cb00a70bSDavid Matlack 	else
1435cb00a70bSDavid Matlack 		write_unlock(&kvm->mmu_lock);
1436a3fe5dbdSDavid Matlack 
1437a3fe5dbdSDavid Matlack 	iter->yielded = true;
1438a3fe5dbdSDavid Matlack 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1439a3fe5dbdSDavid Matlack 
1440cb00a70bSDavid Matlack 	if (shared)
1441a3fe5dbdSDavid Matlack 		read_lock(&kvm->mmu_lock);
1442cb00a70bSDavid Matlack 	else
1443cb00a70bSDavid Matlack 		write_lock(&kvm->mmu_lock);
1444cb00a70bSDavid Matlack 
1445a3fe5dbdSDavid Matlack 	rcu_read_lock();
1446a3fe5dbdSDavid Matlack 
1447a3fe5dbdSDavid Matlack 	return sp;
1448a3fe5dbdSDavid Matlack }
1449a3fe5dbdSDavid Matlack 
1450cb00a70bSDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1451cb00a70bSDavid Matlack 				   struct kvm_mmu_page *sp, bool shared)
1452a3fe5dbdSDavid Matlack {
1453a3fe5dbdSDavid Matlack 	const u64 huge_spte = iter->old_spte;
1454a3fe5dbdSDavid Matlack 	const int level = iter->level;
1455a3fe5dbdSDavid Matlack 	int ret, i;
1456a3fe5dbdSDavid Matlack 
1457a3fe5dbdSDavid Matlack 	tdp_mmu_init_child_sp(sp, iter);
1458a3fe5dbdSDavid Matlack 
1459a3fe5dbdSDavid Matlack 	/*
1460a3fe5dbdSDavid Matlack 	 * No need for atomics when writing to sp->spt since the page table has
1461a3fe5dbdSDavid Matlack 	 * not been linked in yet and thus is not reachable from any other CPU.
1462a3fe5dbdSDavid Matlack 	 */
1463a3fe5dbdSDavid Matlack 	for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1464a3fe5dbdSDavid Matlack 		sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1465a3fe5dbdSDavid Matlack 
1466a3fe5dbdSDavid Matlack 	/*
1467a3fe5dbdSDavid Matlack 	 * Replace the huge spte with a pointer to the populated lower level
1468a3fe5dbdSDavid Matlack 	 * page table. Since we are making this change without a TLB flush vCPUs
1469a3fe5dbdSDavid Matlack 	 * will see a mix of the split mappings and the original huge mapping,
1470a3fe5dbdSDavid Matlack 	 * depending on what's currently in their TLB. This is fine from a
1471a3fe5dbdSDavid Matlack 	 * correctness standpoint since the translation will be the same either
1472a3fe5dbdSDavid Matlack 	 * way.
1473a3fe5dbdSDavid Matlack 	 */
1474cb00a70bSDavid Matlack 	ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1475a3fe5dbdSDavid Matlack 	if (ret)
1476e0b728b1SDavid Matlack 		goto out;
1477a3fe5dbdSDavid Matlack 
1478a3fe5dbdSDavid Matlack 	/*
1479a3fe5dbdSDavid Matlack 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1480a3fe5dbdSDavid Matlack 	 * are overwriting from the page stats. But we have to manually update
1481a3fe5dbdSDavid Matlack 	 * the page stats with the new present child pages.
1482a3fe5dbdSDavid Matlack 	 */
1483a3fe5dbdSDavid Matlack 	kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1484a3fe5dbdSDavid Matlack 
1485e0b728b1SDavid Matlack out:
1486e0b728b1SDavid Matlack 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1487e0b728b1SDavid Matlack 	return ret;
1488a3fe5dbdSDavid Matlack }
1489a3fe5dbdSDavid Matlack 
1490a3fe5dbdSDavid Matlack static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1491a3fe5dbdSDavid Matlack 					 struct kvm_mmu_page *root,
1492a3fe5dbdSDavid Matlack 					 gfn_t start, gfn_t end,
1493cb00a70bSDavid Matlack 					 int target_level, bool shared)
1494a3fe5dbdSDavid Matlack {
1495a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *sp = NULL;
1496a3fe5dbdSDavid Matlack 	struct tdp_iter iter;
1497a3fe5dbdSDavid Matlack 	int ret = 0;
1498a3fe5dbdSDavid Matlack 
1499a3fe5dbdSDavid Matlack 	rcu_read_lock();
1500a3fe5dbdSDavid Matlack 
1501a3fe5dbdSDavid Matlack 	/*
1502a3fe5dbdSDavid Matlack 	 * Traverse the page table splitting all huge pages above the target
1503a3fe5dbdSDavid Matlack 	 * level into one lower level. For example, if we encounter a 1GB page
1504a3fe5dbdSDavid Matlack 	 * we split it into 512 2MB pages.
1505a3fe5dbdSDavid Matlack 	 *
1506a3fe5dbdSDavid Matlack 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1507a3fe5dbdSDavid Matlack 	 * to visit an SPTE before ever visiting its children, which means we
1508a3fe5dbdSDavid Matlack 	 * will correctly recursively split huge pages that are more than one
1509a3fe5dbdSDavid Matlack 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1510a3fe5dbdSDavid Matlack 	 * and then splitting each of those to 512 4KB pages).
1511a3fe5dbdSDavid Matlack 	 */
1512a3fe5dbdSDavid Matlack 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1513a3fe5dbdSDavid Matlack retry:
1514cb00a70bSDavid Matlack 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1515a3fe5dbdSDavid Matlack 			continue;
1516a3fe5dbdSDavid Matlack 
1517a3fe5dbdSDavid Matlack 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1518a3fe5dbdSDavid Matlack 			continue;
1519a3fe5dbdSDavid Matlack 
1520a3fe5dbdSDavid Matlack 		if (!sp) {
1521cb00a70bSDavid Matlack 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1522a3fe5dbdSDavid Matlack 			if (!sp) {
1523a3fe5dbdSDavid Matlack 				ret = -ENOMEM;
1524e0b728b1SDavid Matlack 				trace_kvm_mmu_split_huge_page(iter.gfn,
1525e0b728b1SDavid Matlack 							      iter.old_spte,
1526e0b728b1SDavid Matlack 							      iter.level, ret);
1527a3fe5dbdSDavid Matlack 				break;
1528a3fe5dbdSDavid Matlack 			}
1529a3fe5dbdSDavid Matlack 
1530a3fe5dbdSDavid Matlack 			if (iter.yielded)
1531a3fe5dbdSDavid Matlack 				continue;
1532a3fe5dbdSDavid Matlack 		}
1533a3fe5dbdSDavid Matlack 
1534cb00a70bSDavid Matlack 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1535a3fe5dbdSDavid Matlack 			goto retry;
1536a3fe5dbdSDavid Matlack 
1537a3fe5dbdSDavid Matlack 		sp = NULL;
1538a3fe5dbdSDavid Matlack 	}
1539a3fe5dbdSDavid Matlack 
1540a3fe5dbdSDavid Matlack 	rcu_read_unlock();
1541a3fe5dbdSDavid Matlack 
1542a3fe5dbdSDavid Matlack 	/*
1543a3fe5dbdSDavid Matlack 	 * It's possible to exit the loop having never used the last sp if, for
1544a3fe5dbdSDavid Matlack 	 * example, a vCPU doing HugePage NX splitting wins the race and
1545a3fe5dbdSDavid Matlack 	 * installs its own sp in place of the last sp we tried to split.
1546a3fe5dbdSDavid Matlack 	 */
1547a3fe5dbdSDavid Matlack 	if (sp)
1548a3fe5dbdSDavid Matlack 		tdp_mmu_free_sp(sp);
1549a3fe5dbdSDavid Matlack 
1550a3fe5dbdSDavid Matlack 	return ret;
1551a3fe5dbdSDavid Matlack }
1552a3fe5dbdSDavid Matlack 
1553cb00a70bSDavid Matlack 
1554a3fe5dbdSDavid Matlack /*
1555a3fe5dbdSDavid Matlack  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1556a3fe5dbdSDavid Matlack  */
1557a3fe5dbdSDavid Matlack void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1558a3fe5dbdSDavid Matlack 				      const struct kvm_memory_slot *slot,
1559a3fe5dbdSDavid Matlack 				      gfn_t start, gfn_t end,
1560cb00a70bSDavid Matlack 				      int target_level, bool shared)
1561a3fe5dbdSDavid Matlack {
1562a3fe5dbdSDavid Matlack 	struct kvm_mmu_page *root;
1563a3fe5dbdSDavid Matlack 	int r = 0;
1564a3fe5dbdSDavid Matlack 
1565cb00a70bSDavid Matlack 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1566a3fe5dbdSDavid Matlack 
15677c554d8eSPaolo Bonzini 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1568cb00a70bSDavid Matlack 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1569a3fe5dbdSDavid Matlack 		if (r) {
1570cb00a70bSDavid Matlack 			kvm_tdp_mmu_put_root(kvm, root, shared);
1571a3fe5dbdSDavid Matlack 			break;
1572a3fe5dbdSDavid Matlack 		}
1573a3fe5dbdSDavid Matlack 	}
1574a3fe5dbdSDavid Matlack }
1575a3fe5dbdSDavid Matlack 
1576a6a0b05dSBen Gardon /*
1577a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1578a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1579a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1580a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1581a6a0b05dSBen Gardon  * be flushed.
1582a6a0b05dSBen Gardon  */
1583a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1584a6a0b05dSBen Gardon 			   gfn_t start, gfn_t end)
1585a6a0b05dSBen Gardon {
1586a6a0b05dSBen Gardon 	struct tdp_iter iter;
1587a6a0b05dSBen Gardon 	u64 new_spte;
1588a6a0b05dSBen Gardon 	bool spte_set = false;
1589a6a0b05dSBen Gardon 
15907cca2d0bSBen Gardon 	rcu_read_lock();
15917cca2d0bSBen Gardon 
1592a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
159324ae4cfaSBen Gardon retry:
159424ae4cfaSBen Gardon 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
15951af4a960SBen Gardon 			continue;
15961af4a960SBen Gardon 
15973354ef5aSSean Christopherson 		if (!is_shadow_present_pte(iter.old_spte))
15983354ef5aSSean Christopherson 			continue;
15993354ef5aSSean Christopherson 
1600a6a0b05dSBen Gardon 		if (spte_ad_need_write_protect(iter.old_spte)) {
1601a6a0b05dSBen Gardon 			if (is_writable_pte(iter.old_spte))
1602a6a0b05dSBen Gardon 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1603a6a0b05dSBen Gardon 			else
1604a6a0b05dSBen Gardon 				continue;
1605a6a0b05dSBen Gardon 		} else {
1606a6a0b05dSBen Gardon 			if (iter.old_spte & shadow_dirty_mask)
1607a6a0b05dSBen Gardon 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1608a6a0b05dSBen Gardon 			else
1609a6a0b05dSBen Gardon 				continue;
1610a6a0b05dSBen Gardon 		}
1611a6a0b05dSBen Gardon 
16123e72c791SDavid Matlack 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
161324ae4cfaSBen Gardon 			goto retry;
16143255530aSDavid Matlack 
1615a6a0b05dSBen Gardon 		spte_set = true;
1616a6a0b05dSBen Gardon 	}
16177cca2d0bSBen Gardon 
16187cca2d0bSBen Gardon 	rcu_read_unlock();
1619a6a0b05dSBen Gardon 	return spte_set;
1620a6a0b05dSBen Gardon }
1621a6a0b05dSBen Gardon 
1622a6a0b05dSBen Gardon /*
1623a6a0b05dSBen Gardon  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1624a6a0b05dSBen Gardon  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1625a6a0b05dSBen Gardon  * If AD bits are not enabled, this will require clearing the writable bit on
1626a6a0b05dSBen Gardon  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1627a6a0b05dSBen Gardon  * be flushed.
1628a6a0b05dSBen Gardon  */
1629269e9552SHamza Mahfooz bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1630269e9552SHamza Mahfooz 				  const struct kvm_memory_slot *slot)
1631a6a0b05dSBen Gardon {
1632a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1633a6a0b05dSBen Gardon 	bool spte_set = false;
1634a6a0b05dSBen Gardon 
163524ae4cfaSBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
1636a6a0b05dSBen Gardon 
1637d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1638a6a0b05dSBen Gardon 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1639a6a0b05dSBen Gardon 				slot->base_gfn + slot->npages);
1640a6a0b05dSBen Gardon 
1641a6a0b05dSBen Gardon 	return spte_set;
1642a6a0b05dSBen Gardon }
1643a6a0b05dSBen Gardon 
1644a6a0b05dSBen Gardon /*
1645a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1646a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1647a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1648a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1649a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1650a6a0b05dSBen Gardon  */
1651a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1652a6a0b05dSBen Gardon 				  gfn_t gfn, unsigned long mask, bool wrprot)
1653a6a0b05dSBen Gardon {
1654a6a0b05dSBen Gardon 	struct tdp_iter iter;
1655a6a0b05dSBen Gardon 	u64 new_spte;
1656a6a0b05dSBen Gardon 
16577cca2d0bSBen Gardon 	rcu_read_lock();
16587cca2d0bSBen Gardon 
1659a6a0b05dSBen Gardon 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1660a6a0b05dSBen Gardon 				    gfn + BITS_PER_LONG) {
1661a6a0b05dSBen Gardon 		if (!mask)
1662a6a0b05dSBen Gardon 			break;
1663a6a0b05dSBen Gardon 
1664a6a0b05dSBen Gardon 		if (iter.level > PG_LEVEL_4K ||
1665a6a0b05dSBen Gardon 		    !(mask & (1UL << (iter.gfn - gfn))))
1666a6a0b05dSBen Gardon 			continue;
1667a6a0b05dSBen Gardon 
1668f1b3b06aSBen Gardon 		mask &= ~(1UL << (iter.gfn - gfn));
1669f1b3b06aSBen Gardon 
1670a6a0b05dSBen Gardon 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1671a6a0b05dSBen Gardon 			if (is_writable_pte(iter.old_spte))
1672a6a0b05dSBen Gardon 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1673a6a0b05dSBen Gardon 			else
1674a6a0b05dSBen Gardon 				continue;
1675a6a0b05dSBen Gardon 		} else {
1676a6a0b05dSBen Gardon 			if (iter.old_spte & shadow_dirty_mask)
1677a6a0b05dSBen Gardon 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1678a6a0b05dSBen Gardon 			else
1679a6a0b05dSBen Gardon 				continue;
1680a6a0b05dSBen Gardon 		}
1681a6a0b05dSBen Gardon 
1682a6a0b05dSBen Gardon 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1683a6a0b05dSBen Gardon 	}
16847cca2d0bSBen Gardon 
16857cca2d0bSBen Gardon 	rcu_read_unlock();
1686a6a0b05dSBen Gardon }
1687a6a0b05dSBen Gardon 
1688a6a0b05dSBen Gardon /*
1689a6a0b05dSBen Gardon  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1690a6a0b05dSBen Gardon  * set in mask, starting at gfn. The given memslot is expected to contain all
1691a6a0b05dSBen Gardon  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1692a6a0b05dSBen Gardon  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1693a6a0b05dSBen Gardon  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1694a6a0b05dSBen Gardon  */
1695a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1696a6a0b05dSBen Gardon 				       struct kvm_memory_slot *slot,
1697a6a0b05dSBen Gardon 				       gfn_t gfn, unsigned long mask,
1698a6a0b05dSBen Gardon 				       bool wrprot)
1699a6a0b05dSBen Gardon {
1700a6a0b05dSBen Gardon 	struct kvm_mmu_page *root;
1701a6a0b05dSBen Gardon 
1702531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
1703a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1704a6a0b05dSBen Gardon 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1705a6a0b05dSBen Gardon }
1706a6a0b05dSBen Gardon 
1707a6a0b05dSBen Gardon /*
170887aa9ec9SBen Gardon  * Clear leaf entries which could be replaced by large mappings, for
170987aa9ec9SBen Gardon  * GFNs within the slot.
171014881998SBen Gardon  */
17114b85c921SSean Christopherson static void zap_collapsible_spte_range(struct kvm *kvm,
171214881998SBen Gardon 				       struct kvm_mmu_page *root,
17134b85c921SSean Christopherson 				       const struct kvm_memory_slot *slot)
171414881998SBen Gardon {
17159eba50f8SSean Christopherson 	gfn_t start = slot->base_gfn;
17169eba50f8SSean Christopherson 	gfn_t end = start + slot->npages;
171714881998SBen Gardon 	struct tdp_iter iter;
171814881998SBen Gardon 	kvm_pfn_t pfn;
171914881998SBen Gardon 
17207cca2d0bSBen Gardon 	rcu_read_lock();
17217cca2d0bSBen Gardon 
172214881998SBen Gardon 	tdp_root_for_each_pte(iter, root, start, end) {
17232db6f772SBen Gardon retry:
17244b85c921SSean Christopherson 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
17251af4a960SBen Gardon 			continue;
17261af4a960SBen Gardon 
172714881998SBen Gardon 		if (!is_shadow_present_pte(iter.old_spte) ||
172887aa9ec9SBen Gardon 		    !is_last_spte(iter.old_spte, iter.level))
172914881998SBen Gardon 			continue;
173014881998SBen Gardon 
173114881998SBen Gardon 		pfn = spte_to_pfn(iter.old_spte);
173214881998SBen Gardon 		if (kvm_is_reserved_pfn(pfn) ||
17339eba50f8SSean Christopherson 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
17349eba50f8SSean Christopherson 							    pfn, PG_LEVEL_NUM))
173514881998SBen Gardon 			continue;
173614881998SBen Gardon 
17374b85c921SSean Christopherson 		/* Note, a successful atomic zap also does a remote TLB flush. */
17383e72c791SDavid Matlack 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
17392db6f772SBen Gardon 			goto retry;
17402db6f772SBen Gardon 	}
174114881998SBen Gardon 
17427cca2d0bSBen Gardon 	rcu_read_unlock();
174314881998SBen Gardon }
174414881998SBen Gardon 
174514881998SBen Gardon /*
174614881998SBen Gardon  * Clear non-leaf entries (and free associated page tables) which could
174714881998SBen Gardon  * be replaced by large mappings, for GFNs within the slot.
174814881998SBen Gardon  */
17494b85c921SSean Christopherson void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
17504b85c921SSean Christopherson 				       const struct kvm_memory_slot *slot)
175114881998SBen Gardon {
175214881998SBen Gardon 	struct kvm_mmu_page *root;
175314881998SBen Gardon 
17542db6f772SBen Gardon 	lockdep_assert_held_read(&kvm->mmu_lock);
175514881998SBen Gardon 
1756d62007edSSean Christopherson 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
17574b85c921SSean Christopherson 		zap_collapsible_spte_range(kvm, root, slot);
175814881998SBen Gardon }
175946044f72SBen Gardon 
176046044f72SBen Gardon /*
176146044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
17625fc3424fSSean Christopherson  * MMU-writable bit to ensure future writes continue to be intercepted.
176346044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
176446044f72SBen Gardon  */
176546044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
17663ad93562SKeqian Zhu 			      gfn_t gfn, int min_level)
176746044f72SBen Gardon {
176846044f72SBen Gardon 	struct tdp_iter iter;
176946044f72SBen Gardon 	u64 new_spte;
177046044f72SBen Gardon 	bool spte_set = false;
177146044f72SBen Gardon 
17723ad93562SKeqian Zhu 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
17733ad93562SKeqian Zhu 
17747cca2d0bSBen Gardon 	rcu_read_lock();
17757cca2d0bSBen Gardon 
177677aa6075SDavid Matlack 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
17773ad93562SKeqian Zhu 		if (!is_shadow_present_pte(iter.old_spte) ||
17783ad93562SKeqian Zhu 		    !is_last_spte(iter.old_spte, iter.level))
17793ad93562SKeqian Zhu 			continue;
17803ad93562SKeqian Zhu 
178146044f72SBen Gardon 		new_spte = iter.old_spte &
17825fc3424fSSean Christopherson 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
178346044f72SBen Gardon 
17847c8a4742SDavid Matlack 		if (new_spte == iter.old_spte)
17857c8a4742SDavid Matlack 			break;
17867c8a4742SDavid Matlack 
178746044f72SBen Gardon 		tdp_mmu_set_spte(kvm, &iter, new_spte);
178846044f72SBen Gardon 		spte_set = true;
178946044f72SBen Gardon 	}
179046044f72SBen Gardon 
17917cca2d0bSBen Gardon 	rcu_read_unlock();
17927cca2d0bSBen Gardon 
179346044f72SBen Gardon 	return spte_set;
179446044f72SBen Gardon }
179546044f72SBen Gardon 
179646044f72SBen Gardon /*
179746044f72SBen Gardon  * Removes write access on the last level SPTE mapping this GFN and unsets the
17985fc3424fSSean Christopherson  * MMU-writable bit to ensure future writes continue to be intercepted.
179946044f72SBen Gardon  * Returns true if an SPTE was set and a TLB flush is needed.
180046044f72SBen Gardon  */
180146044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
18023ad93562SKeqian Zhu 				   struct kvm_memory_slot *slot, gfn_t gfn,
18033ad93562SKeqian Zhu 				   int min_level)
180446044f72SBen Gardon {
180546044f72SBen Gardon 	struct kvm_mmu_page *root;
180646044f72SBen Gardon 	bool spte_set = false;
180746044f72SBen Gardon 
1808531810caSBen Gardon 	lockdep_assert_held_write(&kvm->mmu_lock);
1809a3f15bdaSSean Christopherson 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
18103ad93562SKeqian Zhu 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1811a3f15bdaSSean Christopherson 
181246044f72SBen Gardon 	return spte_set;
181346044f72SBen Gardon }
181446044f72SBen Gardon 
181595fb5b02SBen Gardon /*
181695fb5b02SBen Gardon  * Return the level of the lowest level SPTE added to sptes.
181795fb5b02SBen Gardon  * That SPTE may be non-present.
1818c5c8c7c5SDavid Matlack  *
1819c5c8c7c5SDavid Matlack  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
182095fb5b02SBen Gardon  */
182139b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
182239b4d43eSSean Christopherson 			 int *root_level)
182395fb5b02SBen Gardon {
182495fb5b02SBen Gardon 	struct tdp_iter iter;
182595fb5b02SBen Gardon 	struct kvm_mmu *mmu = vcpu->arch.mmu;
182695fb5b02SBen Gardon 	gfn_t gfn = addr >> PAGE_SHIFT;
18272aa07893SSean Christopherson 	int leaf = -1;
182895fb5b02SBen Gardon 
182939b4d43eSSean Christopherson 	*root_level = vcpu->arch.mmu->shadow_root_level;
183095fb5b02SBen Gardon 
183195fb5b02SBen Gardon 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
183295fb5b02SBen Gardon 		leaf = iter.level;
1833dde81f94SSean Christopherson 		sptes[leaf] = iter.old_spte;
183495fb5b02SBen Gardon 	}
183595fb5b02SBen Gardon 
183695fb5b02SBen Gardon 	return leaf;
183795fb5b02SBen Gardon }
18386e8eb206SDavid Matlack 
18396e8eb206SDavid Matlack /*
18406e8eb206SDavid Matlack  * Returns the last level spte pointer of the shadow page walk for the given
18416e8eb206SDavid Matlack  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
18426e8eb206SDavid Matlack  * walk could be performed, returns NULL and *spte does not contain valid data.
18436e8eb206SDavid Matlack  *
18446e8eb206SDavid Matlack  * Contract:
18456e8eb206SDavid Matlack  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
18466e8eb206SDavid Matlack  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
18476e8eb206SDavid Matlack  *
18486e8eb206SDavid Matlack  * WARNING: This function is only intended to be called during fast_page_fault.
18496e8eb206SDavid Matlack  */
18506e8eb206SDavid Matlack u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
18516e8eb206SDavid Matlack 					u64 *spte)
18526e8eb206SDavid Matlack {
18536e8eb206SDavid Matlack 	struct tdp_iter iter;
18546e8eb206SDavid Matlack 	struct kvm_mmu *mmu = vcpu->arch.mmu;
18556e8eb206SDavid Matlack 	gfn_t gfn = addr >> PAGE_SHIFT;
18566e8eb206SDavid Matlack 	tdp_ptep_t sptep = NULL;
18576e8eb206SDavid Matlack 
18586e8eb206SDavid Matlack 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
18596e8eb206SDavid Matlack 		*spte = iter.old_spte;
18606e8eb206SDavid Matlack 		sptep = iter.sptep;
18616e8eb206SDavid Matlack 	}
18626e8eb206SDavid Matlack 
18636e8eb206SDavid Matlack 	/*
18646e8eb206SDavid Matlack 	 * Perform the rcu_dereference to get the raw spte pointer value since
18656e8eb206SDavid Matlack 	 * we are passing it up to fast_page_fault, which is shared with the
18666e8eb206SDavid Matlack 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
18676e8eb206SDavid Matlack 	 * annotation.
18686e8eb206SDavid Matlack 	 *
18696e8eb206SDavid Matlack 	 * This is safe since fast_page_fault obeys the contracts of this
18706e8eb206SDavid Matlack 	 * function as well as all TDP MMU contracts around modifying SPTEs
18716e8eb206SDavid Matlack 	 * outside of mmu_lock.
18726e8eb206SDavid Matlack 	 */
18736e8eb206SDavid Matlack 	return rcu_dereference(sptep);
18746e8eb206SDavid Matlack }
1875