kvm/mmu/tdp_mmu.c

fe5db27dSBen Gardon// SPDX-License-Identifier: GPL-2.0
fe5db27dSBen Gardon
02c00b3aSBen Gardon#include "mmu.h"
02c00b3aSBen Gardon#include "mmu_internal.h"
bb18842eSBen Gardon#include "mmutrace.h"
2f2fad08SBen Gardon#include "tdp_iter.h"
fe5db27dSBen Gardon#include "tdp_mmu.h"
02c00b3aSBen Gardon#include "spte.h"
fe5db27dSBen Gardon
9a77daacSBen Gardon#include <asm/cmpxchg.h>
33dd3574SBen Gardon#include <trace/events/kvm.h>
33dd3574SBen Gardon
71ba3f31SPaolo Bonzinistatic bool __read_mostly tdp_mmu_enabled = true;
95fb5b02SBen Gardonmodule_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27dSBen Gardon
fe5db27dSBen Gardon/* Initializes the TDP MMU for the VM, if enabled. */
a1a39128SPaolo Bonziniint kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27dSBen Gardon{
a1a39128SPaolo Bonzini	struct workqueue_struct *wq;
a1a39128SPaolo Bonzini
897218ffSPaolo Bonzini	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
a1a39128SPaolo Bonzini		return 0;
a1a39128SPaolo Bonzini
a1a39128SPaolo Bonzini	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
a1a39128SPaolo Bonzini	if (!wq)
a1a39128SPaolo Bonzini		return -ENOMEM;
fe5db27dSBen Gardon
fe5db27dSBen Gardon	/* This should not be changed for the lifetime of the VM. */
fe5db27dSBen Gardon	kvm->arch.tdp_mmu_enabled = true;
02c00b3aSBen Gardon	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daacSBen Gardon	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
a1a39128SPaolo Bonzini	kvm->arch.tdp_mmu_zap_wq = wq;
a1a39128SPaolo Bonzini	return 1;
fe5db27dSBen Gardon}
fe5db27dSBen Gardon
226b8c8fSSean Christopherson/* Arbitrarily returns true so that this may be used in if statements. */
226b8c8fSSean Christophersonstatic __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
6103bc07SBen Gardon							     bool shared)
6103bc07SBen Gardon{
6103bc07SBen Gardon	if (shared)
6103bc07SBen Gardon		lockdep_assert_held_read(&kvm->mmu_lock);
6103bc07SBen Gardon	else
6103bc07SBen Gardon		lockdep_assert_held_write(&kvm->mmu_lock);
226b8c8fSSean Christopherson
226b8c8fSSean Christopherson	return true;
6103bc07SBen Gardon}
6103bc07SBen Gardon
fe5db27dSBen Gardonvoid kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
fe5db27dSBen Gardon{
fe5db27dSBen Gardon	if (!kvm->arch.tdp_mmu_enabled)
fe5db27dSBen Gardon		return;
02c00b3aSBen Gardon
3203a56aSLv Ruyi	/* Also waits for any queued work items.  */
22b94c4bSPaolo Bonzini	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
22b94c4bSPaolo Bonzini
d25ceb92SSean Christopherson	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
02c00b3aSBen Gardon	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0bSBen Gardon
7cca2d0bSBen Gardon	/*
7cca2d0bSBen Gardon	 * Ensure that all the outstanding RCU callbacks to free shadow pages
22b94c4bSPaolo Bonzini	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
22b94c4bSPaolo Bonzini	 * can call kvm_tdp_mmu_put_root and create new callbacks.
7cca2d0bSBen Gardon	 */
7cca2d0bSBen Gardon	rcu_barrier();
02c00b3aSBen Gardon}
02c00b3aSBen Gardon
2bdb3d84SBen Gardonstatic void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54SBen Gardon{
2bdb3d84SBen Gardon	free_page((unsigned long)sp->spt);
2bdb3d84SBen Gardon	kmem_cache_free(mmu_page_header_cache, sp);
a889ea54SBen Gardon}
a889ea54SBen Gardon
c0e64238SBen Gardon/*
c0e64238SBen Gardon * This is called through call_rcu in order to free TDP page table memory
c0e64238SBen Gardon * safely with respect to other kernel threads that may be operating on
c0e64238SBen Gardon * the memory.
c0e64238SBen Gardon * By only accessing TDP MMU page table memory in an RCU read critical
c0e64238SBen Gardon * section, and freeing it after a grace period, lockless access to that
c0e64238SBen Gardon * memory won't use it after it is freed.
c0e64238SBen Gardon */
c0e64238SBen Gardonstatic void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54SBen Gardon{
c0e64238SBen Gardon	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
c0e64238SBen Gardon					       rcu_head);
a889ea54SBen Gardon
c0e64238SBen Gardon	tdp_mmu_free_sp(sp);
a889ea54SBen Gardon}
a889ea54SBen Gardon
e2b5b21dSSean Christophersonstatic void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
e2b5b21dSSean Christopherson			     bool shared);
e2b5b21dSSean Christopherson
22b94c4bSPaolo Bonzinistatic void tdp_mmu_zap_root_work(struct work_struct *work)
22b94c4bSPaolo Bonzini{
22b94c4bSPaolo Bonzini	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
22b94c4bSPaolo Bonzini						 tdp_mmu_async_work);
22b94c4bSPaolo Bonzini	struct kvm *kvm = root->tdp_mmu_async_data;
22b94c4bSPaolo Bonzini
22b94c4bSPaolo Bonzini	read_lock(&kvm->mmu_lock);
22b94c4bSPaolo Bonzini
22b94c4bSPaolo Bonzini	/*
22b94c4bSPaolo Bonzini	 * A TLB flush is not necessary as KVM performs a local TLB flush when
22b94c4bSPaolo Bonzini	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
22b94c4bSPaolo Bonzini	 * to a different pCPU.  Note, the local TLB flush on reuse also
22b94c4bSPaolo Bonzini	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
22b94c4bSPaolo Bonzini	 * intermediate paging structures, that may be zapped, as such entries
22b94c4bSPaolo Bonzini	 * are associated with the ASID on both VMX and SVM.
22b94c4bSPaolo Bonzini	 */
22b94c4bSPaolo Bonzini	tdp_mmu_zap_root(kvm, root, true);
22b94c4bSPaolo Bonzini
22b94c4bSPaolo Bonzini	/*
22b94c4bSPaolo Bonzini	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
22b94c4bSPaolo Bonzini	 * avoiding an infinite loop.  By design, the root is reachable while
22b94c4bSPaolo Bonzini	 * it's being asynchronously zapped, thus a different task can put its
22b94c4bSPaolo Bonzini	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
22b94c4bSPaolo Bonzini	 * asynchronously zapped root is unavoidable.
22b94c4bSPaolo Bonzini	 */
22b94c4bSPaolo Bonzini	kvm_tdp_mmu_put_root(kvm, root, true);
22b94c4bSPaolo Bonzini
22b94c4bSPaolo Bonzini	read_unlock(&kvm->mmu_lock);
22b94c4bSPaolo Bonzini}
22b94c4bSPaolo Bonzini
22b94c4bSPaolo Bonzinistatic void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
22b94c4bSPaolo Bonzini{
22b94c4bSPaolo Bonzini	root->tdp_mmu_async_data = kvm;
22b94c4bSPaolo Bonzini	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
22b94c4bSPaolo Bonzini	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
22b94c4bSPaolo Bonzini}
22b94c4bSPaolo Bonzini
8351779cSPaolo Bonzinistatic inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
8351779cSPaolo Bonzini{
8351779cSPaolo Bonzini	union kvm_mmu_page_role role = page->role;
8351779cSPaolo Bonzini	role.invalid = true;
8351779cSPaolo Bonzini
8351779cSPaolo Bonzini	/* No need to use cmpxchg, only the invalid bit can change.  */
8351779cSPaolo Bonzini	role.word = xchg(&page->role.word, role.word);
8351779cSPaolo Bonzini	return role.invalid;
8351779cSPaolo Bonzini}
8351779cSPaolo Bonzini
6103bc07SBen Gardonvoid kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07SBen Gardon			  bool shared)
2bdb3d84SBen Gardon{
6103bc07SBen Gardon	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
2bdb3d84SBen Gardon
11cccf5cSBen Gardon	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84SBen Gardon		return;
2bdb3d84SBen Gardon
2bdb3d84SBen Gardon	WARN_ON(!root->tdp_mmu_page);
2bdb3d84SBen Gardon
8351779cSPaolo Bonzini	/*
8351779cSPaolo Bonzini	 * The root now has refcount=0.  It is valid, but readers already
8351779cSPaolo Bonzini	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
8351779cSPaolo Bonzini	 * rejects it.  This remains true for the rest of the execution
8351779cSPaolo Bonzini	 * of this function, because readers visit valid roots only
8351779cSPaolo Bonzini	 * (except for tdp_mmu_zap_root_work(), which however
8351779cSPaolo Bonzini	 * does not acquire any reference itself).
8351779cSPaolo Bonzini	 *
8351779cSPaolo Bonzini	 * Even though there are flows that need to visit all roots for
8351779cSPaolo Bonzini	 * correctness, they all take mmu_lock for write, so they cannot yet
8351779cSPaolo Bonzini	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
8351779cSPaolo Bonzini	 * since the root still has refcount=0.
8351779cSPaolo Bonzini	 *
8351779cSPaolo Bonzini	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
8351779cSPaolo Bonzini	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
8351779cSPaolo Bonzini	 * So the root temporarily gets an extra reference, going to refcount=1
8351779cSPaolo Bonzini	 * while staying invalid.  Readers still cannot acquire any reference;
8351779cSPaolo Bonzini	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
efd995daSPaolo Bonzini	 * they might take an extra reference if they themselves yield.
efd995daSPaolo Bonzini	 * Therefore, when the reference is given back by the worker,
8351779cSPaolo Bonzini	 * there is no guarantee that the refcount is still 1.  If not, whoever
8351779cSPaolo Bonzini	 * puts the last reference will free the page, but they will not have to
8351779cSPaolo Bonzini	 * zap the root because a root cannot go from invalid to valid.
8351779cSPaolo Bonzini	 */
8351779cSPaolo Bonzini	if (!kvm_tdp_root_mark_invalid(root)) {
8351779cSPaolo Bonzini		refcount_set(&root->tdp_mmu_root_count, 1);
8351779cSPaolo Bonzini
8351779cSPaolo Bonzini		/*
efd995daSPaolo Bonzini		 * Zapping the root in a worker is not just "nice to have";
efd995daSPaolo Bonzini		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
efd995daSPaolo Bonzini		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
efd995daSPaolo Bonzini		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
efd995daSPaolo Bonzini		 * might return with some roots not zapped yet.
8351779cSPaolo Bonzini		 */
efd995daSPaolo Bonzini		tdp_mmu_schedule_zap_root(kvm, root);
8351779cSPaolo Bonzini		return;
8351779cSPaolo Bonzini	}
8351779cSPaolo Bonzini
c0e64238SBen Gardon	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
c0e64238SBen Gardon	list_del_rcu(&root->link);
c0e64238SBen Gardon	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
c0e64238SBen Gardon	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54SBen Gardon}
a889ea54SBen Gardon
cfc10997SBen Gardon/*
d62007edSSean Christopherson * Returns the next root after @prev_root (or the first root if @prev_root is
d62007edSSean Christopherson * NULL).  A reference to the returned root is acquired, and the reference to
d62007edSSean Christopherson * @prev_root is released (the caller obviously must hold a reference to
d62007edSSean Christopherson * @prev_root if it's non-NULL).
d62007edSSean Christopherson *
d62007edSSean Christopherson * If @only_valid is true, invalid roots are skipped.
d62007edSSean Christopherson *
d62007edSSean Christopherson * Returns NULL if the end of tdp_mmu_roots was reached.
cfc10997SBen Gardon */
cfc10997SBen Gardonstatic struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07SBen Gardon					      struct kvm_mmu_page *prev_root,
d62007edSSean Christopherson					      bool shared, bool only_valid)
a889ea54SBen Gardon{
a889ea54SBen Gardon	struct kvm_mmu_page *next_root;
a889ea54SBen Gardon
c0e64238SBen Gardon	rcu_read_lock();
c0e64238SBen Gardon
cfc10997SBen Gardon	if (prev_root)
c0e64238SBen Gardon		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
c0e64238SBen Gardon						  &prev_root->link,
c0e64238SBen Gardon						  typeof(*prev_root), link);
cfc10997SBen Gardon	else
c0e64238SBen Gardon		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
cfc10997SBen Gardon						   typeof(*next_root), link);
cfc10997SBen Gardon
04dc4e6cSSean Christopherson	while (next_root) {
d62007edSSean Christopherson		if ((!only_valid || !next_root->role.invalid) &&
ad6d6b94SJinrong Liang		    kvm_tdp_mmu_get_root(next_root))
04dc4e6cSSean Christopherson			break;
04dc4e6cSSean Christopherson
c0e64238SBen Gardon		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
c0e64238SBen Gardon				&next_root->link, typeof(*next_root), link);
04dc4e6cSSean Christopherson	}
fb101293SBen Gardon
c0e64238SBen Gardon	rcu_read_unlock();
cfc10997SBen Gardon
cfc10997SBen Gardon	if (prev_root)
6103bc07SBen Gardon		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
cfc10997SBen Gardon
a889ea54SBen Gardon	return next_root;
a889ea54SBen Gardon}
a889ea54SBen Gardon
a889ea54SBen Gardon/*
a889ea54SBen Gardon * Note: this iterator gets and puts references to the roots it iterates over.
a889ea54SBen Gardon * This makes it safe to release the MMU lock and yield within the loop, but
a889ea54SBen Gardon * if exiting the loop early, the caller must drop the reference to the most
a889ea54SBen Gardon * recent root. (Unless keeping a live reference is desirable.)
6103bc07SBen Gardon *
6103bc07SBen Gardon * If shared is set, this function is operating under the MMU lock in read
6103bc07SBen Gardon * mode. In the unlikely event that this thread must free a root, the lock
6103bc07SBen Gardon * will be temporarily dropped and reacquired in write mode.
a889ea54SBen Gardon */
d62007edSSean Christopherson#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
d62007edSSean Christopherson	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
cfc10997SBen Gardon	     _root;								\
d62007edSSean Christopherson	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
614f6970SPaolo Bonzini		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
614f6970SPaolo Bonzini		    kvm_mmu_page_as_id(_root) != _as_id) {			\
a3f15bdaSSean Christopherson		} else
a889ea54SBen Gardon
d62007edSSean Christopherson#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
d62007edSSean Christopherson	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
d62007edSSean Christopherson
614f6970SPaolo Bonzini#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
614f6970SPaolo Bonzini	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
d62007edSSean Christopherson
226b8c8fSSean Christopherson/*
226b8c8fSSean Christopherson * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
226b8c8fSSean Christopherson * the implication being that any flow that holds mmu_lock for read is
226b8c8fSSean Christopherson * inherently yield-friendly and should use the yield-safe variant above.
226b8c8fSSean Christopherson * Holding mmu_lock for write obviates the need for RCU protection as the list
226b8c8fSSean Christopherson * is guaranteed to be stable.
226b8c8fSSean Christopherson */
a3f15bdaSSean Christopherson#define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
226b8c8fSSean Christopherson	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
226b8c8fSSean Christopherson		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
226b8c8fSSean Christopherson		    kvm_mmu_page_as_id(_root) != _as_id) {		\
a3f15bdaSSean Christopherson		} else
02c00b3aSBen Gardon
a82070b6SDavid Matlackstatic struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
02c00b3aSBen Gardon{
02c00b3aSBen Gardon	struct kvm_mmu_page *sp;
02c00b3aSBen Gardon
02c00b3aSBen Gardon	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
02c00b3aSBen Gardon	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
a82070b6SDavid Matlack
a82070b6SDavid Matlack	return sp;
a82070b6SDavid Matlack}
a82070b6SDavid Matlack
c10743a1SSean Christophersonstatic void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
c10743a1SSean Christopherson			    gfn_t gfn, union kvm_mmu_page_role role)
a82070b6SDavid Matlack{
55c510e2SSean Christopherson	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
428e9216SSean Christopherson
02c00b3aSBen Gardon	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
02c00b3aSBen Gardon
a3aca4deSDavid Matlack	sp->role = role;
02c00b3aSBen Gardon	sp->gfn = gfn;
c10743a1SSean Christopherson	sp->ptep = sptep;
02c00b3aSBen Gardon	sp->tdp_mmu_page = true;
02c00b3aSBen Gardon
33dd3574SBen Gardon	trace_kvm_mmu_get_page(sp, true);
02c00b3aSBen Gardon}
02c00b3aSBen Gardon
a82070b6SDavid Matlackstatic void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
a3aca4deSDavid Matlack				  struct tdp_iter *iter)
a3aca4deSDavid Matlack{
a3aca4deSDavid Matlack	struct kvm_mmu_page *parent_sp;
a3aca4deSDavid Matlack	union kvm_mmu_page_role role;
a3aca4deSDavid Matlack
a3aca4deSDavid Matlack	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
a3aca4deSDavid Matlack
a3aca4deSDavid Matlack	role = parent_sp->role;
a3aca4deSDavid Matlack	role.level--;
a3aca4deSDavid Matlack
c10743a1SSean Christopherson	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
a3aca4deSDavid Matlack}
a3aca4deSDavid Matlack
6e6ec584SSean Christophersonhpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
02c00b3aSBen Gardon{
7a458f0eSPaolo Bonzini	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
02c00b3aSBen Gardon	struct kvm *kvm = vcpu->kvm;
02c00b3aSBen Gardon	struct kvm_mmu_page *root;
02c00b3aSBen Gardon
6e6ec584SSean Christopherson	lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3aSBen Gardon
04dc4e6cSSean Christopherson	/*
04dc4e6cSSean Christopherson	 * Check for an existing root before allocating a new one.  Note, the
04dc4e6cSSean Christopherson	 * role check prevents consuming an invalid root.
04dc4e6cSSean Christopherson	 */
a3f15bdaSSean Christopherson	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293SBen Gardon		if (root->role.word == role.word &&
ad6d6b94SJinrong Liang		    kvm_tdp_mmu_get_root(root))
6e6ec584SSean Christopherson			goto out;
02c00b3aSBen Gardon	}
02c00b3aSBen Gardon
a82070b6SDavid Matlack	root = tdp_mmu_alloc_sp(vcpu);
c10743a1SSean Christopherson	tdp_mmu_init_sp(root, NULL, 0, role);
a82070b6SDavid Matlack
11cccf5cSBen Gardon	refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3aSBen Gardon
c0e64238SBen Gardon	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
c0e64238SBen Gardon	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
c0e64238SBen Gardon	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3aSBen Gardon
6e6ec584SSean Christophersonout:
02c00b3aSBen Gardon	return __pa(root->spt);
fe5db27dSBen Gardon}
2f2fad08SBen Gardon
2f2fad08SBen Gardonstatic void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daacSBen Gardon				u64 old_spte, u64 new_spte, int level,
9a77daacSBen Gardon				bool shared);
2f2fad08SBen Gardon
f8e14497SBen Gardonstatic void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
f8e14497SBen Gardon{
f8e14497SBen Gardon	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
f8e14497SBen Gardon		return;
f8e14497SBen Gardon
f8e14497SBen Gardon	if (is_accessed_spte(old_spte) &&
64bb2769SSean Christopherson	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
64bb2769SSean Christopherson	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497SBen Gardon		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
f8e14497SBen Gardon}
f8e14497SBen Gardon
a6a0b05dSBen Gardonstatic void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
a6a0b05dSBen Gardon					  u64 old_spte, u64 new_spte, int level)
a6a0b05dSBen Gardon{
a6a0b05dSBen Gardon	bool pfn_changed;
a6a0b05dSBen Gardon	struct kvm_memory_slot *slot;
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon	if (level > PG_LEVEL_4K)
a6a0b05dSBen Gardon		return;
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon	if ((!is_writable_pte(old_spte) || pfn_changed) &&
a6a0b05dSBen Gardon	    is_writable_pte(new_spte)) {
a6a0b05dSBen Gardon		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1edSPeter Xu		mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05dSBen Gardon	}
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
43a063caSYosry Ahmedstatic void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
43a063caSYosry Ahmed{
43a063caSYosry Ahmed	kvm_account_pgtable_pages((void *)sp->spt, +1);
d25ceb92SSean Christopherson	atomic64_inc(&kvm->arch.tdp_mmu_pages);
43a063caSYosry Ahmed}
43a063caSYosry Ahmed
43a063caSYosry Ahmedstatic void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
43a063caSYosry Ahmed{
43a063caSYosry Ahmed	kvm_account_pgtable_pages((void *)sp->spt, -1);
d25ceb92SSean Christopherson	atomic64_dec(&kvm->arch.tdp_mmu_pages);
43a063caSYosry Ahmed}
43a063caSYosry Ahmed
2f2fad08SBen Gardon/**
c298a30cSDavid Matlack * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
a9442f59SBen Gardon *
a9442f59SBen Gardon * @kvm: kvm instance
a9442f59SBen Gardon * @sp: the page to be removed
9a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of
9a77daacSBen Gardon *	    the MMU lock and the operation must synchronize with other
9a77daacSBen Gardon *	    threads that might be adding or removing pages.
a9442f59SBen Gardon */
c298a30cSDavid Matlackstatic void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
9a77daacSBen Gardon			      bool shared)
a9442f59SBen Gardon{
43a063caSYosry Ahmed	tdp_unaccount_mmu_page(kvm, sp);
d25ceb92SSean Christopherson
d25ceb92SSean Christopherson	if (!sp->nx_huge_page_disallowed)
d25ceb92SSean Christopherson		return;
d25ceb92SSean Christopherson
9a77daacSBen Gardon	if (shared)
9a77daacSBen Gardon		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
9a77daacSBen Gardon	else
a9442f59SBen Gardon		lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59SBen Gardon
61f94478SSean Christopherson	sp->nx_huge_page_disallowed = false;
61f94478SSean Christopherson	untrack_possible_nx_huge_page(kvm, sp);
9a77daacSBen Gardon
9a77daacSBen Gardon	if (shared)
9a77daacSBen Gardon		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59SBen Gardon}
a9442f59SBen Gardon
a9442f59SBen Gardon/**
0f53dfa3SDavid Matlack * handle_removed_pt() - handle a page table removed from the TDP structure
a066e61fSBen Gardon *
a066e61fSBen Gardon * @kvm: kvm instance
a066e61fSBen Gardon * @pt: the page removed from the paging structure
9a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use
9a77daacSBen Gardon *	    of the MMU lock and the operation must synchronize with other
9a77daacSBen Gardon *	    threads that might be modifying SPTEs.
a066e61fSBen Gardon *
a066e61fSBen Gardon * Given a page table that has been removed from the TDP paging structure,
a066e61fSBen Gardon * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41SBen Gardon *
70fb3e41SBen Gardon * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
70fb3e41SBen Gardon * protection. Since this thread removed it from the paging structure,
70fb3e41SBen Gardon * this thread will be responsible for ensuring the page is freed. Hence the
70fb3e41SBen Gardon * early rcu_dereferences in the function.
a066e61fSBen Gardon */
0f53dfa3SDavid Matlackstatic void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
a066e61fSBen Gardon{
70fb3e41SBen Gardon	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61fSBen Gardon	int level = sp->role.level;
e25f0e0cSBen Gardon	gfn_t base_gfn = sp->gfn;
a066e61fSBen Gardon	int i;
a066e61fSBen Gardon
a066e61fSBen Gardon	trace_kvm_mmu_prepare_zap_page(sp);
a066e61fSBen Gardon
c298a30cSDavid Matlack	tdp_mmu_unlink_sp(kvm, sp, shared);
a066e61fSBen Gardon
2ca3129eSSean Christopherson	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
ba3a6120SSean Christopherson		tdp_ptep_t sptep = pt + i;
574c3c55SBen Gardon		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
ba3a6120SSean Christopherson		u64 old_spte;
9a77daacSBen Gardon
9a77daacSBen Gardon		if (shared) {
e25f0e0cSBen Gardon			/*
e25f0e0cSBen Gardon			 * Set the SPTE to a nonpresent value that other
e25f0e0cSBen Gardon			 * threads will not overwrite. If the SPTE was
e25f0e0cSBen Gardon			 * already marked as removed then another thread
e25f0e0cSBen Gardon			 * handling a page fault could overwrite it, so
e25f0e0cSBen Gardon			 * set the SPTE until it is set from some other
e25f0e0cSBen Gardon			 * value to the removed SPTE value.
e25f0e0cSBen Gardon			 */
e25f0e0cSBen Gardon			for (;;) {
ba3a6120SSean Christopherson				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
ba3a6120SSean Christopherson				if (!is_removed_spte(old_spte))
e25f0e0cSBen Gardon					break;
e25f0e0cSBen Gardon				cpu_relax();
e25f0e0cSBen Gardon			}
9a77daacSBen Gardon		} else {
8df9f1afSSean Christopherson			/*
8df9f1afSSean Christopherson			 * If the SPTE is not MMU-present, there is no backing
8df9f1afSSean Christopherson			 * page associated with the SPTE and so no side effects
8df9f1afSSean Christopherson			 * that need to be recorded, and exclusive ownership of
8df9f1afSSean Christopherson			 * mmu_lock ensures the SPTE can't be made present.
8df9f1afSSean Christopherson			 * Note, zapping MMIO SPTEs is also unnecessary as they
8df9f1afSSean Christopherson			 * are guarded by the memslots generation, not by being
8df9f1afSSean Christopherson			 * unreachable.
8df9f1afSSean Christopherson			 */
ba3a6120SSean Christopherson			old_spte = kvm_tdp_mmu_read_spte(sptep);
ba3a6120SSean Christopherson			if (!is_shadow_present_pte(old_spte))
8df9f1afSSean Christopherson				continue;
e25f0e0cSBen Gardon
e25f0e0cSBen Gardon			/*
ba3a6120SSean Christopherson			 * Use the common helper instead of a raw WRITE_ONCE as
ba3a6120SSean Christopherson			 * the SPTE needs to be updated atomically if it can be
ba3a6120SSean Christopherson			 * modified by a different vCPU outside of mmu_lock.
ba3a6120SSean Christopherson			 * Even though the parent SPTE is !PRESENT, the TLB
ba3a6120SSean Christopherson			 * hasn't yet been flushed, and both Intel and AMD
ba3a6120SSean Christopherson			 * document that A/D assists can use upper-level PxE
ba3a6120SSean Christopherson			 * entries that are cached in the TLB, i.e. the CPU can
ba3a6120SSean Christopherson			 * still access the page and mark it dirty.
ba3a6120SSean Christopherson			 *
ba3a6120SSean Christopherson			 * No retry is needed in the atomic update path as the
ba3a6120SSean Christopherson			 * sole concern is dropping a Dirty bit, i.e. no other
ba3a6120SSean Christopherson			 * task can zap/remove the SPTE as mmu_lock is held for
ba3a6120SSean Christopherson			 * write.  Marking the SPTE as a removed SPTE is not
ba3a6120SSean Christopherson			 * strictly necessary for the same reason, but using
ba3a6120SSean Christopherson			 * the remove SPTE value keeps the shared/exclusive
ba3a6120SSean Christopherson			 * paths consistent and allows the handle_changed_spte()
ba3a6120SSean Christopherson			 * call below to hardcode the new value to REMOVED_SPTE.
ba3a6120SSean Christopherson			 *
ba3a6120SSean Christopherson			 * Note, even though dropping a Dirty bit is the only
ba3a6120SSean Christopherson			 * scenario where a non-atomic update could result in a
ba3a6120SSean Christopherson			 * functional bug, simply checking the Dirty bit isn't
ba3a6120SSean Christopherson			 * sufficient as a fast page fault could read the upper
ba3a6120SSean Christopherson			 * level SPTE before it is zapped, and then make this
ba3a6120SSean Christopherson			 * target SPTE writable, resume the guest, and set the
ba3a6120SSean Christopherson			 * Dirty bit between reading the SPTE above and writing
ba3a6120SSean Christopherson			 * it here.
e25f0e0cSBen Gardon			 */
ba3a6120SSean Christopherson			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
ba3a6120SSean Christopherson							  REMOVED_SPTE, level);
9a77daacSBen Gardon		}
e25f0e0cSBen Gardon		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
ba3a6120SSean Christopherson				    old_spte, REMOVED_SPTE, level, shared);
a066e61fSBen Gardon	}
a066e61fSBen Gardon
7cca2d0bSBen Gardon	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61fSBen Gardon}
a066e61fSBen Gardon
a066e61fSBen Gardon/**
7f6231a3SKai Huang * __handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08SBen Gardon * @kvm: kvm instance
2f2fad08SBen Gardon * @as_id: the address space of the paging structure the SPTE was a part of
2f2fad08SBen Gardon * @gfn: the base GFN that was mapped by the SPTE
2f2fad08SBen Gardon * @old_spte: The value of the SPTE before the change
2f2fad08SBen Gardon * @new_spte: The value of the SPTE after the change
2f2fad08SBen Gardon * @level: the level of the PT the SPTE is part of in the paging structure
9a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of
9a77daacSBen Gardon *	    the MMU lock and the operation must synchronize with other
9a77daacSBen Gardon *	    threads that might be modifying SPTEs.
2f2fad08SBen Gardon *
2f2fad08SBen Gardon * Handle bookkeeping that might result from the modification of a SPTE.
2f2fad08SBen Gardon * This function must be called for all TDP SPTE modifications.
2f2fad08SBen Gardon */
2f2fad08SBen Gardonstatic void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daacSBen Gardon				  u64 old_spte, u64 new_spte, int level,
9a77daacSBen Gardon				  bool shared)
2f2fad08SBen Gardon{
2f2fad08SBen Gardon	bool was_present = is_shadow_present_pte(old_spte);
2f2fad08SBen Gardon	bool is_present = is_shadow_present_pte(new_spte);
2f2fad08SBen Gardon	bool was_leaf = was_present && is_last_spte(old_spte, level);
2f2fad08SBen Gardon	bool is_leaf = is_present && is_last_spte(new_spte, level);
2f2fad08SBen Gardon	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08SBen Gardon
2f2fad08SBen Gardon	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
2f2fad08SBen Gardon	WARN_ON(level < PG_LEVEL_4K);
764388ceSSean Christopherson	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08SBen Gardon
2f2fad08SBen Gardon	/*
2f2fad08SBen Gardon	 * If this warning were to trigger it would indicate that there was a
2f2fad08SBen Gardon	 * missing MMU notifier or a race with some notifier handler.
2f2fad08SBen Gardon	 * A present, leaf SPTE should never be directly replaced with another
d9f6e12fSIngo Molnar	 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08SBen Gardon	 * should be zapping the SPTE before the main MM's page table is
2f2fad08SBen Gardon	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
2f2fad08SBen Gardon	 * thread before replacement.
2f2fad08SBen Gardon	 */
2f2fad08SBen Gardon	if (was_leaf && is_leaf && pfn_changed) {
2f2fad08SBen Gardon		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
2f2fad08SBen Gardon		       "SPTE with another present leaf SPTE mapping a\n"
2f2fad08SBen Gardon		       "different PFN!\n"
2f2fad08SBen Gardon		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
2f2fad08SBen Gardon		       as_id, gfn, old_spte, new_spte, level);
2f2fad08SBen Gardon
2f2fad08SBen Gardon		/*
2f2fad08SBen Gardon		 * Crash the host to prevent error propagation and guest data
d9f6e12fSIngo Molnar		 * corruption.
2f2fad08SBen Gardon		 */
2f2fad08SBen Gardon		BUG();
2f2fad08SBen Gardon	}
2f2fad08SBen Gardon
2f2fad08SBen Gardon	if (old_spte == new_spte)
2f2fad08SBen Gardon		return;
2f2fad08SBen Gardon
b9a98c34SBen Gardon	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
b9a98c34SBen Gardon
115111efSDavid Matlack	if (is_leaf)
115111efSDavid Matlack		check_spte_writable_invariants(new_spte);
115111efSDavid Matlack
2f2fad08SBen Gardon	/*
2f2fad08SBen Gardon	 * The only times a SPTE should be changed from a non-present to
2f2fad08SBen Gardon	 * non-present state is when an MMIO entry is installed/modified/
2f2fad08SBen Gardon	 * removed. In that case, there is nothing to do here.
2f2fad08SBen Gardon	 */
2f2fad08SBen Gardon	if (!was_present && !is_present) {
2f2fad08SBen Gardon		/*
08f07c80SBen Gardon		 * If this change does not involve a MMIO SPTE or removed SPTE,
08f07c80SBen Gardon		 * it is unexpected. Log the change, though it should not
08f07c80SBen Gardon		 * impact the guest since both the former and current SPTEs
08f07c80SBen Gardon		 * are nonpresent.
2f2fad08SBen Gardon		 */
08f07c80SBen Gardon		if (WARN_ON(!is_mmio_spte(old_spte) &&
08f07c80SBen Gardon			    !is_mmio_spte(new_spte) &&
08f07c80SBen Gardon			    !is_removed_spte(new_spte)))
2f2fad08SBen Gardon			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
2f2fad08SBen Gardon			       "should not be replaced with another,\n"
2f2fad08SBen Gardon			       "different nonpresent SPTE, unless one or both\n"
08f07c80SBen Gardon			       "are MMIO SPTEs, or the new SPTE is\n"
08f07c80SBen Gardon			       "a temporary removed SPTE.\n"
2f2fad08SBen Gardon			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
2f2fad08SBen Gardon			       as_id, gfn, old_spte, new_spte, level);
2f2fad08SBen Gardon		return;
2f2fad08SBen Gardon	}
2f2fad08SBen Gardon
71f51d2cSMingwei Zhang	if (is_leaf != was_leaf)
71f51d2cSMingwei Zhang		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08SBen Gardon
2f2fad08SBen Gardon	if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769SSean Christopherson	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08SBen Gardon		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
2f2fad08SBen Gardon
2f2fad08SBen Gardon	/*
2f2fad08SBen Gardon	 * Recursively handle child PTs if the change removed a subtree from
c8e5a0d0SSean Christopherson	 * the paging structure.  Note the WARN on the PFN changing without the
c8e5a0d0SSean Christopherson	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
c8e5a0d0SSean Christopherson	 * pages are kernel allocations and should never be migrated.
2f2fad08SBen Gardon	 */
c8e5a0d0SSean Christopherson	if (was_present && !was_leaf &&
c8e5a0d0SSean Christopherson	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
0f53dfa3SDavid Matlack		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
2f2fad08SBen Gardon}
2f2fad08SBen Gardon
2f2fad08SBen Gardonstatic void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daacSBen Gardon				u64 old_spte, u64 new_spte, int level,
9a77daacSBen Gardon				bool shared)
2f2fad08SBen Gardon{
9a77daacSBen Gardon	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
9a77daacSBen Gardon			      shared);
f8e14497SBen Gardon	handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05dSBen Gardon	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
a6a0b05dSBen Gardon				      new_spte, level);
2f2fad08SBen Gardon}
faaf05b0SBen Gardon
fe43fa2fSBen Gardon/*
6ccf4438SPaolo Bonzini * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
6ccf4438SPaolo Bonzini * and handle the associated bookkeeping.  Do not mark the page dirty
24ae4cfaSBen Gardon * in KVM's dirty bitmaps.
9a77daacSBen Gardon *
3255530aSDavid Matlack * If setting the SPTE fails because it has changed, iter->old_spte will be
3255530aSDavid Matlack * refreshed to the current value of the spte.
3255530aSDavid Matlack *
9a77daacSBen Gardon * @kvm: kvm instance
9a77daacSBen Gardon * @iter: a tdp_iter instance currently on the SPTE that should be set
9a77daacSBen Gardon * @new_spte: The value the SPTE should be set to
3e72c791SDavid Matlack * Return:
3e72c791SDavid Matlack * * 0      - If the SPTE was set.
3e72c791SDavid Matlack * * -EBUSY - If the SPTE cannot be set. In this case this function will have
3e72c791SDavid Matlack *            no side-effects other than setting iter->old_spte to the last
3e72c791SDavid Matlack *            known value of the spte.
9a77daacSBen Gardon */
3e72c791SDavid Matlackstatic inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
9a77daacSBen Gardon					  struct tdp_iter *iter,
9a77daacSBen Gardon					  u64 new_spte)
9a77daacSBen Gardon{
3255530aSDavid Matlack	u64 *sptep = rcu_dereference(iter->sptep);
3255530aSDavid Matlack
396fd74dSSean Christopherson	/*
396fd74dSSean Christopherson	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
396fd74dSSean Christopherson	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
396fd74dSSean Christopherson	 * and pre-checking before inserting a new SPTE is advantageous as it
396fd74dSSean Christopherson	 * avoids unnecessary work.
396fd74dSSean Christopherson	 */
396fd74dSSean Christopherson	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
3a0f64deSSean Christopherson
9a77daacSBen Gardon	lockdep_assert_held_read(&kvm->mmu_lock);
9a77daacSBen Gardon
08f07c80SBen Gardon	/*
6e8eb206SDavid Matlack	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
6e8eb206SDavid Matlack	 * does not hold the mmu_lock.
6e8eb206SDavid Matlack	 */
aee98a68SUros Bizjak	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
3e72c791SDavid Matlack		return -EBUSY;
9a77daacSBen Gardon
24ae4cfaSBen Gardon	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
08889894SSean Christopherson			      new_spte, iter->level, true);
24ae4cfaSBen Gardon	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daacSBen Gardon
3e72c791SDavid Matlack	return 0;
9a77daacSBen Gardon}
9a77daacSBen Gardon
3e72c791SDavid Matlackstatic inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
08f07c80SBen Gardon					  struct tdp_iter *iter)
08f07c80SBen Gardon{
3e72c791SDavid Matlack	int ret;
3e72c791SDavid Matlack
08f07c80SBen Gardon	/*
08f07c80SBen Gardon	 * Freeze the SPTE by setting it to a special,
08f07c80SBen Gardon	 * non-present value. This will stop other threads from
08f07c80SBen Gardon	 * immediately installing a present entry in its place
08f07c80SBen Gardon	 * before the TLBs are flushed.
08f07c80SBen Gardon	 */
3e72c791SDavid Matlack	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
3e72c791SDavid Matlack	if (ret)
3e72c791SDavid Matlack		return ret;
08f07c80SBen Gardon
08f07c80SBen Gardon	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
08f07c80SBen Gardon					   KVM_PAGES_PER_HPAGE(iter->level));
08f07c80SBen Gardon
08f07c80SBen Gardon	/*
ba3a6120SSean Christopherson	 * No other thread can overwrite the removed SPTE as they must either
ba3a6120SSean Christopherson	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
ba3a6120SSean Christopherson	 * overwrite the special removed SPTE value. No bookkeeping is needed
ba3a6120SSean Christopherson	 * here since the SPTE is going from non-present to non-present.  Use
ba3a6120SSean Christopherson	 * the raw write helper to avoid an unnecessary check on volatile bits.
08f07c80SBen Gardon	 */
ba3a6120SSean Christopherson	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
08f07c80SBen Gardon
3e72c791SDavid Matlack	return 0;
08f07c80SBen Gardon}
08f07c80SBen Gardon
9a77daacSBen Gardon
9a77daacSBen Gardon/*
fe43fa2fSBen Gardon * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
626808d1SSean Christopherson * @kvm:	      KVM instance
626808d1SSean Christopherson * @as_id:	      Address space ID, i.e. regular vs. SMM
626808d1SSean Christopherson * @sptep:	      Pointer to the SPTE
626808d1SSean Christopherson * @old_spte:	      The current value of the SPTE
626808d1SSean Christopherson * @new_spte:	      The new value that will be set for the SPTE
626808d1SSean Christopherson * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
626808d1SSean Christopherson * @level:	      The level _containing_ the SPTE (its parent PT's level)
fe43fa2fSBen Gardon * @record_acc_track: Notify the MM subsystem of changes to the accessed state
fe43fa2fSBen Gardon *		      of the page. Should be set unless handling an MMU
fe43fa2fSBen Gardon *		      notifier for access tracking. Leaving record_acc_track
fe43fa2fSBen Gardon *		      unset in that case prevents page accesses from being
fe43fa2fSBen Gardon *		      double counted.
fe43fa2fSBen Gardon * @record_dirty_log: Record the page as dirty in the dirty bitmap if
fe43fa2fSBen Gardon *		      appropriate for the change being made. Should be set
fe43fa2fSBen Gardon *		      unless performing certain dirty logging operations.
fe43fa2fSBen Gardon *		      Leaving record_dirty_log unset in that case prevents page
fe43fa2fSBen Gardon *		      writes from being double counted.
ba3a6120SSean Christopherson *
ba3a6120SSean Christopherson * Returns the old SPTE value, which _may_ be different than @old_spte if the
ba3a6120SSean Christopherson * SPTE had voldatile bits.
fe43fa2fSBen Gardon */
ba3a6120SSean Christophersonstatic u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
626808d1SSean Christopherson			      u64 old_spte, u64 new_spte, gfn_t gfn, int level,
626808d1SSean Christopherson			      bool record_acc_track, bool record_dirty_log)
faaf05b0SBen Gardon{
531810caSBen Gardon	lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5SBen Gardon
08f07c80SBen Gardon	/*
966da62aSSean Christopherson	 * No thread should be using this function to set SPTEs to or from the
08f07c80SBen Gardon	 * temporary removed SPTE value.
08f07c80SBen Gardon	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
08f07c80SBen Gardon	 * should be used. If operating under the MMU lock in write mode, the
08f07c80SBen Gardon	 * use of the removed SPTE should not be necessary.
08f07c80SBen Gardon	 */
626808d1SSean Christopherson	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
08f07c80SBen Gardon
ba3a6120SSean Christopherson	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
faaf05b0SBen Gardon
626808d1SSean Christopherson	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
626808d1SSean Christopherson
f8e14497SBen Gardon	if (record_acc_track)
626808d1SSean Christopherson		handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05dSBen Gardon	if (record_dirty_log)
626808d1SSean Christopherson		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
626808d1SSean Christopherson					      new_spte, level);
ba3a6120SSean Christopherson	return old_spte;
626808d1SSean Christopherson}
626808d1SSean Christopherson
626808d1SSean Christophersonstatic inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
626808d1SSean Christopherson				     u64 new_spte, bool record_acc_track,
626808d1SSean Christopherson				     bool record_dirty_log)
626808d1SSean Christopherson{
626808d1SSean Christopherson	WARN_ON_ONCE(iter->yielded);
626808d1SSean Christopherson
ba3a6120SSean Christopherson	iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
ba3a6120SSean Christopherson					    iter->old_spte, new_spte,
ba3a6120SSean Christopherson					    iter->gfn, iter->level,
626808d1SSean Christopherson					    record_acc_track, record_dirty_log);
f8e14497SBen Gardon}
f8e14497SBen Gardon
f8e14497SBen Gardonstatic inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
f8e14497SBen Gardon				    u64 new_spte)
f8e14497SBen Gardon{
626808d1SSean Christopherson	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497SBen Gardon}
f8e14497SBen Gardon
f8e14497SBen Gardonstatic inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
f8e14497SBen Gardon						 struct tdp_iter *iter,
f8e14497SBen Gardon						 u64 new_spte)
f8e14497SBen Gardon{
626808d1SSean Christopherson	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
a6a0b05dSBen Gardonstatic inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
a6a0b05dSBen Gardon						 struct tdp_iter *iter,
a6a0b05dSBen Gardon						 u64 new_spte)
a6a0b05dSBen Gardon{
626808d1SSean Christopherson	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0SBen Gardon}
faaf05b0SBen Gardon
faaf05b0SBen Gardon#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
77aa6075SDavid Matlack	for_each_tdp_pte(_iter, _root, _start, _end)
faaf05b0SBen Gardon
f8e14497SBen Gardon#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
f8e14497SBen Gardon	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
f8e14497SBen Gardon		if (!is_shadow_present_pte(_iter.old_spte) ||		\
f8e14497SBen Gardon		    !is_last_spte(_iter.old_spte, _iter.level))		\
f8e14497SBen Gardon			continue;					\
f8e14497SBen Gardon		else
f8e14497SBen Gardon
bb18842eSBen Gardon#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
b9e5603cSPaolo Bonzini	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
bb18842eSBen Gardon
faaf05b0SBen Gardon/*
e28a436cSBen Gardon * Yield if the MMU lock is contended or this thread needs to return control
e28a436cSBen Gardon * to the scheduler.
e28a436cSBen Gardon *
e139a34eSBen Gardon * If this function should yield and flush is set, it will perform a remote
e139a34eSBen Gardon * TLB flush before yielding.
e139a34eSBen Gardon *
3a0f64deSSean Christopherson * If this function yields, iter->yielded is set and the caller must skip to
3a0f64deSSean Christopherson * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
3a0f64deSSean Christopherson * over the paging structures to allow the iterator to continue its traversal
3a0f64deSSean Christopherson * from the paging structure root.
e28a436cSBen Gardon *
3a0f64deSSean Christopherson * Returns true if this function yielded.
e28a436cSBen Gardon */
3a0f64deSSean Christophersonstatic inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
3a0f64deSSean Christopherson							  struct tdp_iter *iter,
3a0f64deSSean Christopherson							  bool flush, bool shared)
a6a0b05dSBen Gardon{
3a0f64deSSean Christopherson	WARN_ON(iter->yielded);
3a0f64deSSean Christopherson
ed5e484bSBen Gardon	/* Ensure forward progress has been made before yielding. */
ed5e484bSBen Gardon	if (iter->next_last_level_gfn == iter->yielded_gfn)
ed5e484bSBen Gardon		return false;
ed5e484bSBen Gardon
531810caSBen Gardon	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
e139a34eSBen Gardon		if (flush)
e139a34eSBen Gardon			kvm_flush_remote_tlbs(kvm);
e139a34eSBen Gardon
bd296779SSean Christopherson		rcu_read_unlock();
bd296779SSean Christopherson
6103bc07SBen Gardon		if (shared)
6103bc07SBen Gardon			cond_resched_rwlock_read(&kvm->mmu_lock);
6103bc07SBen Gardon		else
531810caSBen Gardon			cond_resched_rwlock_write(&kvm->mmu_lock);
6103bc07SBen Gardon
7cca2d0bSBen Gardon		rcu_read_lock();
ed5e484bSBen Gardon
ed5e484bSBen Gardon		WARN_ON(iter->gfn > iter->next_last_level_gfn);
ed5e484bSBen Gardon
3a0f64deSSean Christopherson		iter->yielded = true;
a6a0b05dSBen Gardon	}
e28a436cSBen Gardon
3a0f64deSSean Christopherson	return iter->yielded;
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
86931ff7SSean Christophersonstatic inline gfn_t tdp_mmu_max_gfn_exclusive(void)
e2b5b21dSSean Christopherson{
e2b5b21dSSean Christopherson	/*
86931ff7SSean Christopherson	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
86931ff7SSean Christopherson	 * a gpa range that would exceed the max gfn, and KVM does not create
86931ff7SSean Christopherson	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
86931ff7SSean Christopherson	 * the slow emulation path every time.
e2b5b21dSSean Christopherson	 */
86931ff7SSean Christopherson	return kvm_mmu_max_gfn() + 1;
e2b5b21dSSean Christopherson}
e2b5b21dSSean Christopherson
1b6043e8SSean Christophersonstatic void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
1b6043e8SSean Christopherson			       bool shared, int zap_level)
e2b5b21dSSean Christopherson{
e2b5b21dSSean Christopherson	struct tdp_iter iter;
e2b5b21dSSean Christopherson
86931ff7SSean Christopherson	gfn_t end = tdp_mmu_max_gfn_exclusive();
e2b5b21dSSean Christopherson	gfn_t start = 0;
e2b5b21dSSean Christopherson
1b6043e8SSean Christopherson	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
1b6043e8SSean Christophersonretry:
1b6043e8SSean Christopherson		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1b6043e8SSean Christopherson			continue;
1b6043e8SSean Christopherson
1b6043e8SSean Christopherson		if (!is_shadow_present_pte(iter.old_spte))
1b6043e8SSean Christopherson			continue;
1b6043e8SSean Christopherson
1b6043e8SSean Christopherson		if (iter.level > zap_level)
1b6043e8SSean Christopherson			continue;
1b6043e8SSean Christopherson
1b6043e8SSean Christopherson		if (!shared)
1b6043e8SSean Christopherson			tdp_mmu_set_spte(kvm, &iter, 0);
1b6043e8SSean Christopherson		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
1b6043e8SSean Christopherson			goto retry;
1b6043e8SSean Christopherson	}
1b6043e8SSean Christopherson}
1b6043e8SSean Christopherson
1b6043e8SSean Christophersonstatic void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
1b6043e8SSean Christopherson			     bool shared)
1b6043e8SSean Christopherson{
1b6043e8SSean Christopherson
8351779cSPaolo Bonzini	/*
8351779cSPaolo Bonzini	 * The root must have an elevated refcount so that it's reachable via
8351779cSPaolo Bonzini	 * mmu_notifier callbacks, which allows this path to yield and drop
8351779cSPaolo Bonzini	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
8351779cSPaolo Bonzini	 * must drop all references to relevant pages prior to completing the
8351779cSPaolo Bonzini	 * callback.  Dropping mmu_lock with an unreachable root would result
8351779cSPaolo Bonzini	 * in zapping SPTEs after a relevant mmu_notifier callback completes
8351779cSPaolo Bonzini	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
8351779cSPaolo Bonzini	 * dirty accessed bits to the SPTE's associated struct page.
8351779cSPaolo Bonzini	 */
8351779cSPaolo Bonzini	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
8351779cSPaolo Bonzini
e2b5b21dSSean Christopherson	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
e2b5b21dSSean Christopherson
e2b5b21dSSean Christopherson	rcu_read_lock();
e2b5b21dSSean Christopherson
e2b5b21dSSean Christopherson	/*
1b6043e8SSean Christopherson	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
1b6043e8SSean Christopherson	 * split the zap into two passes.  On the first pass, zap at the 1gb
1b6043e8SSean Christopherson	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
1b6043e8SSean Christopherson	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
1b6043e8SSean Christopherson	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
1b6043e8SSean Christopherson	 *
1b6043e8SSean Christopherson	 * Because zapping a SP recurses on its children, stepping down to
1b6043e8SSean Christopherson	 * PG_LEVEL_4K in the iterator itself is unnecessary.
e2b5b21dSSean Christopherson	 */
1b6043e8SSean Christopherson	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
1b6043e8SSean Christopherson	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
e2b5b21dSSean Christopherson
e2b5b21dSSean Christopherson	rcu_read_unlock();
e2b5b21dSSean Christopherson}
e2b5b21dSSean Christopherson
c10743a1SSean Christophersonbool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
c10743a1SSean Christopherson{
c10743a1SSean Christopherson	u64 old_spte;
c10743a1SSean Christopherson
c10743a1SSean Christopherson	/*
c10743a1SSean Christopherson	 * This helper intentionally doesn't allow zapping a root shadow page,
c10743a1SSean Christopherson	 * which doesn't have a parent page table and thus no associated entry.
c10743a1SSean Christopherson	 */
c10743a1SSean Christopherson	if (WARN_ON_ONCE(!sp->ptep))
c10743a1SSean Christopherson		return false;
c10743a1SSean Christopherson
c10743a1SSean Christopherson	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
bb95dfb9SSean Christopherson	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
c10743a1SSean Christopherson		return false;
c10743a1SSean Christopherson
c10743a1SSean Christopherson	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
c10743a1SSean Christopherson			   sp->gfn, sp->role.level + 1, true, true);
c10743a1SSean Christopherson
c10743a1SSean Christopherson	return true;
c10743a1SSean Christopherson}
c10743a1SSean Christopherson
faaf05b0SBen Gardon/*
063afacdSBen Gardon * If can_yield is true, will release the MMU lock and reschedule if the
063afacdSBen Gardon * scheduler needs the CPU or there is contention on the MMU lock. If this
063afacdSBen Gardon * function cannot yield, it will not release the MMU lock or reschedule and
063afacdSBen Gardon * the caller must ensure it does not supply too large a GFN range, or the
6103bc07SBen Gardon * operation can cause a soft lockup.
faaf05b0SBen Gardon */
f47e5bbbSSean Christophersonstatic bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
acbda82aSSean Christopherson			      gfn_t start, gfn_t end, bool can_yield, bool flush)
faaf05b0SBen Gardon{
faaf05b0SBen Gardon	struct tdp_iter iter;
faaf05b0SBen Gardon
86931ff7SSean Christopherson	end = min(end, tdp_mmu_max_gfn_exclusive());
524a1e4eSSean Christopherson
acbda82aSSean Christopherson	lockdep_assert_held_write(&kvm->mmu_lock);
6103bc07SBen Gardon
7cca2d0bSBen Gardon	rcu_read_lock();
7cca2d0bSBen Gardon
f47e5bbbSSean Christopherson	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
1af4a960SBen Gardon		if (can_yield &&
acbda82aSSean Christopherson		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
a835429cSSean Christopherson			flush = false;
1af4a960SBen Gardon			continue;
1af4a960SBen Gardon		}
1af4a960SBen Gardon
f47e5bbbSSean Christopherson		if (!is_shadow_present_pte(iter.old_spte) ||
faaf05b0SBen Gardon		    !is_last_spte(iter.old_spte, iter.level))
faaf05b0SBen Gardon			continue;
faaf05b0SBen Gardon
faaf05b0SBen Gardon		tdp_mmu_set_spte(kvm, &iter, 0);
a835429cSSean Christopherson		flush = true;
faaf05b0SBen Gardon	}
7cca2d0bSBen Gardon
7cca2d0bSBen Gardon	rcu_read_unlock();
bb95dfb9SSean Christopherson
f47e5bbbSSean Christopherson	/*
f47e5bbbSSean Christopherson	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
f47e5bbbSSean Christopherson	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
f47e5bbbSSean Christopherson	 */
f47e5bbbSSean Christopherson	return flush;
faaf05b0SBen Gardon}
faaf05b0SBen Gardon
faaf05b0SBen Gardon/*
7edc3a68SKai Huang * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
7edc3a68SKai Huang * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
7edc3a68SKai Huang * more SPTEs were zapped since the MMU lock was last acquired.
faaf05b0SBen Gardon */
f47e5bbbSSean Christophersonbool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
f47e5bbbSSean Christopherson			   bool can_yield, bool flush)
faaf05b0SBen Gardon{
faaf05b0SBen Gardon	struct kvm_mmu_page *root;
faaf05b0SBen Gardon
614f6970SPaolo Bonzini	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
f47e5bbbSSean Christopherson		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
faaf05b0SBen Gardon
faaf05b0SBen Gardon	return flush;
faaf05b0SBen Gardon}
faaf05b0SBen Gardon
faaf05b0SBen Gardonvoid kvm_tdp_mmu_zap_all(struct kvm *kvm)
faaf05b0SBen Gardon{
e2b5b21dSSean Christopherson	struct kvm_mmu_page *root;
2b9663d8SSean Christopherson	int i;
faaf05b0SBen Gardon
77c8cd6bSSean Christopherson	/*
22b94c4bSPaolo Bonzini	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
22b94c4bSPaolo Bonzini	 * before returning to the caller.  Zap directly even if the root is
22b94c4bSPaolo Bonzini	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
22b94c4bSPaolo Bonzini	 * all that expensive and mmu_lock is already held, which means the
22b94c4bSPaolo Bonzini	 * worker has yielded, i.e. flushing the work instead of zapping here
22b94c4bSPaolo Bonzini	 * isn't guaranteed to be any faster.
22b94c4bSPaolo Bonzini	 *
77c8cd6bSSean Christopherson	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
77c8cd6bSSean Christopherson	 * is being destroyed or the userspace VMM has exited.  In both cases,
77c8cd6bSSean Christopherson	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
77c8cd6bSSean Christopherson	 */
e2b5b21dSSean Christopherson	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
e2b5b21dSSean Christopherson		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
e2b5b21dSSean Christopherson			tdp_mmu_zap_root(kvm, root, false);
e2b5b21dSSean Christopherson	}
faaf05b0SBen Gardon}
bb18842eSBen Gardon
4c6654bdSBen Gardon/*
f28e9c7fSSean Christopherson * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
22b94c4bSPaolo Bonzini * zap" completes.
4c6654bdSBen Gardon */
4c6654bdSBen Gardonvoid kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
4c6654bdSBen Gardon{
22b94c4bSPaolo Bonzini	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
4c6654bdSBen Gardon}
4c6654bdSBen Gardon
bb18842eSBen Gardon/*
f28e9c7fSSean Christopherson * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
22b94c4bSPaolo Bonzini * is about to be zapped, e.g. in response to a memslots update.  The actual
22b94c4bSPaolo Bonzini * zapping is performed asynchronously, so a reference is taken on all roots.
22b94c4bSPaolo Bonzini * Using a separate workqueue makes it easy to ensure that the destruction is
22b94c4bSPaolo Bonzini * performed before the "fast zap" completes, without keeping a separate list
22b94c4bSPaolo Bonzini * of invalidated roots; the list is effectively the list of work items in
22b94c4bSPaolo Bonzini * the workqueue.
b7cccd39SBen Gardon *
22b94c4bSPaolo Bonzini * Get a reference even if the root is already invalid, the asynchronous worker
22b94c4bSPaolo Bonzini * assumes it was gifted a reference to the root it processes.  Because mmu_lock
22b94c4bSPaolo Bonzini * is held for write, it should be impossible to observe a root with zero refcount,
22b94c4bSPaolo Bonzini * i.e. the list of roots cannot be stale.
4c6654bdSBen Gardon *
b7cccd39SBen Gardon * This has essentially the same effect for the TDP MMU
b7cccd39SBen Gardon * as updating mmu_valid_gen does for the shadow MMU.
b7cccd39SBen Gardon */
b7cccd39SBen Gardonvoid kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
b7cccd39SBen Gardon{
b7cccd39SBen Gardon	struct kvm_mmu_page *root;
b7cccd39SBen Gardon
b7cccd39SBen Gardon	lockdep_assert_held_write(&kvm->mmu_lock);
f28e9c7fSSean Christopherson	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
efd995daSPaolo Bonzini		if (!root->role.invalid &&
efd995daSPaolo Bonzini		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
b7cccd39SBen Gardon			root->role.invalid = true;
22b94c4bSPaolo Bonzini			tdp_mmu_schedule_zap_root(kvm, root);
22b94c4bSPaolo Bonzini		}
b7cccd39SBen Gardon	}
f28e9c7fSSean Christopherson}
b7cccd39SBen Gardon
bb18842eSBen Gardon/*
bb18842eSBen Gardon * Installs a last-level SPTE to handle a TDP page fault.
bb18842eSBen Gardon * (NPT/EPT violation/misconfiguration)
bb18842eSBen Gardon */
cdc47767SPaolo Bonzinistatic int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
cdc47767SPaolo Bonzini					  struct kvm_page_fault *fault,
cdc47767SPaolo Bonzini					  struct tdp_iter *iter)
bb18842eSBen Gardon{
c435d4b7SSean Christopherson	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842eSBen Gardon	u64 new_spte;
57a3e96dSKai Huang	int ret = RET_PF_FIXED;
ad67e480SPaolo Bonzini	bool wrprot = false;
bb18842eSBen Gardon
7158bee4SPaolo Bonzini	WARN_ON(sp->role.level != fault->goal_level);
e710c5f6SDavid Matlack	if (unlikely(!fault->slot))
bb18842eSBen Gardon		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daacSBen Gardon	else
53597858SDavid Matlack		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180cSPaolo Bonzini					 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4SPaolo Bonzini					 fault->map_writable, &new_spte);
bb18842eSBen Gardon
bb18842eSBen Gardon	if (new_spte == iter->old_spte)
bb18842eSBen Gardon		ret = RET_PF_SPURIOUS;
3e72c791SDavid Matlack	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daacSBen Gardon		return RET_PF_RETRY;
bb95dfb9SSean Christopherson	else if (is_shadow_present_pte(iter->old_spte) &&
bb95dfb9SSean Christopherson		 !is_last_spte(iter->old_spte, iter->level))
bb95dfb9SSean Christopherson		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
bb95dfb9SSean Christopherson						   KVM_PAGES_PER_HPAGE(iter->level + 1));
bb18842eSBen Gardon
bb18842eSBen Gardon	/*
bb18842eSBen Gardon	 * If the page fault was caused by a write but the page is write
bb18842eSBen Gardon	 * protected, emulation is needed. If the emulation was skipped,
bb18842eSBen Gardon	 * the vCPU would have the same fault again.
bb18842eSBen Gardon	 */
ad67e480SPaolo Bonzini	if (wrprot) {
cdc47767SPaolo Bonzini		if (fault->write)
bb18842eSBen Gardon			ret = RET_PF_EMULATE;
bb18842eSBen Gardon	}
bb18842eSBen Gardon
bb18842eSBen Gardon	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daacSBen Gardon	if (unlikely(is_mmio_spte(new_spte))) {
1075d41eSSean Christopherson		vcpu->stat.pf_mmio_spte_created++;
9a77daacSBen Gardon		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
9a77daacSBen Gardon				     new_spte);
bb18842eSBen Gardon		ret = RET_PF_EMULATE;
3849e092SSean Christopherson	} else {
9a77daacSBen Gardon		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
9a77daacSBen Gardon				       rcu_dereference(iter->sptep));
3849e092SSean Christopherson	}
bb18842eSBen Gardon
bb18842eSBen Gardon	return ret;
bb18842eSBen Gardon}
bb18842eSBen Gardon
bb18842eSBen Gardon/*
cb00a70bSDavid Matlack * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
cb00a70bSDavid Matlack * provided page table.
7b7e1ab6SDavid Matlack *
7b7e1ab6SDavid Matlack * @kvm: kvm instance
7b7e1ab6SDavid Matlack * @iter: a tdp_iter instance currently on the SPTE that should be set
7b7e1ab6SDavid Matlack * @sp: The new TDP page table to install.
cb00a70bSDavid Matlack * @shared: This operation is running under the MMU lock in read mode.
7b7e1ab6SDavid Matlack *
7b7e1ab6SDavid Matlack * Returns: 0 if the new page table was installed. Non-0 if the page table
7b7e1ab6SDavid Matlack *          could not be installed (e.g. the atomic compare-exchange failed).
7b7e1ab6SDavid Matlack */
cb00a70bSDavid Matlackstatic int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
61f94478SSean Christopherson			   struct kvm_mmu_page *sp, bool shared)
7b7e1ab6SDavid Matlack{
54275f74SSean Christopherson	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
cb00a70bSDavid Matlack	int ret = 0;
7b7e1ab6SDavid Matlack
cb00a70bSDavid Matlack	if (shared) {
7b7e1ab6SDavid Matlack		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
7b7e1ab6SDavid Matlack		if (ret)
7b7e1ab6SDavid Matlack			return ret;
cb00a70bSDavid Matlack	} else {
cb00a70bSDavid Matlack		tdp_mmu_set_spte(kvm, iter, spte);
cb00a70bSDavid Matlack	}
7b7e1ab6SDavid Matlack
43a063caSYosry Ahmed	tdp_account_mmu_page(kvm, sp);
7b7e1ab6SDavid Matlack
7b7e1ab6SDavid Matlack	return 0;
7b7e1ab6SDavid Matlack}
7b7e1ab6SDavid Matlack
c4b33d28SDavid Matlackstatic int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
c4b33d28SDavid Matlack				   struct kvm_mmu_page *sp, bool shared);
c4b33d28SDavid Matlack
7b7e1ab6SDavid Matlack/*
bb18842eSBen Gardon * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
bb18842eSBen Gardon * page tables and SPTEs to translate the faulting guest physical address.
bb18842eSBen Gardon */
2f6305ddSPaolo Bonziniint kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842eSBen Gardon{
bb18842eSBen Gardon	struct kvm_mmu *mmu = vcpu->arch.mmu;
61f94478SSean Christopherson	struct kvm *kvm = vcpu->kvm;
bb18842eSBen Gardon	struct tdp_iter iter;
89c0fd49SBen Gardon	struct kvm_mmu_page *sp;
63d28a25SPaolo Bonzini	int ret = RET_PF_RETRY;
bb18842eSBen Gardon
73a3c659SPaolo Bonzini	kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842eSBen Gardon
f0066d94SPaolo Bonzini	trace_kvm_mmu_spte_requested(fault);
7cca2d0bSBen Gardon
7cca2d0bSBen Gardon	rcu_read_lock();
7cca2d0bSBen Gardon
2f6305ddSPaolo Bonzini	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
63d28a25SPaolo Bonzini		int r;
63d28a25SPaolo Bonzini
73a3c659SPaolo Bonzini		if (fault->nx_huge_page_workaround_enabled)
536f0e6aSPaolo Bonzini			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842eSBen Gardon
bb18842eSBen Gardon		/*
c4b33d28SDavid Matlack		 * If SPTE has been frozen by another thread, just give up and
c4b33d28SDavid Matlack		 * retry, avoiding unnecessary page table allocation and free.
ff76d506SKai Huang		 */
ff76d506SKai Huang		if (is_removed_spte(iter.old_spte))
63d28a25SPaolo Bonzini			goto retry;
63d28a25SPaolo Bonzini
f5d16bb9SSean Christopherson		if (iter.level == fault->goal_level)
*80a3e4aeSSean Christopherson			goto map_target_level;
f5d16bb9SSean Christopherson
63d28a25SPaolo Bonzini		/* Step down into the lower level page table if it exists. */
63d28a25SPaolo Bonzini		if (is_shadow_present_pte(iter.old_spte) &&
63d28a25SPaolo Bonzini		    !is_large_pte(iter.old_spte))
63d28a25SPaolo Bonzini			continue;
ff76d506SKai Huang
c4b33d28SDavid Matlack		/*
c4b33d28SDavid Matlack		 * The SPTE is either non-present or points to a huge page that
c4b33d28SDavid Matlack		 * needs to be split.
c4b33d28SDavid Matlack		 */
a82070b6SDavid Matlack		sp = tdp_mmu_alloc_sp(vcpu);
a82070b6SDavid Matlack		tdp_mmu_init_child_sp(sp, &iter);
a82070b6SDavid Matlack
61f94478SSean Christopherson		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
61f94478SSean Christopherson
c4b33d28SDavid Matlack		if (is_shadow_present_pte(iter.old_spte))
63d28a25SPaolo Bonzini			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
c4b33d28SDavid Matlack		else
63d28a25SPaolo Bonzini			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
c4b33d28SDavid Matlack
63d28a25SPaolo Bonzini		/*
*80a3e4aeSSean Christopherson		 * Force the guest to retry if installing an upper level SPTE
*80a3e4aeSSean Christopherson		 * failed, e.g. because a different task modified the SPTE.
63d28a25SPaolo Bonzini		 */
63d28a25SPaolo Bonzini		if (r) {
9a77daacSBen Gardon			tdp_mmu_free_sp(sp);
63d28a25SPaolo Bonzini			goto retry;
9a77daacSBen Gardon		}
61f94478SSean Christopherson
61f94478SSean Christopherson		if (fault->huge_page_disallowed &&
61f94478SSean Christopherson		    fault->req_level >= iter.level) {
61f94478SSean Christopherson			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
61f94478SSean Christopherson			track_possible_nx_huge_page(kvm, sp);
61f94478SSean Christopherson			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
61f94478SSean Christopherson		}
bb18842eSBen Gardon	}
bb18842eSBen Gardon
*80a3e4aeSSean Christopherson	/*
*80a3e4aeSSean Christopherson	 * The walk aborted before reaching the target level, e.g. because the
*80a3e4aeSSean Christopherson	 * iterator detected an upper level SPTE was frozen during traversal.
*80a3e4aeSSean Christopherson	 */
*80a3e4aeSSean Christopherson	WARN_ON_ONCE(iter.level == fault->goal_level);
*80a3e4aeSSean Christopherson	goto retry;
*80a3e4aeSSean Christopherson
*80a3e4aeSSean Christophersonmap_target_level:
cdc47767SPaolo Bonzini	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
bb18842eSBen Gardon
63d28a25SPaolo Bonziniretry:
63d28a25SPaolo Bonzini	rcu_read_unlock();
bb18842eSBen Gardon	return ret;
bb18842eSBen Gardon}
063afacdSBen Gardon
3039bcc7SSean Christophersonbool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
3039bcc7SSean Christopherson				 bool flush)
063afacdSBen Gardon{
f47e5bbbSSean Christopherson	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
83b83a02SSean Christopherson				     range->end, range->may_block, flush);
3039bcc7SSean Christopherson}
3039bcc7SSean Christopherson
3039bcc7SSean Christophersontypedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
3039bcc7SSean Christopherson			      struct kvm_gfn_range *range);
3039bcc7SSean Christopherson
3039bcc7SSean Christophersonstatic __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
3039bcc7SSean Christopherson						   struct kvm_gfn_range *range,
c1b91493SSean Christopherson						   tdp_handler_t handler)
063afacdSBen Gardon{
063afacdSBen Gardon	struct kvm_mmu_page *root;
3039bcc7SSean Christopherson	struct tdp_iter iter;
3039bcc7SSean Christopherson	bool ret = false;
063afacdSBen Gardon
063afacdSBen Gardon	/*
e1eed584SSean Christopherson	 * Don't support rescheduling, none of the MMU notifiers that funnel
e1eed584SSean Christopherson	 * into this helper allow blocking; it'd be dead, wasteful code.
063afacdSBen Gardon	 */
3039bcc7SSean Christopherson	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
a151acecSSean Christopherson		rcu_read_lock();
a151acecSSean Christopherson
3039bcc7SSean Christopherson		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
3039bcc7SSean Christopherson			ret |= handler(kvm, &iter, range);
063afacdSBen Gardon
3039bcc7SSean Christopherson		rcu_read_unlock();
a151acecSSean Christopherson	}
063afacdSBen Gardon
063afacdSBen Gardon	return ret;
063afacdSBen Gardon}
063afacdSBen Gardon
f8e14497SBen Gardon/*
f8e14497SBen Gardon * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
f8e14497SBen Gardon * if any of the GFNs in the range have been accessed.
f8e14497SBen Gardon */
3039bcc7SSean Christophersonstatic bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
3039bcc7SSean Christopherson			  struct kvm_gfn_range *range)
f8e14497SBen Gardon{
f8e14497SBen Gardon	u64 new_spte = 0;
f8e14497SBen Gardon
3039bcc7SSean Christopherson	/* If we have a non-accessed entry we don't need to change the pte. */
3039bcc7SSean Christopherson	if (!is_accessed_spte(iter->old_spte))
3039bcc7SSean Christopherson		return false;
7cca2d0bSBen Gardon
3039bcc7SSean Christopherson	new_spte = iter->old_spte;
f8e14497SBen Gardon
f8e14497SBen Gardon	if (spte_ad_enabled(new_spte)) {
8f8f52a4SSean Christopherson		new_spte &= ~shadow_accessed_mask;
f8e14497SBen Gardon	} else {
f8e14497SBen Gardon		/*
f8e14497SBen Gardon		 * Capture the dirty status of the page, so that it doesn't get
f8e14497SBen Gardon		 * lost when the SPTE is marked for access tracking.
f8e14497SBen Gardon		 */
f8e14497SBen Gardon		if (is_writable_pte(new_spte))
f8e14497SBen Gardon			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497SBen Gardon
f8e14497SBen Gardon		new_spte = mark_spte_for_access_track(new_spte);
f8e14497SBen Gardon	}
f8e14497SBen Gardon
3039bcc7SSean Christopherson	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
33dd3574SBen Gardon
3039bcc7SSean Christopherson	return true;
f8e14497SBen Gardon}
f8e14497SBen Gardon
3039bcc7SSean Christophersonbool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497SBen Gardon{
3039bcc7SSean Christopherson	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497SBen Gardon}
f8e14497SBen Gardon
3039bcc7SSean Christophersonstatic bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
3039bcc7SSean Christopherson			 struct kvm_gfn_range *range)
f8e14497SBen Gardon{
3039bcc7SSean Christopherson	return is_accessed_spte(iter->old_spte);
f8e14497SBen Gardon}
f8e14497SBen Gardon
3039bcc7SSean Christophersonbool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497SBen Gardon{
3039bcc7SSean Christopherson	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
3039bcc7SSean Christopherson}
3039bcc7SSean Christopherson
3039bcc7SSean Christophersonstatic bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
3039bcc7SSean Christopherson			 struct kvm_gfn_range *range)
3039bcc7SSean Christopherson{
3039bcc7SSean Christopherson	u64 new_spte;
3039bcc7SSean Christopherson
3039bcc7SSean Christopherson	/* Huge pages aren't expected to be modified without first being zapped. */
3039bcc7SSean Christopherson	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
3039bcc7SSean Christopherson
3039bcc7SSean Christopherson	if (iter->level != PG_LEVEL_4K ||
3039bcc7SSean Christopherson	    !is_shadow_present_pte(iter->old_spte))
3039bcc7SSean Christopherson		return false;
3039bcc7SSean Christopherson
3039bcc7SSean Christopherson	/*
3039bcc7SSean Christopherson	 * Note, when changing a read-only SPTE, it's not strictly necessary to
3039bcc7SSean Christopherson	 * zero the SPTE before setting the new PFN, but doing so preserves the
3039bcc7SSean Christopherson	 * invariant that the PFN of a present * leaf SPTE can never change.
3039bcc7SSean Christopherson	 * See __handle_changed_spte().
3039bcc7SSean Christopherson	 */
3039bcc7SSean Christopherson	tdp_mmu_set_spte(kvm, iter, 0);
3039bcc7SSean Christopherson
3039bcc7SSean Christopherson	if (!pte_write(range->pte)) {
3039bcc7SSean Christopherson		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
3039bcc7SSean Christopherson								  pte_pfn(range->pte));
3039bcc7SSean Christopherson
3039bcc7SSean Christopherson		tdp_mmu_set_spte(kvm, iter, new_spte);
3039bcc7SSean Christopherson	}
3039bcc7SSean Christopherson
3039bcc7SSean Christopherson	return true;
f8e14497SBen Gardon}
1d8dd6b3SBen Gardon
1d8dd6b3SBen Gardon/*
1d8dd6b3SBen Gardon * Handle the changed_pte MMU notifier for the TDP MMU.
1d8dd6b3SBen Gardon * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1d8dd6b3SBen Gardon * notifier.
1d8dd6b3SBen Gardon * Returns non-zero if a flush is needed before releasing the MMU lock.
1d8dd6b3SBen Gardon */
3039bcc7SSean Christophersonbool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3SBen Gardon{
93fa50f6SSean Christopherson	/*
93fa50f6SSean Christopherson	 * No need to handle the remote TLB flush under RCU protection, the
93fa50f6SSean Christopherson	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
93fa50f6SSean Christopherson	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
93fa50f6SSean Christopherson	 */
93fa50f6SSean Christopherson	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1d8dd6b3SBen Gardon}
1d8dd6b3SBen Gardon
a6a0b05dSBen Gardon/*
bedd9195SDavid Matlack * Remove write access from all SPTEs at or above min_level that map GFNs
bedd9195SDavid Matlack * [start, end). Returns true if an SPTE has been changed and the TLBs need to
bedd9195SDavid Matlack * be flushed.
a6a0b05dSBen Gardon */
a6a0b05dSBen Gardonstatic bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
a6a0b05dSBen Gardon			     gfn_t start, gfn_t end, int min_level)
a6a0b05dSBen Gardon{
a6a0b05dSBen Gardon	struct tdp_iter iter;
a6a0b05dSBen Gardon	u64 new_spte;
a6a0b05dSBen Gardon	bool spte_set = false;
a6a0b05dSBen Gardon
7cca2d0bSBen Gardon	rcu_read_lock();
7cca2d0bSBen Gardon
a6a0b05dSBen Gardon	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
a6a0b05dSBen Gardon
77aa6075SDavid Matlack	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
24ae4cfaSBen Gardonretry:
24ae4cfaSBen Gardon		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960SBen Gardon			continue;
1af4a960SBen Gardon
a6a0b05dSBen Gardon		if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2cSBen Gardon		    !is_last_spte(iter.old_spte, iter.level) ||
0f99ee2cSBen Gardon		    !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05dSBen Gardon			continue;
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
a6a0b05dSBen Gardon
3e72c791SDavid Matlack		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfaSBen Gardon			goto retry;
3255530aSDavid Matlack
a6a0b05dSBen Gardon		spte_set = true;
a6a0b05dSBen Gardon	}
7cca2d0bSBen Gardon
7cca2d0bSBen Gardon	rcu_read_unlock();
a6a0b05dSBen Gardon	return spte_set;
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon/*
a6a0b05dSBen Gardon * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
a6a0b05dSBen Gardon * only affect leaf SPTEs down to min_level.
a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed.
a6a0b05dSBen Gardon */
269e9552SHamza Mahfoozbool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
269e9552SHamza Mahfooz			     const struct kvm_memory_slot *slot, int min_level)
a6a0b05dSBen Gardon{
a6a0b05dSBen Gardon	struct kvm_mmu_page *root;
a6a0b05dSBen Gardon	bool spte_set = false;
a6a0b05dSBen Gardon
24ae4cfaSBen Gardon	lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05dSBen Gardon
d62007edSSean Christopherson	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05dSBen Gardon		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
a6a0b05dSBen Gardon			     slot->base_gfn + slot->npages, min_level);
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon	return spte_set;
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
a3fe5dbdSDavid Matlackstatic struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
a3fe5dbdSDavid Matlack{
a3fe5dbdSDavid Matlack	struct kvm_mmu_page *sp;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	gfp |= __GFP_ZERO;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
a3fe5dbdSDavid Matlack	if (!sp)
a3fe5dbdSDavid Matlack		return NULL;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	sp->spt = (void *)__get_free_page(gfp);
a3fe5dbdSDavid Matlack	if (!sp->spt) {
a3fe5dbdSDavid Matlack		kmem_cache_free(mmu_page_header_cache, sp);
a3fe5dbdSDavid Matlack		return NULL;
a3fe5dbdSDavid Matlack	}
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	return sp;
a3fe5dbdSDavid Matlack}
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlackstatic struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
cb00a70bSDavid Matlack						       struct tdp_iter *iter,
cb00a70bSDavid Matlack						       bool shared)
a3fe5dbdSDavid Matlack{
a3fe5dbdSDavid Matlack	struct kvm_mmu_page *sp;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	/*
a3fe5dbdSDavid Matlack	 * Since we are allocating while under the MMU lock we have to be
a3fe5dbdSDavid Matlack	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
a3fe5dbdSDavid Matlack	 * reclaim and to avoid making any filesystem callbacks (which can end
a3fe5dbdSDavid Matlack	 * up invoking KVM MMU notifiers, resulting in a deadlock).
a3fe5dbdSDavid Matlack	 *
a3fe5dbdSDavid Matlack	 * If this allocation fails we drop the lock and retry with reclaim
a3fe5dbdSDavid Matlack	 * allowed.
a3fe5dbdSDavid Matlack	 */
a3fe5dbdSDavid Matlack	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
a3fe5dbdSDavid Matlack	if (sp)
a3fe5dbdSDavid Matlack		return sp;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	rcu_read_unlock();
cb00a70bSDavid Matlack
cb00a70bSDavid Matlack	if (shared)
a3fe5dbdSDavid Matlack		read_unlock(&kvm->mmu_lock);
cb00a70bSDavid Matlack	else
cb00a70bSDavid Matlack		write_unlock(&kvm->mmu_lock);
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	iter->yielded = true;
a3fe5dbdSDavid Matlack	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
a3fe5dbdSDavid Matlack
cb00a70bSDavid Matlack	if (shared)
a3fe5dbdSDavid Matlack		read_lock(&kvm->mmu_lock);
cb00a70bSDavid Matlack	else
cb00a70bSDavid Matlack		write_lock(&kvm->mmu_lock);
cb00a70bSDavid Matlack
a3fe5dbdSDavid Matlack	rcu_read_lock();
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	return sp;
a3fe5dbdSDavid Matlack}
a3fe5dbdSDavid Matlack
c4b33d28SDavid Matlack/* Note, the caller is responsible for initializing @sp. */
cb00a70bSDavid Matlackstatic int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
cb00a70bSDavid Matlack				   struct kvm_mmu_page *sp, bool shared)
a3fe5dbdSDavid Matlack{
a3fe5dbdSDavid Matlack	const u64 huge_spte = iter->old_spte;
a3fe5dbdSDavid Matlack	const int level = iter->level;
a3fe5dbdSDavid Matlack	int ret, i;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	/*
a3fe5dbdSDavid Matlack	 * No need for atomics when writing to sp->spt since the page table has
a3fe5dbdSDavid Matlack	 * not been linked in yet and thus is not reachable from any other CPU.
a3fe5dbdSDavid Matlack	 */
2ca3129eSSean Christopherson	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
47855da0SDavid Matlack		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	/*
a3fe5dbdSDavid Matlack	 * Replace the huge spte with a pointer to the populated lower level
a3fe5dbdSDavid Matlack	 * page table. Since we are making this change without a TLB flush vCPUs
a3fe5dbdSDavid Matlack	 * will see a mix of the split mappings and the original huge mapping,
a3fe5dbdSDavid Matlack	 * depending on what's currently in their TLB. This is fine from a
a3fe5dbdSDavid Matlack	 * correctness standpoint since the translation will be the same either
a3fe5dbdSDavid Matlack	 * way.
a3fe5dbdSDavid Matlack	 */
61f94478SSean Christopherson	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
a3fe5dbdSDavid Matlack	if (ret)
e0b728b1SDavid Matlack		goto out;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	/*
a3fe5dbdSDavid Matlack	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
a3fe5dbdSDavid Matlack	 * are overwriting from the page stats. But we have to manually update
a3fe5dbdSDavid Matlack	 * the page stats with the new present child pages.
a3fe5dbdSDavid Matlack	 */
2ca3129eSSean Christopherson	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
a3fe5dbdSDavid Matlack
e0b728b1SDavid Matlackout:
e0b728b1SDavid Matlack	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
e0b728b1SDavid Matlack	return ret;
a3fe5dbdSDavid Matlack}
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlackstatic int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
a3fe5dbdSDavid Matlack					 struct kvm_mmu_page *root,
a3fe5dbdSDavid Matlack					 gfn_t start, gfn_t end,
cb00a70bSDavid Matlack					 int target_level, bool shared)
a3fe5dbdSDavid Matlack{
a3fe5dbdSDavid Matlack	struct kvm_mmu_page *sp = NULL;
a3fe5dbdSDavid Matlack	struct tdp_iter iter;
a3fe5dbdSDavid Matlack	int ret = 0;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	rcu_read_lock();
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	/*
a3fe5dbdSDavid Matlack	 * Traverse the page table splitting all huge pages above the target
a3fe5dbdSDavid Matlack	 * level into one lower level. For example, if we encounter a 1GB page
a3fe5dbdSDavid Matlack	 * we split it into 512 2MB pages.
a3fe5dbdSDavid Matlack	 *
a3fe5dbdSDavid Matlack	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
a3fe5dbdSDavid Matlack	 * to visit an SPTE before ever visiting its children, which means we
a3fe5dbdSDavid Matlack	 * will correctly recursively split huge pages that are more than one
a3fe5dbdSDavid Matlack	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
a3fe5dbdSDavid Matlack	 * and then splitting each of those to 512 4KB pages).
a3fe5dbdSDavid Matlack	 */
a3fe5dbdSDavid Matlack	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
a3fe5dbdSDavid Matlackretry:
cb00a70bSDavid Matlack		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
a3fe5dbdSDavid Matlack			continue;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
a3fe5dbdSDavid Matlack			continue;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack		if (!sp) {
cb00a70bSDavid Matlack			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
a3fe5dbdSDavid Matlack			if (!sp) {
a3fe5dbdSDavid Matlack				ret = -ENOMEM;
e0b728b1SDavid Matlack				trace_kvm_mmu_split_huge_page(iter.gfn,
e0b728b1SDavid Matlack							      iter.old_spte,
e0b728b1SDavid Matlack							      iter.level, ret);
a3fe5dbdSDavid Matlack				break;
a3fe5dbdSDavid Matlack			}
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack			if (iter.yielded)
a3fe5dbdSDavid Matlack				continue;
a3fe5dbdSDavid Matlack		}
a3fe5dbdSDavid Matlack
c4b33d28SDavid Matlack		tdp_mmu_init_child_sp(sp, &iter);
c4b33d28SDavid Matlack
cb00a70bSDavid Matlack		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
a3fe5dbdSDavid Matlack			goto retry;
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack		sp = NULL;
a3fe5dbdSDavid Matlack	}
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	rcu_read_unlock();
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	/*
a3fe5dbdSDavid Matlack	 * It's possible to exit the loop having never used the last sp if, for
a3fe5dbdSDavid Matlack	 * example, a vCPU doing HugePage NX splitting wins the race and
a3fe5dbdSDavid Matlack	 * installs its own sp in place of the last sp we tried to split.
a3fe5dbdSDavid Matlack	 */
a3fe5dbdSDavid Matlack	if (sp)
a3fe5dbdSDavid Matlack		tdp_mmu_free_sp(sp);
a3fe5dbdSDavid Matlack
a3fe5dbdSDavid Matlack	return ret;
a3fe5dbdSDavid Matlack}
a3fe5dbdSDavid Matlack
cb00a70bSDavid Matlack
a3fe5dbdSDavid Matlack/*
a3fe5dbdSDavid Matlack * Try to split all huge pages mapped by the TDP MMU down to the target level.
a3fe5dbdSDavid Matlack */
a3fe5dbdSDavid Matlackvoid kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
a3fe5dbdSDavid Matlack				      const struct kvm_memory_slot *slot,
a3fe5dbdSDavid Matlack				      gfn_t start, gfn_t end,
cb00a70bSDavid Matlack				      int target_level, bool shared)
a3fe5dbdSDavid Matlack{
a3fe5dbdSDavid Matlack	struct kvm_mmu_page *root;
a3fe5dbdSDavid Matlack	int r = 0;
a3fe5dbdSDavid Matlack
cb00a70bSDavid Matlack	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a3fe5dbdSDavid Matlack
7c554d8eSPaolo Bonzini	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
cb00a70bSDavid Matlack		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
a3fe5dbdSDavid Matlack		if (r) {
cb00a70bSDavid Matlack			kvm_tdp_mmu_put_root(kvm, root, shared);
a3fe5dbdSDavid Matlack			break;
a3fe5dbdSDavid Matlack		}
a3fe5dbdSDavid Matlack	}
a3fe5dbdSDavid Matlack}
a3fe5dbdSDavid Matlack
a6a0b05dSBen Gardon/*
a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on
a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
a6a0b05dSBen Gardon * be flushed.
a6a0b05dSBen Gardon */
a6a0b05dSBen Gardonstatic bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
a6a0b05dSBen Gardon			   gfn_t start, gfn_t end)
a6a0b05dSBen Gardon{
a6a0b05dSBen Gardon	struct tdp_iter iter;
a6a0b05dSBen Gardon	u64 new_spte;
a6a0b05dSBen Gardon	bool spte_set = false;
a6a0b05dSBen Gardon
7cca2d0bSBen Gardon	rcu_read_lock();
7cca2d0bSBen Gardon
a6a0b05dSBen Gardon	tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfaSBen Gardonretry:
24ae4cfaSBen Gardon		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960SBen Gardon			continue;
1af4a960SBen Gardon
3354ef5aSSean Christopherson		if (!is_shadow_present_pte(iter.old_spte))
3354ef5aSSean Christopherson			continue;
3354ef5aSSean Christopherson
a6a0b05dSBen Gardon		if (spte_ad_need_write_protect(iter.old_spte)) {
a6a0b05dSBen Gardon			if (is_writable_pte(iter.old_spte))
a6a0b05dSBen Gardon				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
a6a0b05dSBen Gardon			else
a6a0b05dSBen Gardon				continue;
a6a0b05dSBen Gardon		} else {
a6a0b05dSBen Gardon			if (iter.old_spte & shadow_dirty_mask)
a6a0b05dSBen Gardon				new_spte = iter.old_spte & ~shadow_dirty_mask;
a6a0b05dSBen Gardon			else
a6a0b05dSBen Gardon				continue;
a6a0b05dSBen Gardon		}
a6a0b05dSBen Gardon
3e72c791SDavid Matlack		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfaSBen Gardon			goto retry;
3255530aSDavid Matlack
a6a0b05dSBen Gardon		spte_set = true;
a6a0b05dSBen Gardon	}
7cca2d0bSBen Gardon
7cca2d0bSBen Gardon	rcu_read_unlock();
a6a0b05dSBen Gardon	return spte_set;
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon/*
a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on
a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
a6a0b05dSBen Gardon * be flushed.
a6a0b05dSBen Gardon */
269e9552SHamza Mahfoozbool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
269e9552SHamza Mahfooz				  const struct kvm_memory_slot *slot)
a6a0b05dSBen Gardon{
a6a0b05dSBen Gardon	struct kvm_mmu_page *root;
a6a0b05dSBen Gardon	bool spte_set = false;
a6a0b05dSBen Gardon
24ae4cfaSBen Gardon	lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05dSBen Gardon
d62007edSSean Christopherson	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05dSBen Gardon		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
a6a0b05dSBen Gardon				slot->base_gfn + slot->npages);
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon	return spte_set;
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon/*
a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all
a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled,
a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE
a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
a6a0b05dSBen Gardon */
a6a0b05dSBen Gardonstatic void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
a6a0b05dSBen Gardon				  gfn_t gfn, unsigned long mask, bool wrprot)
a6a0b05dSBen Gardon{
a6a0b05dSBen Gardon	struct tdp_iter iter;
a6a0b05dSBen Gardon	u64 new_spte;
a6a0b05dSBen Gardon
7cca2d0bSBen Gardon	rcu_read_lock();
7cca2d0bSBen Gardon
a6a0b05dSBen Gardon	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
a6a0b05dSBen Gardon				    gfn + BITS_PER_LONG) {
a6a0b05dSBen Gardon		if (!mask)
a6a0b05dSBen Gardon			break;
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon		if (iter.level > PG_LEVEL_4K ||
a6a0b05dSBen Gardon		    !(mask & (1UL << (iter.gfn - gfn))))
a6a0b05dSBen Gardon			continue;
a6a0b05dSBen Gardon
f1b3b06aSBen Gardon		mask &= ~(1UL << (iter.gfn - gfn));
f1b3b06aSBen Gardon
a6a0b05dSBen Gardon		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
a6a0b05dSBen Gardon			if (is_writable_pte(iter.old_spte))
a6a0b05dSBen Gardon				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
a6a0b05dSBen Gardon			else
a6a0b05dSBen Gardon				continue;
a6a0b05dSBen Gardon		} else {
a6a0b05dSBen Gardon			if (iter.old_spte & shadow_dirty_mask)
a6a0b05dSBen Gardon				new_spte = iter.old_spte & ~shadow_dirty_mask;
a6a0b05dSBen Gardon			else
a6a0b05dSBen Gardon				continue;
a6a0b05dSBen Gardon		}
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05dSBen Gardon	}
7cca2d0bSBen Gardon
7cca2d0bSBen Gardon	rcu_read_unlock();
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
a6a0b05dSBen Gardon/*
a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all
a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled,
a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE
a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
a6a0b05dSBen Gardon */
a6a0b05dSBen Gardonvoid kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
a6a0b05dSBen Gardon				       struct kvm_memory_slot *slot,
a6a0b05dSBen Gardon				       gfn_t gfn, unsigned long mask,
a6a0b05dSBen Gardon				       bool wrprot)
a6a0b05dSBen Gardon{
a6a0b05dSBen Gardon	struct kvm_mmu_page *root;
a6a0b05dSBen Gardon
531810caSBen Gardon	lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bdaSSean Christopherson	for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05dSBen Gardon		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05dSBen Gardon}
a6a0b05dSBen Gardon
4b85c921SSean Christophersonstatic void zap_collapsible_spte_range(struct kvm *kvm,
14881998SBen Gardon				       struct kvm_mmu_page *root,
4b85c921SSean Christopherson				       const struct kvm_memory_slot *slot)
14881998SBen Gardon{
9eba50f8SSean Christopherson	gfn_t start = slot->base_gfn;
9eba50f8SSean Christopherson	gfn_t end = start + slot->npages;
14881998SBen Gardon	struct tdp_iter iter;
5ba7c4c6SBen Gardon	int max_mapping_level;
14881998SBen Gardon
7cca2d0bSBen Gardon	rcu_read_lock();
7cca2d0bSBen Gardon
85f44f8cSSean Christopherson	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
85f44f8cSSean Christophersonretry:
4b85c921SSean Christopherson		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960SBen Gardon			continue;
1af4a960SBen Gardon
85f44f8cSSean Christopherson		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
85f44f8cSSean Christopherson		    !is_shadow_present_pte(iter.old_spte))
85f44f8cSSean Christopherson			continue;
85f44f8cSSean Christopherson
85f44f8cSSean Christopherson		/*
85f44f8cSSean Christopherson		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
85f44f8cSSean Christopherson		 * a large page size, then its parent would have been zapped
85f44f8cSSean Christopherson		 * instead of stepping down.
85f44f8cSSean Christopherson		 */
85f44f8cSSean Christopherson		if (is_last_spte(iter.old_spte, iter.level))
85f44f8cSSean Christopherson			continue;
85f44f8cSSean Christopherson
85f44f8cSSean Christopherson		/*
85f44f8cSSean Christopherson		 * If iter.gfn resides outside of the slot, i.e. the page for
85f44f8cSSean Christopherson		 * the current level overlaps but is not contained by the slot,
85f44f8cSSean Christopherson		 * then the SPTE can't be made huge.  More importantly, trying
85f44f8cSSean Christopherson		 * to query that info from slot->arch.lpage_info will cause an
85f44f8cSSean Christopherson		 * out-of-bounds access.
85f44f8cSSean Christopherson		 */
85f44f8cSSean Christopherson		if (iter.gfn < start || iter.gfn >= end)
14881998SBen Gardon			continue;
14881998SBen Gardon
5ba7c4c6SBen Gardon		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
a8ac499bSSean Christopherson							      iter.gfn, PG_LEVEL_NUM);
85f44f8cSSean Christopherson		if (max_mapping_level < iter.level)
5ba7c4c6SBen Gardon			continue;
5ba7c4c6SBen Gardon
4b85c921SSean Christopherson		/* Note, a successful atomic zap also does a remote TLB flush. */
85f44f8cSSean Christopherson		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
85f44f8cSSean Christopherson			goto retry;
2db6f772SBen Gardon	}
14881998SBen Gardon
7cca2d0bSBen Gardon	rcu_read_unlock();
14881998SBen Gardon}
14881998SBen Gardon
14881998SBen Gardon/*
85f44f8cSSean Christopherson * Zap non-leaf SPTEs (and free their associated page tables) which could
85f44f8cSSean Christopherson * be replaced by huge pages, for GFNs within the slot.
14881998SBen Gardon */
4b85c921SSean Christophersonvoid kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
4b85c921SSean Christopherson				       const struct kvm_memory_slot *slot)
14881998SBen Gardon{
14881998SBen Gardon	struct kvm_mmu_page *root;
14881998SBen Gardon
2db6f772SBen Gardon	lockdep_assert_held_read(&kvm->mmu_lock);
14881998SBen Gardon
d62007edSSean Christopherson	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
4b85c921SSean Christopherson		zap_collapsible_spte_range(kvm, root, slot);
14881998SBen Gardon}
46044f72SBen Gardon
46044f72SBen Gardon/*
46044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424fSSean Christopherson * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed.
46044f72SBen Gardon */
46044f72SBen Gardonstatic bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562SKeqian Zhu			      gfn_t gfn, int min_level)
46044f72SBen Gardon{
46044f72SBen Gardon	struct tdp_iter iter;
46044f72SBen Gardon	u64 new_spte;
46044f72SBen Gardon	bool spte_set = false;
46044f72SBen Gardon
3ad93562SKeqian Zhu	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
3ad93562SKeqian Zhu
7cca2d0bSBen Gardon	rcu_read_lock();
7cca2d0bSBen Gardon
77aa6075SDavid Matlack	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
3ad93562SKeqian Zhu		if (!is_shadow_present_pte(iter.old_spte) ||
3ad93562SKeqian Zhu		    !is_last_spte(iter.old_spte, iter.level))
3ad93562SKeqian Zhu			continue;
3ad93562SKeqian Zhu
46044f72SBen Gardon		new_spte = iter.old_spte &
5fc3424fSSean Christopherson			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72SBen Gardon
7c8a4742SDavid Matlack		if (new_spte == iter.old_spte)
7c8a4742SDavid Matlack			break;
7c8a4742SDavid Matlack
46044f72SBen Gardon		tdp_mmu_set_spte(kvm, &iter, new_spte);
46044f72SBen Gardon		spte_set = true;
46044f72SBen Gardon	}
46044f72SBen Gardon
7cca2d0bSBen Gardon	rcu_read_unlock();
7cca2d0bSBen Gardon
46044f72SBen Gardon	return spte_set;
46044f72SBen Gardon}
46044f72SBen Gardon
46044f72SBen Gardon/*
46044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424fSSean Christopherson * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed.
46044f72SBen Gardon */
46044f72SBen Gardonbool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562SKeqian Zhu				   struct kvm_memory_slot *slot, gfn_t gfn,
3ad93562SKeqian Zhu				   int min_level)
46044f72SBen Gardon{
46044f72SBen Gardon	struct kvm_mmu_page *root;
46044f72SBen Gardon	bool spte_set = false;
46044f72SBen Gardon
531810caSBen Gardon	lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bdaSSean Christopherson	for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562SKeqian Zhu		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bdaSSean Christopherson
46044f72SBen Gardon	return spte_set;
46044f72SBen Gardon}
46044f72SBen Gardon
95fb5b02SBen Gardon/*
95fb5b02SBen Gardon * Return the level of the lowest level SPTE added to sptes.
95fb5b02SBen Gardon * That SPTE may be non-present.
c5c8c7c5SDavid Matlack *
c5c8c7c5SDavid Matlack * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02SBen Gardon */
39b4d43eSSean Christophersonint kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
39b4d43eSSean Christopherson			 int *root_level)
95fb5b02SBen Gardon{
95fb5b02SBen Gardon	struct tdp_iter iter;
95fb5b02SBen Gardon	struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02SBen Gardon	gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893SSean Christopherson	int leaf = -1;
95fb5b02SBen Gardon
a972e29cSPaolo Bonzini	*root_level = vcpu->arch.mmu->root_role.level;
95fb5b02SBen Gardon
95fb5b02SBen Gardon	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
95fb5b02SBen Gardon		leaf = iter.level;
dde81f94SSean Christopherson		sptes[leaf] = iter.old_spte;
95fb5b02SBen Gardon	}
95fb5b02SBen Gardon
95fb5b02SBen Gardon	return leaf;
95fb5b02SBen Gardon}
6e8eb206SDavid Matlack
6e8eb206SDavid Matlack/*
6e8eb206SDavid Matlack * Returns the last level spte pointer of the shadow page walk for the given
6e8eb206SDavid Matlack * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
6e8eb206SDavid Matlack * walk could be performed, returns NULL and *spte does not contain valid data.
6e8eb206SDavid Matlack *
6e8eb206SDavid Matlack * Contract:
6e8eb206SDavid Matlack *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
6e8eb206SDavid Matlack *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
6e8eb206SDavid Matlack *
6e8eb206SDavid Matlack * WARNING: This function is only intended to be called during fast_page_fault.
6e8eb206SDavid Matlack */
6e8eb206SDavid Matlacku64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
6e8eb206SDavid Matlack					u64 *spte)
6e8eb206SDavid Matlack{
6e8eb206SDavid Matlack	struct tdp_iter iter;
6e8eb206SDavid Matlack	struct kvm_mmu *mmu = vcpu->arch.mmu;
6e8eb206SDavid Matlack	gfn_t gfn = addr >> PAGE_SHIFT;
6e8eb206SDavid Matlack	tdp_ptep_t sptep = NULL;
6e8eb206SDavid Matlack
6e8eb206SDavid Matlack	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
6e8eb206SDavid Matlack		*spte = iter.old_spte;
6e8eb206SDavid Matlack		sptep = iter.sptep;
6e8eb206SDavid Matlack	}
6e8eb206SDavid Matlack
6e8eb206SDavid Matlack	/*
6e8eb206SDavid Matlack	 * Perform the rcu_dereference to get the raw spte pointer value since
6e8eb206SDavid Matlack	 * we are passing it up to fast_page_fault, which is shared with the
6e8eb206SDavid Matlack	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
6e8eb206SDavid Matlack	 * annotation.
6e8eb206SDavid Matlack	 *
6e8eb206SDavid Matlack	 * This is safe since fast_page_fault obeys the contracts of this
6e8eb206SDavid Matlack	 * function as well as all TDP MMU contracts around modifying SPTEs
6e8eb206SDavid Matlack	 * outside of mmu_lock.
6e8eb206SDavid Matlack	 */
6e8eb206SDavid Matlack	return rcu_dereference(sptep);
6e8eb206SDavid Matlack}