1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0 28d20bd63SSean Christopherson #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3fe5db27dSBen Gardon 402c00b3aSBen Gardon #include "mmu.h" 502c00b3aSBen Gardon #include "mmu_internal.h" 6bb18842eSBen Gardon #include "mmutrace.h" 72f2fad08SBen Gardon #include "tdp_iter.h" 8fe5db27dSBen Gardon #include "tdp_mmu.h" 902c00b3aSBen Gardon #include "spte.h" 10fe5db27dSBen Gardon 119a77daacSBen Gardon #include <asm/cmpxchg.h> 1233dd3574SBen Gardon #include <trace/events/kvm.h> 1333dd3574SBen Gardon 14fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */ 15a1a39128SPaolo Bonzini int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16fe5db27dSBen Gardon { 17a1a39128SPaolo Bonzini struct workqueue_struct *wq; 18a1a39128SPaolo Bonzini 19a1a39128SPaolo Bonzini wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 20a1a39128SPaolo Bonzini if (!wq) 21a1a39128SPaolo Bonzini return -ENOMEM; 22fe5db27dSBen Gardon 2302c00b3aSBen Gardon INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 249a77daacSBen Gardon spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 25a1a39128SPaolo Bonzini kvm->arch.tdp_mmu_zap_wq = wq; 26a1a39128SPaolo Bonzini return 1; 27fe5db27dSBen Gardon } 28fe5db27dSBen Gardon 29226b8c8fSSean Christopherson /* Arbitrarily returns true so that this may be used in if statements. */ 30226b8c8fSSean Christopherson static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 316103bc07SBen Gardon bool shared) 326103bc07SBen Gardon { 336103bc07SBen Gardon if (shared) 346103bc07SBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 356103bc07SBen Gardon else 366103bc07SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 37226b8c8fSSean Christopherson 38226b8c8fSSean Christopherson return true; 396103bc07SBen Gardon } 406103bc07SBen Gardon 41fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 42fe5db27dSBen Gardon { 43*edbdb43fSSean Christopherson /* 44*edbdb43fSSean Christopherson * Invalidate all roots, which besides the obvious, schedules all roots 45*edbdb43fSSean Christopherson * for zapping and thus puts the TDP MMU's reference to each root, i.e. 46*edbdb43fSSean Christopherson * ultimately frees all roots. 47*edbdb43fSSean Christopherson */ 48*edbdb43fSSean Christopherson kvm_tdp_mmu_invalidate_all_roots(kvm); 49*edbdb43fSSean Christopherson 50*edbdb43fSSean Christopherson /* 51*edbdb43fSSean Christopherson * Destroying a workqueue also first flushes the workqueue, i.e. no 52*edbdb43fSSean Christopherson * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). 53*edbdb43fSSean Christopherson */ 5422b94c4bSPaolo Bonzini destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 5522b94c4bSPaolo Bonzini 56d25ceb92SSean Christopherson WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 5702c00b3aSBen Gardon WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 587cca2d0bSBen Gardon 597cca2d0bSBen Gardon /* 607cca2d0bSBen Gardon * Ensure that all the outstanding RCU callbacks to free shadow pages 6122b94c4bSPaolo Bonzini * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 6222b94c4bSPaolo Bonzini * can call kvm_tdp_mmu_put_root and create new callbacks. 637cca2d0bSBen Gardon */ 647cca2d0bSBen Gardon rcu_barrier(); 6502c00b3aSBen Gardon } 6602c00b3aSBen Gardon 672bdb3d84SBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 68a889ea54SBen Gardon { 692bdb3d84SBen Gardon free_page((unsigned long)sp->spt); 702bdb3d84SBen Gardon kmem_cache_free(mmu_page_header_cache, sp); 71a889ea54SBen Gardon } 72a889ea54SBen Gardon 73c0e64238SBen Gardon /* 74c0e64238SBen Gardon * This is called through call_rcu in order to free TDP page table memory 75c0e64238SBen Gardon * safely with respect to other kernel threads that may be operating on 76c0e64238SBen Gardon * the memory. 77c0e64238SBen Gardon * By only accessing TDP MMU page table memory in an RCU read critical 78c0e64238SBen Gardon * section, and freeing it after a grace period, lockless access to that 79c0e64238SBen Gardon * memory won't use it after it is freed. 80c0e64238SBen Gardon */ 81c0e64238SBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 82a889ea54SBen Gardon { 83c0e64238SBen Gardon struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 84c0e64238SBen Gardon rcu_head); 85a889ea54SBen Gardon 86c0e64238SBen Gardon tdp_mmu_free_sp(sp); 87a889ea54SBen Gardon } 88a889ea54SBen Gardon 89e2b5b21dSSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 90e2b5b21dSSean Christopherson bool shared); 91e2b5b21dSSean Christopherson 9222b94c4bSPaolo Bonzini static void tdp_mmu_zap_root_work(struct work_struct *work) 9322b94c4bSPaolo Bonzini { 9422b94c4bSPaolo Bonzini struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 9522b94c4bSPaolo Bonzini tdp_mmu_async_work); 9622b94c4bSPaolo Bonzini struct kvm *kvm = root->tdp_mmu_async_data; 9722b94c4bSPaolo Bonzini 9822b94c4bSPaolo Bonzini read_lock(&kvm->mmu_lock); 9922b94c4bSPaolo Bonzini 10022b94c4bSPaolo Bonzini /* 10122b94c4bSPaolo Bonzini * A TLB flush is not necessary as KVM performs a local TLB flush when 10222b94c4bSPaolo Bonzini * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 10322b94c4bSPaolo Bonzini * to a different pCPU. Note, the local TLB flush on reuse also 10422b94c4bSPaolo Bonzini * invalidates any paging-structure-cache entries, i.e. TLB entries for 10522b94c4bSPaolo Bonzini * intermediate paging structures, that may be zapped, as such entries 10622b94c4bSPaolo Bonzini * are associated with the ASID on both VMX and SVM. 10722b94c4bSPaolo Bonzini */ 10822b94c4bSPaolo Bonzini tdp_mmu_zap_root(kvm, root, true); 10922b94c4bSPaolo Bonzini 11022b94c4bSPaolo Bonzini /* 11122b94c4bSPaolo Bonzini * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 11222b94c4bSPaolo Bonzini * avoiding an infinite loop. By design, the root is reachable while 11322b94c4bSPaolo Bonzini * it's being asynchronously zapped, thus a different task can put its 11422b94c4bSPaolo Bonzini * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 11522b94c4bSPaolo Bonzini * asynchronously zapped root is unavoidable. 11622b94c4bSPaolo Bonzini */ 11722b94c4bSPaolo Bonzini kvm_tdp_mmu_put_root(kvm, root, true); 11822b94c4bSPaolo Bonzini 11922b94c4bSPaolo Bonzini read_unlock(&kvm->mmu_lock); 12022b94c4bSPaolo Bonzini } 12122b94c4bSPaolo Bonzini 12222b94c4bSPaolo Bonzini static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 12322b94c4bSPaolo Bonzini { 12422b94c4bSPaolo Bonzini root->tdp_mmu_async_data = kvm; 12522b94c4bSPaolo Bonzini INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 12622b94c4bSPaolo Bonzini queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 12722b94c4bSPaolo Bonzini } 12822b94c4bSPaolo Bonzini 1296103bc07SBen Gardon void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 1306103bc07SBen Gardon bool shared) 1312bdb3d84SBen Gardon { 1326103bc07SBen Gardon kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1332bdb3d84SBen Gardon 13411cccf5cSBen Gardon if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 1352bdb3d84SBen Gardon return; 1362bdb3d84SBen Gardon 1378351779cSPaolo Bonzini /* 138*edbdb43fSSean Christopherson * The TDP MMU itself holds a reference to each root until the root is 139*edbdb43fSSean Christopherson * explicitly invalidated, i.e. the final reference should be never be 140*edbdb43fSSean Christopherson * put for a valid root. 1418351779cSPaolo Bonzini */ 142*edbdb43fSSean Christopherson KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); 1438351779cSPaolo Bonzini 144c0e64238SBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 145c0e64238SBen Gardon list_del_rcu(&root->link); 146c0e64238SBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 147c0e64238SBen Gardon call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 148a889ea54SBen Gardon } 149a889ea54SBen Gardon 150cfc10997SBen Gardon /* 151d62007edSSean Christopherson * Returns the next root after @prev_root (or the first root if @prev_root is 152d62007edSSean Christopherson * NULL). A reference to the returned root is acquired, and the reference to 153d62007edSSean Christopherson * @prev_root is released (the caller obviously must hold a reference to 154d62007edSSean Christopherson * @prev_root if it's non-NULL). 155d62007edSSean Christopherson * 156d62007edSSean Christopherson * If @only_valid is true, invalid roots are skipped. 157d62007edSSean Christopherson * 158d62007edSSean Christopherson * Returns NULL if the end of tdp_mmu_roots was reached. 159cfc10997SBen Gardon */ 160cfc10997SBen Gardon static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 1616103bc07SBen Gardon struct kvm_mmu_page *prev_root, 162d62007edSSean Christopherson bool shared, bool only_valid) 163a889ea54SBen Gardon { 164a889ea54SBen Gardon struct kvm_mmu_page *next_root; 165a889ea54SBen Gardon 166c0e64238SBen Gardon rcu_read_lock(); 167c0e64238SBen Gardon 168cfc10997SBen Gardon if (prev_root) 169c0e64238SBen Gardon next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 170c0e64238SBen Gardon &prev_root->link, 171c0e64238SBen Gardon typeof(*prev_root), link); 172cfc10997SBen Gardon else 173c0e64238SBen Gardon next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 174cfc10997SBen Gardon typeof(*next_root), link); 175cfc10997SBen Gardon 17604dc4e6cSSean Christopherson while (next_root) { 177d62007edSSean Christopherson if ((!only_valid || !next_root->role.invalid) && 178ad6d6b94SJinrong Liang kvm_tdp_mmu_get_root(next_root)) 17904dc4e6cSSean Christopherson break; 18004dc4e6cSSean Christopherson 181c0e64238SBen Gardon next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 182c0e64238SBen Gardon &next_root->link, typeof(*next_root), link); 18304dc4e6cSSean Christopherson } 184fb101293SBen Gardon 185c0e64238SBen Gardon rcu_read_unlock(); 186cfc10997SBen Gardon 187cfc10997SBen Gardon if (prev_root) 1886103bc07SBen Gardon kvm_tdp_mmu_put_root(kvm, prev_root, shared); 189cfc10997SBen Gardon 190a889ea54SBen Gardon return next_root; 191a889ea54SBen Gardon } 192a889ea54SBen Gardon 193a889ea54SBen Gardon /* 194a889ea54SBen Gardon * Note: this iterator gets and puts references to the roots it iterates over. 195a889ea54SBen Gardon * This makes it safe to release the MMU lock and yield within the loop, but 196a889ea54SBen Gardon * if exiting the loop early, the caller must drop the reference to the most 197a889ea54SBen Gardon * recent root. (Unless keeping a live reference is desirable.) 1986103bc07SBen Gardon * 1996103bc07SBen Gardon * If shared is set, this function is operating under the MMU lock in read 2006103bc07SBen Gardon * mode. In the unlikely event that this thread must free a root, the lock 2016103bc07SBen Gardon * will be temporarily dropped and reacquired in write mode. 202a889ea54SBen Gardon */ 203d62007edSSean Christopherson #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 204d62007edSSean Christopherson for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 205cfc10997SBen Gardon _root; \ 206d62007edSSean Christopherson _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 207614f6970SPaolo Bonzini if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 208614f6970SPaolo Bonzini kvm_mmu_page_as_id(_root) != _as_id) { \ 209a3f15bdaSSean Christopherson } else 210a889ea54SBen Gardon 211d62007edSSean Christopherson #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 212d62007edSSean Christopherson __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 213d62007edSSean Christopherson 214614f6970SPaolo Bonzini #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 215614f6970SPaolo Bonzini __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 216d62007edSSean Christopherson 217226b8c8fSSean Christopherson /* 218226b8c8fSSean Christopherson * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 219226b8c8fSSean Christopherson * the implication being that any flow that holds mmu_lock for read is 220226b8c8fSSean Christopherson * inherently yield-friendly and should use the yield-safe variant above. 221226b8c8fSSean Christopherson * Holding mmu_lock for write obviates the need for RCU protection as the list 222226b8c8fSSean Christopherson * is guaranteed to be stable. 223226b8c8fSSean Christopherson */ 224a3f15bdaSSean Christopherson #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 225226b8c8fSSean Christopherson list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 226226b8c8fSSean Christopherson if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 227226b8c8fSSean Christopherson kvm_mmu_page_as_id(_root) != _as_id) { \ 228a3f15bdaSSean Christopherson } else 22902c00b3aSBen Gardon 230a82070b6SDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 23102c00b3aSBen Gardon { 23202c00b3aSBen Gardon struct kvm_mmu_page *sp; 23302c00b3aSBen Gardon 23402c00b3aSBen Gardon sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 23502c00b3aSBen Gardon sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 236a82070b6SDavid Matlack 237a82070b6SDavid Matlack return sp; 238a82070b6SDavid Matlack } 239a82070b6SDavid Matlack 240c10743a1SSean Christopherson static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 241c10743a1SSean Christopherson gfn_t gfn, union kvm_mmu_page_role role) 242a82070b6SDavid Matlack { 24355c510e2SSean Christopherson INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 244428e9216SSean Christopherson 24502c00b3aSBen Gardon set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 24602c00b3aSBen Gardon 247a3aca4deSDavid Matlack sp->role = role; 24802c00b3aSBen Gardon sp->gfn = gfn; 249c10743a1SSean Christopherson sp->ptep = sptep; 25002c00b3aSBen Gardon sp->tdp_mmu_page = true; 25102c00b3aSBen Gardon 25233dd3574SBen Gardon trace_kvm_mmu_get_page(sp, true); 25302c00b3aSBen Gardon } 25402c00b3aSBen Gardon 255a82070b6SDavid Matlack static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 256a3aca4deSDavid Matlack struct tdp_iter *iter) 257a3aca4deSDavid Matlack { 258a3aca4deSDavid Matlack struct kvm_mmu_page *parent_sp; 259a3aca4deSDavid Matlack union kvm_mmu_page_role role; 260a3aca4deSDavid Matlack 261a3aca4deSDavid Matlack parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 262a3aca4deSDavid Matlack 263a3aca4deSDavid Matlack role = parent_sp->role; 264a3aca4deSDavid Matlack role.level--; 265a3aca4deSDavid Matlack 266c10743a1SSean Christopherson tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 267a3aca4deSDavid Matlack } 268a3aca4deSDavid Matlack 2696e6ec584SSean Christopherson hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 27002c00b3aSBen Gardon { 2717a458f0eSPaolo Bonzini union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 27202c00b3aSBen Gardon struct kvm *kvm = vcpu->kvm; 27302c00b3aSBen Gardon struct kvm_mmu_page *root; 27402c00b3aSBen Gardon 2756e6ec584SSean Christopherson lockdep_assert_held_write(&kvm->mmu_lock); 27602c00b3aSBen Gardon 27704dc4e6cSSean Christopherson /* 27804dc4e6cSSean Christopherson * Check for an existing root before allocating a new one. Note, the 27904dc4e6cSSean Christopherson * role check prevents consuming an invalid root. 28004dc4e6cSSean Christopherson */ 281a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 282fb101293SBen Gardon if (root->role.word == role.word && 283ad6d6b94SJinrong Liang kvm_tdp_mmu_get_root(root)) 2846e6ec584SSean Christopherson goto out; 28502c00b3aSBen Gardon } 28602c00b3aSBen Gardon 287a82070b6SDavid Matlack root = tdp_mmu_alloc_sp(vcpu); 288c10743a1SSean Christopherson tdp_mmu_init_sp(root, NULL, 0, role); 289a82070b6SDavid Matlack 290*edbdb43fSSean Christopherson /* 291*edbdb43fSSean Christopherson * TDP MMU roots are kept until they are explicitly invalidated, either 292*edbdb43fSSean Christopherson * by a memslot update or by the destruction of the VM. Initialize the 293*edbdb43fSSean Christopherson * refcount to two; one reference for the vCPU, and one reference for 294*edbdb43fSSean Christopherson * the TDP MMU itself, which is held until the root is invalidated and 295*edbdb43fSSean Christopherson * is ultimately put by tdp_mmu_zap_root_work(). 296*edbdb43fSSean Christopherson */ 297*edbdb43fSSean Christopherson refcount_set(&root->tdp_mmu_root_count, 2); 29802c00b3aSBen Gardon 299c0e64238SBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 300c0e64238SBen Gardon list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 301c0e64238SBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 30202c00b3aSBen Gardon 3036e6ec584SSean Christopherson out: 30402c00b3aSBen Gardon return __pa(root->spt); 305fe5db27dSBen Gardon } 3062f2fad08SBen Gardon 3072f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 3089a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 3099a77daacSBen Gardon bool shared); 3102f2fad08SBen Gardon 31143a063caSYosry Ahmed static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 31243a063caSYosry Ahmed { 31343a063caSYosry Ahmed kvm_account_pgtable_pages((void *)sp->spt, +1); 314d25ceb92SSean Christopherson atomic64_inc(&kvm->arch.tdp_mmu_pages); 31543a063caSYosry Ahmed } 31643a063caSYosry Ahmed 31743a063caSYosry Ahmed static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 31843a063caSYosry Ahmed { 31943a063caSYosry Ahmed kvm_account_pgtable_pages((void *)sp->spt, -1); 320d25ceb92SSean Christopherson atomic64_dec(&kvm->arch.tdp_mmu_pages); 32143a063caSYosry Ahmed } 32243a063caSYosry Ahmed 3232f2fad08SBen Gardon /** 324c298a30cSDavid Matlack * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 325a9442f59SBen Gardon * 326a9442f59SBen Gardon * @kvm: kvm instance 327a9442f59SBen Gardon * @sp: the page to be removed 3289a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 3299a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 3309a77daacSBen Gardon * threads that might be adding or removing pages. 331a9442f59SBen Gardon */ 332c298a30cSDavid Matlack static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 3339a77daacSBen Gardon bool shared) 334a9442f59SBen Gardon { 33543a063caSYosry Ahmed tdp_unaccount_mmu_page(kvm, sp); 336d25ceb92SSean Christopherson 337d25ceb92SSean Christopherson if (!sp->nx_huge_page_disallowed) 338d25ceb92SSean Christopherson return; 339d25ceb92SSean Christopherson 3409a77daacSBen Gardon if (shared) 3419a77daacSBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 3429a77daacSBen Gardon else 343a9442f59SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 344a9442f59SBen Gardon 34561f94478SSean Christopherson sp->nx_huge_page_disallowed = false; 34661f94478SSean Christopherson untrack_possible_nx_huge_page(kvm, sp); 3479a77daacSBen Gardon 3489a77daacSBen Gardon if (shared) 3499a77daacSBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 350a9442f59SBen Gardon } 351a9442f59SBen Gardon 352a9442f59SBen Gardon /** 3530f53dfa3SDavid Matlack * handle_removed_pt() - handle a page table removed from the TDP structure 354a066e61fSBen Gardon * 355a066e61fSBen Gardon * @kvm: kvm instance 356a066e61fSBen Gardon * @pt: the page removed from the paging structure 3579a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use 3589a77daacSBen Gardon * of the MMU lock and the operation must synchronize with other 3599a77daacSBen Gardon * threads that might be modifying SPTEs. 360a066e61fSBen Gardon * 361a066e61fSBen Gardon * Given a page table that has been removed from the TDP paging structure, 362a066e61fSBen Gardon * iterates through the page table to clear SPTEs and free child page tables. 36370fb3e41SBen Gardon * 36470fb3e41SBen Gardon * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 36570fb3e41SBen Gardon * protection. Since this thread removed it from the paging structure, 36670fb3e41SBen Gardon * this thread will be responsible for ensuring the page is freed. Hence the 36770fb3e41SBen Gardon * early rcu_dereferences in the function. 368a066e61fSBen Gardon */ 3690f53dfa3SDavid Matlack static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 370a066e61fSBen Gardon { 37170fb3e41SBen Gardon struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 372a066e61fSBen Gardon int level = sp->role.level; 373e25f0e0cSBen Gardon gfn_t base_gfn = sp->gfn; 374a066e61fSBen Gardon int i; 375a066e61fSBen Gardon 376a066e61fSBen Gardon trace_kvm_mmu_prepare_zap_page(sp); 377a066e61fSBen Gardon 378c298a30cSDavid Matlack tdp_mmu_unlink_sp(kvm, sp, shared); 379a066e61fSBen Gardon 3802ca3129eSSean Christopherson for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 381ba3a6120SSean Christopherson tdp_ptep_t sptep = pt + i; 382574c3c55SBen Gardon gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 383ba3a6120SSean Christopherson u64 old_spte; 3849a77daacSBen Gardon 3859a77daacSBen Gardon if (shared) { 386e25f0e0cSBen Gardon /* 387e25f0e0cSBen Gardon * Set the SPTE to a nonpresent value that other 388e25f0e0cSBen Gardon * threads will not overwrite. If the SPTE was 389e25f0e0cSBen Gardon * already marked as removed then another thread 390e25f0e0cSBen Gardon * handling a page fault could overwrite it, so 391e25f0e0cSBen Gardon * set the SPTE until it is set from some other 392e25f0e0cSBen Gardon * value to the removed SPTE value. 393e25f0e0cSBen Gardon */ 394e25f0e0cSBen Gardon for (;;) { 395ba3a6120SSean Christopherson old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 396ba3a6120SSean Christopherson if (!is_removed_spte(old_spte)) 397e25f0e0cSBen Gardon break; 398e25f0e0cSBen Gardon cpu_relax(); 399e25f0e0cSBen Gardon } 4009a77daacSBen Gardon } else { 4018df9f1afSSean Christopherson /* 4028df9f1afSSean Christopherson * If the SPTE is not MMU-present, there is no backing 4038df9f1afSSean Christopherson * page associated with the SPTE and so no side effects 4048df9f1afSSean Christopherson * that need to be recorded, and exclusive ownership of 4058df9f1afSSean Christopherson * mmu_lock ensures the SPTE can't be made present. 4068df9f1afSSean Christopherson * Note, zapping MMIO SPTEs is also unnecessary as they 4078df9f1afSSean Christopherson * are guarded by the memslots generation, not by being 4088df9f1afSSean Christopherson * unreachable. 4098df9f1afSSean Christopherson */ 410ba3a6120SSean Christopherson old_spte = kvm_tdp_mmu_read_spte(sptep); 411ba3a6120SSean Christopherson if (!is_shadow_present_pte(old_spte)) 4128df9f1afSSean Christopherson continue; 413e25f0e0cSBen Gardon 414e25f0e0cSBen Gardon /* 415ba3a6120SSean Christopherson * Use the common helper instead of a raw WRITE_ONCE as 416ba3a6120SSean Christopherson * the SPTE needs to be updated atomically if it can be 417ba3a6120SSean Christopherson * modified by a different vCPU outside of mmu_lock. 418ba3a6120SSean Christopherson * Even though the parent SPTE is !PRESENT, the TLB 419ba3a6120SSean Christopherson * hasn't yet been flushed, and both Intel and AMD 420ba3a6120SSean Christopherson * document that A/D assists can use upper-level PxE 421ba3a6120SSean Christopherson * entries that are cached in the TLB, i.e. the CPU can 422ba3a6120SSean Christopherson * still access the page and mark it dirty. 423ba3a6120SSean Christopherson * 424ba3a6120SSean Christopherson * No retry is needed in the atomic update path as the 425ba3a6120SSean Christopherson * sole concern is dropping a Dirty bit, i.e. no other 426ba3a6120SSean Christopherson * task can zap/remove the SPTE as mmu_lock is held for 427ba3a6120SSean Christopherson * write. Marking the SPTE as a removed SPTE is not 428ba3a6120SSean Christopherson * strictly necessary for the same reason, but using 429ba3a6120SSean Christopherson * the remove SPTE value keeps the shared/exclusive 430ba3a6120SSean Christopherson * paths consistent and allows the handle_changed_spte() 431ba3a6120SSean Christopherson * call below to hardcode the new value to REMOVED_SPTE. 432ba3a6120SSean Christopherson * 433ba3a6120SSean Christopherson * Note, even though dropping a Dirty bit is the only 434ba3a6120SSean Christopherson * scenario where a non-atomic update could result in a 435ba3a6120SSean Christopherson * functional bug, simply checking the Dirty bit isn't 436ba3a6120SSean Christopherson * sufficient as a fast page fault could read the upper 437ba3a6120SSean Christopherson * level SPTE before it is zapped, and then make this 438ba3a6120SSean Christopherson * target SPTE writable, resume the guest, and set the 439ba3a6120SSean Christopherson * Dirty bit between reading the SPTE above and writing 440ba3a6120SSean Christopherson * it here. 441e25f0e0cSBen Gardon */ 442ba3a6120SSean Christopherson old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 443ba3a6120SSean Christopherson REMOVED_SPTE, level); 4449a77daacSBen Gardon } 445e25f0e0cSBen Gardon handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 446ba3a6120SSean Christopherson old_spte, REMOVED_SPTE, level, shared); 447a066e61fSBen Gardon } 448a066e61fSBen Gardon 4497cca2d0bSBen Gardon call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 450a066e61fSBen Gardon } 451a066e61fSBen Gardon 452a066e61fSBen Gardon /** 45340fa907eSVipin Sharma * handle_changed_spte - handle bookkeeping associated with an SPTE change 4542f2fad08SBen Gardon * @kvm: kvm instance 4552f2fad08SBen Gardon * @as_id: the address space of the paging structure the SPTE was a part of 4562f2fad08SBen Gardon * @gfn: the base GFN that was mapped by the SPTE 4572f2fad08SBen Gardon * @old_spte: The value of the SPTE before the change 4582f2fad08SBen Gardon * @new_spte: The value of the SPTE after the change 4592f2fad08SBen Gardon * @level: the level of the PT the SPTE is part of in the paging structure 4609a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 4619a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 4629a77daacSBen Gardon * threads that might be modifying SPTEs. 4632f2fad08SBen Gardon * 4641f997345SVipin Sharma * Handle bookkeeping that might result from the modification of a SPTE. Note, 4651f997345SVipin Sharma * dirty logging updates are handled in common code, not here (see make_spte() 4661f997345SVipin Sharma * and fast_pf_fix_direct_spte()). 4672f2fad08SBen Gardon */ 46840fa907eSVipin Sharma static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 4699a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 4709a77daacSBen Gardon bool shared) 4712f2fad08SBen Gardon { 4722f2fad08SBen Gardon bool was_present = is_shadow_present_pte(old_spte); 4732f2fad08SBen Gardon bool is_present = is_shadow_present_pte(new_spte); 4742f2fad08SBen Gardon bool was_leaf = was_present && is_last_spte(old_spte, level); 4752f2fad08SBen Gardon bool is_leaf = is_present && is_last_spte(new_spte, level); 4762f2fad08SBen Gardon bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 4772f2fad08SBen Gardon 4782f2fad08SBen Gardon WARN_ON(level > PT64_ROOT_MAX_LEVEL); 4792f2fad08SBen Gardon WARN_ON(level < PG_LEVEL_4K); 480764388ceSSean Christopherson WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 4812f2fad08SBen Gardon 4822f2fad08SBen Gardon /* 4832f2fad08SBen Gardon * If this warning were to trigger it would indicate that there was a 4842f2fad08SBen Gardon * missing MMU notifier or a race with some notifier handler. 4852f2fad08SBen Gardon * A present, leaf SPTE should never be directly replaced with another 486d9f6e12fSIngo Molnar * present leaf SPTE pointing to a different PFN. A notifier handler 4872f2fad08SBen Gardon * should be zapping the SPTE before the main MM's page table is 4882f2fad08SBen Gardon * changed, or the SPTE should be zeroed, and the TLBs flushed by the 4892f2fad08SBen Gardon * thread before replacement. 4902f2fad08SBen Gardon */ 4912f2fad08SBen Gardon if (was_leaf && is_leaf && pfn_changed) { 4922f2fad08SBen Gardon pr_err("Invalid SPTE change: cannot replace a present leaf\n" 4932f2fad08SBen Gardon "SPTE with another present leaf SPTE mapping a\n" 4942f2fad08SBen Gardon "different PFN!\n" 4952f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 4962f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 4972f2fad08SBen Gardon 4982f2fad08SBen Gardon /* 4992f2fad08SBen Gardon * Crash the host to prevent error propagation and guest data 500d9f6e12fSIngo Molnar * corruption. 5012f2fad08SBen Gardon */ 5022f2fad08SBen Gardon BUG(); 5032f2fad08SBen Gardon } 5042f2fad08SBen Gardon 5052f2fad08SBen Gardon if (old_spte == new_spte) 5062f2fad08SBen Gardon return; 5072f2fad08SBen Gardon 508b9a98c34SBen Gardon trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 509b9a98c34SBen Gardon 510115111efSDavid Matlack if (is_leaf) 511115111efSDavid Matlack check_spte_writable_invariants(new_spte); 512115111efSDavid Matlack 5132f2fad08SBen Gardon /* 5142f2fad08SBen Gardon * The only times a SPTE should be changed from a non-present to 5152f2fad08SBen Gardon * non-present state is when an MMIO entry is installed/modified/ 5162f2fad08SBen Gardon * removed. In that case, there is nothing to do here. 5172f2fad08SBen Gardon */ 5182f2fad08SBen Gardon if (!was_present && !is_present) { 5192f2fad08SBen Gardon /* 52008f07c80SBen Gardon * If this change does not involve a MMIO SPTE or removed SPTE, 52108f07c80SBen Gardon * it is unexpected. Log the change, though it should not 52208f07c80SBen Gardon * impact the guest since both the former and current SPTEs 52308f07c80SBen Gardon * are nonpresent. 5242f2fad08SBen Gardon */ 52508f07c80SBen Gardon if (WARN_ON(!is_mmio_spte(old_spte) && 52608f07c80SBen Gardon !is_mmio_spte(new_spte) && 52708f07c80SBen Gardon !is_removed_spte(new_spte))) 5282f2fad08SBen Gardon pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 5292f2fad08SBen Gardon "should not be replaced with another,\n" 5302f2fad08SBen Gardon "different nonpresent SPTE, unless one or both\n" 53108f07c80SBen Gardon "are MMIO SPTEs, or the new SPTE is\n" 53208f07c80SBen Gardon "a temporary removed SPTE.\n" 5332f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 5342f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 5352f2fad08SBen Gardon return; 5362f2fad08SBen Gardon } 5372f2fad08SBen Gardon 53871f51d2cSMingwei Zhang if (is_leaf != was_leaf) 53971f51d2cSMingwei Zhang kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 5402f2fad08SBen Gardon 5412f2fad08SBen Gardon if (was_leaf && is_dirty_spte(old_spte) && 54264bb2769SSean Christopherson (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 5432f2fad08SBen Gardon kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 5442f2fad08SBen Gardon 5452f2fad08SBen Gardon /* 5462f2fad08SBen Gardon * Recursively handle child PTs if the change removed a subtree from 547c8e5a0d0SSean Christopherson * the paging structure. Note the WARN on the PFN changing without the 548c8e5a0d0SSean Christopherson * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 549c8e5a0d0SSean Christopherson * pages are kernel allocations and should never be migrated. 5502f2fad08SBen Gardon */ 551c8e5a0d0SSean Christopherson if (was_present && !was_leaf && 552c8e5a0d0SSean Christopherson (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 5530f53dfa3SDavid Matlack handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 5542f2fad08SBen Gardon 55540fa907eSVipin Sharma if (was_leaf && is_accessed_spte(old_spte) && 55640fa907eSVipin Sharma (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) 55740fa907eSVipin Sharma kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 5582f2fad08SBen Gardon } 559faaf05b0SBen Gardon 560fe43fa2fSBen Gardon /* 5616ccf4438SPaolo Bonzini * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 5626ccf4438SPaolo Bonzini * and handle the associated bookkeeping. Do not mark the page dirty 56324ae4cfaSBen Gardon * in KVM's dirty bitmaps. 5649a77daacSBen Gardon * 5653255530aSDavid Matlack * If setting the SPTE fails because it has changed, iter->old_spte will be 5663255530aSDavid Matlack * refreshed to the current value of the spte. 5673255530aSDavid Matlack * 5689a77daacSBen Gardon * @kvm: kvm instance 5699a77daacSBen Gardon * @iter: a tdp_iter instance currently on the SPTE that should be set 5709a77daacSBen Gardon * @new_spte: The value the SPTE should be set to 5713e72c791SDavid Matlack * Return: 5723e72c791SDavid Matlack * * 0 - If the SPTE was set. 5733e72c791SDavid Matlack * * -EBUSY - If the SPTE cannot be set. In this case this function will have 5743e72c791SDavid Matlack * no side-effects other than setting iter->old_spte to the last 5753e72c791SDavid Matlack * known value of the spte. 5769a77daacSBen Gardon */ 5773e72c791SDavid Matlack static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 5789a77daacSBen Gardon struct tdp_iter *iter, 5799a77daacSBen Gardon u64 new_spte) 5809a77daacSBen Gardon { 5813255530aSDavid Matlack u64 *sptep = rcu_dereference(iter->sptep); 5823255530aSDavid Matlack 583396fd74dSSean Christopherson /* 584396fd74dSSean Christopherson * The caller is responsible for ensuring the old SPTE is not a REMOVED 585396fd74dSSean Christopherson * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 586396fd74dSSean Christopherson * and pre-checking before inserting a new SPTE is advantageous as it 587396fd74dSSean Christopherson * avoids unnecessary work. 588396fd74dSSean Christopherson */ 589396fd74dSSean Christopherson WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 5903a0f64deSSean Christopherson 5919a77daacSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 5929a77daacSBen Gardon 59308f07c80SBen Gardon /* 5946e8eb206SDavid Matlack * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 5956e8eb206SDavid Matlack * does not hold the mmu_lock. 5966e8eb206SDavid Matlack */ 597aee98a68SUros Bizjak if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 5983e72c791SDavid Matlack return -EBUSY; 5999a77daacSBen Gardon 60040fa907eSVipin Sharma handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 60108889894SSean Christopherson new_spte, iter->level, true); 6029a77daacSBen Gardon 6033e72c791SDavid Matlack return 0; 6049a77daacSBen Gardon } 6059a77daacSBen Gardon 6063e72c791SDavid Matlack static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 60708f07c80SBen Gardon struct tdp_iter *iter) 60808f07c80SBen Gardon { 6093e72c791SDavid Matlack int ret; 6103e72c791SDavid Matlack 61108f07c80SBen Gardon /* 61208f07c80SBen Gardon * Freeze the SPTE by setting it to a special, 61308f07c80SBen Gardon * non-present value. This will stop other threads from 61408f07c80SBen Gardon * immediately installing a present entry in its place 61508f07c80SBen Gardon * before the TLBs are flushed. 61608f07c80SBen Gardon */ 6173e72c791SDavid Matlack ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 6183e72c791SDavid Matlack if (ret) 6193e72c791SDavid Matlack return ret; 62008f07c80SBen Gardon 6214ad980aeSHou Wenlong kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); 62208f07c80SBen Gardon 62308f07c80SBen Gardon /* 624ba3a6120SSean Christopherson * No other thread can overwrite the removed SPTE as they must either 625ba3a6120SSean Christopherson * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 626ba3a6120SSean Christopherson * overwrite the special removed SPTE value. No bookkeeping is needed 627ba3a6120SSean Christopherson * here since the SPTE is going from non-present to non-present. Use 628ba3a6120SSean Christopherson * the raw write helper to avoid an unnecessary check on volatile bits. 62908f07c80SBen Gardon */ 630ba3a6120SSean Christopherson __kvm_tdp_mmu_write_spte(iter->sptep, 0); 63108f07c80SBen Gardon 6323e72c791SDavid Matlack return 0; 63308f07c80SBen Gardon } 63408f07c80SBen Gardon 6359a77daacSBen Gardon 6369a77daacSBen Gardon /* 6370b7cc254SVipin Sharma * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 638626808d1SSean Christopherson * @kvm: KVM instance 639626808d1SSean Christopherson * @as_id: Address space ID, i.e. regular vs. SMM 640626808d1SSean Christopherson * @sptep: Pointer to the SPTE 641626808d1SSean Christopherson * @old_spte: The current value of the SPTE 642626808d1SSean Christopherson * @new_spte: The new value that will be set for the SPTE 643626808d1SSean Christopherson * @gfn: The base GFN that was (or will be) mapped by the SPTE 644626808d1SSean Christopherson * @level: The level _containing_ the SPTE (its parent PT's level) 645ba3a6120SSean Christopherson * 646ba3a6120SSean Christopherson * Returns the old SPTE value, which _may_ be different than @old_spte if the 647ba3a6120SSean Christopherson * SPTE had voldatile bits. 648fe43fa2fSBen Gardon */ 6490b7cc254SVipin Sharma static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 6500b7cc254SVipin Sharma u64 old_spte, u64 new_spte, gfn_t gfn, int level) 651faaf05b0SBen Gardon { 652531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 6533a9a4aa5SBen Gardon 65408f07c80SBen Gardon /* 655966da62aSSean Christopherson * No thread should be using this function to set SPTEs to or from the 65608f07c80SBen Gardon * temporary removed SPTE value. 65708f07c80SBen Gardon * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 65808f07c80SBen Gardon * should be used. If operating under the MMU lock in write mode, the 65908f07c80SBen Gardon * use of the removed SPTE should not be necessary. 66008f07c80SBen Gardon */ 661626808d1SSean Christopherson WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 66208f07c80SBen Gardon 663ba3a6120SSean Christopherson old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 664faaf05b0SBen Gardon 66540fa907eSVipin Sharma handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 666ba3a6120SSean Christopherson return old_spte; 667626808d1SSean Christopherson } 668626808d1SSean Christopherson 6690b7cc254SVipin Sharma static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, 670f8e14497SBen Gardon u64 new_spte) 671f8e14497SBen Gardon { 6720b7cc254SVipin Sharma WARN_ON_ONCE(iter->yielded); 6730b7cc254SVipin Sharma iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 6740b7cc254SVipin Sharma iter->old_spte, new_spte, 6750b7cc254SVipin Sharma iter->gfn, iter->level); 676f8e14497SBen Gardon } 677f8e14497SBen Gardon 678faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 67977aa6075SDavid Matlack for_each_tdp_pte(_iter, _root, _start, _end) 680faaf05b0SBen Gardon 681f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 682f8e14497SBen Gardon tdp_root_for_each_pte(_iter, _root, _start, _end) \ 683f8e14497SBen Gardon if (!is_shadow_present_pte(_iter.old_spte) || \ 684f8e14497SBen Gardon !is_last_spte(_iter.old_spte, _iter.level)) \ 685f8e14497SBen Gardon continue; \ 686f8e14497SBen Gardon else 687f8e14497SBen Gardon 688bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 689b9e5603cSPaolo Bonzini for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 690bb18842eSBen Gardon 691faaf05b0SBen Gardon /* 692e28a436cSBen Gardon * Yield if the MMU lock is contended or this thread needs to return control 693e28a436cSBen Gardon * to the scheduler. 694e28a436cSBen Gardon * 695e139a34eSBen Gardon * If this function should yield and flush is set, it will perform a remote 696e139a34eSBen Gardon * TLB flush before yielding. 697e139a34eSBen Gardon * 6983a0f64deSSean Christopherson * If this function yields, iter->yielded is set and the caller must skip to 6993a0f64deSSean Christopherson * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 7003a0f64deSSean Christopherson * over the paging structures to allow the iterator to continue its traversal 7013a0f64deSSean Christopherson * from the paging structure root. 702e28a436cSBen Gardon * 7033a0f64deSSean Christopherson * Returns true if this function yielded. 704e28a436cSBen Gardon */ 7053a0f64deSSean Christopherson static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 7063a0f64deSSean Christopherson struct tdp_iter *iter, 7073a0f64deSSean Christopherson bool flush, bool shared) 708a6a0b05dSBen Gardon { 7093a0f64deSSean Christopherson WARN_ON(iter->yielded); 7103a0f64deSSean Christopherson 711ed5e484bSBen Gardon /* Ensure forward progress has been made before yielding. */ 712ed5e484bSBen Gardon if (iter->next_last_level_gfn == iter->yielded_gfn) 713ed5e484bSBen Gardon return false; 714ed5e484bSBen Gardon 715531810caSBen Gardon if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 716e139a34eSBen Gardon if (flush) 717e139a34eSBen Gardon kvm_flush_remote_tlbs(kvm); 718e139a34eSBen Gardon 719bd296779SSean Christopherson rcu_read_unlock(); 720bd296779SSean Christopherson 7216103bc07SBen Gardon if (shared) 7226103bc07SBen Gardon cond_resched_rwlock_read(&kvm->mmu_lock); 7236103bc07SBen Gardon else 724531810caSBen Gardon cond_resched_rwlock_write(&kvm->mmu_lock); 7256103bc07SBen Gardon 7267cca2d0bSBen Gardon rcu_read_lock(); 727ed5e484bSBen Gardon 728ed5e484bSBen Gardon WARN_ON(iter->gfn > iter->next_last_level_gfn); 729ed5e484bSBen Gardon 7303a0f64deSSean Christopherson iter->yielded = true; 731a6a0b05dSBen Gardon } 732e28a436cSBen Gardon 7333a0f64deSSean Christopherson return iter->yielded; 734a6a0b05dSBen Gardon } 735a6a0b05dSBen Gardon 73686931ff7SSean Christopherson static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 737e2b5b21dSSean Christopherson { 738e2b5b21dSSean Christopherson /* 73986931ff7SSean Christopherson * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 74086931ff7SSean Christopherson * a gpa range that would exceed the max gfn, and KVM does not create 74186931ff7SSean Christopherson * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 74286931ff7SSean Christopherson * the slow emulation path every time. 743e2b5b21dSSean Christopherson */ 74486931ff7SSean Christopherson return kvm_mmu_max_gfn() + 1; 745e2b5b21dSSean Christopherson } 746e2b5b21dSSean Christopherson 7471b6043e8SSean Christopherson static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 7481b6043e8SSean Christopherson bool shared, int zap_level) 749e2b5b21dSSean Christopherson { 750e2b5b21dSSean Christopherson struct tdp_iter iter; 751e2b5b21dSSean Christopherson 75286931ff7SSean Christopherson gfn_t end = tdp_mmu_max_gfn_exclusive(); 753e2b5b21dSSean Christopherson gfn_t start = 0; 754e2b5b21dSSean Christopherson 7551b6043e8SSean Christopherson for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 7561b6043e8SSean Christopherson retry: 7571b6043e8SSean Christopherson if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 7581b6043e8SSean Christopherson continue; 7591b6043e8SSean Christopherson 7601b6043e8SSean Christopherson if (!is_shadow_present_pte(iter.old_spte)) 7611b6043e8SSean Christopherson continue; 7621b6043e8SSean Christopherson 7631b6043e8SSean Christopherson if (iter.level > zap_level) 7641b6043e8SSean Christopherson continue; 7651b6043e8SSean Christopherson 7661b6043e8SSean Christopherson if (!shared) 7670b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, &iter, 0); 7681b6043e8SSean Christopherson else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 7691b6043e8SSean Christopherson goto retry; 7701b6043e8SSean Christopherson } 7711b6043e8SSean Christopherson } 7721b6043e8SSean Christopherson 7731b6043e8SSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 7741b6043e8SSean Christopherson bool shared) 7751b6043e8SSean Christopherson { 7761b6043e8SSean Christopherson 7778351779cSPaolo Bonzini /* 7788351779cSPaolo Bonzini * The root must have an elevated refcount so that it's reachable via 7798351779cSPaolo Bonzini * mmu_notifier callbacks, which allows this path to yield and drop 7808351779cSPaolo Bonzini * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 7818351779cSPaolo Bonzini * must drop all references to relevant pages prior to completing the 7828351779cSPaolo Bonzini * callback. Dropping mmu_lock with an unreachable root would result 7838351779cSPaolo Bonzini * in zapping SPTEs after a relevant mmu_notifier callback completes 7848351779cSPaolo Bonzini * and lead to use-after-free as zapping a SPTE triggers "writeback" of 7858351779cSPaolo Bonzini * dirty accessed bits to the SPTE's associated struct page. 7868351779cSPaolo Bonzini */ 7878351779cSPaolo Bonzini WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 7888351779cSPaolo Bonzini 789e2b5b21dSSean Christopherson kvm_lockdep_assert_mmu_lock_held(kvm, shared); 790e2b5b21dSSean Christopherson 791e2b5b21dSSean Christopherson rcu_read_lock(); 792e2b5b21dSSean Christopherson 793e2b5b21dSSean Christopherson /* 7941b6043e8SSean Christopherson * To avoid RCU stalls due to recursively removing huge swaths of SPs, 7951b6043e8SSean Christopherson * split the zap into two passes. On the first pass, zap at the 1gb 7961b6043e8SSean Christopherson * level, and then zap top-level SPs on the second pass. "1gb" is not 7971b6043e8SSean Christopherson * arbitrary, as KVM must be able to zap a 1gb shadow page without 7981b6043e8SSean Christopherson * inducing a stall to allow in-place replacement with a 1gb hugepage. 7991b6043e8SSean Christopherson * 8001b6043e8SSean Christopherson * Because zapping a SP recurses on its children, stepping down to 8011b6043e8SSean Christopherson * PG_LEVEL_4K in the iterator itself is unnecessary. 802e2b5b21dSSean Christopherson */ 8031b6043e8SSean Christopherson __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 8041b6043e8SSean Christopherson __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 805e2b5b21dSSean Christopherson 806e2b5b21dSSean Christopherson rcu_read_unlock(); 807e2b5b21dSSean Christopherson } 808e2b5b21dSSean Christopherson 809c10743a1SSean Christopherson bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 810c10743a1SSean Christopherson { 811c10743a1SSean Christopherson u64 old_spte; 812c10743a1SSean Christopherson 813c10743a1SSean Christopherson /* 814c10743a1SSean Christopherson * This helper intentionally doesn't allow zapping a root shadow page, 815c10743a1SSean Christopherson * which doesn't have a parent page table and thus no associated entry. 816c10743a1SSean Christopherson */ 817c10743a1SSean Christopherson if (WARN_ON_ONCE(!sp->ptep)) 818c10743a1SSean Christopherson return false; 819c10743a1SSean Christopherson 820c10743a1SSean Christopherson old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 821bb95dfb9SSean Christopherson if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 822c10743a1SSean Christopherson return false; 823c10743a1SSean Christopherson 8240b7cc254SVipin Sharma tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 8250b7cc254SVipin Sharma sp->gfn, sp->role.level + 1); 826c10743a1SSean Christopherson 827c10743a1SSean Christopherson return true; 828c10743a1SSean Christopherson } 829c10743a1SSean Christopherson 830faaf05b0SBen Gardon /* 831063afacdSBen Gardon * If can_yield is true, will release the MMU lock and reschedule if the 832063afacdSBen Gardon * scheduler needs the CPU or there is contention on the MMU lock. If this 833063afacdSBen Gardon * function cannot yield, it will not release the MMU lock or reschedule and 834063afacdSBen Gardon * the caller must ensure it does not supply too large a GFN range, or the 8356103bc07SBen Gardon * operation can cause a soft lockup. 836faaf05b0SBen Gardon */ 837f47e5bbbSSean Christopherson static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 838acbda82aSSean Christopherson gfn_t start, gfn_t end, bool can_yield, bool flush) 839faaf05b0SBen Gardon { 840faaf05b0SBen Gardon struct tdp_iter iter; 841faaf05b0SBen Gardon 84286931ff7SSean Christopherson end = min(end, tdp_mmu_max_gfn_exclusive()); 843524a1e4eSSean Christopherson 844acbda82aSSean Christopherson lockdep_assert_held_write(&kvm->mmu_lock); 8456103bc07SBen Gardon 8467cca2d0bSBen Gardon rcu_read_lock(); 8477cca2d0bSBen Gardon 848f47e5bbbSSean Christopherson for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 8491af4a960SBen Gardon if (can_yield && 850acbda82aSSean Christopherson tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 851a835429cSSean Christopherson flush = false; 8521af4a960SBen Gardon continue; 8531af4a960SBen Gardon } 8541af4a960SBen Gardon 855f47e5bbbSSean Christopherson if (!is_shadow_present_pte(iter.old_spte) || 856faaf05b0SBen Gardon !is_last_spte(iter.old_spte, iter.level)) 857faaf05b0SBen Gardon continue; 858faaf05b0SBen Gardon 8590b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, &iter, 0); 860a835429cSSean Christopherson flush = true; 861faaf05b0SBen Gardon } 8627cca2d0bSBen Gardon 8637cca2d0bSBen Gardon rcu_read_unlock(); 864bb95dfb9SSean Christopherson 865f47e5bbbSSean Christopherson /* 866f47e5bbbSSean Christopherson * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 867f47e5bbbSSean Christopherson * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 868f47e5bbbSSean Christopherson */ 869f47e5bbbSSean Christopherson return flush; 870faaf05b0SBen Gardon } 871faaf05b0SBen Gardon 872faaf05b0SBen Gardon /* 8737edc3a68SKai Huang * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 8747edc3a68SKai Huang * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 8757edc3a68SKai Huang * more SPTEs were zapped since the MMU lock was last acquired. 876faaf05b0SBen Gardon */ 877f47e5bbbSSean Christopherson bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 878f47e5bbbSSean Christopherson bool can_yield, bool flush) 879faaf05b0SBen Gardon { 880faaf05b0SBen Gardon struct kvm_mmu_page *root; 881faaf05b0SBen Gardon 882614f6970SPaolo Bonzini for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 883f47e5bbbSSean Christopherson flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 884faaf05b0SBen Gardon 885faaf05b0SBen Gardon return flush; 886faaf05b0SBen Gardon } 887faaf05b0SBen Gardon 888faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm) 889faaf05b0SBen Gardon { 890e2b5b21dSSean Christopherson struct kvm_mmu_page *root; 8912b9663d8SSean Christopherson int i; 892faaf05b0SBen Gardon 89377c8cd6bSSean Christopherson /* 89422b94c4bSPaolo Bonzini * Zap all roots, including invalid roots, as all SPTEs must be dropped 89522b94c4bSPaolo Bonzini * before returning to the caller. Zap directly even if the root is 89622b94c4bSPaolo Bonzini * also being zapped by a worker. Walking zapped top-level SPTEs isn't 89722b94c4bSPaolo Bonzini * all that expensive and mmu_lock is already held, which means the 89822b94c4bSPaolo Bonzini * worker has yielded, i.e. flushing the work instead of zapping here 89922b94c4bSPaolo Bonzini * isn't guaranteed to be any faster. 90022b94c4bSPaolo Bonzini * 90177c8cd6bSSean Christopherson * A TLB flush is unnecessary, KVM zaps everything if and only the VM 90277c8cd6bSSean Christopherson * is being destroyed or the userspace VMM has exited. In both cases, 90377c8cd6bSSean Christopherson * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 90477c8cd6bSSean Christopherson */ 905e2b5b21dSSean Christopherson for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 906e2b5b21dSSean Christopherson for_each_tdp_mmu_root_yield_safe(kvm, root, i) 907e2b5b21dSSean Christopherson tdp_mmu_zap_root(kvm, root, false); 908e2b5b21dSSean Christopherson } 909faaf05b0SBen Gardon } 910bb18842eSBen Gardon 9114c6654bdSBen Gardon /* 912f28e9c7fSSean Christopherson * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 91322b94c4bSPaolo Bonzini * zap" completes. 9144c6654bdSBen Gardon */ 9154c6654bdSBen Gardon void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 9164c6654bdSBen Gardon { 91722b94c4bSPaolo Bonzini flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 9184c6654bdSBen Gardon } 9194c6654bdSBen Gardon 920bb18842eSBen Gardon /* 921f28e9c7fSSean Christopherson * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 92222b94c4bSPaolo Bonzini * is about to be zapped, e.g. in response to a memslots update. The actual 923*edbdb43fSSean Christopherson * zapping is performed asynchronously. Using a separate workqueue makes it 924*edbdb43fSSean Christopherson * easy to ensure that the destruction is performed before the "fast zap" 925*edbdb43fSSean Christopherson * completes, without keeping a separate list of invalidated roots; the list is 926*edbdb43fSSean Christopherson * effectively the list of work items in the workqueue. 927b7cccd39SBen Gardon * 928*edbdb43fSSean Christopherson * Note, the asynchronous worker is gifted the TDP MMU's reference. 929*edbdb43fSSean Christopherson * See kvm_tdp_mmu_get_vcpu_root_hpa(). 930b7cccd39SBen Gardon */ 931b7cccd39SBen Gardon void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 932b7cccd39SBen Gardon { 933b7cccd39SBen Gardon struct kvm_mmu_page *root; 934b7cccd39SBen Gardon 935*edbdb43fSSean Christopherson /* 936*edbdb43fSSean Christopherson * mmu_lock must be held for write to ensure that a root doesn't become 937*edbdb43fSSean Christopherson * invalid while there are active readers (invalidating a root while 938*edbdb43fSSean Christopherson * there are active readers may or may not be problematic in practice, 939*edbdb43fSSean Christopherson * but it's uncharted territory and not supported). 940*edbdb43fSSean Christopherson * 941*edbdb43fSSean Christopherson * Waive the assertion if there are no users of @kvm, i.e. the VM is 942*edbdb43fSSean Christopherson * being destroyed after all references have been put, or if no vCPUs 943*edbdb43fSSean Christopherson * have been created (which means there are no roots), i.e. the VM is 944*edbdb43fSSean Christopherson * being destroyed in an error path of KVM_CREATE_VM. 945*edbdb43fSSean Christopherson */ 946*edbdb43fSSean Christopherson if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 947*edbdb43fSSean Christopherson refcount_read(&kvm->users_count) && kvm->created_vcpus) 948b7cccd39SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 949*edbdb43fSSean Christopherson 950*edbdb43fSSean Christopherson /* 951*edbdb43fSSean Christopherson * As above, mmu_lock isn't held when destroying the VM! There can't 952*edbdb43fSSean Christopherson * be other references to @kvm, i.e. nothing else can invalidate roots 953*edbdb43fSSean Christopherson * or be consuming roots, but walking the list of roots does need to be 954*edbdb43fSSean Christopherson * guarded against roots being deleted by the asynchronous zap worker. 955*edbdb43fSSean Christopherson */ 956*edbdb43fSSean Christopherson rcu_read_lock(); 957*edbdb43fSSean Christopherson 958*edbdb43fSSean Christopherson list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { 959*edbdb43fSSean Christopherson if (!root->role.invalid) { 960b7cccd39SBen Gardon root->role.invalid = true; 96122b94c4bSPaolo Bonzini tdp_mmu_schedule_zap_root(kvm, root); 96222b94c4bSPaolo Bonzini } 963b7cccd39SBen Gardon } 964*edbdb43fSSean Christopherson 965*edbdb43fSSean Christopherson rcu_read_unlock(); 966f28e9c7fSSean Christopherson } 967b7cccd39SBen Gardon 968bb18842eSBen Gardon /* 969bb18842eSBen Gardon * Installs a last-level SPTE to handle a TDP page fault. 970bb18842eSBen Gardon * (NPT/EPT violation/misconfiguration) 971bb18842eSBen Gardon */ 972cdc47767SPaolo Bonzini static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 973cdc47767SPaolo Bonzini struct kvm_page_fault *fault, 974cdc47767SPaolo Bonzini struct tdp_iter *iter) 975bb18842eSBen Gardon { 976c435d4b7SSean Christopherson struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 977bb18842eSBen Gardon u64 new_spte; 97857a3e96dSKai Huang int ret = RET_PF_FIXED; 979ad67e480SPaolo Bonzini bool wrprot = false; 980bb18842eSBen Gardon 98150a9ac25SSean Christopherson if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 98250a9ac25SSean Christopherson return RET_PF_RETRY; 98350a9ac25SSean Christopherson 984e710c5f6SDavid Matlack if (unlikely(!fault->slot)) 985bb18842eSBen Gardon new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 9869a77daacSBen Gardon else 98753597858SDavid Matlack wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 9882839180cSPaolo Bonzini fault->pfn, iter->old_spte, fault->prefetch, true, 9897158bee4SPaolo Bonzini fault->map_writable, &new_spte); 990bb18842eSBen Gardon 991bb18842eSBen Gardon if (new_spte == iter->old_spte) 992bb18842eSBen Gardon ret = RET_PF_SPURIOUS; 9933e72c791SDavid Matlack else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 9949a77daacSBen Gardon return RET_PF_RETRY; 995bb95dfb9SSean Christopherson else if (is_shadow_present_pte(iter->old_spte) && 996bb95dfb9SSean Christopherson !is_last_spte(iter->old_spte, iter->level)) 9971e203847SHou Wenlong kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 998bb18842eSBen Gardon 999bb18842eSBen Gardon /* 1000bb18842eSBen Gardon * If the page fault was caused by a write but the page is write 1001bb18842eSBen Gardon * protected, emulation is needed. If the emulation was skipped, 1002bb18842eSBen Gardon * the vCPU would have the same fault again. 1003bb18842eSBen Gardon */ 1004ad67e480SPaolo Bonzini if (wrprot) { 1005cdc47767SPaolo Bonzini if (fault->write) 1006bb18842eSBen Gardon ret = RET_PF_EMULATE; 1007bb18842eSBen Gardon } 1008bb18842eSBen Gardon 1009bb18842eSBen Gardon /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 10109a77daacSBen Gardon if (unlikely(is_mmio_spte(new_spte))) { 10111075d41eSSean Christopherson vcpu->stat.pf_mmio_spte_created++; 10129a77daacSBen Gardon trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 10139a77daacSBen Gardon new_spte); 1014bb18842eSBen Gardon ret = RET_PF_EMULATE; 10153849e092SSean Christopherson } else { 10169a77daacSBen Gardon trace_kvm_mmu_set_spte(iter->level, iter->gfn, 10179a77daacSBen Gardon rcu_dereference(iter->sptep)); 10183849e092SSean Christopherson } 1019bb18842eSBen Gardon 1020bb18842eSBen Gardon return ret; 1021bb18842eSBen Gardon } 1022bb18842eSBen Gardon 1023bb18842eSBen Gardon /* 1024cb00a70bSDavid Matlack * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1025cb00a70bSDavid Matlack * provided page table. 10267b7e1ab6SDavid Matlack * 10277b7e1ab6SDavid Matlack * @kvm: kvm instance 10287b7e1ab6SDavid Matlack * @iter: a tdp_iter instance currently on the SPTE that should be set 10297b7e1ab6SDavid Matlack * @sp: The new TDP page table to install. 1030cb00a70bSDavid Matlack * @shared: This operation is running under the MMU lock in read mode. 10317b7e1ab6SDavid Matlack * 10327b7e1ab6SDavid Matlack * Returns: 0 if the new page table was installed. Non-0 if the page table 10337b7e1ab6SDavid Matlack * could not be installed (e.g. the atomic compare-exchange failed). 10347b7e1ab6SDavid Matlack */ 1035cb00a70bSDavid Matlack static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 103661f94478SSean Christopherson struct kvm_mmu_page *sp, bool shared) 10377b7e1ab6SDavid Matlack { 103854275f74SSean Christopherson u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1039cb00a70bSDavid Matlack int ret = 0; 10407b7e1ab6SDavid Matlack 1041cb00a70bSDavid Matlack if (shared) { 10427b7e1ab6SDavid Matlack ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 10437b7e1ab6SDavid Matlack if (ret) 10447b7e1ab6SDavid Matlack return ret; 1045cb00a70bSDavid Matlack } else { 10460b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, iter, spte); 1047cb00a70bSDavid Matlack } 10487b7e1ab6SDavid Matlack 104943a063caSYosry Ahmed tdp_account_mmu_page(kvm, sp); 10507b7e1ab6SDavid Matlack 10517b7e1ab6SDavid Matlack return 0; 10527b7e1ab6SDavid Matlack } 10537b7e1ab6SDavid Matlack 1054c4b33d28SDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1055c4b33d28SDavid Matlack struct kvm_mmu_page *sp, bool shared); 1056c4b33d28SDavid Matlack 10577b7e1ab6SDavid Matlack /* 1058bb18842eSBen Gardon * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1059bb18842eSBen Gardon * page tables and SPTEs to translate the faulting guest physical address. 1060bb18842eSBen Gardon */ 10612f6305ddSPaolo Bonzini int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1062bb18842eSBen Gardon { 1063bb18842eSBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 106461f94478SSean Christopherson struct kvm *kvm = vcpu->kvm; 1065bb18842eSBen Gardon struct tdp_iter iter; 106689c0fd49SBen Gardon struct kvm_mmu_page *sp; 106763d28a25SPaolo Bonzini int ret = RET_PF_RETRY; 1068bb18842eSBen Gardon 106973a3c659SPaolo Bonzini kvm_mmu_hugepage_adjust(vcpu, fault); 1070bb18842eSBen Gardon 1071f0066d94SPaolo Bonzini trace_kvm_mmu_spte_requested(fault); 10727cca2d0bSBen Gardon 10737cca2d0bSBen Gardon rcu_read_lock(); 10747cca2d0bSBen Gardon 10752f6305ddSPaolo Bonzini tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 107663d28a25SPaolo Bonzini int r; 107763d28a25SPaolo Bonzini 107873a3c659SPaolo Bonzini if (fault->nx_huge_page_workaround_enabled) 1079536f0e6aSPaolo Bonzini disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1080bb18842eSBen Gardon 1081bb18842eSBen Gardon /* 1082c4b33d28SDavid Matlack * If SPTE has been frozen by another thread, just give up and 1083c4b33d28SDavid Matlack * retry, avoiding unnecessary page table allocation and free. 1084ff76d506SKai Huang */ 1085ff76d506SKai Huang if (is_removed_spte(iter.old_spte)) 108663d28a25SPaolo Bonzini goto retry; 108763d28a25SPaolo Bonzini 1088f5d16bb9SSean Christopherson if (iter.level == fault->goal_level) 108980a3e4aeSSean Christopherson goto map_target_level; 1090f5d16bb9SSean Christopherson 109163d28a25SPaolo Bonzini /* Step down into the lower level page table if it exists. */ 109263d28a25SPaolo Bonzini if (is_shadow_present_pte(iter.old_spte) && 109363d28a25SPaolo Bonzini !is_large_pte(iter.old_spte)) 109463d28a25SPaolo Bonzini continue; 1095ff76d506SKai Huang 1096c4b33d28SDavid Matlack /* 1097c4b33d28SDavid Matlack * The SPTE is either non-present or points to a huge page that 1098c4b33d28SDavid Matlack * needs to be split. 1099c4b33d28SDavid Matlack */ 1100a82070b6SDavid Matlack sp = tdp_mmu_alloc_sp(vcpu); 1101a82070b6SDavid Matlack tdp_mmu_init_child_sp(sp, &iter); 1102a82070b6SDavid Matlack 110361f94478SSean Christopherson sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 110461f94478SSean Christopherson 1105c4b33d28SDavid Matlack if (is_shadow_present_pte(iter.old_spte)) 110663d28a25SPaolo Bonzini r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1107c4b33d28SDavid Matlack else 110863d28a25SPaolo Bonzini r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1109c4b33d28SDavid Matlack 111063d28a25SPaolo Bonzini /* 111180a3e4aeSSean Christopherson * Force the guest to retry if installing an upper level SPTE 111280a3e4aeSSean Christopherson * failed, e.g. because a different task modified the SPTE. 111363d28a25SPaolo Bonzini */ 111463d28a25SPaolo Bonzini if (r) { 11159a77daacSBen Gardon tdp_mmu_free_sp(sp); 111663d28a25SPaolo Bonzini goto retry; 11179a77daacSBen Gardon } 111861f94478SSean Christopherson 111961f94478SSean Christopherson if (fault->huge_page_disallowed && 112061f94478SSean Christopherson fault->req_level >= iter.level) { 112161f94478SSean Christopherson spin_lock(&kvm->arch.tdp_mmu_pages_lock); 112221a36ac6SSean Christopherson if (sp->nx_huge_page_disallowed) 112361f94478SSean Christopherson track_possible_nx_huge_page(kvm, sp); 112461f94478SSean Christopherson spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 112561f94478SSean Christopherson } 1126bb18842eSBen Gardon } 1127bb18842eSBen Gardon 112880a3e4aeSSean Christopherson /* 112980a3e4aeSSean Christopherson * The walk aborted before reaching the target level, e.g. because the 113080a3e4aeSSean Christopherson * iterator detected an upper level SPTE was frozen during traversal. 113180a3e4aeSSean Christopherson */ 113280a3e4aeSSean Christopherson WARN_ON_ONCE(iter.level == fault->goal_level); 113380a3e4aeSSean Christopherson goto retry; 113480a3e4aeSSean Christopherson 113580a3e4aeSSean Christopherson map_target_level: 1136cdc47767SPaolo Bonzini ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1137bb18842eSBen Gardon 113863d28a25SPaolo Bonzini retry: 113963d28a25SPaolo Bonzini rcu_read_unlock(); 1140bb18842eSBen Gardon return ret; 1141bb18842eSBen Gardon } 1142063afacdSBen Gardon 11433039bcc7SSean Christopherson bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 11443039bcc7SSean Christopherson bool flush) 1145063afacdSBen Gardon { 1146f47e5bbbSSean Christopherson return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 114783b83a02SSean Christopherson range->end, range->may_block, flush); 11483039bcc7SSean Christopherson } 11493039bcc7SSean Christopherson 11503039bcc7SSean Christopherson typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 11513039bcc7SSean Christopherson struct kvm_gfn_range *range); 11523039bcc7SSean Christopherson 11533039bcc7SSean Christopherson static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 11543039bcc7SSean Christopherson struct kvm_gfn_range *range, 1155c1b91493SSean Christopherson tdp_handler_t handler) 1156063afacdSBen Gardon { 1157063afacdSBen Gardon struct kvm_mmu_page *root; 11583039bcc7SSean Christopherson struct tdp_iter iter; 11593039bcc7SSean Christopherson bool ret = false; 1160063afacdSBen Gardon 1161063afacdSBen Gardon /* 1162e1eed584SSean Christopherson * Don't support rescheduling, none of the MMU notifiers that funnel 1163e1eed584SSean Christopherson * into this helper allow blocking; it'd be dead, wasteful code. 1164063afacdSBen Gardon */ 11653039bcc7SSean Christopherson for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1166a151acecSSean Christopherson rcu_read_lock(); 1167a151acecSSean Christopherson 11683039bcc7SSean Christopherson tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 11693039bcc7SSean Christopherson ret |= handler(kvm, &iter, range); 1170063afacdSBen Gardon 11713039bcc7SSean Christopherson rcu_read_unlock(); 1172a151acecSSean Christopherson } 1173063afacdSBen Gardon 1174063afacdSBen Gardon return ret; 1175063afacdSBen Gardon } 1176063afacdSBen Gardon 1177f8e14497SBen Gardon /* 1178f8e14497SBen Gardon * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1179f8e14497SBen Gardon * if any of the GFNs in the range have been accessed. 11807ee131e3SVipin Sharma * 11817ee131e3SVipin Sharma * No need to mark the corresponding PFN as accessed as this call is coming 11827ee131e3SVipin Sharma * from the clear_young() or clear_flush_young() notifier, which uses the 11837ee131e3SVipin Sharma * return value to determine if the page has been accessed. 1184f8e14497SBen Gardon */ 11853039bcc7SSean Christopherson static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 11863039bcc7SSean Christopherson struct kvm_gfn_range *range) 1187f8e14497SBen Gardon { 11887ee131e3SVipin Sharma u64 new_spte; 1189f8e14497SBen Gardon 11903039bcc7SSean Christopherson /* If we have a non-accessed entry we don't need to change the pte. */ 11913039bcc7SSean Christopherson if (!is_accessed_spte(iter->old_spte)) 11923039bcc7SSean Christopherson return false; 11937cca2d0bSBen Gardon 11947ee131e3SVipin Sharma if (spte_ad_enabled(iter->old_spte)) { 11957ee131e3SVipin Sharma iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, 11967ee131e3SVipin Sharma iter->old_spte, 11977ee131e3SVipin Sharma shadow_accessed_mask, 11987ee131e3SVipin Sharma iter->level); 11997ee131e3SVipin Sharma new_spte = iter->old_spte & ~shadow_accessed_mask; 1200f8e14497SBen Gardon } else { 1201f8e14497SBen Gardon /* 1202f8e14497SBen Gardon * Capture the dirty status of the page, so that it doesn't get 1203f8e14497SBen Gardon * lost when the SPTE is marked for access tracking. 1204f8e14497SBen Gardon */ 12057ee131e3SVipin Sharma if (is_writable_pte(iter->old_spte)) 12067ee131e3SVipin Sharma kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); 1207f8e14497SBen Gardon 12087ee131e3SVipin Sharma new_spte = mark_spte_for_access_track(iter->old_spte); 12097ee131e3SVipin Sharma iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, 12107ee131e3SVipin Sharma iter->old_spte, new_spte, 12117ee131e3SVipin Sharma iter->level); 1212f8e14497SBen Gardon } 1213f8e14497SBen Gardon 1214891f1159SVipin Sharma trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, 1215891f1159SVipin Sharma iter->old_spte, new_spte); 12163039bcc7SSean Christopherson return true; 1217f8e14497SBen Gardon } 1218f8e14497SBen Gardon 12193039bcc7SSean Christopherson bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1220f8e14497SBen Gardon { 12213039bcc7SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1222f8e14497SBen Gardon } 1223f8e14497SBen Gardon 12243039bcc7SSean Christopherson static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 12253039bcc7SSean Christopherson struct kvm_gfn_range *range) 1226f8e14497SBen Gardon { 12273039bcc7SSean Christopherson return is_accessed_spte(iter->old_spte); 1228f8e14497SBen Gardon } 1229f8e14497SBen Gardon 12303039bcc7SSean Christopherson bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1231f8e14497SBen Gardon { 12323039bcc7SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 12333039bcc7SSean Christopherson } 12343039bcc7SSean Christopherson 12353039bcc7SSean Christopherson static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 12363039bcc7SSean Christopherson struct kvm_gfn_range *range) 12373039bcc7SSean Christopherson { 12383039bcc7SSean Christopherson u64 new_spte; 12393039bcc7SSean Christopherson 12403039bcc7SSean Christopherson /* Huge pages aren't expected to be modified without first being zapped. */ 12413039bcc7SSean Christopherson WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 12423039bcc7SSean Christopherson 12433039bcc7SSean Christopherson if (iter->level != PG_LEVEL_4K || 12443039bcc7SSean Christopherson !is_shadow_present_pte(iter->old_spte)) 12453039bcc7SSean Christopherson return false; 12463039bcc7SSean Christopherson 12473039bcc7SSean Christopherson /* 12483039bcc7SSean Christopherson * Note, when changing a read-only SPTE, it's not strictly necessary to 12493039bcc7SSean Christopherson * zero the SPTE before setting the new PFN, but doing so preserves the 12503039bcc7SSean Christopherson * invariant that the PFN of a present * leaf SPTE can never change. 125140fa907eSVipin Sharma * See handle_changed_spte(). 12523039bcc7SSean Christopherson */ 12530b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, iter, 0); 12543039bcc7SSean Christopherson 12553039bcc7SSean Christopherson if (!pte_write(range->pte)) { 12563039bcc7SSean Christopherson new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 12573039bcc7SSean Christopherson pte_pfn(range->pte)); 12583039bcc7SSean Christopherson 12590b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, iter, new_spte); 12603039bcc7SSean Christopherson } 12613039bcc7SSean Christopherson 12623039bcc7SSean Christopherson return true; 1263f8e14497SBen Gardon } 12641d8dd6b3SBen Gardon 12651d8dd6b3SBen Gardon /* 12661d8dd6b3SBen Gardon * Handle the changed_pte MMU notifier for the TDP MMU. 12671d8dd6b3SBen Gardon * data is a pointer to the new pte_t mapping the HVA specified by the MMU 12681d8dd6b3SBen Gardon * notifier. 12691d8dd6b3SBen Gardon * Returns non-zero if a flush is needed before releasing the MMU lock. 12701d8dd6b3SBen Gardon */ 12713039bcc7SSean Christopherson bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 12721d8dd6b3SBen Gardon { 127393fa50f6SSean Christopherson /* 127493fa50f6SSean Christopherson * No need to handle the remote TLB flush under RCU protection, the 127593fa50f6SSean Christopherson * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 127640fa907eSVipin Sharma * shadow page. See the WARN on pfn_changed in handle_changed_spte(). 127793fa50f6SSean Christopherson */ 127893fa50f6SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 12791d8dd6b3SBen Gardon } 12801d8dd6b3SBen Gardon 1281a6a0b05dSBen Gardon /* 1282bedd9195SDavid Matlack * Remove write access from all SPTEs at or above min_level that map GFNs 1283bedd9195SDavid Matlack * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1284bedd9195SDavid Matlack * be flushed. 1285a6a0b05dSBen Gardon */ 1286a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1287a6a0b05dSBen Gardon gfn_t start, gfn_t end, int min_level) 1288a6a0b05dSBen Gardon { 1289a6a0b05dSBen Gardon struct tdp_iter iter; 1290a6a0b05dSBen Gardon u64 new_spte; 1291a6a0b05dSBen Gardon bool spte_set = false; 1292a6a0b05dSBen Gardon 12937cca2d0bSBen Gardon rcu_read_lock(); 12947cca2d0bSBen Gardon 1295a6a0b05dSBen Gardon BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1296a6a0b05dSBen Gardon 129777aa6075SDavid Matlack for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 129824ae4cfaSBen Gardon retry: 129924ae4cfaSBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 13001af4a960SBen Gardon continue; 13011af4a960SBen Gardon 1302a6a0b05dSBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 13030f99ee2cSBen Gardon !is_last_spte(iter.old_spte, iter.level) || 13040f99ee2cSBen Gardon !(iter.old_spte & PT_WRITABLE_MASK)) 1305a6a0b05dSBen Gardon continue; 1306a6a0b05dSBen Gardon 1307a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1308a6a0b05dSBen Gardon 13093e72c791SDavid Matlack if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 131024ae4cfaSBen Gardon goto retry; 13113255530aSDavid Matlack 1312a6a0b05dSBen Gardon spte_set = true; 1313a6a0b05dSBen Gardon } 13147cca2d0bSBen Gardon 13157cca2d0bSBen Gardon rcu_read_unlock(); 1316a6a0b05dSBen Gardon return spte_set; 1317a6a0b05dSBen Gardon } 1318a6a0b05dSBen Gardon 1319a6a0b05dSBen Gardon /* 1320a6a0b05dSBen Gardon * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1321a6a0b05dSBen Gardon * only affect leaf SPTEs down to min_level. 1322a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1323a6a0b05dSBen Gardon */ 1324269e9552SHamza Mahfooz bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1325269e9552SHamza Mahfooz const struct kvm_memory_slot *slot, int min_level) 1326a6a0b05dSBen Gardon { 1327a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1328a6a0b05dSBen Gardon bool spte_set = false; 1329a6a0b05dSBen Gardon 133024ae4cfaSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 1331a6a0b05dSBen Gardon 1332d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1333a6a0b05dSBen Gardon spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1334a6a0b05dSBen Gardon slot->base_gfn + slot->npages, min_level); 1335a6a0b05dSBen Gardon 1336a6a0b05dSBen Gardon return spte_set; 1337a6a0b05dSBen Gardon } 1338a6a0b05dSBen Gardon 1339a3fe5dbdSDavid Matlack static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1340a3fe5dbdSDavid Matlack { 1341a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp; 1342a3fe5dbdSDavid Matlack 1343a3fe5dbdSDavid Matlack gfp |= __GFP_ZERO; 1344a3fe5dbdSDavid Matlack 1345a3fe5dbdSDavid Matlack sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1346a3fe5dbdSDavid Matlack if (!sp) 1347a3fe5dbdSDavid Matlack return NULL; 1348a3fe5dbdSDavid Matlack 1349a3fe5dbdSDavid Matlack sp->spt = (void *)__get_free_page(gfp); 1350a3fe5dbdSDavid Matlack if (!sp->spt) { 1351a3fe5dbdSDavid Matlack kmem_cache_free(mmu_page_header_cache, sp); 1352a3fe5dbdSDavid Matlack return NULL; 1353a3fe5dbdSDavid Matlack } 1354a3fe5dbdSDavid Matlack 1355a3fe5dbdSDavid Matlack return sp; 1356a3fe5dbdSDavid Matlack } 1357a3fe5dbdSDavid Matlack 1358a3fe5dbdSDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1359cb00a70bSDavid Matlack struct tdp_iter *iter, 1360cb00a70bSDavid Matlack bool shared) 1361a3fe5dbdSDavid Matlack { 1362a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp; 1363a3fe5dbdSDavid Matlack 1364a3fe5dbdSDavid Matlack /* 1365a3fe5dbdSDavid Matlack * Since we are allocating while under the MMU lock we have to be 1366a3fe5dbdSDavid Matlack * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1367a3fe5dbdSDavid Matlack * reclaim and to avoid making any filesystem callbacks (which can end 1368a3fe5dbdSDavid Matlack * up invoking KVM MMU notifiers, resulting in a deadlock). 1369a3fe5dbdSDavid Matlack * 1370a3fe5dbdSDavid Matlack * If this allocation fails we drop the lock and retry with reclaim 1371a3fe5dbdSDavid Matlack * allowed. 1372a3fe5dbdSDavid Matlack */ 1373a3fe5dbdSDavid Matlack sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1374a3fe5dbdSDavid Matlack if (sp) 1375a3fe5dbdSDavid Matlack return sp; 1376a3fe5dbdSDavid Matlack 1377a3fe5dbdSDavid Matlack rcu_read_unlock(); 1378cb00a70bSDavid Matlack 1379cb00a70bSDavid Matlack if (shared) 1380a3fe5dbdSDavid Matlack read_unlock(&kvm->mmu_lock); 1381cb00a70bSDavid Matlack else 1382cb00a70bSDavid Matlack write_unlock(&kvm->mmu_lock); 1383a3fe5dbdSDavid Matlack 1384a3fe5dbdSDavid Matlack iter->yielded = true; 1385a3fe5dbdSDavid Matlack sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1386a3fe5dbdSDavid Matlack 1387cb00a70bSDavid Matlack if (shared) 1388a3fe5dbdSDavid Matlack read_lock(&kvm->mmu_lock); 1389cb00a70bSDavid Matlack else 1390cb00a70bSDavid Matlack write_lock(&kvm->mmu_lock); 1391cb00a70bSDavid Matlack 1392a3fe5dbdSDavid Matlack rcu_read_lock(); 1393a3fe5dbdSDavid Matlack 1394a3fe5dbdSDavid Matlack return sp; 1395a3fe5dbdSDavid Matlack } 1396a3fe5dbdSDavid Matlack 1397c4b33d28SDavid Matlack /* Note, the caller is responsible for initializing @sp. */ 1398cb00a70bSDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1399cb00a70bSDavid Matlack struct kvm_mmu_page *sp, bool shared) 1400a3fe5dbdSDavid Matlack { 1401a3fe5dbdSDavid Matlack const u64 huge_spte = iter->old_spte; 1402a3fe5dbdSDavid Matlack const int level = iter->level; 1403a3fe5dbdSDavid Matlack int ret, i; 1404a3fe5dbdSDavid Matlack 1405a3fe5dbdSDavid Matlack /* 1406a3fe5dbdSDavid Matlack * No need for atomics when writing to sp->spt since the page table has 1407a3fe5dbdSDavid Matlack * not been linked in yet and thus is not reachable from any other CPU. 1408a3fe5dbdSDavid Matlack */ 14092ca3129eSSean Christopherson for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 141047855da0SDavid Matlack sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1411a3fe5dbdSDavid Matlack 1412a3fe5dbdSDavid Matlack /* 1413a3fe5dbdSDavid Matlack * Replace the huge spte with a pointer to the populated lower level 1414a3fe5dbdSDavid Matlack * page table. Since we are making this change without a TLB flush vCPUs 1415a3fe5dbdSDavid Matlack * will see a mix of the split mappings and the original huge mapping, 1416a3fe5dbdSDavid Matlack * depending on what's currently in their TLB. This is fine from a 1417a3fe5dbdSDavid Matlack * correctness standpoint since the translation will be the same either 1418a3fe5dbdSDavid Matlack * way. 1419a3fe5dbdSDavid Matlack */ 142061f94478SSean Christopherson ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1421a3fe5dbdSDavid Matlack if (ret) 1422e0b728b1SDavid Matlack goto out; 1423a3fe5dbdSDavid Matlack 1424a3fe5dbdSDavid Matlack /* 1425a3fe5dbdSDavid Matlack * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1426a3fe5dbdSDavid Matlack * are overwriting from the page stats. But we have to manually update 1427a3fe5dbdSDavid Matlack * the page stats with the new present child pages. 1428a3fe5dbdSDavid Matlack */ 14292ca3129eSSean Christopherson kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1430a3fe5dbdSDavid Matlack 1431e0b728b1SDavid Matlack out: 1432e0b728b1SDavid Matlack trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1433e0b728b1SDavid Matlack return ret; 1434a3fe5dbdSDavid Matlack } 1435a3fe5dbdSDavid Matlack 1436a3fe5dbdSDavid Matlack static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1437a3fe5dbdSDavid Matlack struct kvm_mmu_page *root, 1438a3fe5dbdSDavid Matlack gfn_t start, gfn_t end, 1439cb00a70bSDavid Matlack int target_level, bool shared) 1440a3fe5dbdSDavid Matlack { 1441a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp = NULL; 1442a3fe5dbdSDavid Matlack struct tdp_iter iter; 1443a3fe5dbdSDavid Matlack int ret = 0; 1444a3fe5dbdSDavid Matlack 1445a3fe5dbdSDavid Matlack rcu_read_lock(); 1446a3fe5dbdSDavid Matlack 1447a3fe5dbdSDavid Matlack /* 1448a3fe5dbdSDavid Matlack * Traverse the page table splitting all huge pages above the target 1449a3fe5dbdSDavid Matlack * level into one lower level. For example, if we encounter a 1GB page 1450a3fe5dbdSDavid Matlack * we split it into 512 2MB pages. 1451a3fe5dbdSDavid Matlack * 1452a3fe5dbdSDavid Matlack * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1453a3fe5dbdSDavid Matlack * to visit an SPTE before ever visiting its children, which means we 1454a3fe5dbdSDavid Matlack * will correctly recursively split huge pages that are more than one 1455a3fe5dbdSDavid Matlack * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1456a3fe5dbdSDavid Matlack * and then splitting each of those to 512 4KB pages). 1457a3fe5dbdSDavid Matlack */ 1458a3fe5dbdSDavid Matlack for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1459a3fe5dbdSDavid Matlack retry: 1460cb00a70bSDavid Matlack if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1461a3fe5dbdSDavid Matlack continue; 1462a3fe5dbdSDavid Matlack 1463a3fe5dbdSDavid Matlack if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1464a3fe5dbdSDavid Matlack continue; 1465a3fe5dbdSDavid Matlack 1466a3fe5dbdSDavid Matlack if (!sp) { 1467cb00a70bSDavid Matlack sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1468a3fe5dbdSDavid Matlack if (!sp) { 1469a3fe5dbdSDavid Matlack ret = -ENOMEM; 1470e0b728b1SDavid Matlack trace_kvm_mmu_split_huge_page(iter.gfn, 1471e0b728b1SDavid Matlack iter.old_spte, 1472e0b728b1SDavid Matlack iter.level, ret); 1473a3fe5dbdSDavid Matlack break; 1474a3fe5dbdSDavid Matlack } 1475a3fe5dbdSDavid Matlack 1476a3fe5dbdSDavid Matlack if (iter.yielded) 1477a3fe5dbdSDavid Matlack continue; 1478a3fe5dbdSDavid Matlack } 1479a3fe5dbdSDavid Matlack 1480c4b33d28SDavid Matlack tdp_mmu_init_child_sp(sp, &iter); 1481c4b33d28SDavid Matlack 1482cb00a70bSDavid Matlack if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1483a3fe5dbdSDavid Matlack goto retry; 1484a3fe5dbdSDavid Matlack 1485a3fe5dbdSDavid Matlack sp = NULL; 1486a3fe5dbdSDavid Matlack } 1487a3fe5dbdSDavid Matlack 1488a3fe5dbdSDavid Matlack rcu_read_unlock(); 1489a3fe5dbdSDavid Matlack 1490a3fe5dbdSDavid Matlack /* 1491a3fe5dbdSDavid Matlack * It's possible to exit the loop having never used the last sp if, for 1492a3fe5dbdSDavid Matlack * example, a vCPU doing HugePage NX splitting wins the race and 1493a3fe5dbdSDavid Matlack * installs its own sp in place of the last sp we tried to split. 1494a3fe5dbdSDavid Matlack */ 1495a3fe5dbdSDavid Matlack if (sp) 1496a3fe5dbdSDavid Matlack tdp_mmu_free_sp(sp); 1497a3fe5dbdSDavid Matlack 1498a3fe5dbdSDavid Matlack return ret; 1499a3fe5dbdSDavid Matlack } 1500a3fe5dbdSDavid Matlack 1501cb00a70bSDavid Matlack 1502a3fe5dbdSDavid Matlack /* 1503a3fe5dbdSDavid Matlack * Try to split all huge pages mapped by the TDP MMU down to the target level. 1504a3fe5dbdSDavid Matlack */ 1505a3fe5dbdSDavid Matlack void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1506a3fe5dbdSDavid Matlack const struct kvm_memory_slot *slot, 1507a3fe5dbdSDavid Matlack gfn_t start, gfn_t end, 1508cb00a70bSDavid Matlack int target_level, bool shared) 1509a3fe5dbdSDavid Matlack { 1510a3fe5dbdSDavid Matlack struct kvm_mmu_page *root; 1511a3fe5dbdSDavid Matlack int r = 0; 1512a3fe5dbdSDavid Matlack 1513cb00a70bSDavid Matlack kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1514a3fe5dbdSDavid Matlack 15157c554d8eSPaolo Bonzini for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1516cb00a70bSDavid Matlack r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1517a3fe5dbdSDavid Matlack if (r) { 1518cb00a70bSDavid Matlack kvm_tdp_mmu_put_root(kvm, root, shared); 1519a3fe5dbdSDavid Matlack break; 1520a3fe5dbdSDavid Matlack } 1521a3fe5dbdSDavid Matlack } 1522a3fe5dbdSDavid Matlack } 1523a3fe5dbdSDavid Matlack 1524a6a0b05dSBen Gardon /* 1525a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1526a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1527a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1528a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1529a6a0b05dSBen Gardon * be flushed. 1530a6a0b05dSBen Gardon */ 1531a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1532a6a0b05dSBen Gardon gfn_t start, gfn_t end) 1533a6a0b05dSBen Gardon { 1534697c89beSVipin Sharma u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; 1535a6a0b05dSBen Gardon struct tdp_iter iter; 1536a6a0b05dSBen Gardon bool spte_set = false; 1537a6a0b05dSBen Gardon 15387cca2d0bSBen Gardon rcu_read_lock(); 15397cca2d0bSBen Gardon 1540a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, start, end) { 154124ae4cfaSBen Gardon retry: 154224ae4cfaSBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 15431af4a960SBen Gardon continue; 15441af4a960SBen Gardon 15453354ef5aSSean Christopherson if (!is_shadow_present_pte(iter.old_spte)) 15463354ef5aSSean Christopherson continue; 15473354ef5aSSean Christopherson 15485982a539SVipin Sharma MMU_WARN_ON(kvm_ad_enabled() && 15495982a539SVipin Sharma spte_ad_need_write_protect(iter.old_spte)); 15505982a539SVipin Sharma 1551697c89beSVipin Sharma if (!(iter.old_spte & dbit)) 1552a6a0b05dSBen Gardon continue; 1553a6a0b05dSBen Gardon 1554697c89beSVipin Sharma if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 155524ae4cfaSBen Gardon goto retry; 15563255530aSDavid Matlack 1557a6a0b05dSBen Gardon spte_set = true; 1558a6a0b05dSBen Gardon } 15597cca2d0bSBen Gardon 15607cca2d0bSBen Gardon rcu_read_unlock(); 1561a6a0b05dSBen Gardon return spte_set; 1562a6a0b05dSBen Gardon } 1563a6a0b05dSBen Gardon 1564a6a0b05dSBen Gardon /* 1565a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1566a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1567a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1568a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1569a6a0b05dSBen Gardon * be flushed. 1570a6a0b05dSBen Gardon */ 1571269e9552SHamza Mahfooz bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1572269e9552SHamza Mahfooz const struct kvm_memory_slot *slot) 1573a6a0b05dSBen Gardon { 1574a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1575a6a0b05dSBen Gardon bool spte_set = false; 1576a6a0b05dSBen Gardon 157724ae4cfaSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 1578a6a0b05dSBen Gardon 1579d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1580a6a0b05dSBen Gardon spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1581a6a0b05dSBen Gardon slot->base_gfn + slot->npages); 1582a6a0b05dSBen Gardon 1583a6a0b05dSBen Gardon return spte_set; 1584a6a0b05dSBen Gardon } 1585a6a0b05dSBen Gardon 1586a6a0b05dSBen Gardon /* 1587a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1588a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1589a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1590a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1591a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1592a6a0b05dSBen Gardon */ 1593a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1594a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, bool wrprot) 1595a6a0b05dSBen Gardon { 1596697c89beSVipin Sharma u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : 1597697c89beSVipin Sharma shadow_dirty_mask; 1598a6a0b05dSBen Gardon struct tdp_iter iter; 1599a6a0b05dSBen Gardon 16007cca2d0bSBen Gardon rcu_read_lock(); 16017cca2d0bSBen Gardon 1602a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1603a6a0b05dSBen Gardon gfn + BITS_PER_LONG) { 1604a6a0b05dSBen Gardon if (!mask) 1605a6a0b05dSBen Gardon break; 1606a6a0b05dSBen Gardon 16075982a539SVipin Sharma MMU_WARN_ON(kvm_ad_enabled() && 16085982a539SVipin Sharma spte_ad_need_write_protect(iter.old_spte)); 16095982a539SVipin Sharma 1610a6a0b05dSBen Gardon if (iter.level > PG_LEVEL_4K || 1611a6a0b05dSBen Gardon !(mask & (1UL << (iter.gfn - gfn)))) 1612a6a0b05dSBen Gardon continue; 1613a6a0b05dSBen Gardon 1614f1b3b06aSBen Gardon mask &= ~(1UL << (iter.gfn - gfn)); 1615f1b3b06aSBen Gardon 1616697c89beSVipin Sharma if (!(iter.old_spte & dbit)) 1617a6a0b05dSBen Gardon continue; 1618a6a0b05dSBen Gardon 161989c313f2SVipin Sharma iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 162089c313f2SVipin Sharma iter.old_spte, dbit, 162189c313f2SVipin Sharma iter.level); 162289c313f2SVipin Sharma 16231e0f4298SVipin Sharma trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 16241e0f4298SVipin Sharma iter.old_spte, 16251e0f4298SVipin Sharma iter.old_spte & ~dbit); 16261e0f4298SVipin Sharma kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); 1627a6a0b05dSBen Gardon } 16287cca2d0bSBen Gardon 16297cca2d0bSBen Gardon rcu_read_unlock(); 1630a6a0b05dSBen Gardon } 1631a6a0b05dSBen Gardon 1632a6a0b05dSBen Gardon /* 1633a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1634a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1635a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1636a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1637a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1638a6a0b05dSBen Gardon */ 1639a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1640a6a0b05dSBen Gardon struct kvm_memory_slot *slot, 1641a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, 1642a6a0b05dSBen Gardon bool wrprot) 1643a6a0b05dSBen Gardon { 1644a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1645a6a0b05dSBen Gardon 1646531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 1647a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, slot->as_id) 1648a6a0b05dSBen Gardon clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1649a6a0b05dSBen Gardon } 1650a6a0b05dSBen Gardon 16514b85c921SSean Christopherson static void zap_collapsible_spte_range(struct kvm *kvm, 165214881998SBen Gardon struct kvm_mmu_page *root, 16534b85c921SSean Christopherson const struct kvm_memory_slot *slot) 165414881998SBen Gardon { 16559eba50f8SSean Christopherson gfn_t start = slot->base_gfn; 16569eba50f8SSean Christopherson gfn_t end = start + slot->npages; 165714881998SBen Gardon struct tdp_iter iter; 16585ba7c4c6SBen Gardon int max_mapping_level; 165914881998SBen Gardon 16607cca2d0bSBen Gardon rcu_read_lock(); 16617cca2d0bSBen Gardon 166285f44f8cSSean Christopherson for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 166385f44f8cSSean Christopherson retry: 16644b85c921SSean Christopherson if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 16651af4a960SBen Gardon continue; 16661af4a960SBen Gardon 166785f44f8cSSean Christopherson if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 166885f44f8cSSean Christopherson !is_shadow_present_pte(iter.old_spte)) 166985f44f8cSSean Christopherson continue; 167085f44f8cSSean Christopherson 167185f44f8cSSean Christopherson /* 167285f44f8cSSean Christopherson * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 167385f44f8cSSean Christopherson * a large page size, then its parent would have been zapped 167485f44f8cSSean Christopherson * instead of stepping down. 167585f44f8cSSean Christopherson */ 167685f44f8cSSean Christopherson if (is_last_spte(iter.old_spte, iter.level)) 167785f44f8cSSean Christopherson continue; 167885f44f8cSSean Christopherson 167985f44f8cSSean Christopherson /* 168085f44f8cSSean Christopherson * If iter.gfn resides outside of the slot, i.e. the page for 168185f44f8cSSean Christopherson * the current level overlaps but is not contained by the slot, 168285f44f8cSSean Christopherson * then the SPTE can't be made huge. More importantly, trying 168385f44f8cSSean Christopherson * to query that info from slot->arch.lpage_info will cause an 168485f44f8cSSean Christopherson * out-of-bounds access. 168585f44f8cSSean Christopherson */ 168685f44f8cSSean Christopherson if (iter.gfn < start || iter.gfn >= end) 168714881998SBen Gardon continue; 168814881998SBen Gardon 16895ba7c4c6SBen Gardon max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1690a8ac499bSSean Christopherson iter.gfn, PG_LEVEL_NUM); 169185f44f8cSSean Christopherson if (max_mapping_level < iter.level) 16925ba7c4c6SBen Gardon continue; 16935ba7c4c6SBen Gardon 16944b85c921SSean Christopherson /* Note, a successful atomic zap also does a remote TLB flush. */ 169585f44f8cSSean Christopherson if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 169685f44f8cSSean Christopherson goto retry; 16972db6f772SBen Gardon } 169814881998SBen Gardon 16997cca2d0bSBen Gardon rcu_read_unlock(); 170014881998SBen Gardon } 170114881998SBen Gardon 170214881998SBen Gardon /* 170385f44f8cSSean Christopherson * Zap non-leaf SPTEs (and free their associated page tables) which could 170485f44f8cSSean Christopherson * be replaced by huge pages, for GFNs within the slot. 170514881998SBen Gardon */ 17064b85c921SSean Christopherson void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 17074b85c921SSean Christopherson const struct kvm_memory_slot *slot) 170814881998SBen Gardon { 170914881998SBen Gardon struct kvm_mmu_page *root; 171014881998SBen Gardon 17112db6f772SBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 171214881998SBen Gardon 1713d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 17144b85c921SSean Christopherson zap_collapsible_spte_range(kvm, root, slot); 171514881998SBen Gardon } 171646044f72SBen Gardon 171746044f72SBen Gardon /* 171846044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 17195fc3424fSSean Christopherson * MMU-writable bit to ensure future writes continue to be intercepted. 172046044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 172146044f72SBen Gardon */ 172246044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 17233ad93562SKeqian Zhu gfn_t gfn, int min_level) 172446044f72SBen Gardon { 172546044f72SBen Gardon struct tdp_iter iter; 172646044f72SBen Gardon u64 new_spte; 172746044f72SBen Gardon bool spte_set = false; 172846044f72SBen Gardon 17293ad93562SKeqian Zhu BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 17303ad93562SKeqian Zhu 17317cca2d0bSBen Gardon rcu_read_lock(); 17327cca2d0bSBen Gardon 173377aa6075SDavid Matlack for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 17343ad93562SKeqian Zhu if (!is_shadow_present_pte(iter.old_spte) || 17353ad93562SKeqian Zhu !is_last_spte(iter.old_spte, iter.level)) 17363ad93562SKeqian Zhu continue; 17373ad93562SKeqian Zhu 173846044f72SBen Gardon new_spte = iter.old_spte & 17395fc3424fSSean Christopherson ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 174046044f72SBen Gardon 17417c8a4742SDavid Matlack if (new_spte == iter.old_spte) 17427c8a4742SDavid Matlack break; 17437c8a4742SDavid Matlack 17440b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, &iter, new_spte); 174546044f72SBen Gardon spte_set = true; 174646044f72SBen Gardon } 174746044f72SBen Gardon 17487cca2d0bSBen Gardon rcu_read_unlock(); 17497cca2d0bSBen Gardon 175046044f72SBen Gardon return spte_set; 175146044f72SBen Gardon } 175246044f72SBen Gardon 175346044f72SBen Gardon /* 175446044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 17555fc3424fSSean Christopherson * MMU-writable bit to ensure future writes continue to be intercepted. 175646044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 175746044f72SBen Gardon */ 175846044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 17593ad93562SKeqian Zhu struct kvm_memory_slot *slot, gfn_t gfn, 17603ad93562SKeqian Zhu int min_level) 176146044f72SBen Gardon { 176246044f72SBen Gardon struct kvm_mmu_page *root; 176346044f72SBen Gardon bool spte_set = false; 176446044f72SBen Gardon 1765531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 1766a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, slot->as_id) 17673ad93562SKeqian Zhu spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1768a3f15bdaSSean Christopherson 176946044f72SBen Gardon return spte_set; 177046044f72SBen Gardon } 177146044f72SBen Gardon 177295fb5b02SBen Gardon /* 177395fb5b02SBen Gardon * Return the level of the lowest level SPTE added to sptes. 177495fb5b02SBen Gardon * That SPTE may be non-present. 1775c5c8c7c5SDavid Matlack * 1776c5c8c7c5SDavid Matlack * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 177795fb5b02SBen Gardon */ 177839b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 177939b4d43eSSean Christopherson int *root_level) 178095fb5b02SBen Gardon { 178195fb5b02SBen Gardon struct tdp_iter iter; 178295fb5b02SBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 178395fb5b02SBen Gardon gfn_t gfn = addr >> PAGE_SHIFT; 17842aa07893SSean Christopherson int leaf = -1; 178595fb5b02SBen Gardon 1786a972e29cSPaolo Bonzini *root_level = vcpu->arch.mmu->root_role.level; 178795fb5b02SBen Gardon 178895fb5b02SBen Gardon tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 178995fb5b02SBen Gardon leaf = iter.level; 1790dde81f94SSean Christopherson sptes[leaf] = iter.old_spte; 179195fb5b02SBen Gardon } 179295fb5b02SBen Gardon 179395fb5b02SBen Gardon return leaf; 179495fb5b02SBen Gardon } 17956e8eb206SDavid Matlack 17966e8eb206SDavid Matlack /* 17976e8eb206SDavid Matlack * Returns the last level spte pointer of the shadow page walk for the given 17986e8eb206SDavid Matlack * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 17996e8eb206SDavid Matlack * walk could be performed, returns NULL and *spte does not contain valid data. 18006e8eb206SDavid Matlack * 18016e8eb206SDavid Matlack * Contract: 18026e8eb206SDavid Matlack * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 18036e8eb206SDavid Matlack * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 18046e8eb206SDavid Matlack * 18056e8eb206SDavid Matlack * WARNING: This function is only intended to be called during fast_page_fault. 18066e8eb206SDavid Matlack */ 18076e8eb206SDavid Matlack u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 18086e8eb206SDavid Matlack u64 *spte) 18096e8eb206SDavid Matlack { 18106e8eb206SDavid Matlack struct tdp_iter iter; 18116e8eb206SDavid Matlack struct kvm_mmu *mmu = vcpu->arch.mmu; 18126e8eb206SDavid Matlack gfn_t gfn = addr >> PAGE_SHIFT; 18136e8eb206SDavid Matlack tdp_ptep_t sptep = NULL; 18146e8eb206SDavid Matlack 18156e8eb206SDavid Matlack tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 18166e8eb206SDavid Matlack *spte = iter.old_spte; 18176e8eb206SDavid Matlack sptep = iter.sptep; 18186e8eb206SDavid Matlack } 18196e8eb206SDavid Matlack 18206e8eb206SDavid Matlack /* 18216e8eb206SDavid Matlack * Perform the rcu_dereference to get the raw spte pointer value since 18226e8eb206SDavid Matlack * we are passing it up to fast_page_fault, which is shared with the 18236e8eb206SDavid Matlack * legacy MMU and thus does not retain the TDP MMU-specific __rcu 18246e8eb206SDavid Matlack * annotation. 18256e8eb206SDavid Matlack * 18266e8eb206SDavid Matlack * This is safe since fast_page_fault obeys the contracts of this 18276e8eb206SDavid Matlack * function as well as all TDP MMU contracts around modifying SPTEs 18286e8eb206SDavid Matlack * outside of mmu_lock. 18296e8eb206SDavid Matlack */ 18306e8eb206SDavid Matlack return rcu_dereference(sptep); 18316e8eb206SDavid Matlack } 1832