1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0 28d20bd63SSean Christopherson #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3fe5db27dSBen Gardon 402c00b3aSBen Gardon #include "mmu.h" 502c00b3aSBen Gardon #include "mmu_internal.h" 6bb18842eSBen Gardon #include "mmutrace.h" 72f2fad08SBen Gardon #include "tdp_iter.h" 8fe5db27dSBen Gardon #include "tdp_mmu.h" 902c00b3aSBen Gardon #include "spte.h" 10fe5db27dSBen Gardon 119a77daacSBen Gardon #include <asm/cmpxchg.h> 1233dd3574SBen Gardon #include <trace/events/kvm.h> 1333dd3574SBen Gardon 14fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */ 15a1a39128SPaolo Bonzini int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16fe5db27dSBen Gardon { 17a1a39128SPaolo Bonzini struct workqueue_struct *wq; 18a1a39128SPaolo Bonzini 19a1a39128SPaolo Bonzini wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 20a1a39128SPaolo Bonzini if (!wq) 21a1a39128SPaolo Bonzini return -ENOMEM; 22fe5db27dSBen Gardon 2302c00b3aSBen Gardon INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 249a77daacSBen Gardon spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 25a1a39128SPaolo Bonzini kvm->arch.tdp_mmu_zap_wq = wq; 26a1a39128SPaolo Bonzini return 1; 27fe5db27dSBen Gardon } 28fe5db27dSBen Gardon 29226b8c8fSSean Christopherson /* Arbitrarily returns true so that this may be used in if statements. */ 30226b8c8fSSean Christopherson static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 316103bc07SBen Gardon bool shared) 326103bc07SBen Gardon { 336103bc07SBen Gardon if (shared) 346103bc07SBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 356103bc07SBen Gardon else 366103bc07SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 37226b8c8fSSean Christopherson 38226b8c8fSSean Christopherson return true; 396103bc07SBen Gardon } 406103bc07SBen Gardon 41fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 42fe5db27dSBen Gardon { 43edbdb43fSSean Christopherson /* 44edbdb43fSSean Christopherson * Invalidate all roots, which besides the obvious, schedules all roots 45edbdb43fSSean Christopherson * for zapping and thus puts the TDP MMU's reference to each root, i.e. 46edbdb43fSSean Christopherson * ultimately frees all roots. 47edbdb43fSSean Christopherson */ 48edbdb43fSSean Christopherson kvm_tdp_mmu_invalidate_all_roots(kvm); 49edbdb43fSSean Christopherson 50edbdb43fSSean Christopherson /* 51edbdb43fSSean Christopherson * Destroying a workqueue also first flushes the workqueue, i.e. no 52edbdb43fSSean Christopherson * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). 53edbdb43fSSean Christopherson */ 5422b94c4bSPaolo Bonzini destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 5522b94c4bSPaolo Bonzini 56d25ceb92SSean Christopherson WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 5702c00b3aSBen Gardon WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 587cca2d0bSBen Gardon 597cca2d0bSBen Gardon /* 607cca2d0bSBen Gardon * Ensure that all the outstanding RCU callbacks to free shadow pages 6122b94c4bSPaolo Bonzini * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 6222b94c4bSPaolo Bonzini * can call kvm_tdp_mmu_put_root and create new callbacks. 637cca2d0bSBen Gardon */ 647cca2d0bSBen Gardon rcu_barrier(); 6502c00b3aSBen Gardon } 6602c00b3aSBen Gardon 672bdb3d84SBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 68a889ea54SBen Gardon { 692bdb3d84SBen Gardon free_page((unsigned long)sp->spt); 702bdb3d84SBen Gardon kmem_cache_free(mmu_page_header_cache, sp); 71a889ea54SBen Gardon } 72a889ea54SBen Gardon 73c0e64238SBen Gardon /* 74c0e64238SBen Gardon * This is called through call_rcu in order to free TDP page table memory 75c0e64238SBen Gardon * safely with respect to other kernel threads that may be operating on 76c0e64238SBen Gardon * the memory. 77c0e64238SBen Gardon * By only accessing TDP MMU page table memory in an RCU read critical 78c0e64238SBen Gardon * section, and freeing it after a grace period, lockless access to that 79c0e64238SBen Gardon * memory won't use it after it is freed. 80c0e64238SBen Gardon */ 81c0e64238SBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 82a889ea54SBen Gardon { 83c0e64238SBen Gardon struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 84c0e64238SBen Gardon rcu_head); 85a889ea54SBen Gardon 86c0e64238SBen Gardon tdp_mmu_free_sp(sp); 87a889ea54SBen Gardon } 88a889ea54SBen Gardon 89e2b5b21dSSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 90e2b5b21dSSean Christopherson bool shared); 91e2b5b21dSSean Christopherson 9222b94c4bSPaolo Bonzini static void tdp_mmu_zap_root_work(struct work_struct *work) 9322b94c4bSPaolo Bonzini { 9422b94c4bSPaolo Bonzini struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 9522b94c4bSPaolo Bonzini tdp_mmu_async_work); 9622b94c4bSPaolo Bonzini struct kvm *kvm = root->tdp_mmu_async_data; 9722b94c4bSPaolo Bonzini 9822b94c4bSPaolo Bonzini read_lock(&kvm->mmu_lock); 9922b94c4bSPaolo Bonzini 10022b94c4bSPaolo Bonzini /* 10122b94c4bSPaolo Bonzini * A TLB flush is not necessary as KVM performs a local TLB flush when 10222b94c4bSPaolo Bonzini * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 10322b94c4bSPaolo Bonzini * to a different pCPU. Note, the local TLB flush on reuse also 10422b94c4bSPaolo Bonzini * invalidates any paging-structure-cache entries, i.e. TLB entries for 10522b94c4bSPaolo Bonzini * intermediate paging structures, that may be zapped, as such entries 10622b94c4bSPaolo Bonzini * are associated with the ASID on both VMX and SVM. 10722b94c4bSPaolo Bonzini */ 10822b94c4bSPaolo Bonzini tdp_mmu_zap_root(kvm, root, true); 10922b94c4bSPaolo Bonzini 11022b94c4bSPaolo Bonzini /* 11122b94c4bSPaolo Bonzini * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 11222b94c4bSPaolo Bonzini * avoiding an infinite loop. By design, the root is reachable while 11322b94c4bSPaolo Bonzini * it's being asynchronously zapped, thus a different task can put its 11422b94c4bSPaolo Bonzini * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 11522b94c4bSPaolo Bonzini * asynchronously zapped root is unavoidable. 11622b94c4bSPaolo Bonzini */ 11722b94c4bSPaolo Bonzini kvm_tdp_mmu_put_root(kvm, root, true); 11822b94c4bSPaolo Bonzini 11922b94c4bSPaolo Bonzini read_unlock(&kvm->mmu_lock); 12022b94c4bSPaolo Bonzini } 12122b94c4bSPaolo Bonzini 12222b94c4bSPaolo Bonzini static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 12322b94c4bSPaolo Bonzini { 12422b94c4bSPaolo Bonzini root->tdp_mmu_async_data = kvm; 12522b94c4bSPaolo Bonzini INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 12622b94c4bSPaolo Bonzini queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 12722b94c4bSPaolo Bonzini } 12822b94c4bSPaolo Bonzini 1296103bc07SBen Gardon void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 1306103bc07SBen Gardon bool shared) 1312bdb3d84SBen Gardon { 1326103bc07SBen Gardon kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1332bdb3d84SBen Gardon 13411cccf5cSBen Gardon if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 1352bdb3d84SBen Gardon return; 1362bdb3d84SBen Gardon 1378351779cSPaolo Bonzini /* 138edbdb43fSSean Christopherson * The TDP MMU itself holds a reference to each root until the root is 139edbdb43fSSean Christopherson * explicitly invalidated, i.e. the final reference should be never be 140edbdb43fSSean Christopherson * put for a valid root. 1418351779cSPaolo Bonzini */ 142edbdb43fSSean Christopherson KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); 1438351779cSPaolo Bonzini 144c0e64238SBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 145c0e64238SBen Gardon list_del_rcu(&root->link); 146c0e64238SBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 147c0e64238SBen Gardon call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 148a889ea54SBen Gardon } 149a889ea54SBen Gardon 150cfc10997SBen Gardon /* 151d62007edSSean Christopherson * Returns the next root after @prev_root (or the first root if @prev_root is 152d62007edSSean Christopherson * NULL). A reference to the returned root is acquired, and the reference to 153d62007edSSean Christopherson * @prev_root is released (the caller obviously must hold a reference to 154d62007edSSean Christopherson * @prev_root if it's non-NULL). 155d62007edSSean Christopherson * 156d62007edSSean Christopherson * If @only_valid is true, invalid roots are skipped. 157d62007edSSean Christopherson * 158d62007edSSean Christopherson * Returns NULL if the end of tdp_mmu_roots was reached. 159cfc10997SBen Gardon */ 160cfc10997SBen Gardon static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 1616103bc07SBen Gardon struct kvm_mmu_page *prev_root, 162d62007edSSean Christopherson bool shared, bool only_valid) 163a889ea54SBen Gardon { 164a889ea54SBen Gardon struct kvm_mmu_page *next_root; 165a889ea54SBen Gardon 166c0e64238SBen Gardon rcu_read_lock(); 167c0e64238SBen Gardon 168cfc10997SBen Gardon if (prev_root) 169c0e64238SBen Gardon next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 170c0e64238SBen Gardon &prev_root->link, 171c0e64238SBen Gardon typeof(*prev_root), link); 172cfc10997SBen Gardon else 173c0e64238SBen Gardon next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 174cfc10997SBen Gardon typeof(*next_root), link); 175cfc10997SBen Gardon 17604dc4e6cSSean Christopherson while (next_root) { 177d62007edSSean Christopherson if ((!only_valid || !next_root->role.invalid) && 178ad6d6b94SJinrong Liang kvm_tdp_mmu_get_root(next_root)) 17904dc4e6cSSean Christopherson break; 18004dc4e6cSSean Christopherson 181c0e64238SBen Gardon next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 182c0e64238SBen Gardon &next_root->link, typeof(*next_root), link); 18304dc4e6cSSean Christopherson } 184fb101293SBen Gardon 185c0e64238SBen Gardon rcu_read_unlock(); 186cfc10997SBen Gardon 187cfc10997SBen Gardon if (prev_root) 1886103bc07SBen Gardon kvm_tdp_mmu_put_root(kvm, prev_root, shared); 189cfc10997SBen Gardon 190a889ea54SBen Gardon return next_root; 191a889ea54SBen Gardon } 192a889ea54SBen Gardon 193a889ea54SBen Gardon /* 194a889ea54SBen Gardon * Note: this iterator gets and puts references to the roots it iterates over. 195a889ea54SBen Gardon * This makes it safe to release the MMU lock and yield within the loop, but 196a889ea54SBen Gardon * if exiting the loop early, the caller must drop the reference to the most 197a889ea54SBen Gardon * recent root. (Unless keeping a live reference is desirable.) 1986103bc07SBen Gardon * 1996103bc07SBen Gardon * If shared is set, this function is operating under the MMU lock in read 2006103bc07SBen Gardon * mode. In the unlikely event that this thread must free a root, the lock 2016103bc07SBen Gardon * will be temporarily dropped and reacquired in write mode. 202a889ea54SBen Gardon */ 203d62007edSSean Christopherson #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 204d62007edSSean Christopherson for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 205cfc10997SBen Gardon _root; \ 206d62007edSSean Christopherson _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 207614f6970SPaolo Bonzini if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 208614f6970SPaolo Bonzini kvm_mmu_page_as_id(_root) != _as_id) { \ 209a3f15bdaSSean Christopherson } else 210a889ea54SBen Gardon 211d62007edSSean Christopherson #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 212d62007edSSean Christopherson __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 213d62007edSSean Christopherson 214614f6970SPaolo Bonzini #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 215614f6970SPaolo Bonzini __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 216d62007edSSean Christopherson 217226b8c8fSSean Christopherson /* 218226b8c8fSSean Christopherson * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 219226b8c8fSSean Christopherson * the implication being that any flow that holds mmu_lock for read is 220226b8c8fSSean Christopherson * inherently yield-friendly and should use the yield-safe variant above. 221226b8c8fSSean Christopherson * Holding mmu_lock for write obviates the need for RCU protection as the list 222226b8c8fSSean Christopherson * is guaranteed to be stable. 223226b8c8fSSean Christopherson */ 224a3f15bdaSSean Christopherson #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 225226b8c8fSSean Christopherson list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 226226b8c8fSSean Christopherson if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 227226b8c8fSSean Christopherson kvm_mmu_page_as_id(_root) != _as_id) { \ 228a3f15bdaSSean Christopherson } else 22902c00b3aSBen Gardon 230a82070b6SDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 23102c00b3aSBen Gardon { 23202c00b3aSBen Gardon struct kvm_mmu_page *sp; 23302c00b3aSBen Gardon 23402c00b3aSBen Gardon sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 23502c00b3aSBen Gardon sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 236a82070b6SDavid Matlack 237a82070b6SDavid Matlack return sp; 238a82070b6SDavid Matlack } 239a82070b6SDavid Matlack 240c10743a1SSean Christopherson static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 241c10743a1SSean Christopherson gfn_t gfn, union kvm_mmu_page_role role) 242a82070b6SDavid Matlack { 24355c510e2SSean Christopherson INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 244428e9216SSean Christopherson 24502c00b3aSBen Gardon set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 24602c00b3aSBen Gardon 247a3aca4deSDavid Matlack sp->role = role; 24802c00b3aSBen Gardon sp->gfn = gfn; 249c10743a1SSean Christopherson sp->ptep = sptep; 25002c00b3aSBen Gardon sp->tdp_mmu_page = true; 25102c00b3aSBen Gardon 25233dd3574SBen Gardon trace_kvm_mmu_get_page(sp, true); 25302c00b3aSBen Gardon } 25402c00b3aSBen Gardon 255a82070b6SDavid Matlack static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 256a3aca4deSDavid Matlack struct tdp_iter *iter) 257a3aca4deSDavid Matlack { 258a3aca4deSDavid Matlack struct kvm_mmu_page *parent_sp; 259a3aca4deSDavid Matlack union kvm_mmu_page_role role; 260a3aca4deSDavid Matlack 261a3aca4deSDavid Matlack parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 262a3aca4deSDavid Matlack 263a3aca4deSDavid Matlack role = parent_sp->role; 264a3aca4deSDavid Matlack role.level--; 265a3aca4deSDavid Matlack 266c10743a1SSean Christopherson tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 267a3aca4deSDavid Matlack } 268a3aca4deSDavid Matlack 2696e6ec584SSean Christopherson hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 27002c00b3aSBen Gardon { 2717a458f0eSPaolo Bonzini union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 27202c00b3aSBen Gardon struct kvm *kvm = vcpu->kvm; 27302c00b3aSBen Gardon struct kvm_mmu_page *root; 27402c00b3aSBen Gardon 2756e6ec584SSean Christopherson lockdep_assert_held_write(&kvm->mmu_lock); 27602c00b3aSBen Gardon 27704dc4e6cSSean Christopherson /* 27804dc4e6cSSean Christopherson * Check for an existing root before allocating a new one. Note, the 27904dc4e6cSSean Christopherson * role check prevents consuming an invalid root. 28004dc4e6cSSean Christopherson */ 281a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 282fb101293SBen Gardon if (root->role.word == role.word && 283ad6d6b94SJinrong Liang kvm_tdp_mmu_get_root(root)) 2846e6ec584SSean Christopherson goto out; 28502c00b3aSBen Gardon } 28602c00b3aSBen Gardon 287a82070b6SDavid Matlack root = tdp_mmu_alloc_sp(vcpu); 288c10743a1SSean Christopherson tdp_mmu_init_sp(root, NULL, 0, role); 289a82070b6SDavid Matlack 290edbdb43fSSean Christopherson /* 291edbdb43fSSean Christopherson * TDP MMU roots are kept until they are explicitly invalidated, either 292edbdb43fSSean Christopherson * by a memslot update or by the destruction of the VM. Initialize the 293edbdb43fSSean Christopherson * refcount to two; one reference for the vCPU, and one reference for 294edbdb43fSSean Christopherson * the TDP MMU itself, which is held until the root is invalidated and 295edbdb43fSSean Christopherson * is ultimately put by tdp_mmu_zap_root_work(). 296edbdb43fSSean Christopherson */ 297edbdb43fSSean Christopherson refcount_set(&root->tdp_mmu_root_count, 2); 29802c00b3aSBen Gardon 299c0e64238SBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 300c0e64238SBen Gardon list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 301c0e64238SBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 30202c00b3aSBen Gardon 3036e6ec584SSean Christopherson out: 30402c00b3aSBen Gardon return __pa(root->spt); 305fe5db27dSBen Gardon } 3062f2fad08SBen Gardon 3072f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 3089a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 3099a77daacSBen Gardon bool shared); 3102f2fad08SBen Gardon 31143a063caSYosry Ahmed static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 31243a063caSYosry Ahmed { 31343a063caSYosry Ahmed kvm_account_pgtable_pages((void *)sp->spt, +1); 314d25ceb92SSean Christopherson atomic64_inc(&kvm->arch.tdp_mmu_pages); 31543a063caSYosry Ahmed } 31643a063caSYosry Ahmed 31743a063caSYosry Ahmed static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 31843a063caSYosry Ahmed { 31943a063caSYosry Ahmed kvm_account_pgtable_pages((void *)sp->spt, -1); 320d25ceb92SSean Christopherson atomic64_dec(&kvm->arch.tdp_mmu_pages); 32143a063caSYosry Ahmed } 32243a063caSYosry Ahmed 3232f2fad08SBen Gardon /** 324c298a30cSDavid Matlack * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 325a9442f59SBen Gardon * 326a9442f59SBen Gardon * @kvm: kvm instance 327a9442f59SBen Gardon * @sp: the page to be removed 3289a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 3299a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 3309a77daacSBen Gardon * threads that might be adding or removing pages. 331a9442f59SBen Gardon */ 332c298a30cSDavid Matlack static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 3339a77daacSBen Gardon bool shared) 334a9442f59SBen Gardon { 33543a063caSYosry Ahmed tdp_unaccount_mmu_page(kvm, sp); 336d25ceb92SSean Christopherson 337d25ceb92SSean Christopherson if (!sp->nx_huge_page_disallowed) 338d25ceb92SSean Christopherson return; 339d25ceb92SSean Christopherson 3409a77daacSBen Gardon if (shared) 3419a77daacSBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 3429a77daacSBen Gardon else 343a9442f59SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 344a9442f59SBen Gardon 34561f94478SSean Christopherson sp->nx_huge_page_disallowed = false; 34661f94478SSean Christopherson untrack_possible_nx_huge_page(kvm, sp); 3479a77daacSBen Gardon 3489a77daacSBen Gardon if (shared) 3499a77daacSBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 350a9442f59SBen Gardon } 351a9442f59SBen Gardon 352a9442f59SBen Gardon /** 3530f53dfa3SDavid Matlack * handle_removed_pt() - handle a page table removed from the TDP structure 354a066e61fSBen Gardon * 355a066e61fSBen Gardon * @kvm: kvm instance 356a066e61fSBen Gardon * @pt: the page removed from the paging structure 3579a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use 3589a77daacSBen Gardon * of the MMU lock and the operation must synchronize with other 3599a77daacSBen Gardon * threads that might be modifying SPTEs. 360a066e61fSBen Gardon * 361a066e61fSBen Gardon * Given a page table that has been removed from the TDP paging structure, 362a066e61fSBen Gardon * iterates through the page table to clear SPTEs and free child page tables. 36370fb3e41SBen Gardon * 36470fb3e41SBen Gardon * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 36570fb3e41SBen Gardon * protection. Since this thread removed it from the paging structure, 36670fb3e41SBen Gardon * this thread will be responsible for ensuring the page is freed. Hence the 36770fb3e41SBen Gardon * early rcu_dereferences in the function. 368a066e61fSBen Gardon */ 3690f53dfa3SDavid Matlack static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 370a066e61fSBen Gardon { 37170fb3e41SBen Gardon struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 372a066e61fSBen Gardon int level = sp->role.level; 373e25f0e0cSBen Gardon gfn_t base_gfn = sp->gfn; 374a066e61fSBen Gardon int i; 375a066e61fSBen Gardon 376a066e61fSBen Gardon trace_kvm_mmu_prepare_zap_page(sp); 377a066e61fSBen Gardon 378c298a30cSDavid Matlack tdp_mmu_unlink_sp(kvm, sp, shared); 379a066e61fSBen Gardon 3802ca3129eSSean Christopherson for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 381ba3a6120SSean Christopherson tdp_ptep_t sptep = pt + i; 382574c3c55SBen Gardon gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 383ba3a6120SSean Christopherson u64 old_spte; 3849a77daacSBen Gardon 3859a77daacSBen Gardon if (shared) { 386e25f0e0cSBen Gardon /* 387e25f0e0cSBen Gardon * Set the SPTE to a nonpresent value that other 388e25f0e0cSBen Gardon * threads will not overwrite. If the SPTE was 389e25f0e0cSBen Gardon * already marked as removed then another thread 390e25f0e0cSBen Gardon * handling a page fault could overwrite it, so 391e25f0e0cSBen Gardon * set the SPTE until it is set from some other 392e25f0e0cSBen Gardon * value to the removed SPTE value. 393e25f0e0cSBen Gardon */ 394e25f0e0cSBen Gardon for (;;) { 395ba3a6120SSean Christopherson old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 396ba3a6120SSean Christopherson if (!is_removed_spte(old_spte)) 397e25f0e0cSBen Gardon break; 398e25f0e0cSBen Gardon cpu_relax(); 399e25f0e0cSBen Gardon } 4009a77daacSBen Gardon } else { 4018df9f1afSSean Christopherson /* 4028df9f1afSSean Christopherson * If the SPTE is not MMU-present, there is no backing 4038df9f1afSSean Christopherson * page associated with the SPTE and so no side effects 4048df9f1afSSean Christopherson * that need to be recorded, and exclusive ownership of 4058df9f1afSSean Christopherson * mmu_lock ensures the SPTE can't be made present. 4068df9f1afSSean Christopherson * Note, zapping MMIO SPTEs is also unnecessary as they 4078df9f1afSSean Christopherson * are guarded by the memslots generation, not by being 4088df9f1afSSean Christopherson * unreachable. 4098df9f1afSSean Christopherson */ 410ba3a6120SSean Christopherson old_spte = kvm_tdp_mmu_read_spte(sptep); 411ba3a6120SSean Christopherson if (!is_shadow_present_pte(old_spte)) 4128df9f1afSSean Christopherson continue; 413e25f0e0cSBen Gardon 414e25f0e0cSBen Gardon /* 415ba3a6120SSean Christopherson * Use the common helper instead of a raw WRITE_ONCE as 416ba3a6120SSean Christopherson * the SPTE needs to be updated atomically if it can be 417ba3a6120SSean Christopherson * modified by a different vCPU outside of mmu_lock. 418ba3a6120SSean Christopherson * Even though the parent SPTE is !PRESENT, the TLB 419ba3a6120SSean Christopherson * hasn't yet been flushed, and both Intel and AMD 420ba3a6120SSean Christopherson * document that A/D assists can use upper-level PxE 421ba3a6120SSean Christopherson * entries that are cached in the TLB, i.e. the CPU can 422ba3a6120SSean Christopherson * still access the page and mark it dirty. 423ba3a6120SSean Christopherson * 424ba3a6120SSean Christopherson * No retry is needed in the atomic update path as the 425ba3a6120SSean Christopherson * sole concern is dropping a Dirty bit, i.e. no other 426ba3a6120SSean Christopherson * task can zap/remove the SPTE as mmu_lock is held for 427ba3a6120SSean Christopherson * write. Marking the SPTE as a removed SPTE is not 428ba3a6120SSean Christopherson * strictly necessary for the same reason, but using 429ba3a6120SSean Christopherson * the remove SPTE value keeps the shared/exclusive 430ba3a6120SSean Christopherson * paths consistent and allows the handle_changed_spte() 431ba3a6120SSean Christopherson * call below to hardcode the new value to REMOVED_SPTE. 432ba3a6120SSean Christopherson * 433ba3a6120SSean Christopherson * Note, even though dropping a Dirty bit is the only 434ba3a6120SSean Christopherson * scenario where a non-atomic update could result in a 435ba3a6120SSean Christopherson * functional bug, simply checking the Dirty bit isn't 436ba3a6120SSean Christopherson * sufficient as a fast page fault could read the upper 437ba3a6120SSean Christopherson * level SPTE before it is zapped, and then make this 438ba3a6120SSean Christopherson * target SPTE writable, resume the guest, and set the 439ba3a6120SSean Christopherson * Dirty bit between reading the SPTE above and writing 440ba3a6120SSean Christopherson * it here. 441e25f0e0cSBen Gardon */ 442ba3a6120SSean Christopherson old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 443ba3a6120SSean Christopherson REMOVED_SPTE, level); 4449a77daacSBen Gardon } 445e25f0e0cSBen Gardon handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 446ba3a6120SSean Christopherson old_spte, REMOVED_SPTE, level, shared); 447a066e61fSBen Gardon } 448a066e61fSBen Gardon 4497cca2d0bSBen Gardon call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 450a066e61fSBen Gardon } 451a066e61fSBen Gardon 452a066e61fSBen Gardon /** 45340fa907eSVipin Sharma * handle_changed_spte - handle bookkeeping associated with an SPTE change 4542f2fad08SBen Gardon * @kvm: kvm instance 4552f2fad08SBen Gardon * @as_id: the address space of the paging structure the SPTE was a part of 4562f2fad08SBen Gardon * @gfn: the base GFN that was mapped by the SPTE 4572f2fad08SBen Gardon * @old_spte: The value of the SPTE before the change 4582f2fad08SBen Gardon * @new_spte: The value of the SPTE after the change 4592f2fad08SBen Gardon * @level: the level of the PT the SPTE is part of in the paging structure 4609a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 4619a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 4629a77daacSBen Gardon * threads that might be modifying SPTEs. 4632f2fad08SBen Gardon * 4641f997345SVipin Sharma * Handle bookkeeping that might result from the modification of a SPTE. Note, 4651f997345SVipin Sharma * dirty logging updates are handled in common code, not here (see make_spte() 4661f997345SVipin Sharma * and fast_pf_fix_direct_spte()). 4672f2fad08SBen Gardon */ 46840fa907eSVipin Sharma static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 4699a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 4709a77daacSBen Gardon bool shared) 4712f2fad08SBen Gardon { 4722f2fad08SBen Gardon bool was_present = is_shadow_present_pte(old_spte); 4732f2fad08SBen Gardon bool is_present = is_shadow_present_pte(new_spte); 4742f2fad08SBen Gardon bool was_leaf = was_present && is_last_spte(old_spte, level); 4752f2fad08SBen Gardon bool is_leaf = is_present && is_last_spte(new_spte, level); 4762f2fad08SBen Gardon bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 4772f2fad08SBen Gardon 47820ba462dSSean Christopherson WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); 47920ba462dSSean Christopherson WARN_ON_ONCE(level < PG_LEVEL_4K); 48020ba462dSSean Christopherson WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 4812f2fad08SBen Gardon 4822f2fad08SBen Gardon /* 4832f2fad08SBen Gardon * If this warning were to trigger it would indicate that there was a 4842f2fad08SBen Gardon * missing MMU notifier or a race with some notifier handler. 4852f2fad08SBen Gardon * A present, leaf SPTE should never be directly replaced with another 486d9f6e12fSIngo Molnar * present leaf SPTE pointing to a different PFN. A notifier handler 4872f2fad08SBen Gardon * should be zapping the SPTE before the main MM's page table is 4882f2fad08SBen Gardon * changed, or the SPTE should be zeroed, and the TLBs flushed by the 4892f2fad08SBen Gardon * thread before replacement. 4902f2fad08SBen Gardon */ 4912f2fad08SBen Gardon if (was_leaf && is_leaf && pfn_changed) { 4922f2fad08SBen Gardon pr_err("Invalid SPTE change: cannot replace a present leaf\n" 4932f2fad08SBen Gardon "SPTE with another present leaf SPTE mapping a\n" 4942f2fad08SBen Gardon "different PFN!\n" 4952f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 4962f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 4972f2fad08SBen Gardon 4982f2fad08SBen Gardon /* 4992f2fad08SBen Gardon * Crash the host to prevent error propagation and guest data 500d9f6e12fSIngo Molnar * corruption. 5012f2fad08SBen Gardon */ 5022f2fad08SBen Gardon BUG(); 5032f2fad08SBen Gardon } 5042f2fad08SBen Gardon 5052f2fad08SBen Gardon if (old_spte == new_spte) 5062f2fad08SBen Gardon return; 5072f2fad08SBen Gardon 508b9a98c34SBen Gardon trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 509b9a98c34SBen Gardon 510115111efSDavid Matlack if (is_leaf) 511115111efSDavid Matlack check_spte_writable_invariants(new_spte); 512115111efSDavid Matlack 5132f2fad08SBen Gardon /* 5142f2fad08SBen Gardon * The only times a SPTE should be changed from a non-present to 5152f2fad08SBen Gardon * non-present state is when an MMIO entry is installed/modified/ 5162f2fad08SBen Gardon * removed. In that case, there is nothing to do here. 5172f2fad08SBen Gardon */ 5182f2fad08SBen Gardon if (!was_present && !is_present) { 5192f2fad08SBen Gardon /* 52008f07c80SBen Gardon * If this change does not involve a MMIO SPTE or removed SPTE, 52108f07c80SBen Gardon * it is unexpected. Log the change, though it should not 52208f07c80SBen Gardon * impact the guest since both the former and current SPTEs 52308f07c80SBen Gardon * are nonpresent. 5242f2fad08SBen Gardon */ 52520ba462dSSean Christopherson if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && 52608f07c80SBen Gardon !is_mmio_spte(new_spte) && 52708f07c80SBen Gardon !is_removed_spte(new_spte))) 5282f2fad08SBen Gardon pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 5292f2fad08SBen Gardon "should not be replaced with another,\n" 5302f2fad08SBen Gardon "different nonpresent SPTE, unless one or both\n" 53108f07c80SBen Gardon "are MMIO SPTEs, or the new SPTE is\n" 53208f07c80SBen Gardon "a temporary removed SPTE.\n" 5332f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 5342f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 5352f2fad08SBen Gardon return; 5362f2fad08SBen Gardon } 5372f2fad08SBen Gardon 53871f51d2cSMingwei Zhang if (is_leaf != was_leaf) 53971f51d2cSMingwei Zhang kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 5402f2fad08SBen Gardon 5412f2fad08SBen Gardon if (was_leaf && is_dirty_spte(old_spte) && 54264bb2769SSean Christopherson (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 5432f2fad08SBen Gardon kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 5442f2fad08SBen Gardon 5452f2fad08SBen Gardon /* 5462f2fad08SBen Gardon * Recursively handle child PTs if the change removed a subtree from 547c8e5a0d0SSean Christopherson * the paging structure. Note the WARN on the PFN changing without the 548c8e5a0d0SSean Christopherson * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 549c8e5a0d0SSean Christopherson * pages are kernel allocations and should never be migrated. 5502f2fad08SBen Gardon */ 551c8e5a0d0SSean Christopherson if (was_present && !was_leaf && 552c8e5a0d0SSean Christopherson (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 5530f53dfa3SDavid Matlack handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 5542f2fad08SBen Gardon 55540fa907eSVipin Sharma if (was_leaf && is_accessed_spte(old_spte) && 55640fa907eSVipin Sharma (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) 55740fa907eSVipin Sharma kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 5582f2fad08SBen Gardon } 559faaf05b0SBen Gardon 560fe43fa2fSBen Gardon /* 5616ccf4438SPaolo Bonzini * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 5626ccf4438SPaolo Bonzini * and handle the associated bookkeeping. Do not mark the page dirty 56324ae4cfaSBen Gardon * in KVM's dirty bitmaps. 5649a77daacSBen Gardon * 5653255530aSDavid Matlack * If setting the SPTE fails because it has changed, iter->old_spte will be 5663255530aSDavid Matlack * refreshed to the current value of the spte. 5673255530aSDavid Matlack * 5689a77daacSBen Gardon * @kvm: kvm instance 5699a77daacSBen Gardon * @iter: a tdp_iter instance currently on the SPTE that should be set 5709a77daacSBen Gardon * @new_spte: The value the SPTE should be set to 5713e72c791SDavid Matlack * Return: 5723e72c791SDavid Matlack * * 0 - If the SPTE was set. 5733e72c791SDavid Matlack * * -EBUSY - If the SPTE cannot be set. In this case this function will have 5743e72c791SDavid Matlack * no side-effects other than setting iter->old_spte to the last 5753e72c791SDavid Matlack * known value of the spte. 5769a77daacSBen Gardon */ 5773e72c791SDavid Matlack static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 5789a77daacSBen Gardon struct tdp_iter *iter, 5799a77daacSBen Gardon u64 new_spte) 5809a77daacSBen Gardon { 5813255530aSDavid Matlack u64 *sptep = rcu_dereference(iter->sptep); 5823255530aSDavid Matlack 583396fd74dSSean Christopherson /* 584396fd74dSSean Christopherson * The caller is responsible for ensuring the old SPTE is not a REMOVED 585396fd74dSSean Christopherson * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 586396fd74dSSean Christopherson * and pre-checking before inserting a new SPTE is advantageous as it 587396fd74dSSean Christopherson * avoids unnecessary work. 588396fd74dSSean Christopherson */ 589396fd74dSSean Christopherson WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 5903a0f64deSSean Christopherson 5919a77daacSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 5929a77daacSBen Gardon 59308f07c80SBen Gardon /* 5946e8eb206SDavid Matlack * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 59512ced095SUros Bizjak * does not hold the mmu_lock. On failure, i.e. if a different logical 59612ced095SUros Bizjak * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with 59712ced095SUros Bizjak * the current value, so the caller operates on fresh data, e.g. if it 59812ced095SUros Bizjak * retries tdp_mmu_set_spte_atomic() 5996e8eb206SDavid Matlack */ 600aee98a68SUros Bizjak if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 6013e72c791SDavid Matlack return -EBUSY; 6029a77daacSBen Gardon 60340fa907eSVipin Sharma handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 60408889894SSean Christopherson new_spte, iter->level, true); 6059a77daacSBen Gardon 6063e72c791SDavid Matlack return 0; 6079a77daacSBen Gardon } 6089a77daacSBen Gardon 6093e72c791SDavid Matlack static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 61008f07c80SBen Gardon struct tdp_iter *iter) 61108f07c80SBen Gardon { 6123e72c791SDavid Matlack int ret; 6133e72c791SDavid Matlack 61408f07c80SBen Gardon /* 61508f07c80SBen Gardon * Freeze the SPTE by setting it to a special, 61608f07c80SBen Gardon * non-present value. This will stop other threads from 61708f07c80SBen Gardon * immediately installing a present entry in its place 61808f07c80SBen Gardon * before the TLBs are flushed. 61908f07c80SBen Gardon */ 6203e72c791SDavid Matlack ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 6213e72c791SDavid Matlack if (ret) 6223e72c791SDavid Matlack return ret; 62308f07c80SBen Gardon 6244ad980aeSHou Wenlong kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); 62508f07c80SBen Gardon 62608f07c80SBen Gardon /* 627ba3a6120SSean Christopherson * No other thread can overwrite the removed SPTE as they must either 628ba3a6120SSean Christopherson * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 629ba3a6120SSean Christopherson * overwrite the special removed SPTE value. No bookkeeping is needed 630ba3a6120SSean Christopherson * here since the SPTE is going from non-present to non-present. Use 631ba3a6120SSean Christopherson * the raw write helper to avoid an unnecessary check on volatile bits. 63208f07c80SBen Gardon */ 633ba3a6120SSean Christopherson __kvm_tdp_mmu_write_spte(iter->sptep, 0); 63408f07c80SBen Gardon 6353e72c791SDavid Matlack return 0; 63608f07c80SBen Gardon } 63708f07c80SBen Gardon 6389a77daacSBen Gardon 6399a77daacSBen Gardon /* 6400b7cc254SVipin Sharma * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 641626808d1SSean Christopherson * @kvm: KVM instance 642626808d1SSean Christopherson * @as_id: Address space ID, i.e. regular vs. SMM 643626808d1SSean Christopherson * @sptep: Pointer to the SPTE 644626808d1SSean Christopherson * @old_spte: The current value of the SPTE 645626808d1SSean Christopherson * @new_spte: The new value that will be set for the SPTE 646626808d1SSean Christopherson * @gfn: The base GFN that was (or will be) mapped by the SPTE 647626808d1SSean Christopherson * @level: The level _containing_ the SPTE (its parent PT's level) 648ba3a6120SSean Christopherson * 649ba3a6120SSean Christopherson * Returns the old SPTE value, which _may_ be different than @old_spte if the 650ba3a6120SSean Christopherson * SPTE had voldatile bits. 651fe43fa2fSBen Gardon */ 6520b7cc254SVipin Sharma static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 6530b7cc254SVipin Sharma u64 old_spte, u64 new_spte, gfn_t gfn, int level) 654faaf05b0SBen Gardon { 655531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 6563a9a4aa5SBen Gardon 65708f07c80SBen Gardon /* 658966da62aSSean Christopherson * No thread should be using this function to set SPTEs to or from the 65908f07c80SBen Gardon * temporary removed SPTE value. 66008f07c80SBen Gardon * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 66108f07c80SBen Gardon * should be used. If operating under the MMU lock in write mode, the 66208f07c80SBen Gardon * use of the removed SPTE should not be necessary. 66308f07c80SBen Gardon */ 66420ba462dSSean Christopherson WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 66508f07c80SBen Gardon 666ba3a6120SSean Christopherson old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 667faaf05b0SBen Gardon 66840fa907eSVipin Sharma handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 669ba3a6120SSean Christopherson return old_spte; 670626808d1SSean Christopherson } 671626808d1SSean Christopherson 6720b7cc254SVipin Sharma static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, 673f8e14497SBen Gardon u64 new_spte) 674f8e14497SBen Gardon { 6750b7cc254SVipin Sharma WARN_ON_ONCE(iter->yielded); 6760b7cc254SVipin Sharma iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 6770b7cc254SVipin Sharma iter->old_spte, new_spte, 6780b7cc254SVipin Sharma iter->gfn, iter->level); 679f8e14497SBen Gardon } 680f8e14497SBen Gardon 681faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 68277aa6075SDavid Matlack for_each_tdp_pte(_iter, _root, _start, _end) 683faaf05b0SBen Gardon 684f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 685f8e14497SBen Gardon tdp_root_for_each_pte(_iter, _root, _start, _end) \ 686f8e14497SBen Gardon if (!is_shadow_present_pte(_iter.old_spte) || \ 687f8e14497SBen Gardon !is_last_spte(_iter.old_spte, _iter.level)) \ 688f8e14497SBen Gardon continue; \ 689f8e14497SBen Gardon else 690f8e14497SBen Gardon 691bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 692*c5f2d564SSean Christopherson for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) 693bb18842eSBen Gardon 694faaf05b0SBen Gardon /* 695e28a436cSBen Gardon * Yield if the MMU lock is contended or this thread needs to return control 696e28a436cSBen Gardon * to the scheduler. 697e28a436cSBen Gardon * 698e139a34eSBen Gardon * If this function should yield and flush is set, it will perform a remote 699e139a34eSBen Gardon * TLB flush before yielding. 700e139a34eSBen Gardon * 7013a0f64deSSean Christopherson * If this function yields, iter->yielded is set and the caller must skip to 7023a0f64deSSean Christopherson * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 7033a0f64deSSean Christopherson * over the paging structures to allow the iterator to continue its traversal 7043a0f64deSSean Christopherson * from the paging structure root. 705e28a436cSBen Gardon * 7063a0f64deSSean Christopherson * Returns true if this function yielded. 707e28a436cSBen Gardon */ 7083a0f64deSSean Christopherson static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 7093a0f64deSSean Christopherson struct tdp_iter *iter, 7103a0f64deSSean Christopherson bool flush, bool shared) 711a6a0b05dSBen Gardon { 71220ba462dSSean Christopherson WARN_ON_ONCE(iter->yielded); 7133a0f64deSSean Christopherson 714ed5e484bSBen Gardon /* Ensure forward progress has been made before yielding. */ 715ed5e484bSBen Gardon if (iter->next_last_level_gfn == iter->yielded_gfn) 716ed5e484bSBen Gardon return false; 717ed5e484bSBen Gardon 718531810caSBen Gardon if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 719e139a34eSBen Gardon if (flush) 720e139a34eSBen Gardon kvm_flush_remote_tlbs(kvm); 721e139a34eSBen Gardon 722bd296779SSean Christopherson rcu_read_unlock(); 723bd296779SSean Christopherson 7246103bc07SBen Gardon if (shared) 7256103bc07SBen Gardon cond_resched_rwlock_read(&kvm->mmu_lock); 7266103bc07SBen Gardon else 727531810caSBen Gardon cond_resched_rwlock_write(&kvm->mmu_lock); 7286103bc07SBen Gardon 7297cca2d0bSBen Gardon rcu_read_lock(); 730ed5e484bSBen Gardon 73120ba462dSSean Christopherson WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); 732ed5e484bSBen Gardon 7333a0f64deSSean Christopherson iter->yielded = true; 734a6a0b05dSBen Gardon } 735e28a436cSBen Gardon 7363a0f64deSSean Christopherson return iter->yielded; 737a6a0b05dSBen Gardon } 738a6a0b05dSBen Gardon 73986931ff7SSean Christopherson static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 740e2b5b21dSSean Christopherson { 741e2b5b21dSSean Christopherson /* 74286931ff7SSean Christopherson * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 74386931ff7SSean Christopherson * a gpa range that would exceed the max gfn, and KVM does not create 74486931ff7SSean Christopherson * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 74586931ff7SSean Christopherson * the slow emulation path every time. 746e2b5b21dSSean Christopherson */ 74786931ff7SSean Christopherson return kvm_mmu_max_gfn() + 1; 748e2b5b21dSSean Christopherson } 749e2b5b21dSSean Christopherson 7501b6043e8SSean Christopherson static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 7511b6043e8SSean Christopherson bool shared, int zap_level) 752e2b5b21dSSean Christopherson { 753e2b5b21dSSean Christopherson struct tdp_iter iter; 754e2b5b21dSSean Christopherson 75586931ff7SSean Christopherson gfn_t end = tdp_mmu_max_gfn_exclusive(); 756e2b5b21dSSean Christopherson gfn_t start = 0; 757e2b5b21dSSean Christopherson 7581b6043e8SSean Christopherson for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 7591b6043e8SSean Christopherson retry: 7601b6043e8SSean Christopherson if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 7611b6043e8SSean Christopherson continue; 7621b6043e8SSean Christopherson 7631b6043e8SSean Christopherson if (!is_shadow_present_pte(iter.old_spte)) 7641b6043e8SSean Christopherson continue; 7651b6043e8SSean Christopherson 7661b6043e8SSean Christopherson if (iter.level > zap_level) 7671b6043e8SSean Christopherson continue; 7681b6043e8SSean Christopherson 7691b6043e8SSean Christopherson if (!shared) 7700b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, &iter, 0); 7711b6043e8SSean Christopherson else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 7721b6043e8SSean Christopherson goto retry; 7731b6043e8SSean Christopherson } 7741b6043e8SSean Christopherson } 7751b6043e8SSean Christopherson 7761b6043e8SSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 7771b6043e8SSean Christopherson bool shared) 7781b6043e8SSean Christopherson { 7791b6043e8SSean Christopherson 7808351779cSPaolo Bonzini /* 7818351779cSPaolo Bonzini * The root must have an elevated refcount so that it's reachable via 7828351779cSPaolo Bonzini * mmu_notifier callbacks, which allows this path to yield and drop 7838351779cSPaolo Bonzini * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 7848351779cSPaolo Bonzini * must drop all references to relevant pages prior to completing the 7858351779cSPaolo Bonzini * callback. Dropping mmu_lock with an unreachable root would result 7868351779cSPaolo Bonzini * in zapping SPTEs after a relevant mmu_notifier callback completes 7878351779cSPaolo Bonzini * and lead to use-after-free as zapping a SPTE triggers "writeback" of 7888351779cSPaolo Bonzini * dirty accessed bits to the SPTE's associated struct page. 7898351779cSPaolo Bonzini */ 7908351779cSPaolo Bonzini WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 7918351779cSPaolo Bonzini 792e2b5b21dSSean Christopherson kvm_lockdep_assert_mmu_lock_held(kvm, shared); 793e2b5b21dSSean Christopherson 794e2b5b21dSSean Christopherson rcu_read_lock(); 795e2b5b21dSSean Christopherson 796e2b5b21dSSean Christopherson /* 7971b6043e8SSean Christopherson * To avoid RCU stalls due to recursively removing huge swaths of SPs, 7981b6043e8SSean Christopherson * split the zap into two passes. On the first pass, zap at the 1gb 7991b6043e8SSean Christopherson * level, and then zap top-level SPs on the second pass. "1gb" is not 8001b6043e8SSean Christopherson * arbitrary, as KVM must be able to zap a 1gb shadow page without 8011b6043e8SSean Christopherson * inducing a stall to allow in-place replacement with a 1gb hugepage. 8021b6043e8SSean Christopherson * 8031b6043e8SSean Christopherson * Because zapping a SP recurses on its children, stepping down to 8041b6043e8SSean Christopherson * PG_LEVEL_4K in the iterator itself is unnecessary. 805e2b5b21dSSean Christopherson */ 8061b6043e8SSean Christopherson __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 8071b6043e8SSean Christopherson __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 808e2b5b21dSSean Christopherson 809e2b5b21dSSean Christopherson rcu_read_unlock(); 810e2b5b21dSSean Christopherson } 811e2b5b21dSSean Christopherson 812c10743a1SSean Christopherson bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 813c10743a1SSean Christopherson { 814c10743a1SSean Christopherson u64 old_spte; 815c10743a1SSean Christopherson 816c10743a1SSean Christopherson /* 817c10743a1SSean Christopherson * This helper intentionally doesn't allow zapping a root shadow page, 818c10743a1SSean Christopherson * which doesn't have a parent page table and thus no associated entry. 819c10743a1SSean Christopherson */ 820c10743a1SSean Christopherson if (WARN_ON_ONCE(!sp->ptep)) 821c10743a1SSean Christopherson return false; 822c10743a1SSean Christopherson 823c10743a1SSean Christopherson old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 824bb95dfb9SSean Christopherson if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 825c10743a1SSean Christopherson return false; 826c10743a1SSean Christopherson 8270b7cc254SVipin Sharma tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 8280b7cc254SVipin Sharma sp->gfn, sp->role.level + 1); 829c10743a1SSean Christopherson 830c10743a1SSean Christopherson return true; 831c10743a1SSean Christopherson } 832c10743a1SSean Christopherson 833faaf05b0SBen Gardon /* 834063afacdSBen Gardon * If can_yield is true, will release the MMU lock and reschedule if the 835063afacdSBen Gardon * scheduler needs the CPU or there is contention on the MMU lock. If this 836063afacdSBen Gardon * function cannot yield, it will not release the MMU lock or reschedule and 837063afacdSBen Gardon * the caller must ensure it does not supply too large a GFN range, or the 8386103bc07SBen Gardon * operation can cause a soft lockup. 839faaf05b0SBen Gardon */ 840f47e5bbbSSean Christopherson static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 841acbda82aSSean Christopherson gfn_t start, gfn_t end, bool can_yield, bool flush) 842faaf05b0SBen Gardon { 843faaf05b0SBen Gardon struct tdp_iter iter; 844faaf05b0SBen Gardon 84586931ff7SSean Christopherson end = min(end, tdp_mmu_max_gfn_exclusive()); 846524a1e4eSSean Christopherson 847acbda82aSSean Christopherson lockdep_assert_held_write(&kvm->mmu_lock); 8486103bc07SBen Gardon 8497cca2d0bSBen Gardon rcu_read_lock(); 8507cca2d0bSBen Gardon 851f47e5bbbSSean Christopherson for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 8521af4a960SBen Gardon if (can_yield && 853acbda82aSSean Christopherson tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 854a835429cSSean Christopherson flush = false; 8551af4a960SBen Gardon continue; 8561af4a960SBen Gardon } 8571af4a960SBen Gardon 858f47e5bbbSSean Christopherson if (!is_shadow_present_pte(iter.old_spte) || 859faaf05b0SBen Gardon !is_last_spte(iter.old_spte, iter.level)) 860faaf05b0SBen Gardon continue; 861faaf05b0SBen Gardon 8620b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, &iter, 0); 863a835429cSSean Christopherson flush = true; 864faaf05b0SBen Gardon } 8657cca2d0bSBen Gardon 8667cca2d0bSBen Gardon rcu_read_unlock(); 867bb95dfb9SSean Christopherson 868f47e5bbbSSean Christopherson /* 869f47e5bbbSSean Christopherson * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 870f47e5bbbSSean Christopherson * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 871f47e5bbbSSean Christopherson */ 872f47e5bbbSSean Christopherson return flush; 873faaf05b0SBen Gardon } 874faaf05b0SBen Gardon 875faaf05b0SBen Gardon /* 8767edc3a68SKai Huang * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 8777edc3a68SKai Huang * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 8787edc3a68SKai Huang * more SPTEs were zapped since the MMU lock was last acquired. 879faaf05b0SBen Gardon */ 880f47e5bbbSSean Christopherson bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 881f47e5bbbSSean Christopherson bool can_yield, bool flush) 882faaf05b0SBen Gardon { 883faaf05b0SBen Gardon struct kvm_mmu_page *root; 884faaf05b0SBen Gardon 885614f6970SPaolo Bonzini for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 886f47e5bbbSSean Christopherson flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 887faaf05b0SBen Gardon 888faaf05b0SBen Gardon return flush; 889faaf05b0SBen Gardon } 890faaf05b0SBen Gardon 891faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm) 892faaf05b0SBen Gardon { 893e2b5b21dSSean Christopherson struct kvm_mmu_page *root; 8942b9663d8SSean Christopherson int i; 895faaf05b0SBen Gardon 89677c8cd6bSSean Christopherson /* 89722b94c4bSPaolo Bonzini * Zap all roots, including invalid roots, as all SPTEs must be dropped 89822b94c4bSPaolo Bonzini * before returning to the caller. Zap directly even if the root is 89922b94c4bSPaolo Bonzini * also being zapped by a worker. Walking zapped top-level SPTEs isn't 90022b94c4bSPaolo Bonzini * all that expensive and mmu_lock is already held, which means the 90122b94c4bSPaolo Bonzini * worker has yielded, i.e. flushing the work instead of zapping here 90222b94c4bSPaolo Bonzini * isn't guaranteed to be any faster. 90322b94c4bSPaolo Bonzini * 90477c8cd6bSSean Christopherson * A TLB flush is unnecessary, KVM zaps everything if and only the VM 90577c8cd6bSSean Christopherson * is being destroyed or the userspace VMM has exited. In both cases, 90677c8cd6bSSean Christopherson * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 90777c8cd6bSSean Christopherson */ 908e2b5b21dSSean Christopherson for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 909e2b5b21dSSean Christopherson for_each_tdp_mmu_root_yield_safe(kvm, root, i) 910e2b5b21dSSean Christopherson tdp_mmu_zap_root(kvm, root, false); 911e2b5b21dSSean Christopherson } 912faaf05b0SBen Gardon } 913bb18842eSBen Gardon 9144c6654bdSBen Gardon /* 915f28e9c7fSSean Christopherson * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 91622b94c4bSPaolo Bonzini * zap" completes. 9174c6654bdSBen Gardon */ 9184c6654bdSBen Gardon void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 9194c6654bdSBen Gardon { 92022b94c4bSPaolo Bonzini flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 9214c6654bdSBen Gardon } 9224c6654bdSBen Gardon 923bb18842eSBen Gardon /* 924f28e9c7fSSean Christopherson * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 92522b94c4bSPaolo Bonzini * is about to be zapped, e.g. in response to a memslots update. The actual 926edbdb43fSSean Christopherson * zapping is performed asynchronously. Using a separate workqueue makes it 927edbdb43fSSean Christopherson * easy to ensure that the destruction is performed before the "fast zap" 928edbdb43fSSean Christopherson * completes, without keeping a separate list of invalidated roots; the list is 929edbdb43fSSean Christopherson * effectively the list of work items in the workqueue. 930b7cccd39SBen Gardon * 931edbdb43fSSean Christopherson * Note, the asynchronous worker is gifted the TDP MMU's reference. 932edbdb43fSSean Christopherson * See kvm_tdp_mmu_get_vcpu_root_hpa(). 933b7cccd39SBen Gardon */ 934b7cccd39SBen Gardon void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 935b7cccd39SBen Gardon { 936b7cccd39SBen Gardon struct kvm_mmu_page *root; 937b7cccd39SBen Gardon 938edbdb43fSSean Christopherson /* 939edbdb43fSSean Christopherson * mmu_lock must be held for write to ensure that a root doesn't become 940edbdb43fSSean Christopherson * invalid while there are active readers (invalidating a root while 941edbdb43fSSean Christopherson * there are active readers may or may not be problematic in practice, 942edbdb43fSSean Christopherson * but it's uncharted territory and not supported). 943edbdb43fSSean Christopherson * 944edbdb43fSSean Christopherson * Waive the assertion if there are no users of @kvm, i.e. the VM is 945edbdb43fSSean Christopherson * being destroyed after all references have been put, or if no vCPUs 946edbdb43fSSean Christopherson * have been created (which means there are no roots), i.e. the VM is 947edbdb43fSSean Christopherson * being destroyed in an error path of KVM_CREATE_VM. 948edbdb43fSSean Christopherson */ 949edbdb43fSSean Christopherson if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 950edbdb43fSSean Christopherson refcount_read(&kvm->users_count) && kvm->created_vcpus) 951b7cccd39SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 952edbdb43fSSean Christopherson 953edbdb43fSSean Christopherson /* 954edbdb43fSSean Christopherson * As above, mmu_lock isn't held when destroying the VM! There can't 955edbdb43fSSean Christopherson * be other references to @kvm, i.e. nothing else can invalidate roots 956edbdb43fSSean Christopherson * or be consuming roots, but walking the list of roots does need to be 957edbdb43fSSean Christopherson * guarded against roots being deleted by the asynchronous zap worker. 958edbdb43fSSean Christopherson */ 959edbdb43fSSean Christopherson rcu_read_lock(); 960edbdb43fSSean Christopherson 961edbdb43fSSean Christopherson list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { 962edbdb43fSSean Christopherson if (!root->role.invalid) { 963b7cccd39SBen Gardon root->role.invalid = true; 96422b94c4bSPaolo Bonzini tdp_mmu_schedule_zap_root(kvm, root); 96522b94c4bSPaolo Bonzini } 966b7cccd39SBen Gardon } 967edbdb43fSSean Christopherson 968edbdb43fSSean Christopherson rcu_read_unlock(); 969f28e9c7fSSean Christopherson } 970b7cccd39SBen Gardon 971bb18842eSBen Gardon /* 972bb18842eSBen Gardon * Installs a last-level SPTE to handle a TDP page fault. 973bb18842eSBen Gardon * (NPT/EPT violation/misconfiguration) 974bb18842eSBen Gardon */ 975cdc47767SPaolo Bonzini static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 976cdc47767SPaolo Bonzini struct kvm_page_fault *fault, 977cdc47767SPaolo Bonzini struct tdp_iter *iter) 978bb18842eSBen Gardon { 979c435d4b7SSean Christopherson struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 980bb18842eSBen Gardon u64 new_spte; 98157a3e96dSKai Huang int ret = RET_PF_FIXED; 982ad67e480SPaolo Bonzini bool wrprot = false; 983bb18842eSBen Gardon 98450a9ac25SSean Christopherson if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 98550a9ac25SSean Christopherson return RET_PF_RETRY; 98650a9ac25SSean Christopherson 987e710c5f6SDavid Matlack if (unlikely(!fault->slot)) 988bb18842eSBen Gardon new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 9899a77daacSBen Gardon else 99053597858SDavid Matlack wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 9912839180cSPaolo Bonzini fault->pfn, iter->old_spte, fault->prefetch, true, 9927158bee4SPaolo Bonzini fault->map_writable, &new_spte); 993bb18842eSBen Gardon 994bb18842eSBen Gardon if (new_spte == iter->old_spte) 995bb18842eSBen Gardon ret = RET_PF_SPURIOUS; 9963e72c791SDavid Matlack else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 9979a77daacSBen Gardon return RET_PF_RETRY; 998bb95dfb9SSean Christopherson else if (is_shadow_present_pte(iter->old_spte) && 999bb95dfb9SSean Christopherson !is_last_spte(iter->old_spte, iter->level)) 10001e203847SHou Wenlong kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 1001bb18842eSBen Gardon 1002bb18842eSBen Gardon /* 1003bb18842eSBen Gardon * If the page fault was caused by a write but the page is write 1004bb18842eSBen Gardon * protected, emulation is needed. If the emulation was skipped, 1005bb18842eSBen Gardon * the vCPU would have the same fault again. 1006bb18842eSBen Gardon */ 1007ad67e480SPaolo Bonzini if (wrprot) { 1008cdc47767SPaolo Bonzini if (fault->write) 1009bb18842eSBen Gardon ret = RET_PF_EMULATE; 1010bb18842eSBen Gardon } 1011bb18842eSBen Gardon 1012bb18842eSBen Gardon /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 10139a77daacSBen Gardon if (unlikely(is_mmio_spte(new_spte))) { 10141075d41eSSean Christopherson vcpu->stat.pf_mmio_spte_created++; 10159a77daacSBen Gardon trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 10169a77daacSBen Gardon new_spte); 1017bb18842eSBen Gardon ret = RET_PF_EMULATE; 10183849e092SSean Christopherson } else { 10199a77daacSBen Gardon trace_kvm_mmu_set_spte(iter->level, iter->gfn, 10209a77daacSBen Gardon rcu_dereference(iter->sptep)); 10213849e092SSean Christopherson } 1022bb18842eSBen Gardon 1023bb18842eSBen Gardon return ret; 1024bb18842eSBen Gardon } 1025bb18842eSBen Gardon 1026bb18842eSBen Gardon /* 1027cb00a70bSDavid Matlack * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1028cb00a70bSDavid Matlack * provided page table. 10297b7e1ab6SDavid Matlack * 10307b7e1ab6SDavid Matlack * @kvm: kvm instance 10317b7e1ab6SDavid Matlack * @iter: a tdp_iter instance currently on the SPTE that should be set 10327b7e1ab6SDavid Matlack * @sp: The new TDP page table to install. 1033cb00a70bSDavid Matlack * @shared: This operation is running under the MMU lock in read mode. 10347b7e1ab6SDavid Matlack * 10357b7e1ab6SDavid Matlack * Returns: 0 if the new page table was installed. Non-0 if the page table 10367b7e1ab6SDavid Matlack * could not be installed (e.g. the atomic compare-exchange failed). 10377b7e1ab6SDavid Matlack */ 1038cb00a70bSDavid Matlack static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 103961f94478SSean Christopherson struct kvm_mmu_page *sp, bool shared) 10407b7e1ab6SDavid Matlack { 104154275f74SSean Christopherson u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1042cb00a70bSDavid Matlack int ret = 0; 10437b7e1ab6SDavid Matlack 1044cb00a70bSDavid Matlack if (shared) { 10457b7e1ab6SDavid Matlack ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 10467b7e1ab6SDavid Matlack if (ret) 10477b7e1ab6SDavid Matlack return ret; 1048cb00a70bSDavid Matlack } else { 10490b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, iter, spte); 1050cb00a70bSDavid Matlack } 10517b7e1ab6SDavid Matlack 105243a063caSYosry Ahmed tdp_account_mmu_page(kvm, sp); 10537b7e1ab6SDavid Matlack 10547b7e1ab6SDavid Matlack return 0; 10557b7e1ab6SDavid Matlack } 10567b7e1ab6SDavid Matlack 1057c4b33d28SDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1058c4b33d28SDavid Matlack struct kvm_mmu_page *sp, bool shared); 1059c4b33d28SDavid Matlack 10607b7e1ab6SDavid Matlack /* 1061bb18842eSBen Gardon * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1062bb18842eSBen Gardon * page tables and SPTEs to translate the faulting guest physical address. 1063bb18842eSBen Gardon */ 10642f6305ddSPaolo Bonzini int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1065bb18842eSBen Gardon { 1066bb18842eSBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 106761f94478SSean Christopherson struct kvm *kvm = vcpu->kvm; 1068bb18842eSBen Gardon struct tdp_iter iter; 106989c0fd49SBen Gardon struct kvm_mmu_page *sp; 107063d28a25SPaolo Bonzini int ret = RET_PF_RETRY; 1071bb18842eSBen Gardon 107273a3c659SPaolo Bonzini kvm_mmu_hugepage_adjust(vcpu, fault); 1073bb18842eSBen Gardon 1074f0066d94SPaolo Bonzini trace_kvm_mmu_spte_requested(fault); 10757cca2d0bSBen Gardon 10767cca2d0bSBen Gardon rcu_read_lock(); 10777cca2d0bSBen Gardon 10782f6305ddSPaolo Bonzini tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 107963d28a25SPaolo Bonzini int r; 108063d28a25SPaolo Bonzini 108173a3c659SPaolo Bonzini if (fault->nx_huge_page_workaround_enabled) 1082536f0e6aSPaolo Bonzini disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1083bb18842eSBen Gardon 1084bb18842eSBen Gardon /* 1085c4b33d28SDavid Matlack * If SPTE has been frozen by another thread, just give up and 1086c4b33d28SDavid Matlack * retry, avoiding unnecessary page table allocation and free. 1087ff76d506SKai Huang */ 1088ff76d506SKai Huang if (is_removed_spte(iter.old_spte)) 108963d28a25SPaolo Bonzini goto retry; 109063d28a25SPaolo Bonzini 1091f5d16bb9SSean Christopherson if (iter.level == fault->goal_level) 109280a3e4aeSSean Christopherson goto map_target_level; 1093f5d16bb9SSean Christopherson 109463d28a25SPaolo Bonzini /* Step down into the lower level page table if it exists. */ 109563d28a25SPaolo Bonzini if (is_shadow_present_pte(iter.old_spte) && 109663d28a25SPaolo Bonzini !is_large_pte(iter.old_spte)) 109763d28a25SPaolo Bonzini continue; 1098ff76d506SKai Huang 1099c4b33d28SDavid Matlack /* 1100c4b33d28SDavid Matlack * The SPTE is either non-present or points to a huge page that 1101c4b33d28SDavid Matlack * needs to be split. 1102c4b33d28SDavid Matlack */ 1103a82070b6SDavid Matlack sp = tdp_mmu_alloc_sp(vcpu); 1104a82070b6SDavid Matlack tdp_mmu_init_child_sp(sp, &iter); 1105a82070b6SDavid Matlack 110661f94478SSean Christopherson sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 110761f94478SSean Christopherson 1108c4b33d28SDavid Matlack if (is_shadow_present_pte(iter.old_spte)) 110963d28a25SPaolo Bonzini r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1110c4b33d28SDavid Matlack else 111163d28a25SPaolo Bonzini r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1112c4b33d28SDavid Matlack 111363d28a25SPaolo Bonzini /* 111480a3e4aeSSean Christopherson * Force the guest to retry if installing an upper level SPTE 111580a3e4aeSSean Christopherson * failed, e.g. because a different task modified the SPTE. 111663d28a25SPaolo Bonzini */ 111763d28a25SPaolo Bonzini if (r) { 11189a77daacSBen Gardon tdp_mmu_free_sp(sp); 111963d28a25SPaolo Bonzini goto retry; 11209a77daacSBen Gardon } 112161f94478SSean Christopherson 112261f94478SSean Christopherson if (fault->huge_page_disallowed && 112361f94478SSean Christopherson fault->req_level >= iter.level) { 112461f94478SSean Christopherson spin_lock(&kvm->arch.tdp_mmu_pages_lock); 112521a36ac6SSean Christopherson if (sp->nx_huge_page_disallowed) 112661f94478SSean Christopherson track_possible_nx_huge_page(kvm, sp); 112761f94478SSean Christopherson spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 112861f94478SSean Christopherson } 1129bb18842eSBen Gardon } 1130bb18842eSBen Gardon 113180a3e4aeSSean Christopherson /* 113280a3e4aeSSean Christopherson * The walk aborted before reaching the target level, e.g. because the 113380a3e4aeSSean Christopherson * iterator detected an upper level SPTE was frozen during traversal. 113480a3e4aeSSean Christopherson */ 113580a3e4aeSSean Christopherson WARN_ON_ONCE(iter.level == fault->goal_level); 113680a3e4aeSSean Christopherson goto retry; 113780a3e4aeSSean Christopherson 113880a3e4aeSSean Christopherson map_target_level: 1139cdc47767SPaolo Bonzini ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1140bb18842eSBen Gardon 114163d28a25SPaolo Bonzini retry: 114263d28a25SPaolo Bonzini rcu_read_unlock(); 1143bb18842eSBen Gardon return ret; 1144bb18842eSBen Gardon } 1145063afacdSBen Gardon 11463039bcc7SSean Christopherson bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 11473039bcc7SSean Christopherson bool flush) 1148063afacdSBen Gardon { 1149f47e5bbbSSean Christopherson return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 115083b83a02SSean Christopherson range->end, range->may_block, flush); 11513039bcc7SSean Christopherson } 11523039bcc7SSean Christopherson 11533039bcc7SSean Christopherson typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 11543039bcc7SSean Christopherson struct kvm_gfn_range *range); 11553039bcc7SSean Christopherson 11563039bcc7SSean Christopherson static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 11573039bcc7SSean Christopherson struct kvm_gfn_range *range, 1158c1b91493SSean Christopherson tdp_handler_t handler) 1159063afacdSBen Gardon { 1160063afacdSBen Gardon struct kvm_mmu_page *root; 11613039bcc7SSean Christopherson struct tdp_iter iter; 11623039bcc7SSean Christopherson bool ret = false; 1163063afacdSBen Gardon 1164063afacdSBen Gardon /* 1165e1eed584SSean Christopherson * Don't support rescheduling, none of the MMU notifiers that funnel 1166e1eed584SSean Christopherson * into this helper allow blocking; it'd be dead, wasteful code. 1167063afacdSBen Gardon */ 11683039bcc7SSean Christopherson for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1169a151acecSSean Christopherson rcu_read_lock(); 1170a151acecSSean Christopherson 11713039bcc7SSean Christopherson tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 11723039bcc7SSean Christopherson ret |= handler(kvm, &iter, range); 1173063afacdSBen Gardon 11743039bcc7SSean Christopherson rcu_read_unlock(); 1175a151acecSSean Christopherson } 1176063afacdSBen Gardon 1177063afacdSBen Gardon return ret; 1178063afacdSBen Gardon } 1179063afacdSBen Gardon 1180f8e14497SBen Gardon /* 1181f8e14497SBen Gardon * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1182f8e14497SBen Gardon * if any of the GFNs in the range have been accessed. 11837ee131e3SVipin Sharma * 11847ee131e3SVipin Sharma * No need to mark the corresponding PFN as accessed as this call is coming 11857ee131e3SVipin Sharma * from the clear_young() or clear_flush_young() notifier, which uses the 11867ee131e3SVipin Sharma * return value to determine if the page has been accessed. 1187f8e14497SBen Gardon */ 11883039bcc7SSean Christopherson static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 11893039bcc7SSean Christopherson struct kvm_gfn_range *range) 1190f8e14497SBen Gardon { 11917ee131e3SVipin Sharma u64 new_spte; 1192f8e14497SBen Gardon 11933039bcc7SSean Christopherson /* If we have a non-accessed entry we don't need to change the pte. */ 11943039bcc7SSean Christopherson if (!is_accessed_spte(iter->old_spte)) 11953039bcc7SSean Christopherson return false; 11967cca2d0bSBen Gardon 11977ee131e3SVipin Sharma if (spte_ad_enabled(iter->old_spte)) { 11987ee131e3SVipin Sharma iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, 11997ee131e3SVipin Sharma iter->old_spte, 12007ee131e3SVipin Sharma shadow_accessed_mask, 12017ee131e3SVipin Sharma iter->level); 12027ee131e3SVipin Sharma new_spte = iter->old_spte & ~shadow_accessed_mask; 1203f8e14497SBen Gardon } else { 1204f8e14497SBen Gardon /* 1205f8e14497SBen Gardon * Capture the dirty status of the page, so that it doesn't get 1206f8e14497SBen Gardon * lost when the SPTE is marked for access tracking. 1207f8e14497SBen Gardon */ 12087ee131e3SVipin Sharma if (is_writable_pte(iter->old_spte)) 12097ee131e3SVipin Sharma kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); 1210f8e14497SBen Gardon 12117ee131e3SVipin Sharma new_spte = mark_spte_for_access_track(iter->old_spte); 12127ee131e3SVipin Sharma iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, 12137ee131e3SVipin Sharma iter->old_spte, new_spte, 12147ee131e3SVipin Sharma iter->level); 1215f8e14497SBen Gardon } 1216f8e14497SBen Gardon 1217891f1159SVipin Sharma trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, 1218891f1159SVipin Sharma iter->old_spte, new_spte); 12193039bcc7SSean Christopherson return true; 1220f8e14497SBen Gardon } 1221f8e14497SBen Gardon 12223039bcc7SSean Christopherson bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1223f8e14497SBen Gardon { 12243039bcc7SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1225f8e14497SBen Gardon } 1226f8e14497SBen Gardon 12273039bcc7SSean Christopherson static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 12283039bcc7SSean Christopherson struct kvm_gfn_range *range) 1229f8e14497SBen Gardon { 12303039bcc7SSean Christopherson return is_accessed_spte(iter->old_spte); 1231f8e14497SBen Gardon } 1232f8e14497SBen Gardon 12333039bcc7SSean Christopherson bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1234f8e14497SBen Gardon { 12353039bcc7SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 12363039bcc7SSean Christopherson } 12373039bcc7SSean Christopherson 12383039bcc7SSean Christopherson static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 12393039bcc7SSean Christopherson struct kvm_gfn_range *range) 12403039bcc7SSean Christopherson { 12413039bcc7SSean Christopherson u64 new_spte; 12423039bcc7SSean Christopherson 12433039bcc7SSean Christopherson /* Huge pages aren't expected to be modified without first being zapped. */ 124420ba462dSSean Christopherson WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end); 12453039bcc7SSean Christopherson 12463039bcc7SSean Christopherson if (iter->level != PG_LEVEL_4K || 12473039bcc7SSean Christopherson !is_shadow_present_pte(iter->old_spte)) 12483039bcc7SSean Christopherson return false; 12493039bcc7SSean Christopherson 12503039bcc7SSean Christopherson /* 12513039bcc7SSean Christopherson * Note, when changing a read-only SPTE, it's not strictly necessary to 12523039bcc7SSean Christopherson * zero the SPTE before setting the new PFN, but doing so preserves the 12533039bcc7SSean Christopherson * invariant that the PFN of a present * leaf SPTE can never change. 125440fa907eSVipin Sharma * See handle_changed_spte(). 12553039bcc7SSean Christopherson */ 12560b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, iter, 0); 12573039bcc7SSean Christopherson 12583e1efe2bSSean Christopherson if (!pte_write(range->arg.pte)) { 12593039bcc7SSean Christopherson new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 12603e1efe2bSSean Christopherson pte_pfn(range->arg.pte)); 12613039bcc7SSean Christopherson 12620b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, iter, new_spte); 12633039bcc7SSean Christopherson } 12643039bcc7SSean Christopherson 12653039bcc7SSean Christopherson return true; 1266f8e14497SBen Gardon } 12671d8dd6b3SBen Gardon 12681d8dd6b3SBen Gardon /* 12691d8dd6b3SBen Gardon * Handle the changed_pte MMU notifier for the TDP MMU. 12701d8dd6b3SBen Gardon * data is a pointer to the new pte_t mapping the HVA specified by the MMU 12711d8dd6b3SBen Gardon * notifier. 12721d8dd6b3SBen Gardon * Returns non-zero if a flush is needed before releasing the MMU lock. 12731d8dd6b3SBen Gardon */ 12743039bcc7SSean Christopherson bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 12751d8dd6b3SBen Gardon { 127693fa50f6SSean Christopherson /* 127793fa50f6SSean Christopherson * No need to handle the remote TLB flush under RCU protection, the 127893fa50f6SSean Christopherson * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 127940fa907eSVipin Sharma * shadow page. See the WARN on pfn_changed in handle_changed_spte(). 128093fa50f6SSean Christopherson */ 128193fa50f6SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 12821d8dd6b3SBen Gardon } 12831d8dd6b3SBen Gardon 1284a6a0b05dSBen Gardon /* 1285bedd9195SDavid Matlack * Remove write access from all SPTEs at or above min_level that map GFNs 1286bedd9195SDavid Matlack * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1287bedd9195SDavid Matlack * be flushed. 1288a6a0b05dSBen Gardon */ 1289a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1290a6a0b05dSBen Gardon gfn_t start, gfn_t end, int min_level) 1291a6a0b05dSBen Gardon { 1292a6a0b05dSBen Gardon struct tdp_iter iter; 1293a6a0b05dSBen Gardon u64 new_spte; 1294a6a0b05dSBen Gardon bool spte_set = false; 1295a6a0b05dSBen Gardon 12967cca2d0bSBen Gardon rcu_read_lock(); 12977cca2d0bSBen Gardon 1298a6a0b05dSBen Gardon BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1299a6a0b05dSBen Gardon 130077aa6075SDavid Matlack for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 130124ae4cfaSBen Gardon retry: 130224ae4cfaSBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 13031af4a960SBen Gardon continue; 13041af4a960SBen Gardon 1305a6a0b05dSBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 13060f99ee2cSBen Gardon !is_last_spte(iter.old_spte, iter.level) || 13070f99ee2cSBen Gardon !(iter.old_spte & PT_WRITABLE_MASK)) 1308a6a0b05dSBen Gardon continue; 1309a6a0b05dSBen Gardon 1310a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1311a6a0b05dSBen Gardon 13123e72c791SDavid Matlack if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 131324ae4cfaSBen Gardon goto retry; 13143255530aSDavid Matlack 1315a6a0b05dSBen Gardon spte_set = true; 1316a6a0b05dSBen Gardon } 13177cca2d0bSBen Gardon 13187cca2d0bSBen Gardon rcu_read_unlock(); 1319a6a0b05dSBen Gardon return spte_set; 1320a6a0b05dSBen Gardon } 1321a6a0b05dSBen Gardon 1322a6a0b05dSBen Gardon /* 1323a6a0b05dSBen Gardon * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1324a6a0b05dSBen Gardon * only affect leaf SPTEs down to min_level. 1325a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1326a6a0b05dSBen Gardon */ 1327269e9552SHamza Mahfooz bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1328269e9552SHamza Mahfooz const struct kvm_memory_slot *slot, int min_level) 1329a6a0b05dSBen Gardon { 1330a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1331a6a0b05dSBen Gardon bool spte_set = false; 1332a6a0b05dSBen Gardon 133324ae4cfaSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 1334a6a0b05dSBen Gardon 1335d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1336a6a0b05dSBen Gardon spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1337a6a0b05dSBen Gardon slot->base_gfn + slot->npages, min_level); 1338a6a0b05dSBen Gardon 1339a6a0b05dSBen Gardon return spte_set; 1340a6a0b05dSBen Gardon } 1341a6a0b05dSBen Gardon 1342a3fe5dbdSDavid Matlack static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1343a3fe5dbdSDavid Matlack { 1344a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp; 1345a3fe5dbdSDavid Matlack 1346a3fe5dbdSDavid Matlack gfp |= __GFP_ZERO; 1347a3fe5dbdSDavid Matlack 1348a3fe5dbdSDavid Matlack sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1349a3fe5dbdSDavid Matlack if (!sp) 1350a3fe5dbdSDavid Matlack return NULL; 1351a3fe5dbdSDavid Matlack 1352a3fe5dbdSDavid Matlack sp->spt = (void *)__get_free_page(gfp); 1353a3fe5dbdSDavid Matlack if (!sp->spt) { 1354a3fe5dbdSDavid Matlack kmem_cache_free(mmu_page_header_cache, sp); 1355a3fe5dbdSDavid Matlack return NULL; 1356a3fe5dbdSDavid Matlack } 1357a3fe5dbdSDavid Matlack 1358a3fe5dbdSDavid Matlack return sp; 1359a3fe5dbdSDavid Matlack } 1360a3fe5dbdSDavid Matlack 1361a3fe5dbdSDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1362cb00a70bSDavid Matlack struct tdp_iter *iter, 1363cb00a70bSDavid Matlack bool shared) 1364a3fe5dbdSDavid Matlack { 1365a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp; 1366a3fe5dbdSDavid Matlack 1367a3fe5dbdSDavid Matlack /* 1368a3fe5dbdSDavid Matlack * Since we are allocating while under the MMU lock we have to be 1369a3fe5dbdSDavid Matlack * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1370a3fe5dbdSDavid Matlack * reclaim and to avoid making any filesystem callbacks (which can end 1371a3fe5dbdSDavid Matlack * up invoking KVM MMU notifiers, resulting in a deadlock). 1372a3fe5dbdSDavid Matlack * 1373a3fe5dbdSDavid Matlack * If this allocation fails we drop the lock and retry with reclaim 1374a3fe5dbdSDavid Matlack * allowed. 1375a3fe5dbdSDavid Matlack */ 1376a3fe5dbdSDavid Matlack sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1377a3fe5dbdSDavid Matlack if (sp) 1378a3fe5dbdSDavid Matlack return sp; 1379a3fe5dbdSDavid Matlack 1380a3fe5dbdSDavid Matlack rcu_read_unlock(); 1381cb00a70bSDavid Matlack 1382cb00a70bSDavid Matlack if (shared) 1383a3fe5dbdSDavid Matlack read_unlock(&kvm->mmu_lock); 1384cb00a70bSDavid Matlack else 1385cb00a70bSDavid Matlack write_unlock(&kvm->mmu_lock); 1386a3fe5dbdSDavid Matlack 1387a3fe5dbdSDavid Matlack iter->yielded = true; 1388a3fe5dbdSDavid Matlack sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1389a3fe5dbdSDavid Matlack 1390cb00a70bSDavid Matlack if (shared) 1391a3fe5dbdSDavid Matlack read_lock(&kvm->mmu_lock); 1392cb00a70bSDavid Matlack else 1393cb00a70bSDavid Matlack write_lock(&kvm->mmu_lock); 1394cb00a70bSDavid Matlack 1395a3fe5dbdSDavid Matlack rcu_read_lock(); 1396a3fe5dbdSDavid Matlack 1397a3fe5dbdSDavid Matlack return sp; 1398a3fe5dbdSDavid Matlack } 1399a3fe5dbdSDavid Matlack 1400c4b33d28SDavid Matlack /* Note, the caller is responsible for initializing @sp. */ 1401cb00a70bSDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1402cb00a70bSDavid Matlack struct kvm_mmu_page *sp, bool shared) 1403a3fe5dbdSDavid Matlack { 1404a3fe5dbdSDavid Matlack const u64 huge_spte = iter->old_spte; 1405a3fe5dbdSDavid Matlack const int level = iter->level; 1406a3fe5dbdSDavid Matlack int ret, i; 1407a3fe5dbdSDavid Matlack 1408a3fe5dbdSDavid Matlack /* 1409a3fe5dbdSDavid Matlack * No need for atomics when writing to sp->spt since the page table has 1410a3fe5dbdSDavid Matlack * not been linked in yet and thus is not reachable from any other CPU. 1411a3fe5dbdSDavid Matlack */ 14122ca3129eSSean Christopherson for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 141347855da0SDavid Matlack sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1414a3fe5dbdSDavid Matlack 1415a3fe5dbdSDavid Matlack /* 1416a3fe5dbdSDavid Matlack * Replace the huge spte with a pointer to the populated lower level 1417a3fe5dbdSDavid Matlack * page table. Since we are making this change without a TLB flush vCPUs 1418a3fe5dbdSDavid Matlack * will see a mix of the split mappings and the original huge mapping, 1419a3fe5dbdSDavid Matlack * depending on what's currently in their TLB. This is fine from a 1420a3fe5dbdSDavid Matlack * correctness standpoint since the translation will be the same either 1421a3fe5dbdSDavid Matlack * way. 1422a3fe5dbdSDavid Matlack */ 142361f94478SSean Christopherson ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1424a3fe5dbdSDavid Matlack if (ret) 1425e0b728b1SDavid Matlack goto out; 1426a3fe5dbdSDavid Matlack 1427a3fe5dbdSDavid Matlack /* 1428a3fe5dbdSDavid Matlack * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1429a3fe5dbdSDavid Matlack * are overwriting from the page stats. But we have to manually update 1430a3fe5dbdSDavid Matlack * the page stats with the new present child pages. 1431a3fe5dbdSDavid Matlack */ 14322ca3129eSSean Christopherson kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1433a3fe5dbdSDavid Matlack 1434e0b728b1SDavid Matlack out: 1435e0b728b1SDavid Matlack trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1436e0b728b1SDavid Matlack return ret; 1437a3fe5dbdSDavid Matlack } 1438a3fe5dbdSDavid Matlack 1439a3fe5dbdSDavid Matlack static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1440a3fe5dbdSDavid Matlack struct kvm_mmu_page *root, 1441a3fe5dbdSDavid Matlack gfn_t start, gfn_t end, 1442cb00a70bSDavid Matlack int target_level, bool shared) 1443a3fe5dbdSDavid Matlack { 1444a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp = NULL; 1445a3fe5dbdSDavid Matlack struct tdp_iter iter; 1446a3fe5dbdSDavid Matlack int ret = 0; 1447a3fe5dbdSDavid Matlack 1448a3fe5dbdSDavid Matlack rcu_read_lock(); 1449a3fe5dbdSDavid Matlack 1450a3fe5dbdSDavid Matlack /* 1451a3fe5dbdSDavid Matlack * Traverse the page table splitting all huge pages above the target 1452a3fe5dbdSDavid Matlack * level into one lower level. For example, if we encounter a 1GB page 1453a3fe5dbdSDavid Matlack * we split it into 512 2MB pages. 1454a3fe5dbdSDavid Matlack * 1455a3fe5dbdSDavid Matlack * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1456a3fe5dbdSDavid Matlack * to visit an SPTE before ever visiting its children, which means we 1457a3fe5dbdSDavid Matlack * will correctly recursively split huge pages that are more than one 1458a3fe5dbdSDavid Matlack * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1459a3fe5dbdSDavid Matlack * and then splitting each of those to 512 4KB pages). 1460a3fe5dbdSDavid Matlack */ 1461a3fe5dbdSDavid Matlack for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1462a3fe5dbdSDavid Matlack retry: 1463cb00a70bSDavid Matlack if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1464a3fe5dbdSDavid Matlack continue; 1465a3fe5dbdSDavid Matlack 1466a3fe5dbdSDavid Matlack if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1467a3fe5dbdSDavid Matlack continue; 1468a3fe5dbdSDavid Matlack 1469a3fe5dbdSDavid Matlack if (!sp) { 1470cb00a70bSDavid Matlack sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1471a3fe5dbdSDavid Matlack if (!sp) { 1472a3fe5dbdSDavid Matlack ret = -ENOMEM; 1473e0b728b1SDavid Matlack trace_kvm_mmu_split_huge_page(iter.gfn, 1474e0b728b1SDavid Matlack iter.old_spte, 1475e0b728b1SDavid Matlack iter.level, ret); 1476a3fe5dbdSDavid Matlack break; 1477a3fe5dbdSDavid Matlack } 1478a3fe5dbdSDavid Matlack 1479a3fe5dbdSDavid Matlack if (iter.yielded) 1480a3fe5dbdSDavid Matlack continue; 1481a3fe5dbdSDavid Matlack } 1482a3fe5dbdSDavid Matlack 1483c4b33d28SDavid Matlack tdp_mmu_init_child_sp(sp, &iter); 1484c4b33d28SDavid Matlack 1485cb00a70bSDavid Matlack if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1486a3fe5dbdSDavid Matlack goto retry; 1487a3fe5dbdSDavid Matlack 1488a3fe5dbdSDavid Matlack sp = NULL; 1489a3fe5dbdSDavid Matlack } 1490a3fe5dbdSDavid Matlack 1491a3fe5dbdSDavid Matlack rcu_read_unlock(); 1492a3fe5dbdSDavid Matlack 1493a3fe5dbdSDavid Matlack /* 1494a3fe5dbdSDavid Matlack * It's possible to exit the loop having never used the last sp if, for 1495a3fe5dbdSDavid Matlack * example, a vCPU doing HugePage NX splitting wins the race and 1496a3fe5dbdSDavid Matlack * installs its own sp in place of the last sp we tried to split. 1497a3fe5dbdSDavid Matlack */ 1498a3fe5dbdSDavid Matlack if (sp) 1499a3fe5dbdSDavid Matlack tdp_mmu_free_sp(sp); 1500a3fe5dbdSDavid Matlack 1501a3fe5dbdSDavid Matlack return ret; 1502a3fe5dbdSDavid Matlack } 1503a3fe5dbdSDavid Matlack 1504cb00a70bSDavid Matlack 1505a3fe5dbdSDavid Matlack /* 1506a3fe5dbdSDavid Matlack * Try to split all huge pages mapped by the TDP MMU down to the target level. 1507a3fe5dbdSDavid Matlack */ 1508a3fe5dbdSDavid Matlack void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1509a3fe5dbdSDavid Matlack const struct kvm_memory_slot *slot, 1510a3fe5dbdSDavid Matlack gfn_t start, gfn_t end, 1511cb00a70bSDavid Matlack int target_level, bool shared) 1512a3fe5dbdSDavid Matlack { 1513a3fe5dbdSDavid Matlack struct kvm_mmu_page *root; 1514a3fe5dbdSDavid Matlack int r = 0; 1515a3fe5dbdSDavid Matlack 1516cb00a70bSDavid Matlack kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1517a3fe5dbdSDavid Matlack 15187c554d8eSPaolo Bonzini for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1519cb00a70bSDavid Matlack r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1520a3fe5dbdSDavid Matlack if (r) { 1521cb00a70bSDavid Matlack kvm_tdp_mmu_put_root(kvm, root, shared); 1522a3fe5dbdSDavid Matlack break; 1523a3fe5dbdSDavid Matlack } 1524a3fe5dbdSDavid Matlack } 1525a3fe5dbdSDavid Matlack } 1526a3fe5dbdSDavid Matlack 1527a6a0b05dSBen Gardon /* 1528a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1529a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1530a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1531a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1532a6a0b05dSBen Gardon * be flushed. 1533a6a0b05dSBen Gardon */ 1534a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1535a6a0b05dSBen Gardon gfn_t start, gfn_t end) 1536a6a0b05dSBen Gardon { 1537697c89beSVipin Sharma u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; 1538a6a0b05dSBen Gardon struct tdp_iter iter; 1539a6a0b05dSBen Gardon bool spte_set = false; 1540a6a0b05dSBen Gardon 15417cca2d0bSBen Gardon rcu_read_lock(); 15427cca2d0bSBen Gardon 1543a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, start, end) { 154424ae4cfaSBen Gardon retry: 154524ae4cfaSBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 15461af4a960SBen Gardon continue; 15471af4a960SBen Gardon 15483354ef5aSSean Christopherson if (!is_shadow_present_pte(iter.old_spte)) 15493354ef5aSSean Christopherson continue; 15503354ef5aSSean Christopherson 15510fe6370eSSean Christopherson KVM_MMU_WARN_ON(kvm_ad_enabled() && 15525982a539SVipin Sharma spte_ad_need_write_protect(iter.old_spte)); 15535982a539SVipin Sharma 1554697c89beSVipin Sharma if (!(iter.old_spte & dbit)) 1555a6a0b05dSBen Gardon continue; 1556a6a0b05dSBen Gardon 1557697c89beSVipin Sharma if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 155824ae4cfaSBen Gardon goto retry; 15593255530aSDavid Matlack 1560a6a0b05dSBen Gardon spte_set = true; 1561a6a0b05dSBen Gardon } 15627cca2d0bSBen Gardon 15637cca2d0bSBen Gardon rcu_read_unlock(); 1564a6a0b05dSBen Gardon return spte_set; 1565a6a0b05dSBen Gardon } 1566a6a0b05dSBen Gardon 1567a6a0b05dSBen Gardon /* 1568a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1569a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1570a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1571a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1572a6a0b05dSBen Gardon * be flushed. 1573a6a0b05dSBen Gardon */ 1574269e9552SHamza Mahfooz bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1575269e9552SHamza Mahfooz const struct kvm_memory_slot *slot) 1576a6a0b05dSBen Gardon { 1577a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1578a6a0b05dSBen Gardon bool spte_set = false; 1579a6a0b05dSBen Gardon 158024ae4cfaSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 1581a6a0b05dSBen Gardon 1582d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1583a6a0b05dSBen Gardon spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1584a6a0b05dSBen Gardon slot->base_gfn + slot->npages); 1585a6a0b05dSBen Gardon 1586a6a0b05dSBen Gardon return spte_set; 1587a6a0b05dSBen Gardon } 1588a6a0b05dSBen Gardon 1589a6a0b05dSBen Gardon /* 1590a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1591a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1592a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1593a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1594a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1595a6a0b05dSBen Gardon */ 1596a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1597a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, bool wrprot) 1598a6a0b05dSBen Gardon { 1599697c89beSVipin Sharma u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : 1600697c89beSVipin Sharma shadow_dirty_mask; 1601a6a0b05dSBen Gardon struct tdp_iter iter; 1602a6a0b05dSBen Gardon 160391303f80SLike Xu lockdep_assert_held_write(&kvm->mmu_lock); 160491303f80SLike Xu 16057cca2d0bSBen Gardon rcu_read_lock(); 16067cca2d0bSBen Gardon 1607a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1608a6a0b05dSBen Gardon gfn + BITS_PER_LONG) { 1609a6a0b05dSBen Gardon if (!mask) 1610a6a0b05dSBen Gardon break; 1611a6a0b05dSBen Gardon 16120fe6370eSSean Christopherson KVM_MMU_WARN_ON(kvm_ad_enabled() && 16135982a539SVipin Sharma spte_ad_need_write_protect(iter.old_spte)); 16145982a539SVipin Sharma 1615a6a0b05dSBen Gardon if (iter.level > PG_LEVEL_4K || 1616a6a0b05dSBen Gardon !(mask & (1UL << (iter.gfn - gfn)))) 1617a6a0b05dSBen Gardon continue; 1618a6a0b05dSBen Gardon 1619f1b3b06aSBen Gardon mask &= ~(1UL << (iter.gfn - gfn)); 1620f1b3b06aSBen Gardon 1621697c89beSVipin Sharma if (!(iter.old_spte & dbit)) 1622a6a0b05dSBen Gardon continue; 1623a6a0b05dSBen Gardon 162489c313f2SVipin Sharma iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 162589c313f2SVipin Sharma iter.old_spte, dbit, 162689c313f2SVipin Sharma iter.level); 162789c313f2SVipin Sharma 16281e0f4298SVipin Sharma trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 16291e0f4298SVipin Sharma iter.old_spte, 16301e0f4298SVipin Sharma iter.old_spte & ~dbit); 16311e0f4298SVipin Sharma kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); 1632a6a0b05dSBen Gardon } 16337cca2d0bSBen Gardon 16347cca2d0bSBen Gardon rcu_read_unlock(); 1635a6a0b05dSBen Gardon } 1636a6a0b05dSBen Gardon 1637a6a0b05dSBen Gardon /* 1638a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1639a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1640a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1641a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1642a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1643a6a0b05dSBen Gardon */ 1644a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1645a6a0b05dSBen Gardon struct kvm_memory_slot *slot, 1646a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, 1647a6a0b05dSBen Gardon bool wrprot) 1648a6a0b05dSBen Gardon { 1649a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1650a6a0b05dSBen Gardon 1651a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, slot->as_id) 1652a6a0b05dSBen Gardon clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1653a6a0b05dSBen Gardon } 1654a6a0b05dSBen Gardon 16554b85c921SSean Christopherson static void zap_collapsible_spte_range(struct kvm *kvm, 165614881998SBen Gardon struct kvm_mmu_page *root, 16574b85c921SSean Christopherson const struct kvm_memory_slot *slot) 165814881998SBen Gardon { 16599eba50f8SSean Christopherson gfn_t start = slot->base_gfn; 16609eba50f8SSean Christopherson gfn_t end = start + slot->npages; 166114881998SBen Gardon struct tdp_iter iter; 16625ba7c4c6SBen Gardon int max_mapping_level; 166314881998SBen Gardon 16647cca2d0bSBen Gardon rcu_read_lock(); 16657cca2d0bSBen Gardon 166685f44f8cSSean Christopherson for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 166785f44f8cSSean Christopherson retry: 16684b85c921SSean Christopherson if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 16691af4a960SBen Gardon continue; 16701af4a960SBen Gardon 167185f44f8cSSean Christopherson if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 167285f44f8cSSean Christopherson !is_shadow_present_pte(iter.old_spte)) 167385f44f8cSSean Christopherson continue; 167485f44f8cSSean Christopherson 167585f44f8cSSean Christopherson /* 167685f44f8cSSean Christopherson * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 167785f44f8cSSean Christopherson * a large page size, then its parent would have been zapped 167885f44f8cSSean Christopherson * instead of stepping down. 167985f44f8cSSean Christopherson */ 168085f44f8cSSean Christopherson if (is_last_spte(iter.old_spte, iter.level)) 168185f44f8cSSean Christopherson continue; 168285f44f8cSSean Christopherson 168385f44f8cSSean Christopherson /* 168485f44f8cSSean Christopherson * If iter.gfn resides outside of the slot, i.e. the page for 168585f44f8cSSean Christopherson * the current level overlaps but is not contained by the slot, 168685f44f8cSSean Christopherson * then the SPTE can't be made huge. More importantly, trying 168785f44f8cSSean Christopherson * to query that info from slot->arch.lpage_info will cause an 168885f44f8cSSean Christopherson * out-of-bounds access. 168985f44f8cSSean Christopherson */ 169085f44f8cSSean Christopherson if (iter.gfn < start || iter.gfn >= end) 169114881998SBen Gardon continue; 169214881998SBen Gardon 16935ba7c4c6SBen Gardon max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1694a8ac499bSSean Christopherson iter.gfn, PG_LEVEL_NUM); 169585f44f8cSSean Christopherson if (max_mapping_level < iter.level) 16965ba7c4c6SBen Gardon continue; 16975ba7c4c6SBen Gardon 16984b85c921SSean Christopherson /* Note, a successful atomic zap also does a remote TLB flush. */ 169985f44f8cSSean Christopherson if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 170085f44f8cSSean Christopherson goto retry; 17012db6f772SBen Gardon } 170214881998SBen Gardon 17037cca2d0bSBen Gardon rcu_read_unlock(); 170414881998SBen Gardon } 170514881998SBen Gardon 170614881998SBen Gardon /* 170785f44f8cSSean Christopherson * Zap non-leaf SPTEs (and free their associated page tables) which could 170885f44f8cSSean Christopherson * be replaced by huge pages, for GFNs within the slot. 170914881998SBen Gardon */ 17104b85c921SSean Christopherson void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 17114b85c921SSean Christopherson const struct kvm_memory_slot *slot) 171214881998SBen Gardon { 171314881998SBen Gardon struct kvm_mmu_page *root; 171414881998SBen Gardon 17152db6f772SBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 171614881998SBen Gardon 1717d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 17184b85c921SSean Christopherson zap_collapsible_spte_range(kvm, root, slot); 171914881998SBen Gardon } 172046044f72SBen Gardon 172146044f72SBen Gardon /* 172246044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 17235fc3424fSSean Christopherson * MMU-writable bit to ensure future writes continue to be intercepted. 172446044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 172546044f72SBen Gardon */ 172646044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 17273ad93562SKeqian Zhu gfn_t gfn, int min_level) 172846044f72SBen Gardon { 172946044f72SBen Gardon struct tdp_iter iter; 173046044f72SBen Gardon u64 new_spte; 173146044f72SBen Gardon bool spte_set = false; 173246044f72SBen Gardon 17333ad93562SKeqian Zhu BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 17343ad93562SKeqian Zhu 17357cca2d0bSBen Gardon rcu_read_lock(); 17367cca2d0bSBen Gardon 173777aa6075SDavid Matlack for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 17383ad93562SKeqian Zhu if (!is_shadow_present_pte(iter.old_spte) || 17393ad93562SKeqian Zhu !is_last_spte(iter.old_spte, iter.level)) 17403ad93562SKeqian Zhu continue; 17413ad93562SKeqian Zhu 174246044f72SBen Gardon new_spte = iter.old_spte & 17435fc3424fSSean Christopherson ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 174446044f72SBen Gardon 17457c8a4742SDavid Matlack if (new_spte == iter.old_spte) 17467c8a4742SDavid Matlack break; 17477c8a4742SDavid Matlack 17480b7cc254SVipin Sharma tdp_mmu_iter_set_spte(kvm, &iter, new_spte); 174946044f72SBen Gardon spte_set = true; 175046044f72SBen Gardon } 175146044f72SBen Gardon 17527cca2d0bSBen Gardon rcu_read_unlock(); 17537cca2d0bSBen Gardon 175446044f72SBen Gardon return spte_set; 175546044f72SBen Gardon } 175646044f72SBen Gardon 175746044f72SBen Gardon /* 175846044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 17595fc3424fSSean Christopherson * MMU-writable bit to ensure future writes continue to be intercepted. 176046044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 176146044f72SBen Gardon */ 176246044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 17633ad93562SKeqian Zhu struct kvm_memory_slot *slot, gfn_t gfn, 17643ad93562SKeqian Zhu int min_level) 176546044f72SBen Gardon { 176646044f72SBen Gardon struct kvm_mmu_page *root; 176746044f72SBen Gardon bool spte_set = false; 176846044f72SBen Gardon 1769531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 1770a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, slot->as_id) 17713ad93562SKeqian Zhu spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1772a3f15bdaSSean Christopherson 177346044f72SBen Gardon return spte_set; 177446044f72SBen Gardon } 177546044f72SBen Gardon 177695fb5b02SBen Gardon /* 177795fb5b02SBen Gardon * Return the level of the lowest level SPTE added to sptes. 177895fb5b02SBen Gardon * That SPTE may be non-present. 1779c5c8c7c5SDavid Matlack * 1780c5c8c7c5SDavid Matlack * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 178195fb5b02SBen Gardon */ 178239b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 178339b4d43eSSean Christopherson int *root_level) 178495fb5b02SBen Gardon { 178595fb5b02SBen Gardon struct tdp_iter iter; 178695fb5b02SBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 178795fb5b02SBen Gardon gfn_t gfn = addr >> PAGE_SHIFT; 17882aa07893SSean Christopherson int leaf = -1; 178995fb5b02SBen Gardon 1790a972e29cSPaolo Bonzini *root_level = vcpu->arch.mmu->root_role.level; 179195fb5b02SBen Gardon 179295fb5b02SBen Gardon tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 179395fb5b02SBen Gardon leaf = iter.level; 1794dde81f94SSean Christopherson sptes[leaf] = iter.old_spte; 179595fb5b02SBen Gardon } 179695fb5b02SBen Gardon 179795fb5b02SBen Gardon return leaf; 179895fb5b02SBen Gardon } 17996e8eb206SDavid Matlack 18006e8eb206SDavid Matlack /* 18016e8eb206SDavid Matlack * Returns the last level spte pointer of the shadow page walk for the given 18026e8eb206SDavid Matlack * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 18036e8eb206SDavid Matlack * walk could be performed, returns NULL and *spte does not contain valid data. 18046e8eb206SDavid Matlack * 18056e8eb206SDavid Matlack * Contract: 18066e8eb206SDavid Matlack * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 18076e8eb206SDavid Matlack * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 18086e8eb206SDavid Matlack * 18096e8eb206SDavid Matlack * WARNING: This function is only intended to be called during fast_page_fault. 18106e8eb206SDavid Matlack */ 18116e8eb206SDavid Matlack u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 18126e8eb206SDavid Matlack u64 *spte) 18136e8eb206SDavid Matlack { 18146e8eb206SDavid Matlack struct tdp_iter iter; 18156e8eb206SDavid Matlack struct kvm_mmu *mmu = vcpu->arch.mmu; 18166e8eb206SDavid Matlack gfn_t gfn = addr >> PAGE_SHIFT; 18176e8eb206SDavid Matlack tdp_ptep_t sptep = NULL; 18186e8eb206SDavid Matlack 18196e8eb206SDavid Matlack tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 18206e8eb206SDavid Matlack *spte = iter.old_spte; 18216e8eb206SDavid Matlack sptep = iter.sptep; 18226e8eb206SDavid Matlack } 18236e8eb206SDavid Matlack 18246e8eb206SDavid Matlack /* 18256e8eb206SDavid Matlack * Perform the rcu_dereference to get the raw spte pointer value since 18266e8eb206SDavid Matlack * we are passing it up to fast_page_fault, which is shared with the 18276e8eb206SDavid Matlack * legacy MMU and thus does not retain the TDP MMU-specific __rcu 18286e8eb206SDavid Matlack * annotation. 18296e8eb206SDavid Matlack * 18306e8eb206SDavid Matlack * This is safe since fast_page_fault obeys the contracts of this 18316e8eb206SDavid Matlack * function as well as all TDP MMU contracts around modifying SPTEs 18326e8eb206SDavid Matlack * outside of mmu_lock. 18336e8eb206SDavid Matlack */ 18346e8eb206SDavid Matlack return rcu_dereference(sptep); 18356e8eb206SDavid Matlack } 1836