1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0 2fe5db27dSBen Gardon 302c00b3aSBen Gardon #include "mmu.h" 402c00b3aSBen Gardon #include "mmu_internal.h" 5bb18842eSBen Gardon #include "mmutrace.h" 62f2fad08SBen Gardon #include "tdp_iter.h" 7fe5db27dSBen Gardon #include "tdp_mmu.h" 802c00b3aSBen Gardon #include "spte.h" 9fe5db27dSBen Gardon 10*9a77daacSBen Gardon #include <asm/cmpxchg.h> 1133dd3574SBen Gardon #include <trace/events/kvm.h> 1233dd3574SBen Gardon 1395fb5b02SBen Gardon #ifdef CONFIG_X86_64 14fe5db27dSBen Gardon static bool __read_mostly tdp_mmu_enabled = false; 1595fb5b02SBen Gardon module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 1695fb5b02SBen Gardon #endif 17fe5db27dSBen Gardon 18fe5db27dSBen Gardon static bool is_tdp_mmu_enabled(void) 19fe5db27dSBen Gardon { 20fe5db27dSBen Gardon #ifdef CONFIG_X86_64 21fe5db27dSBen Gardon return tdp_enabled && READ_ONCE(tdp_mmu_enabled); 22fe5db27dSBen Gardon #else 23fe5db27dSBen Gardon return false; 24fe5db27dSBen Gardon #endif /* CONFIG_X86_64 */ 25fe5db27dSBen Gardon } 26fe5db27dSBen Gardon 27fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */ 28fe5db27dSBen Gardon void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 29fe5db27dSBen Gardon { 30fe5db27dSBen Gardon if (!is_tdp_mmu_enabled()) 31fe5db27dSBen Gardon return; 32fe5db27dSBen Gardon 33fe5db27dSBen Gardon /* This should not be changed for the lifetime of the VM. */ 34fe5db27dSBen Gardon kvm->arch.tdp_mmu_enabled = true; 3502c00b3aSBen Gardon 3602c00b3aSBen Gardon INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 37*9a77daacSBen Gardon spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 3889c0fd49SBen Gardon INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 39fe5db27dSBen Gardon } 40fe5db27dSBen Gardon 41fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 42fe5db27dSBen Gardon { 43fe5db27dSBen Gardon if (!kvm->arch.tdp_mmu_enabled) 44fe5db27dSBen Gardon return; 4502c00b3aSBen Gardon 4602c00b3aSBen Gardon WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 477cca2d0bSBen Gardon 487cca2d0bSBen Gardon /* 497cca2d0bSBen Gardon * Ensure that all the outstanding RCU callbacks to free shadow pages 507cca2d0bSBen Gardon * can run before the VM is torn down. 517cca2d0bSBen Gardon */ 527cca2d0bSBen Gardon rcu_barrier(); 5302c00b3aSBen Gardon } 5402c00b3aSBen Gardon 55a889ea54SBen Gardon static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 56a889ea54SBen Gardon { 57a889ea54SBen Gardon if (kvm_mmu_put_root(kvm, root)) 58a889ea54SBen Gardon kvm_tdp_mmu_free_root(kvm, root); 59a889ea54SBen Gardon } 60a889ea54SBen Gardon 61a889ea54SBen Gardon static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, 62a889ea54SBen Gardon struct kvm_mmu_page *root) 63a889ea54SBen Gardon { 64531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 65a889ea54SBen Gardon 66a889ea54SBen Gardon if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) 67a889ea54SBen Gardon return false; 68a889ea54SBen Gardon 69a889ea54SBen Gardon kvm_mmu_get_root(kvm, root); 70a889ea54SBen Gardon return true; 71a889ea54SBen Gardon 72a889ea54SBen Gardon } 73a889ea54SBen Gardon 74a889ea54SBen Gardon static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 75a889ea54SBen Gardon struct kvm_mmu_page *root) 76a889ea54SBen Gardon { 77a889ea54SBen Gardon struct kvm_mmu_page *next_root; 78a889ea54SBen Gardon 79a889ea54SBen Gardon next_root = list_next_entry(root, link); 80a889ea54SBen Gardon tdp_mmu_put_root(kvm, root); 81a889ea54SBen Gardon return next_root; 82a889ea54SBen Gardon } 83a889ea54SBen Gardon 84a889ea54SBen Gardon /* 85a889ea54SBen Gardon * Note: this iterator gets and puts references to the roots it iterates over. 86a889ea54SBen Gardon * This makes it safe to release the MMU lock and yield within the loop, but 87a889ea54SBen Gardon * if exiting the loop early, the caller must drop the reference to the most 88a889ea54SBen Gardon * recent root. (Unless keeping a live reference is desirable.) 89a889ea54SBen Gardon */ 90a889ea54SBen Gardon #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 91a889ea54SBen Gardon for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ 92a889ea54SBen Gardon typeof(*_root), link); \ 93a889ea54SBen Gardon tdp_mmu_next_root_valid(_kvm, _root); \ 94a889ea54SBen Gardon _root = tdp_mmu_next_root(_kvm, _root)) 95a889ea54SBen Gardon 9602c00b3aSBen Gardon #define for_each_tdp_mmu_root(_kvm, _root) \ 9702c00b3aSBen Gardon list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 9802c00b3aSBen Gardon 9902c00b3aSBen Gardon bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) 10002c00b3aSBen Gardon { 10102c00b3aSBen Gardon struct kvm_mmu_page *sp; 10202c00b3aSBen Gardon 103c887c9b9SPaolo Bonzini if (!kvm->arch.tdp_mmu_enabled) 104c887c9b9SPaolo Bonzini return false; 105c887c9b9SPaolo Bonzini if (WARN_ON(!VALID_PAGE(hpa))) 106c887c9b9SPaolo Bonzini return false; 107c887c9b9SPaolo Bonzini 10802c00b3aSBen Gardon sp = to_shadow_page(hpa); 109c887c9b9SPaolo Bonzini if (WARN_ON(!sp)) 110c887c9b9SPaolo Bonzini return false; 11102c00b3aSBen Gardon 11202c00b3aSBen Gardon return sp->tdp_mmu_page && sp->root_count; 11302c00b3aSBen Gardon } 11402c00b3aSBen Gardon 115faaf05b0SBen Gardon static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 116063afacdSBen Gardon gfn_t start, gfn_t end, bool can_yield); 117faaf05b0SBen Gardon 11802c00b3aSBen Gardon void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 11902c00b3aSBen Gardon { 120339f5a7fSRick Edgecombe gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 121faaf05b0SBen Gardon 122531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 12302c00b3aSBen Gardon 12402c00b3aSBen Gardon WARN_ON(root->root_count); 12502c00b3aSBen Gardon WARN_ON(!root->tdp_mmu_page); 12602c00b3aSBen Gardon 12702c00b3aSBen Gardon list_del(&root->link); 12802c00b3aSBen Gardon 129063afacdSBen Gardon zap_gfn_range(kvm, root, 0, max_gfn, false); 130faaf05b0SBen Gardon 13102c00b3aSBen Gardon free_page((unsigned long)root->spt); 13202c00b3aSBen Gardon kmem_cache_free(mmu_page_header_cache, root); 13302c00b3aSBen Gardon } 13402c00b3aSBen Gardon 13502c00b3aSBen Gardon static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 13602c00b3aSBen Gardon int level) 13702c00b3aSBen Gardon { 13802c00b3aSBen Gardon union kvm_mmu_page_role role; 13902c00b3aSBen Gardon 14002c00b3aSBen Gardon role = vcpu->arch.mmu->mmu_role.base; 14102c00b3aSBen Gardon role.level = level; 14202c00b3aSBen Gardon role.direct = true; 14302c00b3aSBen Gardon role.gpte_is_8_bytes = true; 14402c00b3aSBen Gardon role.access = ACC_ALL; 14502c00b3aSBen Gardon 14602c00b3aSBen Gardon return role; 14702c00b3aSBen Gardon } 14802c00b3aSBen Gardon 14902c00b3aSBen Gardon static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 15002c00b3aSBen Gardon int level) 15102c00b3aSBen Gardon { 15202c00b3aSBen Gardon struct kvm_mmu_page *sp; 15302c00b3aSBen Gardon 15402c00b3aSBen Gardon sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 15502c00b3aSBen Gardon sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 15602c00b3aSBen Gardon set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 15702c00b3aSBen Gardon 15802c00b3aSBen Gardon sp->role.word = page_role_for_level(vcpu, level).word; 15902c00b3aSBen Gardon sp->gfn = gfn; 16002c00b3aSBen Gardon sp->tdp_mmu_page = true; 16102c00b3aSBen Gardon 16233dd3574SBen Gardon trace_kvm_mmu_get_page(sp, true); 16333dd3574SBen Gardon 16402c00b3aSBen Gardon return sp; 16502c00b3aSBen Gardon } 16602c00b3aSBen Gardon 16702c00b3aSBen Gardon static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 16802c00b3aSBen Gardon { 16902c00b3aSBen Gardon union kvm_mmu_page_role role; 17002c00b3aSBen Gardon struct kvm *kvm = vcpu->kvm; 17102c00b3aSBen Gardon struct kvm_mmu_page *root; 17202c00b3aSBen Gardon 17302c00b3aSBen Gardon role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 17402c00b3aSBen Gardon 175531810caSBen Gardon write_lock(&kvm->mmu_lock); 17602c00b3aSBen Gardon 17702c00b3aSBen Gardon /* Check for an existing root before allocating a new one. */ 17802c00b3aSBen Gardon for_each_tdp_mmu_root(kvm, root) { 17902c00b3aSBen Gardon if (root->role.word == role.word) { 18002c00b3aSBen Gardon kvm_mmu_get_root(kvm, root); 181531810caSBen Gardon write_unlock(&kvm->mmu_lock); 18202c00b3aSBen Gardon return root; 18302c00b3aSBen Gardon } 18402c00b3aSBen Gardon } 18502c00b3aSBen Gardon 18602c00b3aSBen Gardon root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 18702c00b3aSBen Gardon root->root_count = 1; 18802c00b3aSBen Gardon 18902c00b3aSBen Gardon list_add(&root->link, &kvm->arch.tdp_mmu_roots); 19002c00b3aSBen Gardon 191531810caSBen Gardon write_unlock(&kvm->mmu_lock); 19202c00b3aSBen Gardon 19302c00b3aSBen Gardon return root; 19402c00b3aSBen Gardon } 19502c00b3aSBen Gardon 19602c00b3aSBen Gardon hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 19702c00b3aSBen Gardon { 19802c00b3aSBen Gardon struct kvm_mmu_page *root; 19902c00b3aSBen Gardon 20002c00b3aSBen Gardon root = get_tdp_mmu_vcpu_root(vcpu); 20102c00b3aSBen Gardon if (!root) 20202c00b3aSBen Gardon return INVALID_PAGE; 20302c00b3aSBen Gardon 20402c00b3aSBen Gardon return __pa(root->spt); 205fe5db27dSBen Gardon } 2062f2fad08SBen Gardon 2077cca2d0bSBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 2087cca2d0bSBen Gardon { 2097cca2d0bSBen Gardon free_page((unsigned long)sp->spt); 2107cca2d0bSBen Gardon kmem_cache_free(mmu_page_header_cache, sp); 2117cca2d0bSBen Gardon } 2127cca2d0bSBen Gardon 2137cca2d0bSBen Gardon /* 2147cca2d0bSBen Gardon * This is called through call_rcu in order to free TDP page table memory 2157cca2d0bSBen Gardon * safely with respect to other kernel threads that may be operating on 2167cca2d0bSBen Gardon * the memory. 2177cca2d0bSBen Gardon * By only accessing TDP MMU page table memory in an RCU read critical 2187cca2d0bSBen Gardon * section, and freeing it after a grace period, lockless access to that 2197cca2d0bSBen Gardon * memory won't use it after it is freed. 2207cca2d0bSBen Gardon */ 2217cca2d0bSBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 2227cca2d0bSBen Gardon { 2237cca2d0bSBen Gardon struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 2247cca2d0bSBen Gardon rcu_head); 2257cca2d0bSBen Gardon 2267cca2d0bSBen Gardon tdp_mmu_free_sp(sp); 2277cca2d0bSBen Gardon } 2287cca2d0bSBen Gardon 2292f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 230*9a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 231*9a77daacSBen Gardon bool shared); 2322f2fad08SBen Gardon 233faaf05b0SBen Gardon static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 234faaf05b0SBen Gardon { 235faaf05b0SBen Gardon return sp->role.smm ? 1 : 0; 236faaf05b0SBen Gardon } 237faaf05b0SBen Gardon 238f8e14497SBen Gardon static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 239f8e14497SBen Gardon { 240f8e14497SBen Gardon bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 241f8e14497SBen Gardon 242f8e14497SBen Gardon if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 243f8e14497SBen Gardon return; 244f8e14497SBen Gardon 245f8e14497SBen Gardon if (is_accessed_spte(old_spte) && 246f8e14497SBen Gardon (!is_accessed_spte(new_spte) || pfn_changed)) 247f8e14497SBen Gardon kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 248f8e14497SBen Gardon } 249f8e14497SBen Gardon 250a6a0b05dSBen Gardon static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 251a6a0b05dSBen Gardon u64 old_spte, u64 new_spte, int level) 252a6a0b05dSBen Gardon { 253a6a0b05dSBen Gardon bool pfn_changed; 254a6a0b05dSBen Gardon struct kvm_memory_slot *slot; 255a6a0b05dSBen Gardon 256a6a0b05dSBen Gardon if (level > PG_LEVEL_4K) 257a6a0b05dSBen Gardon return; 258a6a0b05dSBen Gardon 259a6a0b05dSBen Gardon pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 260a6a0b05dSBen Gardon 261a6a0b05dSBen Gardon if ((!is_writable_pte(old_spte) || pfn_changed) && 262a6a0b05dSBen Gardon is_writable_pte(new_spte)) { 263a6a0b05dSBen Gardon slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 264fb04a1edSPeter Xu mark_page_dirty_in_slot(kvm, slot, gfn); 265a6a0b05dSBen Gardon } 266a6a0b05dSBen Gardon } 267a6a0b05dSBen Gardon 2682f2fad08SBen Gardon /** 269a9442f59SBen Gardon * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU 270a9442f59SBen Gardon * 271a9442f59SBen Gardon * @kvm: kvm instance 272a9442f59SBen Gardon * @sp: the new page 273*9a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 274*9a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 275*9a77daacSBen Gardon * threads that might be adding or removing pages. 276a9442f59SBen Gardon * @account_nx: This page replaces a NX large page and should be marked for 277a9442f59SBen Gardon * eventual reclaim. 278a9442f59SBen Gardon */ 279a9442f59SBen Gardon static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, 280*9a77daacSBen Gardon bool shared, bool account_nx) 281a9442f59SBen Gardon { 282*9a77daacSBen Gardon if (shared) 283*9a77daacSBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 284*9a77daacSBen Gardon else 285a9442f59SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 286a9442f59SBen Gardon 287a9442f59SBen Gardon list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 288a9442f59SBen Gardon if (account_nx) 289a9442f59SBen Gardon account_huge_nx_page(kvm, sp); 290*9a77daacSBen Gardon 291*9a77daacSBen Gardon if (shared) 292*9a77daacSBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 293a9442f59SBen Gardon } 294a9442f59SBen Gardon 295a9442f59SBen Gardon /** 296a9442f59SBen Gardon * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU 297a9442f59SBen Gardon * 298a9442f59SBen Gardon * @kvm: kvm instance 299a9442f59SBen Gardon * @sp: the page to be removed 300*9a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 301*9a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 302*9a77daacSBen Gardon * threads that might be adding or removing pages. 303a9442f59SBen Gardon */ 304*9a77daacSBen Gardon static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, 305*9a77daacSBen Gardon bool shared) 306a9442f59SBen Gardon { 307*9a77daacSBen Gardon if (shared) 308*9a77daacSBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 309*9a77daacSBen Gardon else 310a9442f59SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 311a9442f59SBen Gardon 312a9442f59SBen Gardon list_del(&sp->link); 313a9442f59SBen Gardon if (sp->lpage_disallowed) 314a9442f59SBen Gardon unaccount_huge_nx_page(kvm, sp); 315*9a77daacSBen Gardon 316*9a77daacSBen Gardon if (shared) 317*9a77daacSBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 318a9442f59SBen Gardon } 319a9442f59SBen Gardon 320a9442f59SBen Gardon /** 321a066e61fSBen Gardon * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure 322a066e61fSBen Gardon * 323a066e61fSBen Gardon * @kvm: kvm instance 324a066e61fSBen Gardon * @pt: the page removed from the paging structure 325*9a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use 326*9a77daacSBen Gardon * of the MMU lock and the operation must synchronize with other 327*9a77daacSBen Gardon * threads that might be modifying SPTEs. 328a066e61fSBen Gardon * 329a066e61fSBen Gardon * Given a page table that has been removed from the TDP paging structure, 330a066e61fSBen Gardon * iterates through the page table to clear SPTEs and free child page tables. 331a066e61fSBen Gardon */ 332*9a77daacSBen Gardon static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, 333*9a77daacSBen Gardon bool shared) 334a066e61fSBen Gardon { 335a066e61fSBen Gardon struct kvm_mmu_page *sp = sptep_to_sp(pt); 336a066e61fSBen Gardon int level = sp->role.level; 337a066e61fSBen Gardon gfn_t gfn = sp->gfn; 338a066e61fSBen Gardon u64 old_child_spte; 339*9a77daacSBen Gardon u64 *sptep; 340a066e61fSBen Gardon int i; 341a066e61fSBen Gardon 342a066e61fSBen Gardon trace_kvm_mmu_prepare_zap_page(sp); 343a066e61fSBen Gardon 344*9a77daacSBen Gardon tdp_mmu_unlink_page(kvm, sp, shared); 345a066e61fSBen Gardon 346a066e61fSBen Gardon for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 347*9a77daacSBen Gardon sptep = pt + i; 348*9a77daacSBen Gardon 349*9a77daacSBen Gardon if (shared) { 350*9a77daacSBen Gardon old_child_spte = xchg(sptep, 0); 351*9a77daacSBen Gardon } else { 352*9a77daacSBen Gardon old_child_spte = READ_ONCE(*sptep); 353*9a77daacSBen Gardon WRITE_ONCE(*sptep, 0); 354*9a77daacSBen Gardon } 355a066e61fSBen Gardon handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), 356a066e61fSBen Gardon gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), 357*9a77daacSBen Gardon old_child_spte, 0, level - 1, shared); 358a066e61fSBen Gardon } 359a066e61fSBen Gardon 360a066e61fSBen Gardon kvm_flush_remote_tlbs_with_address(kvm, gfn, 361a066e61fSBen Gardon KVM_PAGES_PER_HPAGE(level)); 362a066e61fSBen Gardon 3637cca2d0bSBen Gardon call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 364a066e61fSBen Gardon } 365a066e61fSBen Gardon 366a066e61fSBen Gardon /** 3672f2fad08SBen Gardon * handle_changed_spte - handle bookkeeping associated with an SPTE change 3682f2fad08SBen Gardon * @kvm: kvm instance 3692f2fad08SBen Gardon * @as_id: the address space of the paging structure the SPTE was a part of 3702f2fad08SBen Gardon * @gfn: the base GFN that was mapped by the SPTE 3712f2fad08SBen Gardon * @old_spte: The value of the SPTE before the change 3722f2fad08SBen Gardon * @new_spte: The value of the SPTE after the change 3732f2fad08SBen Gardon * @level: the level of the PT the SPTE is part of in the paging structure 374*9a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 375*9a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 376*9a77daacSBen Gardon * threads that might be modifying SPTEs. 3772f2fad08SBen Gardon * 3782f2fad08SBen Gardon * Handle bookkeeping that might result from the modification of a SPTE. 3792f2fad08SBen Gardon * This function must be called for all TDP SPTE modifications. 3802f2fad08SBen Gardon */ 3812f2fad08SBen Gardon static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 382*9a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 383*9a77daacSBen Gardon bool shared) 3842f2fad08SBen Gardon { 3852f2fad08SBen Gardon bool was_present = is_shadow_present_pte(old_spte); 3862f2fad08SBen Gardon bool is_present = is_shadow_present_pte(new_spte); 3872f2fad08SBen Gardon bool was_leaf = was_present && is_last_spte(old_spte, level); 3882f2fad08SBen Gardon bool is_leaf = is_present && is_last_spte(new_spte, level); 3892f2fad08SBen Gardon bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 3902f2fad08SBen Gardon 3912f2fad08SBen Gardon WARN_ON(level > PT64_ROOT_MAX_LEVEL); 3922f2fad08SBen Gardon WARN_ON(level < PG_LEVEL_4K); 393764388ceSSean Christopherson WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 3942f2fad08SBen Gardon 3952f2fad08SBen Gardon /* 3962f2fad08SBen Gardon * If this warning were to trigger it would indicate that there was a 3972f2fad08SBen Gardon * missing MMU notifier or a race with some notifier handler. 3982f2fad08SBen Gardon * A present, leaf SPTE should never be directly replaced with another 3992f2fad08SBen Gardon * present leaf SPTE pointing to a differnt PFN. A notifier handler 4002f2fad08SBen Gardon * should be zapping the SPTE before the main MM's page table is 4012f2fad08SBen Gardon * changed, or the SPTE should be zeroed, and the TLBs flushed by the 4022f2fad08SBen Gardon * thread before replacement. 4032f2fad08SBen Gardon */ 4042f2fad08SBen Gardon if (was_leaf && is_leaf && pfn_changed) { 4052f2fad08SBen Gardon pr_err("Invalid SPTE change: cannot replace a present leaf\n" 4062f2fad08SBen Gardon "SPTE with another present leaf SPTE mapping a\n" 4072f2fad08SBen Gardon "different PFN!\n" 4082f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 4092f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 4102f2fad08SBen Gardon 4112f2fad08SBen Gardon /* 4122f2fad08SBen Gardon * Crash the host to prevent error propagation and guest data 4132f2fad08SBen Gardon * courruption. 4142f2fad08SBen Gardon */ 4152f2fad08SBen Gardon BUG(); 4162f2fad08SBen Gardon } 4172f2fad08SBen Gardon 4182f2fad08SBen Gardon if (old_spte == new_spte) 4192f2fad08SBen Gardon return; 4202f2fad08SBen Gardon 421b9a98c34SBen Gardon trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 422b9a98c34SBen Gardon 4232f2fad08SBen Gardon /* 4242f2fad08SBen Gardon * The only times a SPTE should be changed from a non-present to 4252f2fad08SBen Gardon * non-present state is when an MMIO entry is installed/modified/ 4262f2fad08SBen Gardon * removed. In that case, there is nothing to do here. 4272f2fad08SBen Gardon */ 4282f2fad08SBen Gardon if (!was_present && !is_present) { 4292f2fad08SBen Gardon /* 4302f2fad08SBen Gardon * If this change does not involve a MMIO SPTE, it is 4312f2fad08SBen Gardon * unexpected. Log the change, though it should not impact the 4322f2fad08SBen Gardon * guest since both the former and current SPTEs are nonpresent. 4332f2fad08SBen Gardon */ 4342f2fad08SBen Gardon if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) 4352f2fad08SBen Gardon pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 4362f2fad08SBen Gardon "should not be replaced with another,\n" 4372f2fad08SBen Gardon "different nonpresent SPTE, unless one or both\n" 4382f2fad08SBen Gardon "are MMIO SPTEs.\n" 4392f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 4402f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 4412f2fad08SBen Gardon return; 4422f2fad08SBen Gardon } 4432f2fad08SBen Gardon 4442f2fad08SBen Gardon 4452f2fad08SBen Gardon if (was_leaf && is_dirty_spte(old_spte) && 4462f2fad08SBen Gardon (!is_dirty_spte(new_spte) || pfn_changed)) 4472f2fad08SBen Gardon kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 4482f2fad08SBen Gardon 4492f2fad08SBen Gardon /* 4502f2fad08SBen Gardon * Recursively handle child PTs if the change removed a subtree from 4512f2fad08SBen Gardon * the paging structure. 4522f2fad08SBen Gardon */ 453a066e61fSBen Gardon if (was_present && !was_leaf && (pfn_changed || !is_present)) 454a066e61fSBen Gardon handle_removed_tdp_mmu_page(kvm, 455*9a77daacSBen Gardon spte_to_child_pt(old_spte, level), shared); 4562f2fad08SBen Gardon } 4572f2fad08SBen Gardon 4582f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 459*9a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 460*9a77daacSBen Gardon bool shared) 4612f2fad08SBen Gardon { 462*9a77daacSBen Gardon __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 463*9a77daacSBen Gardon shared); 464f8e14497SBen Gardon handle_changed_spte_acc_track(old_spte, new_spte, level); 465a6a0b05dSBen Gardon handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 466a6a0b05dSBen Gardon new_spte, level); 4672f2fad08SBen Gardon } 468faaf05b0SBen Gardon 469fe43fa2fSBen Gardon /* 470*9a77daacSBen Gardon * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the 471*9a77daacSBen Gardon * associated bookkeeping 472*9a77daacSBen Gardon * 473*9a77daacSBen Gardon * @kvm: kvm instance 474*9a77daacSBen Gardon * @iter: a tdp_iter instance currently on the SPTE that should be set 475*9a77daacSBen Gardon * @new_spte: The value the SPTE should be set to 476*9a77daacSBen Gardon * Returns: true if the SPTE was set, false if it was not. If false is returned, 477*9a77daacSBen Gardon * this function will have no side-effects. 478*9a77daacSBen Gardon */ 479*9a77daacSBen Gardon static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, 480*9a77daacSBen Gardon struct tdp_iter *iter, 481*9a77daacSBen Gardon u64 new_spte) 482*9a77daacSBen Gardon { 483*9a77daacSBen Gardon u64 *root_pt = tdp_iter_root_pt(iter); 484*9a77daacSBen Gardon struct kvm_mmu_page *root = sptep_to_sp(root_pt); 485*9a77daacSBen Gardon int as_id = kvm_mmu_page_as_id(root); 486*9a77daacSBen Gardon 487*9a77daacSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 488*9a77daacSBen Gardon 489*9a77daacSBen Gardon if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, 490*9a77daacSBen Gardon new_spte) != iter->old_spte) 491*9a77daacSBen Gardon return false; 492*9a77daacSBen Gardon 493*9a77daacSBen Gardon handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 494*9a77daacSBen Gardon iter->level, true); 495*9a77daacSBen Gardon 496*9a77daacSBen Gardon return true; 497*9a77daacSBen Gardon } 498*9a77daacSBen Gardon 499*9a77daacSBen Gardon 500*9a77daacSBen Gardon /* 501fe43fa2fSBen Gardon * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 502fe43fa2fSBen Gardon * @kvm: kvm instance 503fe43fa2fSBen Gardon * @iter: a tdp_iter instance currently on the SPTE that should be set 504fe43fa2fSBen Gardon * @new_spte: The value the SPTE should be set to 505fe43fa2fSBen Gardon * @record_acc_track: Notify the MM subsystem of changes to the accessed state 506fe43fa2fSBen Gardon * of the page. Should be set unless handling an MMU 507fe43fa2fSBen Gardon * notifier for access tracking. Leaving record_acc_track 508fe43fa2fSBen Gardon * unset in that case prevents page accesses from being 509fe43fa2fSBen Gardon * double counted. 510fe43fa2fSBen Gardon * @record_dirty_log: Record the page as dirty in the dirty bitmap if 511fe43fa2fSBen Gardon * appropriate for the change being made. Should be set 512fe43fa2fSBen Gardon * unless performing certain dirty logging operations. 513fe43fa2fSBen Gardon * Leaving record_dirty_log unset in that case prevents page 514fe43fa2fSBen Gardon * writes from being double counted. 515fe43fa2fSBen Gardon */ 516f8e14497SBen Gardon static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 517a6a0b05dSBen Gardon u64 new_spte, bool record_acc_track, 518a6a0b05dSBen Gardon bool record_dirty_log) 519faaf05b0SBen Gardon { 5207cca2d0bSBen Gardon tdp_ptep_t root_pt = tdp_iter_root_pt(iter); 521faaf05b0SBen Gardon struct kvm_mmu_page *root = sptep_to_sp(root_pt); 522faaf05b0SBen Gardon int as_id = kvm_mmu_page_as_id(root); 523faaf05b0SBen Gardon 524531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 5253a9a4aa5SBen Gardon 5267cca2d0bSBen Gardon WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); 527faaf05b0SBen Gardon 528f8e14497SBen Gardon __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 529*9a77daacSBen Gardon iter->level, false); 530f8e14497SBen Gardon if (record_acc_track) 531f8e14497SBen Gardon handle_changed_spte_acc_track(iter->old_spte, new_spte, 532f8e14497SBen Gardon iter->level); 533a6a0b05dSBen Gardon if (record_dirty_log) 534a6a0b05dSBen Gardon handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 535a6a0b05dSBen Gardon iter->old_spte, new_spte, 536a6a0b05dSBen Gardon iter->level); 537f8e14497SBen Gardon } 538f8e14497SBen Gardon 539f8e14497SBen Gardon static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 540f8e14497SBen Gardon u64 new_spte) 541f8e14497SBen Gardon { 542a6a0b05dSBen Gardon __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 543f8e14497SBen Gardon } 544f8e14497SBen Gardon 545f8e14497SBen Gardon static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 546f8e14497SBen Gardon struct tdp_iter *iter, 547f8e14497SBen Gardon u64 new_spte) 548f8e14497SBen Gardon { 549a6a0b05dSBen Gardon __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 550a6a0b05dSBen Gardon } 551a6a0b05dSBen Gardon 552a6a0b05dSBen Gardon static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 553a6a0b05dSBen Gardon struct tdp_iter *iter, 554a6a0b05dSBen Gardon u64 new_spte) 555a6a0b05dSBen Gardon { 556a6a0b05dSBen Gardon __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 557faaf05b0SBen Gardon } 558faaf05b0SBen Gardon 559faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 560faaf05b0SBen Gardon for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 561faaf05b0SBen Gardon 562f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 563f8e14497SBen Gardon tdp_root_for_each_pte(_iter, _root, _start, _end) \ 564f8e14497SBen Gardon if (!is_shadow_present_pte(_iter.old_spte) || \ 565f8e14497SBen Gardon !is_last_spte(_iter.old_spte, _iter.level)) \ 566f8e14497SBen Gardon continue; \ 567f8e14497SBen Gardon else 568f8e14497SBen Gardon 569bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 570bb18842eSBen Gardon for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 571bb18842eSBen Gardon _mmu->shadow_root_level, _start, _end) 572bb18842eSBen Gardon 573faaf05b0SBen Gardon /* 574e28a436cSBen Gardon * Yield if the MMU lock is contended or this thread needs to return control 575e28a436cSBen Gardon * to the scheduler. 576e28a436cSBen Gardon * 577e139a34eSBen Gardon * If this function should yield and flush is set, it will perform a remote 578e139a34eSBen Gardon * TLB flush before yielding. 579e139a34eSBen Gardon * 580e28a436cSBen Gardon * If this function yields, it will also reset the tdp_iter's walk over the 581ed5e484bSBen Gardon * paging structure and the calling function should skip to the next 582ed5e484bSBen Gardon * iteration to allow the iterator to continue its traversal from the 583ed5e484bSBen Gardon * paging structure root. 584e28a436cSBen Gardon * 585e28a436cSBen Gardon * Return true if this function yielded and the iterator's traversal was reset. 586e28a436cSBen Gardon * Return false if a yield was not needed. 587e28a436cSBen Gardon */ 588e139a34eSBen Gardon static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 589e139a34eSBen Gardon struct tdp_iter *iter, bool flush) 590a6a0b05dSBen Gardon { 591ed5e484bSBen Gardon /* Ensure forward progress has been made before yielding. */ 592ed5e484bSBen Gardon if (iter->next_last_level_gfn == iter->yielded_gfn) 593ed5e484bSBen Gardon return false; 594ed5e484bSBen Gardon 595531810caSBen Gardon if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 5967cca2d0bSBen Gardon rcu_read_unlock(); 5977cca2d0bSBen Gardon 598e139a34eSBen Gardon if (flush) 599e139a34eSBen Gardon kvm_flush_remote_tlbs(kvm); 600e139a34eSBen Gardon 601531810caSBen Gardon cond_resched_rwlock_write(&kvm->mmu_lock); 6027cca2d0bSBen Gardon rcu_read_lock(); 603ed5e484bSBen Gardon 604ed5e484bSBen Gardon WARN_ON(iter->gfn > iter->next_last_level_gfn); 605ed5e484bSBen Gardon 606ed5e484bSBen Gardon tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], 607ed5e484bSBen Gardon iter->root_level, iter->min_level, 608ed5e484bSBen Gardon iter->next_last_level_gfn); 609ed5e484bSBen Gardon 610e28a436cSBen Gardon return true; 611a6a0b05dSBen Gardon } 612e28a436cSBen Gardon 613e28a436cSBen Gardon return false; 614a6a0b05dSBen Gardon } 615a6a0b05dSBen Gardon 616faaf05b0SBen Gardon /* 617faaf05b0SBen Gardon * Tears down the mappings for the range of gfns, [start, end), and frees the 618faaf05b0SBen Gardon * non-root pages mapping GFNs strictly within that range. Returns true if 619faaf05b0SBen Gardon * SPTEs have been cleared and a TLB flush is needed before releasing the 620faaf05b0SBen Gardon * MMU lock. 621063afacdSBen Gardon * If can_yield is true, will release the MMU lock and reschedule if the 622063afacdSBen Gardon * scheduler needs the CPU or there is contention on the MMU lock. If this 623063afacdSBen Gardon * function cannot yield, it will not release the MMU lock or reschedule and 624063afacdSBen Gardon * the caller must ensure it does not supply too large a GFN range, or the 625063afacdSBen Gardon * operation can cause a soft lockup. 626faaf05b0SBen Gardon */ 627faaf05b0SBen Gardon static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 628063afacdSBen Gardon gfn_t start, gfn_t end, bool can_yield) 629faaf05b0SBen Gardon { 630faaf05b0SBen Gardon struct tdp_iter iter; 631faaf05b0SBen Gardon bool flush_needed = false; 632faaf05b0SBen Gardon 6337cca2d0bSBen Gardon rcu_read_lock(); 6347cca2d0bSBen Gardon 635faaf05b0SBen Gardon tdp_root_for_each_pte(iter, root, start, end) { 6361af4a960SBen Gardon if (can_yield && 6371af4a960SBen Gardon tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { 6381af4a960SBen Gardon flush_needed = false; 6391af4a960SBen Gardon continue; 6401af4a960SBen Gardon } 6411af4a960SBen Gardon 642faaf05b0SBen Gardon if (!is_shadow_present_pte(iter.old_spte)) 643faaf05b0SBen Gardon continue; 644faaf05b0SBen Gardon 645faaf05b0SBen Gardon /* 646faaf05b0SBen Gardon * If this is a non-last-level SPTE that covers a larger range 647faaf05b0SBen Gardon * than should be zapped, continue, and zap the mappings at a 648faaf05b0SBen Gardon * lower level. 649faaf05b0SBen Gardon */ 650faaf05b0SBen Gardon if ((iter.gfn < start || 651faaf05b0SBen Gardon iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 652faaf05b0SBen Gardon !is_last_spte(iter.old_spte, iter.level)) 653faaf05b0SBen Gardon continue; 654faaf05b0SBen Gardon 655faaf05b0SBen Gardon tdp_mmu_set_spte(kvm, &iter, 0); 6561af4a960SBen Gardon flush_needed = true; 657faaf05b0SBen Gardon } 6587cca2d0bSBen Gardon 6597cca2d0bSBen Gardon rcu_read_unlock(); 660faaf05b0SBen Gardon return flush_needed; 661faaf05b0SBen Gardon } 662faaf05b0SBen Gardon 663faaf05b0SBen Gardon /* 664faaf05b0SBen Gardon * Tears down the mappings for the range of gfns, [start, end), and frees the 665faaf05b0SBen Gardon * non-root pages mapping GFNs strictly within that range. Returns true if 666faaf05b0SBen Gardon * SPTEs have been cleared and a TLB flush is needed before releasing the 667faaf05b0SBen Gardon * MMU lock. 668faaf05b0SBen Gardon */ 669faaf05b0SBen Gardon bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) 670faaf05b0SBen Gardon { 671faaf05b0SBen Gardon struct kvm_mmu_page *root; 672faaf05b0SBen Gardon bool flush = false; 673faaf05b0SBen Gardon 674a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) 675063afacdSBen Gardon flush |= zap_gfn_range(kvm, root, start, end, true); 676faaf05b0SBen Gardon 677faaf05b0SBen Gardon return flush; 678faaf05b0SBen Gardon } 679faaf05b0SBen Gardon 680faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm) 681faaf05b0SBen Gardon { 682339f5a7fSRick Edgecombe gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 683faaf05b0SBen Gardon bool flush; 684faaf05b0SBen Gardon 685faaf05b0SBen Gardon flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 686faaf05b0SBen Gardon if (flush) 687faaf05b0SBen Gardon kvm_flush_remote_tlbs(kvm); 688faaf05b0SBen Gardon } 689bb18842eSBen Gardon 690bb18842eSBen Gardon /* 691bb18842eSBen Gardon * Installs a last-level SPTE to handle a TDP page fault. 692bb18842eSBen Gardon * (NPT/EPT violation/misconfiguration) 693bb18842eSBen Gardon */ 694bb18842eSBen Gardon static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 695bb18842eSBen Gardon int map_writable, 696bb18842eSBen Gardon struct tdp_iter *iter, 697bb18842eSBen Gardon kvm_pfn_t pfn, bool prefault) 698bb18842eSBen Gardon { 699bb18842eSBen Gardon u64 new_spte; 700bb18842eSBen Gardon int ret = 0; 701bb18842eSBen Gardon int make_spte_ret = 0; 702bb18842eSBen Gardon 703*9a77daacSBen Gardon if (unlikely(is_noslot_pfn(pfn))) 704bb18842eSBen Gardon new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 705*9a77daacSBen Gardon else 706bb18842eSBen Gardon make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 707bb18842eSBen Gardon pfn, iter->old_spte, prefault, true, 708bb18842eSBen Gardon map_writable, !shadow_accessed_mask, 709bb18842eSBen Gardon &new_spte); 710bb18842eSBen Gardon 711bb18842eSBen Gardon if (new_spte == iter->old_spte) 712bb18842eSBen Gardon ret = RET_PF_SPURIOUS; 713*9a77daacSBen Gardon else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 714*9a77daacSBen Gardon return RET_PF_RETRY; 715bb18842eSBen Gardon 716bb18842eSBen Gardon /* 717bb18842eSBen Gardon * If the page fault was caused by a write but the page is write 718bb18842eSBen Gardon * protected, emulation is needed. If the emulation was skipped, 719bb18842eSBen Gardon * the vCPU would have the same fault again. 720bb18842eSBen Gardon */ 721bb18842eSBen Gardon if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 722bb18842eSBen Gardon if (write) 723bb18842eSBen Gardon ret = RET_PF_EMULATE; 724bb18842eSBen Gardon kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 725bb18842eSBen Gardon } 726bb18842eSBen Gardon 727bb18842eSBen Gardon /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 728*9a77daacSBen Gardon if (unlikely(is_mmio_spte(new_spte))) { 729*9a77daacSBen Gardon trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 730*9a77daacSBen Gardon new_spte); 731bb18842eSBen Gardon ret = RET_PF_EMULATE; 732*9a77daacSBen Gardon } else 733*9a77daacSBen Gardon trace_kvm_mmu_set_spte(iter->level, iter->gfn, 734*9a77daacSBen Gardon rcu_dereference(iter->sptep)); 735bb18842eSBen Gardon 7367cca2d0bSBen Gardon trace_kvm_mmu_set_spte(iter->level, iter->gfn, 7377cca2d0bSBen Gardon rcu_dereference(iter->sptep)); 738bb18842eSBen Gardon if (!prefault) 739bb18842eSBen Gardon vcpu->stat.pf_fixed++; 740bb18842eSBen Gardon 741bb18842eSBen Gardon return ret; 742bb18842eSBen Gardon } 743bb18842eSBen Gardon 744bb18842eSBen Gardon /* 745bb18842eSBen Gardon * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 746bb18842eSBen Gardon * page tables and SPTEs to translate the faulting guest physical address. 747bb18842eSBen Gardon */ 748bb18842eSBen Gardon int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 749bb18842eSBen Gardon int map_writable, int max_level, kvm_pfn_t pfn, 750bb18842eSBen Gardon bool prefault) 751bb18842eSBen Gardon { 752bb18842eSBen Gardon bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 753bb18842eSBen Gardon bool write = error_code & PFERR_WRITE_MASK; 754bb18842eSBen Gardon bool exec = error_code & PFERR_FETCH_MASK; 755bb18842eSBen Gardon bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 756bb18842eSBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 757bb18842eSBen Gardon struct tdp_iter iter; 75889c0fd49SBen Gardon struct kvm_mmu_page *sp; 759bb18842eSBen Gardon u64 *child_pt; 760bb18842eSBen Gardon u64 new_spte; 761bb18842eSBen Gardon int ret; 762bb18842eSBen Gardon gfn_t gfn = gpa >> PAGE_SHIFT; 763bb18842eSBen Gardon int level; 764bb18842eSBen Gardon int req_level; 765bb18842eSBen Gardon 766bb18842eSBen Gardon if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 767bb18842eSBen Gardon return RET_PF_RETRY; 768bb18842eSBen Gardon if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 769bb18842eSBen Gardon return RET_PF_RETRY; 770bb18842eSBen Gardon 771bb18842eSBen Gardon level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 772bb18842eSBen Gardon huge_page_disallowed, &req_level); 773bb18842eSBen Gardon 774bb18842eSBen Gardon trace_kvm_mmu_spte_requested(gpa, level, pfn); 7757cca2d0bSBen Gardon 7767cca2d0bSBen Gardon rcu_read_lock(); 7777cca2d0bSBen Gardon 778bb18842eSBen Gardon tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 779bb18842eSBen Gardon if (nx_huge_page_workaround_enabled) 780bb18842eSBen Gardon disallowed_hugepage_adjust(iter.old_spte, gfn, 781bb18842eSBen Gardon iter.level, &pfn, &level); 782bb18842eSBen Gardon 783bb18842eSBen Gardon if (iter.level == level) 784bb18842eSBen Gardon break; 785bb18842eSBen Gardon 786bb18842eSBen Gardon /* 787bb18842eSBen Gardon * If there is an SPTE mapping a large page at a higher level 788bb18842eSBen Gardon * than the target, that SPTE must be cleared and replaced 789bb18842eSBen Gardon * with a non-leaf SPTE. 790bb18842eSBen Gardon */ 791bb18842eSBen Gardon if (is_shadow_present_pte(iter.old_spte) && 792bb18842eSBen Gardon is_large_pte(iter.old_spte)) { 793*9a77daacSBen Gardon if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0)) 794*9a77daacSBen Gardon break; 795bb18842eSBen Gardon 796bb18842eSBen Gardon kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, 797bb18842eSBen Gardon KVM_PAGES_PER_HPAGE(iter.level)); 798bb18842eSBen Gardon 799bb18842eSBen Gardon /* 800bb18842eSBen Gardon * The iter must explicitly re-read the spte here 801bb18842eSBen Gardon * because the new value informs the !present 802bb18842eSBen Gardon * path below. 803bb18842eSBen Gardon */ 8047cca2d0bSBen Gardon iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 805bb18842eSBen Gardon } 806bb18842eSBen Gardon 807bb18842eSBen Gardon if (!is_shadow_present_pte(iter.old_spte)) { 80889c0fd49SBen Gardon sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 80989c0fd49SBen Gardon child_pt = sp->spt; 810a9442f59SBen Gardon 811bb18842eSBen Gardon new_spte = make_nonleaf_spte(child_pt, 812bb18842eSBen Gardon !shadow_accessed_mask); 813bb18842eSBen Gardon 814*9a77daacSBen Gardon if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 815*9a77daacSBen Gardon new_spte)) { 816*9a77daacSBen Gardon tdp_mmu_link_page(vcpu->kvm, sp, true, 817*9a77daacSBen Gardon huge_page_disallowed && 818*9a77daacSBen Gardon req_level >= iter.level); 819*9a77daacSBen Gardon 820bb18842eSBen Gardon trace_kvm_mmu_get_page(sp, true); 821*9a77daacSBen Gardon } else { 822*9a77daacSBen Gardon tdp_mmu_free_sp(sp); 823*9a77daacSBen Gardon break; 824*9a77daacSBen Gardon } 825bb18842eSBen Gardon } 826bb18842eSBen Gardon } 827bb18842eSBen Gardon 828*9a77daacSBen Gardon if (iter.level != level) { 8297cca2d0bSBen Gardon rcu_read_unlock(); 830bb18842eSBen Gardon return RET_PF_RETRY; 8317cca2d0bSBen Gardon } 832bb18842eSBen Gardon 833bb18842eSBen Gardon ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 834bb18842eSBen Gardon pfn, prefault); 8357cca2d0bSBen Gardon rcu_read_unlock(); 836bb18842eSBen Gardon 837bb18842eSBen Gardon return ret; 838bb18842eSBen Gardon } 839063afacdSBen Gardon 840063afacdSBen Gardon static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, 841063afacdSBen Gardon unsigned long end, unsigned long data, 842063afacdSBen Gardon int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, 843063afacdSBen Gardon struct kvm_mmu_page *root, gfn_t start, 844063afacdSBen Gardon gfn_t end, unsigned long data)) 845063afacdSBen Gardon { 846063afacdSBen Gardon struct kvm_memslots *slots; 847063afacdSBen Gardon struct kvm_memory_slot *memslot; 848063afacdSBen Gardon struct kvm_mmu_page *root; 849063afacdSBen Gardon int ret = 0; 850063afacdSBen Gardon int as_id; 851063afacdSBen Gardon 852a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 853063afacdSBen Gardon as_id = kvm_mmu_page_as_id(root); 854063afacdSBen Gardon slots = __kvm_memslots(kvm, as_id); 855063afacdSBen Gardon kvm_for_each_memslot(memslot, slots) { 856063afacdSBen Gardon unsigned long hva_start, hva_end; 857063afacdSBen Gardon gfn_t gfn_start, gfn_end; 858063afacdSBen Gardon 859063afacdSBen Gardon hva_start = max(start, memslot->userspace_addr); 860063afacdSBen Gardon hva_end = min(end, memslot->userspace_addr + 861063afacdSBen Gardon (memslot->npages << PAGE_SHIFT)); 862063afacdSBen Gardon if (hva_start >= hva_end) 863063afacdSBen Gardon continue; 864063afacdSBen Gardon /* 865063afacdSBen Gardon * {gfn(page) | page intersects with [hva_start, hva_end)} = 866063afacdSBen Gardon * {gfn_start, gfn_start+1, ..., gfn_end-1}. 867063afacdSBen Gardon */ 868063afacdSBen Gardon gfn_start = hva_to_gfn_memslot(hva_start, memslot); 869063afacdSBen Gardon gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 870063afacdSBen Gardon 871063afacdSBen Gardon ret |= handler(kvm, memslot, root, gfn_start, 872063afacdSBen Gardon gfn_end, data); 873063afacdSBen Gardon } 874063afacdSBen Gardon } 875063afacdSBen Gardon 876063afacdSBen Gardon return ret; 877063afacdSBen Gardon } 878063afacdSBen Gardon 879063afacdSBen Gardon static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 880063afacdSBen Gardon struct kvm_memory_slot *slot, 881063afacdSBen Gardon struct kvm_mmu_page *root, gfn_t start, 882063afacdSBen Gardon gfn_t end, unsigned long unused) 883063afacdSBen Gardon { 884063afacdSBen Gardon return zap_gfn_range(kvm, root, start, end, false); 885063afacdSBen Gardon } 886063afacdSBen Gardon 887063afacdSBen Gardon int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 888063afacdSBen Gardon unsigned long end) 889063afacdSBen Gardon { 890063afacdSBen Gardon return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 891063afacdSBen Gardon zap_gfn_range_hva_wrapper); 892063afacdSBen Gardon } 893f8e14497SBen Gardon 894f8e14497SBen Gardon /* 895f8e14497SBen Gardon * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 896f8e14497SBen Gardon * if any of the GFNs in the range have been accessed. 897f8e14497SBen Gardon */ 898f8e14497SBen Gardon static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 899f8e14497SBen Gardon struct kvm_mmu_page *root, gfn_t start, gfn_t end, 900f8e14497SBen Gardon unsigned long unused) 901f8e14497SBen Gardon { 902f8e14497SBen Gardon struct tdp_iter iter; 903f8e14497SBen Gardon int young = 0; 904f8e14497SBen Gardon u64 new_spte = 0; 905f8e14497SBen Gardon 9067cca2d0bSBen Gardon rcu_read_lock(); 9077cca2d0bSBen Gardon 908f8e14497SBen Gardon tdp_root_for_each_leaf_pte(iter, root, start, end) { 909f8e14497SBen Gardon /* 910f8e14497SBen Gardon * If we have a non-accessed entry we don't need to change the 911f8e14497SBen Gardon * pte. 912f8e14497SBen Gardon */ 913f8e14497SBen Gardon if (!is_accessed_spte(iter.old_spte)) 914f8e14497SBen Gardon continue; 915f8e14497SBen Gardon 916f8e14497SBen Gardon new_spte = iter.old_spte; 917f8e14497SBen Gardon 918f8e14497SBen Gardon if (spte_ad_enabled(new_spte)) { 919f8e14497SBen Gardon clear_bit((ffs(shadow_accessed_mask) - 1), 920f8e14497SBen Gardon (unsigned long *)&new_spte); 921f8e14497SBen Gardon } else { 922f8e14497SBen Gardon /* 923f8e14497SBen Gardon * Capture the dirty status of the page, so that it doesn't get 924f8e14497SBen Gardon * lost when the SPTE is marked for access tracking. 925f8e14497SBen Gardon */ 926f8e14497SBen Gardon if (is_writable_pte(new_spte)) 927f8e14497SBen Gardon kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 928f8e14497SBen Gardon 929f8e14497SBen Gardon new_spte = mark_spte_for_access_track(new_spte); 930f8e14497SBen Gardon } 931a6a0b05dSBen Gardon new_spte &= ~shadow_dirty_mask; 932f8e14497SBen Gardon 933f8e14497SBen Gardon tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 934f8e14497SBen Gardon young = 1; 93533dd3574SBen Gardon 93633dd3574SBen Gardon trace_kvm_age_page(iter.gfn, iter.level, slot, young); 937f8e14497SBen Gardon } 938f8e14497SBen Gardon 9397cca2d0bSBen Gardon rcu_read_unlock(); 9407cca2d0bSBen Gardon 941f8e14497SBen Gardon return young; 942f8e14497SBen Gardon } 943f8e14497SBen Gardon 944f8e14497SBen Gardon int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 945f8e14497SBen Gardon unsigned long end) 946f8e14497SBen Gardon { 947f8e14497SBen Gardon return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 948f8e14497SBen Gardon age_gfn_range); 949f8e14497SBen Gardon } 950f8e14497SBen Gardon 951f8e14497SBen Gardon static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 952f8e14497SBen Gardon struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 953f8e14497SBen Gardon unsigned long unused2) 954f8e14497SBen Gardon { 955f8e14497SBen Gardon struct tdp_iter iter; 956f8e14497SBen Gardon 957f8e14497SBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 958f8e14497SBen Gardon if (is_accessed_spte(iter.old_spte)) 959f8e14497SBen Gardon return 1; 960f8e14497SBen Gardon 961f8e14497SBen Gardon return 0; 962f8e14497SBen Gardon } 963f8e14497SBen Gardon 964f8e14497SBen Gardon int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 965f8e14497SBen Gardon { 966f8e14497SBen Gardon return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 967f8e14497SBen Gardon test_age_gfn); 968f8e14497SBen Gardon } 9691d8dd6b3SBen Gardon 9701d8dd6b3SBen Gardon /* 9711d8dd6b3SBen Gardon * Handle the changed_pte MMU notifier for the TDP MMU. 9721d8dd6b3SBen Gardon * data is a pointer to the new pte_t mapping the HVA specified by the MMU 9731d8dd6b3SBen Gardon * notifier. 9741d8dd6b3SBen Gardon * Returns non-zero if a flush is needed before releasing the MMU lock. 9751d8dd6b3SBen Gardon */ 9761d8dd6b3SBen Gardon static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 9771d8dd6b3SBen Gardon struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 9781d8dd6b3SBen Gardon unsigned long data) 9791d8dd6b3SBen Gardon { 9801d8dd6b3SBen Gardon struct tdp_iter iter; 9811d8dd6b3SBen Gardon pte_t *ptep = (pte_t *)data; 9821d8dd6b3SBen Gardon kvm_pfn_t new_pfn; 9831d8dd6b3SBen Gardon u64 new_spte; 9841d8dd6b3SBen Gardon int need_flush = 0; 9851d8dd6b3SBen Gardon 9867cca2d0bSBen Gardon rcu_read_lock(); 9877cca2d0bSBen Gardon 9881d8dd6b3SBen Gardon WARN_ON(pte_huge(*ptep)); 9891d8dd6b3SBen Gardon 9901d8dd6b3SBen Gardon new_pfn = pte_pfn(*ptep); 9911d8dd6b3SBen Gardon 9921d8dd6b3SBen Gardon tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 9931d8dd6b3SBen Gardon if (iter.level != PG_LEVEL_4K) 9941d8dd6b3SBen Gardon continue; 9951d8dd6b3SBen Gardon 9961d8dd6b3SBen Gardon if (!is_shadow_present_pte(iter.old_spte)) 9971d8dd6b3SBen Gardon break; 9981d8dd6b3SBen Gardon 9991d8dd6b3SBen Gardon tdp_mmu_set_spte(kvm, &iter, 0); 10001d8dd6b3SBen Gardon 10011d8dd6b3SBen Gardon kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 10021d8dd6b3SBen Gardon 10031d8dd6b3SBen Gardon if (!pte_write(*ptep)) { 10041d8dd6b3SBen Gardon new_spte = kvm_mmu_changed_pte_notifier_make_spte( 10051d8dd6b3SBen Gardon iter.old_spte, new_pfn); 10061d8dd6b3SBen Gardon 10071d8dd6b3SBen Gardon tdp_mmu_set_spte(kvm, &iter, new_spte); 10081d8dd6b3SBen Gardon } 10091d8dd6b3SBen Gardon 10101d8dd6b3SBen Gardon need_flush = 1; 10111d8dd6b3SBen Gardon } 10121d8dd6b3SBen Gardon 10131d8dd6b3SBen Gardon if (need_flush) 10141d8dd6b3SBen Gardon kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 10151d8dd6b3SBen Gardon 10167cca2d0bSBen Gardon rcu_read_unlock(); 10177cca2d0bSBen Gardon 10181d8dd6b3SBen Gardon return 0; 10191d8dd6b3SBen Gardon } 10201d8dd6b3SBen Gardon 10211d8dd6b3SBen Gardon int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 10221d8dd6b3SBen Gardon pte_t *host_ptep) 10231d8dd6b3SBen Gardon { 10241d8dd6b3SBen Gardon return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 10251d8dd6b3SBen Gardon (unsigned long)host_ptep, 10261d8dd6b3SBen Gardon set_tdp_spte); 10271d8dd6b3SBen Gardon } 10281d8dd6b3SBen Gardon 1029a6a0b05dSBen Gardon /* 1030a6a0b05dSBen Gardon * Remove write access from all the SPTEs mapping GFNs [start, end). If 1031a6a0b05dSBen Gardon * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 1032a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1033a6a0b05dSBen Gardon */ 1034a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1035a6a0b05dSBen Gardon gfn_t start, gfn_t end, int min_level) 1036a6a0b05dSBen Gardon { 1037a6a0b05dSBen Gardon struct tdp_iter iter; 1038a6a0b05dSBen Gardon u64 new_spte; 1039a6a0b05dSBen Gardon bool spte_set = false; 1040a6a0b05dSBen Gardon 10417cca2d0bSBen Gardon rcu_read_lock(); 10427cca2d0bSBen Gardon 1043a6a0b05dSBen Gardon BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1044a6a0b05dSBen Gardon 1045a6a0b05dSBen Gardon for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1046a6a0b05dSBen Gardon min_level, start, end) { 10471af4a960SBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 10481af4a960SBen Gardon continue; 10491af4a960SBen Gardon 1050a6a0b05dSBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 10510f99ee2cSBen Gardon !is_last_spte(iter.old_spte, iter.level) || 10520f99ee2cSBen Gardon !(iter.old_spte & PT_WRITABLE_MASK)) 1053a6a0b05dSBen Gardon continue; 1054a6a0b05dSBen Gardon 1055a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1056a6a0b05dSBen Gardon 1057a6a0b05dSBen Gardon tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1058a6a0b05dSBen Gardon spte_set = true; 1059a6a0b05dSBen Gardon } 10607cca2d0bSBen Gardon 10617cca2d0bSBen Gardon rcu_read_unlock(); 1062a6a0b05dSBen Gardon return spte_set; 1063a6a0b05dSBen Gardon } 1064a6a0b05dSBen Gardon 1065a6a0b05dSBen Gardon /* 1066a6a0b05dSBen Gardon * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1067a6a0b05dSBen Gardon * only affect leaf SPTEs down to min_level. 1068a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1069a6a0b05dSBen Gardon */ 1070a6a0b05dSBen Gardon bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 1071a6a0b05dSBen Gardon int min_level) 1072a6a0b05dSBen Gardon { 1073a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1074a6a0b05dSBen Gardon int root_as_id; 1075a6a0b05dSBen Gardon bool spte_set = false; 1076a6a0b05dSBen Gardon 1077a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 1078a6a0b05dSBen Gardon root_as_id = kvm_mmu_page_as_id(root); 1079a6a0b05dSBen Gardon if (root_as_id != slot->as_id) 1080a6a0b05dSBen Gardon continue; 1081a6a0b05dSBen Gardon 1082a6a0b05dSBen Gardon spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1083a6a0b05dSBen Gardon slot->base_gfn + slot->npages, min_level); 1084a6a0b05dSBen Gardon } 1085a6a0b05dSBen Gardon 1086a6a0b05dSBen Gardon return spte_set; 1087a6a0b05dSBen Gardon } 1088a6a0b05dSBen Gardon 1089a6a0b05dSBen Gardon /* 1090a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1091a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1092a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1093a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1094a6a0b05dSBen Gardon * be flushed. 1095a6a0b05dSBen Gardon */ 1096a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1097a6a0b05dSBen Gardon gfn_t start, gfn_t end) 1098a6a0b05dSBen Gardon { 1099a6a0b05dSBen Gardon struct tdp_iter iter; 1100a6a0b05dSBen Gardon u64 new_spte; 1101a6a0b05dSBen Gardon bool spte_set = false; 1102a6a0b05dSBen Gardon 11037cca2d0bSBen Gardon rcu_read_lock(); 11047cca2d0bSBen Gardon 1105a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, start, end) { 11061af4a960SBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 11071af4a960SBen Gardon continue; 11081af4a960SBen Gardon 1109a6a0b05dSBen Gardon if (spte_ad_need_write_protect(iter.old_spte)) { 1110a6a0b05dSBen Gardon if (is_writable_pte(iter.old_spte)) 1111a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1112a6a0b05dSBen Gardon else 1113a6a0b05dSBen Gardon continue; 1114a6a0b05dSBen Gardon } else { 1115a6a0b05dSBen Gardon if (iter.old_spte & shadow_dirty_mask) 1116a6a0b05dSBen Gardon new_spte = iter.old_spte & ~shadow_dirty_mask; 1117a6a0b05dSBen Gardon else 1118a6a0b05dSBen Gardon continue; 1119a6a0b05dSBen Gardon } 1120a6a0b05dSBen Gardon 1121a6a0b05dSBen Gardon tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1122a6a0b05dSBen Gardon spte_set = true; 1123a6a0b05dSBen Gardon } 11247cca2d0bSBen Gardon 11257cca2d0bSBen Gardon rcu_read_unlock(); 1126a6a0b05dSBen Gardon return spte_set; 1127a6a0b05dSBen Gardon } 1128a6a0b05dSBen Gardon 1129a6a0b05dSBen Gardon /* 1130a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1131a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1132a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1133a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1134a6a0b05dSBen Gardon * be flushed. 1135a6a0b05dSBen Gardon */ 1136a6a0b05dSBen Gardon bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 1137a6a0b05dSBen Gardon { 1138a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1139a6a0b05dSBen Gardon int root_as_id; 1140a6a0b05dSBen Gardon bool spte_set = false; 1141a6a0b05dSBen Gardon 1142a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 1143a6a0b05dSBen Gardon root_as_id = kvm_mmu_page_as_id(root); 1144a6a0b05dSBen Gardon if (root_as_id != slot->as_id) 1145a6a0b05dSBen Gardon continue; 1146a6a0b05dSBen Gardon 1147a6a0b05dSBen Gardon spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1148a6a0b05dSBen Gardon slot->base_gfn + slot->npages); 1149a6a0b05dSBen Gardon } 1150a6a0b05dSBen Gardon 1151a6a0b05dSBen Gardon return spte_set; 1152a6a0b05dSBen Gardon } 1153a6a0b05dSBen Gardon 1154a6a0b05dSBen Gardon /* 1155a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1156a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1157a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1158a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1159a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1160a6a0b05dSBen Gardon */ 1161a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1162a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, bool wrprot) 1163a6a0b05dSBen Gardon { 1164a6a0b05dSBen Gardon struct tdp_iter iter; 1165a6a0b05dSBen Gardon u64 new_spte; 1166a6a0b05dSBen Gardon 11677cca2d0bSBen Gardon rcu_read_lock(); 11687cca2d0bSBen Gardon 1169a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1170a6a0b05dSBen Gardon gfn + BITS_PER_LONG) { 1171a6a0b05dSBen Gardon if (!mask) 1172a6a0b05dSBen Gardon break; 1173a6a0b05dSBen Gardon 1174a6a0b05dSBen Gardon if (iter.level > PG_LEVEL_4K || 1175a6a0b05dSBen Gardon !(mask & (1UL << (iter.gfn - gfn)))) 1176a6a0b05dSBen Gardon continue; 1177a6a0b05dSBen Gardon 1178f1b3b06aSBen Gardon mask &= ~(1UL << (iter.gfn - gfn)); 1179f1b3b06aSBen Gardon 1180a6a0b05dSBen Gardon if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1181a6a0b05dSBen Gardon if (is_writable_pte(iter.old_spte)) 1182a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1183a6a0b05dSBen Gardon else 1184a6a0b05dSBen Gardon continue; 1185a6a0b05dSBen Gardon } else { 1186a6a0b05dSBen Gardon if (iter.old_spte & shadow_dirty_mask) 1187a6a0b05dSBen Gardon new_spte = iter.old_spte & ~shadow_dirty_mask; 1188a6a0b05dSBen Gardon else 1189a6a0b05dSBen Gardon continue; 1190a6a0b05dSBen Gardon } 1191a6a0b05dSBen Gardon 1192a6a0b05dSBen Gardon tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1193a6a0b05dSBen Gardon } 11947cca2d0bSBen Gardon 11957cca2d0bSBen Gardon rcu_read_unlock(); 1196a6a0b05dSBen Gardon } 1197a6a0b05dSBen Gardon 1198a6a0b05dSBen Gardon /* 1199a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1200a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1201a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1202a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1203a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1204a6a0b05dSBen Gardon */ 1205a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1206a6a0b05dSBen Gardon struct kvm_memory_slot *slot, 1207a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, 1208a6a0b05dSBen Gardon bool wrprot) 1209a6a0b05dSBen Gardon { 1210a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1211a6a0b05dSBen Gardon int root_as_id; 1212a6a0b05dSBen Gardon 1213531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 1214a6a0b05dSBen Gardon for_each_tdp_mmu_root(kvm, root) { 1215a6a0b05dSBen Gardon root_as_id = kvm_mmu_page_as_id(root); 1216a6a0b05dSBen Gardon if (root_as_id != slot->as_id) 1217a6a0b05dSBen Gardon continue; 1218a6a0b05dSBen Gardon 1219a6a0b05dSBen Gardon clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1220a6a0b05dSBen Gardon } 1221a6a0b05dSBen Gardon } 1222a6a0b05dSBen Gardon 1223a6a0b05dSBen Gardon /* 1224a6a0b05dSBen Gardon * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1225a6a0b05dSBen Gardon * only used for PML, and so will involve setting the dirty bit on each SPTE. 1226a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1227a6a0b05dSBen Gardon */ 1228a6a0b05dSBen Gardon static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1229a6a0b05dSBen Gardon gfn_t start, gfn_t end) 1230a6a0b05dSBen Gardon { 1231a6a0b05dSBen Gardon struct tdp_iter iter; 1232a6a0b05dSBen Gardon u64 new_spte; 1233a6a0b05dSBen Gardon bool spte_set = false; 1234a6a0b05dSBen Gardon 12357cca2d0bSBen Gardon rcu_read_lock(); 12367cca2d0bSBen Gardon 1237a6a0b05dSBen Gardon tdp_root_for_each_pte(iter, root, start, end) { 12381af4a960SBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 12391af4a960SBen Gardon continue; 12401af4a960SBen Gardon 12410f99ee2cSBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 12420f99ee2cSBen Gardon iter.old_spte & shadow_dirty_mask) 1243a6a0b05dSBen Gardon continue; 1244a6a0b05dSBen Gardon 1245a6a0b05dSBen Gardon new_spte = iter.old_spte | shadow_dirty_mask; 1246a6a0b05dSBen Gardon 1247a6a0b05dSBen Gardon tdp_mmu_set_spte(kvm, &iter, new_spte); 1248a6a0b05dSBen Gardon spte_set = true; 1249a6a0b05dSBen Gardon } 1250a6a0b05dSBen Gardon 12517cca2d0bSBen Gardon rcu_read_unlock(); 1252a6a0b05dSBen Gardon return spte_set; 1253a6a0b05dSBen Gardon } 1254a6a0b05dSBen Gardon 1255a6a0b05dSBen Gardon /* 1256a6a0b05dSBen Gardon * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1257a6a0b05dSBen Gardon * only used for PML, and so will involve setting the dirty bit on each SPTE. 1258a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1259a6a0b05dSBen Gardon */ 1260a6a0b05dSBen Gardon bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) 1261a6a0b05dSBen Gardon { 1262a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1263a6a0b05dSBen Gardon int root_as_id; 1264a6a0b05dSBen Gardon bool spte_set = false; 1265a6a0b05dSBen Gardon 1266a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 1267a6a0b05dSBen Gardon root_as_id = kvm_mmu_page_as_id(root); 1268a6a0b05dSBen Gardon if (root_as_id != slot->as_id) 1269a6a0b05dSBen Gardon continue; 1270a6a0b05dSBen Gardon 1271a6a0b05dSBen Gardon spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, 1272a6a0b05dSBen Gardon slot->base_gfn + slot->npages); 1273a6a0b05dSBen Gardon } 1274a6a0b05dSBen Gardon return spte_set; 1275a6a0b05dSBen Gardon } 1276a6a0b05dSBen Gardon 127714881998SBen Gardon /* 127887aa9ec9SBen Gardon * Clear leaf entries which could be replaced by large mappings, for 127987aa9ec9SBen Gardon * GFNs within the slot. 128014881998SBen Gardon */ 128114881998SBen Gardon static void zap_collapsible_spte_range(struct kvm *kvm, 128214881998SBen Gardon struct kvm_mmu_page *root, 128314881998SBen Gardon gfn_t start, gfn_t end) 128414881998SBen Gardon { 128514881998SBen Gardon struct tdp_iter iter; 128614881998SBen Gardon kvm_pfn_t pfn; 128714881998SBen Gardon bool spte_set = false; 128814881998SBen Gardon 12897cca2d0bSBen Gardon rcu_read_lock(); 12907cca2d0bSBen Gardon 129114881998SBen Gardon tdp_root_for_each_pte(iter, root, start, end) { 12921af4a960SBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { 12931af4a960SBen Gardon spte_set = false; 12941af4a960SBen Gardon continue; 12951af4a960SBen Gardon } 12961af4a960SBen Gardon 129714881998SBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 129887aa9ec9SBen Gardon !is_last_spte(iter.old_spte, iter.level)) 129914881998SBen Gardon continue; 130014881998SBen Gardon 130114881998SBen Gardon pfn = spte_to_pfn(iter.old_spte); 130214881998SBen Gardon if (kvm_is_reserved_pfn(pfn) || 130314881998SBen Gardon !PageTransCompoundMap(pfn_to_page(pfn))) 130414881998SBen Gardon continue; 130514881998SBen Gardon 130614881998SBen Gardon tdp_mmu_set_spte(kvm, &iter, 0); 130714881998SBen Gardon 13081af4a960SBen Gardon spte_set = true; 130914881998SBen Gardon } 131014881998SBen Gardon 13117cca2d0bSBen Gardon rcu_read_unlock(); 131214881998SBen Gardon if (spte_set) 131314881998SBen Gardon kvm_flush_remote_tlbs(kvm); 131414881998SBen Gardon } 131514881998SBen Gardon 131614881998SBen Gardon /* 131714881998SBen Gardon * Clear non-leaf entries (and free associated page tables) which could 131814881998SBen Gardon * be replaced by large mappings, for GFNs within the slot. 131914881998SBen Gardon */ 132014881998SBen Gardon void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 132114881998SBen Gardon const struct kvm_memory_slot *slot) 132214881998SBen Gardon { 132314881998SBen Gardon struct kvm_mmu_page *root; 132414881998SBen Gardon int root_as_id; 132514881998SBen Gardon 1326a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 132714881998SBen Gardon root_as_id = kvm_mmu_page_as_id(root); 132814881998SBen Gardon if (root_as_id != slot->as_id) 132914881998SBen Gardon continue; 133014881998SBen Gardon 133114881998SBen Gardon zap_collapsible_spte_range(kvm, root, slot->base_gfn, 133214881998SBen Gardon slot->base_gfn + slot->npages); 133314881998SBen Gardon } 133414881998SBen Gardon } 133546044f72SBen Gardon 133646044f72SBen Gardon /* 133746044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 133846044f72SBen Gardon * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 133946044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 134046044f72SBen Gardon */ 134146044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 134246044f72SBen Gardon gfn_t gfn) 134346044f72SBen Gardon { 134446044f72SBen Gardon struct tdp_iter iter; 134546044f72SBen Gardon u64 new_spte; 134646044f72SBen Gardon bool spte_set = false; 134746044f72SBen Gardon 13487cca2d0bSBen Gardon rcu_read_lock(); 13497cca2d0bSBen Gardon 135046044f72SBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 135146044f72SBen Gardon if (!is_writable_pte(iter.old_spte)) 135246044f72SBen Gardon break; 135346044f72SBen Gardon 135446044f72SBen Gardon new_spte = iter.old_spte & 135546044f72SBen Gardon ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 135646044f72SBen Gardon 135746044f72SBen Gardon tdp_mmu_set_spte(kvm, &iter, new_spte); 135846044f72SBen Gardon spte_set = true; 135946044f72SBen Gardon } 136046044f72SBen Gardon 13617cca2d0bSBen Gardon rcu_read_unlock(); 13627cca2d0bSBen Gardon 136346044f72SBen Gardon return spte_set; 136446044f72SBen Gardon } 136546044f72SBen Gardon 136646044f72SBen Gardon /* 136746044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 136846044f72SBen Gardon * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 136946044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 137046044f72SBen Gardon */ 137146044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 137246044f72SBen Gardon struct kvm_memory_slot *slot, gfn_t gfn) 137346044f72SBen Gardon { 137446044f72SBen Gardon struct kvm_mmu_page *root; 137546044f72SBen Gardon int root_as_id; 137646044f72SBen Gardon bool spte_set = false; 137746044f72SBen Gardon 1378531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 137946044f72SBen Gardon for_each_tdp_mmu_root(kvm, root) { 138046044f72SBen Gardon root_as_id = kvm_mmu_page_as_id(root); 138146044f72SBen Gardon if (root_as_id != slot->as_id) 138246044f72SBen Gardon continue; 138346044f72SBen Gardon 138446044f72SBen Gardon spte_set |= write_protect_gfn(kvm, root, gfn); 138546044f72SBen Gardon } 138646044f72SBen Gardon return spte_set; 138746044f72SBen Gardon } 138846044f72SBen Gardon 138995fb5b02SBen Gardon /* 139095fb5b02SBen Gardon * Return the level of the lowest level SPTE added to sptes. 139195fb5b02SBen Gardon * That SPTE may be non-present. 139295fb5b02SBen Gardon */ 139339b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 139439b4d43eSSean Christopherson int *root_level) 139595fb5b02SBen Gardon { 139695fb5b02SBen Gardon struct tdp_iter iter; 139795fb5b02SBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 139895fb5b02SBen Gardon gfn_t gfn = addr >> PAGE_SHIFT; 13992aa07893SSean Christopherson int leaf = -1; 140095fb5b02SBen Gardon 140139b4d43eSSean Christopherson *root_level = vcpu->arch.mmu->shadow_root_level; 140295fb5b02SBen Gardon 14037cca2d0bSBen Gardon rcu_read_lock(); 14047cca2d0bSBen Gardon 140595fb5b02SBen Gardon tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 140695fb5b02SBen Gardon leaf = iter.level; 1407dde81f94SSean Christopherson sptes[leaf] = iter.old_spte; 140895fb5b02SBen Gardon } 140995fb5b02SBen Gardon 14107cca2d0bSBen Gardon rcu_read_unlock(); 14117cca2d0bSBen Gardon 141295fb5b02SBen Gardon return leaf; 141395fb5b02SBen Gardon } 1414