1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0 2fe5db27dSBen Gardon 302c00b3aSBen Gardon #include "mmu.h" 402c00b3aSBen Gardon #include "mmu_internal.h" 5bb18842eSBen Gardon #include "mmutrace.h" 62f2fad08SBen Gardon #include "tdp_iter.h" 7fe5db27dSBen Gardon #include "tdp_mmu.h" 802c00b3aSBen Gardon #include "spte.h" 9fe5db27dSBen Gardon 1033dd3574SBen Gardon #include <trace/events/kvm.h> 1133dd3574SBen Gardon 1295fb5b02SBen Gardon #ifdef CONFIG_X86_64 13fe5db27dSBen Gardon static bool __read_mostly tdp_mmu_enabled = false; 1495fb5b02SBen Gardon module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 1595fb5b02SBen Gardon #endif 16fe5db27dSBen Gardon 17fe5db27dSBen Gardon static bool is_tdp_mmu_enabled(void) 18fe5db27dSBen Gardon { 19fe5db27dSBen Gardon #ifdef CONFIG_X86_64 20fe5db27dSBen Gardon return tdp_enabled && READ_ONCE(tdp_mmu_enabled); 21fe5db27dSBen Gardon #else 22fe5db27dSBen Gardon return false; 23fe5db27dSBen Gardon #endif /* CONFIG_X86_64 */ 24fe5db27dSBen Gardon } 25fe5db27dSBen Gardon 26fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */ 27fe5db27dSBen Gardon void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 28fe5db27dSBen Gardon { 29fe5db27dSBen Gardon if (!is_tdp_mmu_enabled()) 30fe5db27dSBen Gardon return; 31fe5db27dSBen Gardon 32fe5db27dSBen Gardon /* This should not be changed for the lifetime of the VM. */ 33fe5db27dSBen Gardon kvm->arch.tdp_mmu_enabled = true; 3402c00b3aSBen Gardon 3502c00b3aSBen Gardon INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 3689c0fd49SBen Gardon INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 37fe5db27dSBen Gardon } 38fe5db27dSBen Gardon 39fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 40fe5db27dSBen Gardon { 41fe5db27dSBen Gardon if (!kvm->arch.tdp_mmu_enabled) 42fe5db27dSBen Gardon return; 4302c00b3aSBen Gardon 4402c00b3aSBen Gardon WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 45*7cca2d0bSBen Gardon 46*7cca2d0bSBen Gardon /* 47*7cca2d0bSBen Gardon * Ensure that all the outstanding RCU callbacks to free shadow pages 48*7cca2d0bSBen Gardon * can run before the VM is torn down. 49*7cca2d0bSBen Gardon */ 50*7cca2d0bSBen Gardon rcu_barrier(); 5102c00b3aSBen Gardon } 5202c00b3aSBen Gardon 53a889ea54SBen Gardon static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 54a889ea54SBen Gardon { 55a889ea54SBen Gardon if (kvm_mmu_put_root(kvm, root)) 56a889ea54SBen Gardon kvm_tdp_mmu_free_root(kvm, root); 57a889ea54SBen Gardon } 58a889ea54SBen Gardon 59a889ea54SBen Gardon static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, 60a889ea54SBen Gardon struct kvm_mmu_page *root) 61a889ea54SBen Gardon { 62a889ea54SBen Gardon lockdep_assert_held(&kvm->mmu_lock); 63a889ea54SBen Gardon 64a889ea54SBen Gardon if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) 65a889ea54SBen Gardon return false; 66a889ea54SBen Gardon 67a889ea54SBen Gardon kvm_mmu_get_root(kvm, root); 68a889ea54SBen Gardon return true; 69a889ea54SBen Gardon 70a889ea54SBen Gardon } 71a889ea54SBen Gardon 72a889ea54SBen Gardon static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 73a889ea54SBen Gardon struct kvm_mmu_page *root) 74a889ea54SBen Gardon { 75a889ea54SBen Gardon struct kvm_mmu_page *next_root; 76a889ea54SBen Gardon 77a889ea54SBen Gardon next_root = list_next_entry(root, link); 78a889ea54SBen Gardon tdp_mmu_put_root(kvm, root); 79a889ea54SBen Gardon return next_root; 80a889ea54SBen Gardon } 81a889ea54SBen Gardon 82a889ea54SBen Gardon /* 83a889ea54SBen Gardon * Note: this iterator gets and puts references to the roots it iterates over. 84a889ea54SBen Gardon * This makes it safe to release the MMU lock and yield within the loop, but 85a889ea54SBen Gardon * if exiting the loop early, the caller must drop the reference to the most 86a889ea54SBen Gardon * recent root. (Unless keeping a live reference is desirable.) 87a889ea54SBen Gardon */ 88a889ea54SBen Gardon #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 89a889ea54SBen Gardon for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ 90a889ea54SBen Gardon typeof(*_root), link); \ 91a889ea54SBen Gardon tdp_mmu_next_root_valid(_kvm, _root); \ 92a889ea54SBen Gardon _root = tdp_mmu_next_root(_kvm, _root)) 93a889ea54SBen Gardon 9402c00b3aSBen Gardon #define for_each_tdp_mmu_root(_kvm, _root) \ 9502c00b3aSBen Gardon list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 9602c00b3aSBen Gardon 9702c00b3aSBen Gardon bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) 9802c00b3aSBen Gardon { 9902c00b3aSBen Gardon struct kvm_mmu_page *sp; 10002c00b3aSBen Gardon 101c887c9b9SPaolo Bonzini if (!kvm->arch.tdp_mmu_enabled) 102c887c9b9SPaolo Bonzini return false; 103c887c9b9SPaolo Bonzini if (WARN_ON(!VALID_PAGE(hpa))) 104c887c9b9SPaolo Bonzini return false; 105c887c9b9SPaolo Bonzini 10602c00b3aSBen Gardon sp = to_shadow_page(hpa); 107c887c9b9SPaolo Bonzini if (WARN_ON(!sp)) 108c887c9b9SPaolo Bonzini return false; 10902c00b3aSBen Gardon 11002c00b3aSBen Gardon return sp->tdp_mmu_page && sp->root_count; 11102c00b3aSBen Gardon } 11202c00b3aSBen Gardon 113faaf05b0SBen Gardon static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 114063afacdSBen Gardon gfn_t start, gfn_t end, bool can_yield); 115faaf05b0SBen Gardon 11602c00b3aSBen Gardon void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 11702c00b3aSBen Gardon { 118339f5a7fSRick Edgecombe gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 119faaf05b0SBen Gardon 12002c00b3aSBen Gardon lockdep_assert_held(&kvm->mmu_lock); 12102c00b3aSBen Gardon 12202c00b3aSBen Gardon WARN_ON(root->root_count); 12302c00b3aSBen Gardon WARN_ON(!root->tdp_mmu_page); 12402c00b3aSBen Gardon 12502c00b3aSBen Gardon list_del(&root->link); 12602c00b3aSBen Gardon 127063afacdSBen Gardon zap_gfn_range(kvm, root, 0, max_gfn, false); 128faaf05b0SBen Gardon 12902c00b3aSBen Gardon free_page((unsigned long)root->spt); 13002c00b3aSBen Gardon kmem_cache_free(mmu_page_header_cache, root); 13102c00b3aSBen Gardon } 13202c00b3aSBen Gardon 13302c00b3aSBen Gardon static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 13402c00b3aSBen Gardon int level) 13502c00b3aSBen Gardon { 13602c00b3aSBen Gardon union kvm_mmu_page_role role; 13702c00b3aSBen Gardon 13802c00b3aSBen Gardon role = vcpu->arch.mmu->mmu_role.base; 13902c00b3aSBen Gardon role.level = level; 14002c00b3aSBen Gardon role.direct = true; 14102c00b3aSBen Gardon role.gpte_is_8_bytes = true; 14202c00b3aSBen Gardon role.access = ACC_ALL; 14302c00b3aSBen Gardon 14402c00b3aSBen Gardon return role; 14502c00b3aSBen Gardon } 14602c00b3aSBen Gardon 14702c00b3aSBen Gardon static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 14802c00b3aSBen Gardon int level) 14902c00b3aSBen Gardon { 15002c00b3aSBen Gardon struct kvm_mmu_page *sp; 15102c00b3aSBen Gardon 15202c00b3aSBen Gardon sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 15302c00b3aSBen Gardon sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 15402c00b3aSBen Gardon set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 15502c00b3aSBen Gardon 15602c00b3aSBen Gardon sp->role.word = page_role_for_level(vcpu, level).word; 15702c00b3aSBen Gardon sp->gfn = gfn; 15802c00b3aSBen Gardon sp->tdp_mmu_page = true; 15902c00b3aSBen Gardon 16033dd3574SBen Gardon trace_kvm_mmu_get_page(sp, true); 16133dd3574SBen Gardon 16202c00b3aSBen Gardon return sp; 16302c00b3aSBen Gardon } 16402c00b3aSBen Gardon 16502c00b3aSBen Gardon static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 16602c00b3aSBen Gardon { 16702c00b3aSBen Gardon union kvm_mmu_page_role role; 16802c00b3aSBen Gardon struct kvm *kvm = vcpu->kvm; 16902c00b3aSBen Gardon struct kvm_mmu_page *root; 17002c00b3aSBen Gardon 17102c00b3aSBen Gardon role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 17202c00b3aSBen Gardon 17302c00b3aSBen Gardon spin_lock(&kvm->mmu_lock); 17402c00b3aSBen Gardon 17502c00b3aSBen Gardon /* Check for an existing root before allocating a new one. */ 17602c00b3aSBen Gardon for_each_tdp_mmu_root(kvm, root) { 17702c00b3aSBen Gardon if (root->role.word == role.word) { 17802c00b3aSBen Gardon kvm_mmu_get_root(kvm, root); 17902c00b3aSBen Gardon spin_unlock(&kvm->mmu_lock); 18002c00b3aSBen Gardon return root; 18102c00b3aSBen Gardon } 18202c00b3aSBen Gardon } 18302c00b3aSBen Gardon 18402c00b3aSBen Gardon root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 18502c00b3aSBen Gardon root->root_count = 1; 18602c00b3aSBen Gardon 18702c00b3aSBen Gardon list_add(&root->link, &kvm->arch.tdp_mmu_roots); 18802c00b3aSBen Gardon 18902c00b3aSBen Gardon spin_unlock(&kvm->mmu_lock); 19002c00b3aSBen Gardon 19102c00b3aSBen Gardon return root; 19202c00b3aSBen Gardon } 19302c00b3aSBen Gardon 19402c00b3aSBen Gardon hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 19502c00b3aSBen Gardon { 19602c00b3aSBen Gardon struct kvm_mmu_page *root; 19702c00b3aSBen Gardon 19802c00b3aSBen Gardon root = get_tdp_mmu_vcpu_root(vcpu); 19902c00b3aSBen Gardon if (!root) 20002c00b3aSBen Gardon return INVALID_PAGE; 20102c00b3aSBen Gardon 20202c00b3aSBen Gardon return __pa(root->spt); 203fe5db27dSBen Gardon } 2042f2fad08SBen Gardon 205*7cca2d0bSBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 206*7cca2d0bSBen Gardon { 207*7cca2d0bSBen Gardon free_page((unsigned long)sp->spt); 208*7cca2d0bSBen Gardon kmem_cache_free(mmu_page_header_cache, sp); 209*7cca2d0bSBen Gardon } 210*7cca2d0bSBen Gardon 211*7cca2d0bSBen Gardon /* 212*7cca2d0bSBen Gardon * This is called through call_rcu in order to free TDP page table memory 213*7cca2d0bSBen Gardon * safely with respect to other kernel threads that may be operating on 214*7cca2d0bSBen Gardon * the memory. 215*7cca2d0bSBen Gardon * By only accessing TDP MMU page table memory in an RCU read critical 216*7cca2d0bSBen Gardon * section, and freeing it after a grace period, lockless access to that 217*7cca2d0bSBen Gardon * memory won't use it after it is freed. 218*7cca2d0bSBen Gardon */ 219*7cca2d0bSBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 220*7cca2d0bSBen Gardon { 221*7cca2d0bSBen Gardon struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 222*7cca2d0bSBen Gardon rcu_head); 223*7cca2d0bSBen Gardon 224*7cca2d0bSBen Gardon tdp_mmu_free_sp(sp); 225*7cca2d0bSBen Gardon } 226*7cca2d0bSBen Gardon 2272f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 2282f2fad08SBen Gardon u64 old_spte, u64 new_spte, int level); 2292f2fad08SBen Gardon 230faaf05b0SBen Gardon static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 231faaf05b0SBen Gardon { 232faaf05b0SBen Gardon return sp->role.smm ? 1 : 0; 233faaf05b0SBen Gardon } 234faaf05b0SBen Gardon 235f8e14497SBen Gardon static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 236f8e14497SBen Gardon { 237f8e14497SBen Gardon bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 238f8e14497SBen Gardon 239f8e14497SBen Gardon if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 240f8e14497SBen Gardon return; 241f8e14497SBen Gardon 242f8e14497SBen Gardon if (is_accessed_spte(old_spte) && 243f8e14497SBen Gardon (!is_accessed_spte(new_spte) || pfn_changed)) 244f8e14497SBen Gardon kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 245f8e14497SBen Gardon } 246f8e14497SBen Gardon 247a6a0b05dSBen Gardon static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 248a6a0b05dSBen Gardon u64 old_spte, u64 new_spte, int level) 249a6a0b05dSBen Gardon { 250a6a0b05dSBen Gardon bool pfn_changed; 251a6a0b05dSBen Gardon struct kvm_memory_slot *slot; 252a6a0b05dSBen Gardon 253a6a0b05dSBen Gardon if (level > PG_LEVEL_4K) 254a6a0b05dSBen Gardon return; 255a6a0b05dSBen Gardon 256a6a0b05dSBen Gardon pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 257a6a0b05dSBen Gardon 258a6a0b05dSBen Gardon if ((!is_writable_pte(old_spte) || pfn_changed) && 259a6a0b05dSBen Gardon is_writable_pte(new_spte)) { 260a6a0b05dSBen Gardon slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 261fb04a1edSPeter Xu mark_page_dirty_in_slot(kvm, slot, gfn); 262a6a0b05dSBen Gardon } 263a6a0b05dSBen Gardon } 264a6a0b05dSBen Gardon 2652f2fad08SBen Gardon /** 266a066e61fSBen Gardon * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure 267a066e61fSBen Gardon * 268a066e61fSBen Gardon * @kvm: kvm instance 269a066e61fSBen Gardon * @pt: the page removed from the paging structure 270a066e61fSBen Gardon * 271a066e61fSBen Gardon * Given a page table that has been removed from the TDP paging structure, 272a066e61fSBen Gardon * iterates through the page table to clear SPTEs and free child page tables. 273a066e61fSBen Gardon */ 274a066e61fSBen Gardon static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) 275a066e61fSBen Gardon { 276a066e61fSBen Gardon struct kvm_mmu_page *sp = sptep_to_sp(pt); 277a066e61fSBen Gardon int level = sp->role.level; 278a066e61fSBen Gardon gfn_t gfn = sp->gfn; 279a066e61fSBen Gardon u64 old_child_spte; 280a066e61fSBen Gardon int i; 281a066e61fSBen Gardon 282a066e61fSBen Gardon trace_kvm_mmu_prepare_zap_page(sp); 283a066e61fSBen Gardon 284a066e61fSBen Gardon list_del(&sp->link); 285a066e61fSBen Gardon 286a066e61fSBen Gardon if (sp->lpage_disallowed) 287a066e61fSBen Gardon unaccount_huge_nx_page(kvm, sp); 288a066e61fSBen Gardon 289a066e61fSBen Gardon for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 290a066e61fSBen Gardon old_child_spte = READ_ONCE(*(pt + i)); 291a066e61fSBen Gardon WRITE_ONCE(*(pt + i), 0); 292a066e61fSBen Gardon handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), 293a066e61fSBen Gardon gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), 294a066e61fSBen Gardon old_child_spte, 0, level - 1); 295a066e61fSBen Gardon } 296a066e61fSBen Gardon 297a066e61fSBen Gardon kvm_flush_remote_tlbs_with_address(kvm, gfn, 298a066e61fSBen Gardon KVM_PAGES_PER_HPAGE(level)); 299a066e61fSBen Gardon 300*7cca2d0bSBen Gardon call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 301a066e61fSBen Gardon } 302a066e61fSBen Gardon 303a066e61fSBen Gardon /** 3042f2fad08SBen Gardon * handle_changed_spte - handle bookkeeping associated with an SPTE change 3052f2fad08SBen Gardon * @kvm: kvm instance 3062f2fad08SBen Gardon * @as_id: the address space of the paging structure the SPTE was a part of 3072f2fad08SBen Gardon * @gfn: the base GFN that was mapped by the SPTE 3082f2fad08SBen Gardon * @old_spte: The value of the SPTE before the change 3092f2fad08SBen Gardon * @new_spte: The value of the SPTE after the change 3102f2fad08SBen Gardon * @level: the level of the PT the SPTE is part of in the paging structure 3112f2fad08SBen Gardon * 3122f2fad08SBen Gardon * Handle bookkeeping that might result from the modification of a SPTE. 3132f2fad08SBen Gardon * This function must be called for all TDP SPTE modifications. 3142f2fad08SBen Gardon */ 3152f2fad08SBen Gardon static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 3162f2fad08SBen Gardon u64 old_spte, u64 new_spte, int level) 3172f2fad08SBen Gardon { 3182f2fad08SBen Gardon bool was_present = is_shadow_present_pte(old_spte); 3192f2fad08SBen Gardon bool is_present = is_shadow_present_pte(new_spte); 3202f2fad08SBen Gardon bool was_leaf = was_present && is_last_spte(old_spte, level); 3212f2fad08SBen Gardon bool is_leaf = is_present && is_last_spte(new_spte, level); 3222f2fad08SBen Gardon bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 3232f2fad08SBen Gardon 3242f2fad08SBen Gardon WARN_ON(level > PT64_ROOT_MAX_LEVEL); 3252f2fad08SBen Gardon WARN_ON(level < PG_LEVEL_4K); 326764388ceSSean Christopherson WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 3272f2fad08SBen Gardon 3282f2fad08SBen Gardon /* 3292f2fad08SBen Gardon * If this warning were to trigger it would indicate that there was a 3302f2fad08SBen Gardon * missing MMU notifier or a race with some notifier handler. 3312f2fad08SBen Gardon * A present, leaf SPTE should never be directly replaced with another 3322f2fad08SBen Gardon * present leaf SPTE pointing to a differnt PFN. A notifier handler 3332f2fad08SBen Gardon * should be zapping the SPTE before the main MM's page table is 3342f2fad08SBen Gardon * changed, or the SPTE should be zeroed, and the TLBs flushed by the 3352f2fad08SBen Gardon * thread before replacement. 3362f2fad08SBen Gardon */ 3372f2fad08SBen Gardon if (was_leaf && is_leaf && pfn_changed) { 3382f2fad08SBen Gardon pr_err("Invalid SPTE change: cannot replace a present leaf\n" 3392f2fad08SBen Gardon "SPTE with another present leaf SPTE mapping a\n" 3402f2fad08SBen Gardon "different PFN!\n" 3412f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 3422f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 3432f2fad08SBen Gardon 3442f2fad08SBen Gardon /* 3452f2fad08SBen Gardon * Crash the host to prevent error propagation and guest data 3462f2fad08SBen Gardon * courruption. 3472f2fad08SBen Gardon */ 3482f2fad08SBen Gardon BUG(); 3492f2fad08SBen Gardon } 3502f2fad08SBen Gardon 3512f2fad08SBen Gardon if (old_spte == new_spte) 3522f2fad08SBen Gardon return; 3532f2fad08SBen Gardon 354b9a98c34SBen Gardon trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 355b9a98c34SBen Gardon 3562f2fad08SBen Gardon /* 3572f2fad08SBen Gardon * The only times a SPTE should be changed from a non-present to 3582f2fad08SBen Gardon * non-present state is when an MMIO entry is installed/modified/ 3592f2fad08SBen Gardon * removed. In that case, there is nothing to do here. 3602f2fad08SBen Gardon */ 3612f2fad08SBen Gardon if (!was_present && !is_present) { 3622f2fad08SBen Gardon /* 3632f2fad08SBen Gardon * If this change does not involve a MMIO SPTE, it is 3642f2fad08SBen Gardon * unexpected. Log the change, though it should not impact the 3652f2fad08SBen Gardon * guest since both the former and current SPTEs are nonpresent. 3662f2fad08SBen Gardon */ 3672f2fad08SBen Gardon if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) 3682f2fad08SBen Gardon pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 3692f2fad08SBen Gardon "should not be replaced with another,\n" 3702f2fad08SBen Gardon "different nonpresent SPTE, unless one or both\n" 3712f2fad08SBen Gardon "are MMIO SPTEs.\n" 3722f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 3732f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 3742f2fad08SBen Gardon return; 3752f2fad08SBen Gardon } 3762f2fad08SBen Gardon 3772f2fad08SBen Gardon 3782f2fad08SBen Gardon if (was_leaf && is_dirty_spte(old_spte) && 3792f2fad08SBen Gardon (!is_dirty_spte(new_spte) || pfn_changed)) 3802f2fad08SBen Gardon kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 3812f2fad08SBen Gardon 3822f2fad08SBen Gardon /* 3832f2fad08SBen Gardon * Recursively handle child PTs if the change removed a subtree from 3842f2fad08SBen Gardon * the paging structure. 3852f2fad08SBen Gardon */ 386a066e61fSBen Gardon if (was_present && !was_leaf && (pfn_changed || !is_present)) 387a066e61fSBen Gardon handle_removed_tdp_mmu_page(kvm, 388a066e61fSBen Gardon spte_to_child_pt(old_spte, level)); 3892f2fad08SBen Gardon } 3902f2fad08SBen Gardon 3912f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 3922f2fad08SBen Gardon u64 old_spte, u64 new_spte, int level) 3932f2fad08SBen Gardon { 3942f2fad08SBen Gardon __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); 395f8e14497SBen Gardon handle_changed_spte_acc_track(old_spte, new_spte, level); 396a6a0b05dSBen Gardon handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 397a6a0b05dSBen Gardon new_spte, level); 3982f2fad08SBen Gardon } 399faaf05b0SBen Gardon 400fe43fa2fSBen Gardon /* 401fe43fa2fSBen Gardon * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 402fe43fa2fSBen Gardon * @kvm: kvm instance 403fe43fa2fSBen Gardon * @iter: a tdp_iter instance currently on the SPTE that should be set 404fe43fa2fSBen Gardon * @new_spte: The value the SPTE should be set to 405fe43fa2fSBen Gardon * @record_acc_track: Notify the MM subsystem of changes to the accessed state 406fe43fa2fSBen Gardon * of the page. Should be set unless handling an MMU 407fe43fa2fSBen Gardon * notifier for access tracking. Leaving record_acc_track 408fe43fa2fSBen Gardon * unset in that case prevents page accesses from being 409fe43fa2fSBen Gardon * double counted. 410fe43fa2fSBen Gardon * @record_dirty_log: Record the page as dirty in the dirty bitmap if 411fe43fa2fSBen Gardon * appropriate for the change being made. Should be set 412fe43fa2fSBen Gardon * unless performing certain dirty logging operations. 413fe43fa2fSBen Gardon * Leaving record_dirty_log unset in that case prevents page 414fe43fa2fSBen Gardon * writes from being double counted. 415fe43fa2fSBen Gardon */ 416f8e14497SBen Gardon static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 417a6a0b05dSBen Gardon u64 new_spte, bool record_acc_track, 418a6a0b05dSBen Gardon bool record_dirty_log) 419faaf05b0SBen Gardon { 420*7cca2d0bSBen Gardon tdp_ptep_t root_pt = tdp_iter_root_pt(iter); 421faaf05b0SBen Gardon struct kvm_mmu_page *root = sptep_to_sp(root_pt); 422faaf05b0SBen Gardon int as_id = kvm_mmu_page_as_id(root); 423faaf05b0SBen Gardon 4243a9a4aa5SBen Gardon lockdep_assert_held(&kvm->mmu_lock); 4253a9a4aa5SBen Gardon 426*7cca2d0bSBen Gardon WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); 427faaf05b0SBen Gardon 428f8e14497SBen Gardon __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 429faaf05b0SBen Gardon iter->level); 430f8e14497SBen Gardon if (record_acc_track) 431f8e14497SBen Gardon handle_changed_spte_acc_track(iter->old_spte, new_spte, 432f8e14497SBen Gardon iter->level); 433a6a0b05dSBen Gardon if (record_dirty_log) 434a6a0b05dSBen Gardon handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 435a6a0b05dSBen Gardon iter->old_spte, new_spte, 436a6a0b05dSBen Gardon iter->level); 437f8e14497SBen Gardon } 438f8e14497SBen Gardon 439f8e14497SBen Gardon static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 440f8e14497SBen Gardon u64 new_spte) 441f8e14497SBen Gardon { 442a6a0b05dSBen Gardon __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 443f8e14497SBen Gardon } 444f8e14497SBen Gardon 445f8e14497SBen Gardon static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 446f8e14497SBen Gardon struct tdp_iter *iter, 447f8e14497SBen Gardon u64 new_spte) 448f8e14497SBen Gardon { 449a6a0b05dSBen Gardon __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 450a6a0b05dSBen Gardon } 451a6a0b05dSBen Gardon 452a6a0b05dSBen Gardon static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 453a6a0b05dSBen Gardon struct tdp_iter *iter, 454a6a0b05dSBen Gardon u64 new_spte) 455a6a0b05dSBen Gardon { 456a6a0b05dSBen Gardon __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 457faaf05b0SBen Gardon } 458faaf05b0SBen Gardon 459faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 460faaf05b0SBen Gardon for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 461faaf05b0SBen Gardon 462f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 463f8e14497SBen Gardon tdp_root_for_each_pte(_iter, _root, _start, _end) \ 464f8e14497SBen Gardon if (!is_shadow_present_pte(_iter.old_spte) || \ 465f8e14497SBen Gardon !is_last_spte(_iter.old_spte, _iter.level)) \ 466f8e14497SBen Gardon continue; \ 467f8e14497SBen Gardon else 468f8e14497SBen Gardon 469bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 470bb18842eSBen Gardon for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 471bb18842eSBen Gardon _mmu->shadow_root_level, _start, _end) 472bb18842eSBen Gardon 473faaf05b0SBen Gardon /* 474e28a436cSBen Gardon * Yield if the MMU lock is contended or this thread needs to return control 475e28a436cSBen Gardon * to the scheduler. 476e28a436cSBen Gardon * 477e139a34eSBen Gardon * If this function should yield and flush is set, it will perform a remote 478e139a34eSBen Gardon * TLB flush before yielding. 479e139a34eSBen Gardon * 480e28a436cSBen Gardon * If this function yields, it will also reset the tdp_iter's walk over the 481ed5e484bSBen Gardon * paging structure and the calling function should skip to the next 482ed5e484bSBen Gardon * iteration to allow the iterator to continue its traversal from the 483ed5e484bSBen Gardon * paging structure root. 484e28a436cSBen Gardon * 485e28a436cSBen Gardon * Return true if this function yielded and the iterator's traversal was reset. 486e28a436cSBen Gardon * Return false if a yield was not needed. 487e28a436cSBen Gardon */ 488e139a34eSBen Gardon static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 489e139a34eSBen Gardon struct tdp_iter *iter, bool flush) 490a6a0b05dSBen Gardon { 491ed5e484bSBen Gardon /* Ensure forward progress has been made before yielding. */ 492ed5e484bSBen Gardon if (iter->next_last_level_gfn == iter->yielded_gfn) 493ed5e484bSBen Gardon return false; 494ed5e484bSBen Gardon 495a6a0b05dSBen Gardon if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 496*7cca2d0bSBen Gardon rcu_read_unlock(); 497*7cca2d0bSBen Gardon 498e139a34eSBen Gardon if (flush) 499e139a34eSBen Gardon kvm_flush_remote_tlbs(kvm); 500e139a34eSBen Gardon 501a6a0b05dSBen Gardon cond_resched_lock(&kvm->mmu_lock); 502*7cca2d0bSBen Gardon rcu_read_lock(); 503ed5e484bSBen Gardon 504ed5e484bSBen Gardon WARN_ON(iter->gfn > iter->next_last_level_gfn); 505ed5e484bSBen Gardon 506ed5e484bSBen Gardon tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], 507ed5e484bSBen Gardon iter->root_level, iter->min_level, 508ed5e484bSBen Gardon iter->next_last_level_gfn); 509ed5e484bSBen Gardon 510e28a436cSBen Gardon return true; 511a6a0b05dSBen Gardon } 512e28a436cSBen Gardon 513e28a436cSBen Gardon return false; 514a6a0b05dSBen Gardon } 515a6a0b05dSBen Gardon 516faaf05b0SBen Gardon /* 517faaf05b0SBen Gardon * Tears down the mappings for the range of gfns, [start, end), and frees the 518faaf05b0SBen Gardon * non-root pages mapping GFNs strictly within that range. Returns true if 519faaf05b0SBen Gardon * SPTEs have been cleared and a TLB flush is needed before releasing the 520faaf05b0SBen Gardon * MMU lock. 521063afacdSBen Gardon * If can_yield is true, will release the MMU lock and reschedule if the 522063afacdSBen Gardon * scheduler needs the CPU or there is contention on the MMU lock. If this 523063afacdSBen Gardon * function cannot yield, it will not release the MMU lock or reschedule and 524063afacdSBen Gardon * the caller must ensure it does not supply too large a GFN range, or the 525063afacdSBen Gardon * operation can cause a soft lockup. 526faaf05b0SBen Gardon */ 527faaf05b0SBen Gardon static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 528063afacdSBen Gardon gfn_t start, gfn_t end, bool can_yield) 529faaf05b0SBen Gardon { 530faaf05b0SBen Gardon struct tdp_iter iter; 531faaf05b0SBen Gardon bool flush_needed = false; 532faaf05b0SBen Gardon 533*7cca2d0bSBen Gardon rcu_read_lock(); 534*7cca2d0bSBen Gardon 535faaf05b0SBen Gardon tdp_root_for_each_pte(iter, root, start, end) { 5361af4a960SBen Gardon if (can_yield && 5371af4a960SBen Gardon tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { 5381af4a960SBen Gardon flush_needed = false; 5391af4a960SBen Gardon continue; 5401af4a960SBen Gardon } 5411af4a960SBen Gardon 542faaf05b0SBen Gardon if (!is_shadow_present_pte(iter.old_spte)) 543faaf05b0SBen Gardon continue; 544faaf05b0SBen Gardon 545faaf05b0SBen Gardon /* 546faaf05b0SBen Gardon * If this is a non-last-level SPTE that covers a larger range 547faaf05b0SBen Gardon * than should be zapped, continue, and zap the mappings at a 548faaf05b0SBen Gardon * lower level. 549faaf05b0SBen Gardon */ 550faaf05b0SBen Gardon if ((iter.gfn < start || 551faaf05b0SBen Gardon iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 552faaf05b0SBen Gardon !is_last_spte(iter.old_spte, iter.level)) 553faaf05b0SBen Gardon continue; 554faaf05b0SBen Gardon 555faaf05b0SBen Gardon tdp_mmu_set_spte(kvm, &iter, 0); 5561af4a960SBen Gardon flush_needed = true; 557faaf05b0SBen Gardon } 558*7cca2d0bSBen Gardon 559*7cca2d0bSBen Gardon rcu_read_unlock(); 560faaf05b0SBen Gardon return flush_needed; 561faaf05b0SBen Gardon } 562faaf05b0SBen Gardon 563faaf05b0SBen Gardon /* 564faaf05b0SBen Gardon * Tears down the mappings for the range of gfns, [start, end), and frees the 565faaf05b0SBen Gardon * non-root pages mapping GFNs strictly within that range. Returns true if 566faaf05b0SBen Gardon * SPTEs have been cleared and a TLB flush is needed before releasing the 567faaf05b0SBen Gardon * MMU lock. 568faaf05b0SBen Gardon */ 569faaf05b0SBen Gardon bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) 570faaf05b0SBen Gardon { 571faaf05b0SBen Gardon struct kvm_mmu_page *root; 572faaf05b0SBen Gardon bool flush = false; 573faaf05b0SBen Gardon 574a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) 575063afacdSBen Gardon flush |= zap_gfn_range(kvm, root, start, end, true); 576faaf05b0SBen Gardon 577faaf05b0SBen Gardon return flush; 578faaf05b0SBen Gardon } 579faaf05b0SBen Gardon 580faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm) 581faaf05b0SBen Gardon { 582339f5a7fSRick Edgecombe gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 583faaf05b0SBen Gardon bool flush; 584faaf05b0SBen Gardon 585faaf05b0SBen Gardon flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 586faaf05b0SBen Gardon if (flush) 587faaf05b0SBen Gardon kvm_flush_remote_tlbs(kvm); 588faaf05b0SBen Gardon } 589bb18842eSBen Gardon 590bb18842eSBen Gardon /* 591bb18842eSBen Gardon * Installs a last-level SPTE to handle a TDP page fault. 592bb18842eSBen Gardon * (NPT/EPT violation/misconfiguration) 593bb18842eSBen Gardon */ 594bb18842eSBen Gardon static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 595bb18842eSBen Gardon int map_writable, 596bb18842eSBen Gardon struct tdp_iter *iter, 597bb18842eSBen Gardon kvm_pfn_t pfn, bool prefault) 598bb18842eSBen Gardon { 599bb18842eSBen Gardon u64 new_spte; 600bb18842eSBen Gardon int ret = 0; 601bb18842eSBen Gardon int make_spte_ret = 0; 602bb18842eSBen Gardon 603bb18842eSBen Gardon if (unlikely(is_noslot_pfn(pfn))) { 604bb18842eSBen Gardon new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 605*7cca2d0bSBen Gardon trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 606*7cca2d0bSBen Gardon new_spte); 60733dd3574SBen Gardon } else { 608bb18842eSBen Gardon make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 609bb18842eSBen Gardon pfn, iter->old_spte, prefault, true, 610bb18842eSBen Gardon map_writable, !shadow_accessed_mask, 611bb18842eSBen Gardon &new_spte); 612*7cca2d0bSBen Gardon trace_kvm_mmu_set_spte(iter->level, iter->gfn, 613*7cca2d0bSBen Gardon rcu_dereference(iter->sptep)); 61433dd3574SBen Gardon } 615bb18842eSBen Gardon 616bb18842eSBen Gardon if (new_spte == iter->old_spte) 617bb18842eSBen Gardon ret = RET_PF_SPURIOUS; 618bb18842eSBen Gardon else 619bb18842eSBen Gardon tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); 620bb18842eSBen Gardon 621bb18842eSBen Gardon /* 622bb18842eSBen Gardon * If the page fault was caused by a write but the page is write 623bb18842eSBen Gardon * protected, emulation is needed. If the emulation was skipped, 624bb18842eSBen Gardon * the vCPU would have the same fault again. 625bb18842eSBen Gardon */ 626bb18842eSBen Gardon if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 627bb18842eSBen Gardon if (write) 628bb18842eSBen Gardon ret = RET_PF_EMULATE; 629bb18842eSBen Gardon kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 630bb18842eSBen Gardon } 631bb18842eSBen Gardon 632bb18842eSBen Gardon /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 633bb18842eSBen Gardon if (unlikely(is_mmio_spte(new_spte))) 634bb18842eSBen Gardon ret = RET_PF_EMULATE; 635bb18842eSBen Gardon 636*7cca2d0bSBen Gardon trace_kvm_mmu_set_spte(iter->level, iter->gfn, 637*7cca2d0bSBen Gardon rcu_dereference(iter->sptep)); 638bb18842eSBen Gardon if (!prefault) 639bb18842eSBen Gardon vcpu->stat.pf_fixed++; 640bb18842eSBen Gardon 641bb18842eSBen Gardon return ret; 642bb18842eSBen Gardon } 643bb18842eSBen Gardon 644bb18842eSBen Gardon /* 645bb18842eSBen Gardon * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 646bb18842eSBen Gardon * page tables and SPTEs to translate the faulting guest physical address. 647bb18842eSBen Gardon */ 648bb18842eSBen Gardon int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 649bb18842eSBen Gardon int map_writable, int max_level, kvm_pfn_t pfn, 650bb18842eSBen Gardon bool prefault) 651bb18842eSBen Gardon { 652bb18842eSBen Gardon bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 653bb18842eSBen Gardon bool write = error_code & PFERR_WRITE_MASK; 654bb18842eSBen Gardon bool exec = error_code & PFERR_FETCH_MASK; 655bb18842eSBen Gardon bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 656bb18842eSBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 657bb18842eSBen Gardon struct tdp_iter iter; 65889c0fd49SBen Gardon struct kvm_mmu_page *sp; 659bb18842eSBen Gardon u64 *child_pt; 660bb18842eSBen Gardon u64 new_spte; 661bb18842eSBen Gardon int ret; 662bb18842eSBen Gardon gfn_t gfn = gpa >> PAGE_SHIFT; 663bb18842eSBen Gardon int level; 664bb18842eSBen Gardon int req_level; 665bb18842eSBen Gardon 666bb18842eSBen Gardon if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 667bb18842eSBen Gardon return RET_PF_RETRY; 668bb18842eSBen Gardon if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 669bb18842eSBen Gardon return RET_PF_RETRY; 670bb18842eSBen Gardon 671bb18842eSBen Gardon level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 672bb18842eSBen Gardon huge_page_disallowed, &req_level); 673bb18842eSBen Gardon 674bb18842eSBen Gardon trace_kvm_mmu_spte_requested(gpa, level, pfn); 675*7cca2d0bSBen Gardon 676*7cca2d0bSBen Gardon rcu_read_lock(); 677*7cca2d0bSBen Gardon 678bb18842eSBen Gardon tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 679bb18842eSBen Gardon if (nx_huge_page_workaround_enabled) 680bb18842eSBen Gardon disallowed_hugepage_adjust(iter.old_spte, gfn, 681bb18842eSBen Gardon iter.level, &pfn, &level); 682bb18842eSBen Gardon 683bb18842eSBen Gardon if (iter.level == level) 684bb18842eSBen Gardon break; 685bb18842eSBen Gardon 686bb18842eSBen Gardon /* 687bb18842eSBen Gardon * If there is an SPTE mapping a large page at a higher level 688bb18842eSBen Gardon * than the target, that SPTE must be cleared and replaced 689bb18842eSBen Gardon * with a non-leaf SPTE. 690bb18842eSBen Gardon */ 691bb18842eSBen Gardon if (is_shadow_present_pte(iter.old_spte) && 692bb18842eSBen Gardon is_large_pte(iter.old_spte)) { 693bb18842eSBen Gardon tdp_mmu_set_spte(vcpu->kvm, &iter, 0); 694bb18842eSBen Gardon 695bb18842eSBen Gardon kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, 696bb18842eSBen Gardon KVM_PAGES_PER_HPAGE(iter.level)); 697bb18842eSBen Gardon 698bb18842eSBen Gardon /* 699bb18842eSBen Gardon * The iter must explicitly re-read the spte here 700bb18842eSBen Gardon * because the new value informs the !present 701bb18842eSBen Gardon * path below. 702bb18842eSBen Gardon */ 703*7cca2d0bSBen Gardon iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 704bb18842eSBen Gardon } 705bb18842eSBen Gardon 706bb18842eSBen Gardon if (!is_shadow_present_pte(iter.old_spte)) { 70789c0fd49SBen Gardon sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 70889c0fd49SBen Gardon list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); 70989c0fd49SBen Gardon child_pt = sp->spt; 710bb18842eSBen Gardon new_spte = make_nonleaf_spte(child_pt, 711bb18842eSBen Gardon !shadow_accessed_mask); 712bb18842eSBen Gardon 713bb18842eSBen Gardon trace_kvm_mmu_get_page(sp, true); 71429cf0f50SBen Gardon if (huge_page_disallowed && req_level >= iter.level) 71529cf0f50SBen Gardon account_huge_nx_page(vcpu->kvm, sp); 71629cf0f50SBen Gardon 717bb18842eSBen Gardon tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); 718bb18842eSBen Gardon } 719bb18842eSBen Gardon } 720bb18842eSBen Gardon 721*7cca2d0bSBen Gardon if (WARN_ON(iter.level != level)) { 722*7cca2d0bSBen Gardon rcu_read_unlock(); 723bb18842eSBen Gardon return RET_PF_RETRY; 724*7cca2d0bSBen Gardon } 725bb18842eSBen Gardon 726bb18842eSBen Gardon ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 727bb18842eSBen Gardon pfn, prefault); 728*7cca2d0bSBen Gardon rcu_read_unlock(); 729bb18842eSBen Gardon 730bb18842eSBen Gardon return ret; 731bb18842eSBen Gardon } 732063afacdSBen Gardon 733063afacdSBen Gardon static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, 734063afacdSBen Gardon unsigned long end, unsigned long data, 735063afacdSBen Gardon int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, 736063afacdSBen Gardon struct kvm_mmu_page *root, gfn_t start, 737063afacdSBen Gardon gfn_t end, unsigned long data)) 738063afacdSBen Gardon { 739063afacdSBen Gardon struct kvm_memslots *slots; 740063afacdSBen Gardon struct kvm_memory_slot *memslot; 741063afacdSBen Gardon struct kvm_mmu_page *root; 742063afacdSBen Gardon int ret = 0; 743063afacdSBen Gardon int as_id; 744063afacdSBen Gardon 745a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 746063afacdSBen Gardon as_id = kvm_mmu_page_as_id(root); 747063afacdSBen Gardon slots = __kvm_memslots(kvm, as_id); 748063afacdSBen Gardon kvm_for_each_memslot(memslot, slots) { 749063afacdSBen Gardon unsigned long hva_start, hva_end; 750063afacdSBen Gardon gfn_t gfn_start, gfn_end; 751063afacdSBen Gardon 752063afacdSBen Gardon hva_start = max(start, memslot->userspace_addr); 753063afacdSBen Gardon hva_end = min(end, memslot->userspace_addr + 754063afacdSBen Gardon (memslot->npages << PAGE_SHIFT)); 755063afacdSBen Gardon if (hva_start >= hva_end) 756063afacdSBen Gardon continue; 757063afacdSBen Gardon /* 758063afacdSBen Gardon * {gfn(page) | page intersects with [hva_start, hva_end)} = 759063afacdSBen Gardon * {gfn_start, gfn_start+1, ..., gfn_end-1}. 760063afacdSBen Gardon */ 761063afacdSBen Gardon gfn_start = hva_to_gfn_memslot(hva_start, memslot); 762063afacdSBen Gardon gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 763063afacdSBen Gardon 764063afacdSBen Gardon ret |= handler(kvm, memslot, root, gfn_start, 765063afacdSBen Gardon gfn_end, data); 766063afacdSBen Gardon } 767063afacdSBen Gardon } 768063afacdSBen Gardon 769063afacdSBen Gardon return ret; 770063afacdSBen Gardon } 771063afacdSBen Gardon 772063afacdSBen Gardon static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 773063afacdSBen Gardon struct kvm_memory_slot *slot, 774063afacdSBen Gardon struct kvm_mmu_page *root, gfn_t start, 775063afacdSBen Gardon gfn_t end, unsigned long unused) 776063afacdSBen Gardon { 777063afacdSBen Gardon return zap_gfn_range(kvm, root, start, end, false); 778063afacdSBen Gardon } 779063afacdSBen Gardon 780063afacdSBen Gardon int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 781063afacdSBen Gardon unsigned long end) 782063afacdSBen Gardon { 783063afacdSBen Gardon return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 784063afacdSBen Gardon zap_gfn_range_hva_wrapper); 785063afacdSBen Gardon } 786f8e14497SBen Gardon 787f8e14497SBen Gardon /* 788f8e14497SBen Gardon * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 789f8e14497SBen Gardon * if any of the GFNs in the range have been accessed. 790f8e14497SBen Gardon */ 791f8e14497SBen Gardon static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 792f8e14497SBen Gardon struct kvm_mmu_page *root, gfn_t start, gfn_t end, 793f8e14497SBen Gardon unsigned long unused) 794f8e14497SBen Gardon { 795f8e14497SBen Gardon struct tdp_iter iter; 796f8e14497SBen Gardon int young = 0; 797f8e14497SBen Gardon u64 new_spte = 0; 798f8e14497SBen Gardon 799*7cca2d0bSBen Gardon rcu_read_lock(); 800*7cca2d0bSBen Gardon 801f8e14497SBen Gardon tdp_root_for_each_leaf_pte(iter, root, start, end) { 802f8e14497SBen Gardon /* 803f8e14497SBen Gardon * If we have a non-accessed entry we don't need to change the 804f8e14497SBen Gardon * pte. 805f8e14497SBen Gardon */ 806f8e14497SBen Gardon if (!is_accessed_spte(iter.old_spte)) 807f8e14497SBen Gardon continue; 808f8e14497SBen Gardon 809f8e14497SBen Gardon new_spte = iter.old_spte; 810f8e14497SBen Gardon 811f8e14497SBen Gardon if (spte_ad_enabled(new_spte)) { 812f8e14497SBen Gardon clear_bit((ffs(shadow_accessed_mask) - 1), 813f8e14497SBen Gardon (unsigned long *)&new_spte); 814f8e14497SBen Gardon } else { 815f8e14497SBen Gardon /* 816f8e14497SBen Gardon * Capture the dirty status of the page, so that it doesn't get 817f8e14497SBen Gardon * lost when the SPTE is marked for access tracking. 818f8e14497SBen Gardon */ 819f8e14497SBen Gardon if (is_writable_pte(new_spte)) 820f8e14497SBen Gardon kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 821f8e14497SBen Gardon 822f8e14497SBen Gardon new_spte = mark_spte_for_access_track(new_spte); 823f8e14497SBen Gardon } 824a6a0b05dSBen Gardon new_spte &= ~shadow_dirty_mask; 825f8e14497SBen Gardon 826f8e14497SBen Gardon tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 827f8e14497SBen Gardon young = 1; 82833dd3574SBen Gardon 82933dd3574SBen Gardon trace_kvm_age_page(iter.gfn, iter.level, slot, young); 830f8e14497SBen Gardon } 831f8e14497SBen Gardon 832*7cca2d0bSBen Gardon rcu_read_unlock(); 833*7cca2d0bSBen Gardon 834f8e14497SBen Gardon return young; 835f8e14497SBen Gardon } 836f8e14497SBen Gardon 837f8e14497SBen Gardon int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 838f8e14497SBen Gardon unsigned long end) 839f8e14497SBen Gardon { 840f8e14497SBen Gardon return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 841f8e14497SBen Gardon age_gfn_range); 842f8e14497SBen Gardon } 843f8e14497SBen Gardon 844f8e14497SBen Gardon static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 845f8e14497SBen Gardon struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 846f8e14497SBen Gardon unsigned long unused2) 847f8e14497SBen Gardon { 848f8e14497SBen Gardon struct tdp_iter iter; 849f8e14497SBen Gardon 850f8e14497SBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 851f8e14497SBen Gardon if (is_accessed_spte(iter.old_spte)) 852f8e14497SBen Gardon return 1; 853f8e14497SBen Gardon 854f8e14497SBen Gardon return 0; 855f8e14497SBen Gardon } 856f8e14497SBen Gardon 857f8e14497SBen Gardon int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 858f8e14497SBen Gardon { 859f8e14497SBen Gardon return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 860f8e14497SBen Gardon test_age_gfn); 861f8e14497SBen Gardon } 8621d8dd6b3SBen Gardon 8631d8dd6b3SBen Gardon /* 8641d8dd6b3SBen Gardon * Handle the changed_pte MMU notifier for the TDP MMU. 8651d8dd6b3SBen Gardon * data is a pointer to the new pte_t mapping the HVA specified by the MMU 8661d8dd6b3SBen Gardon * notifier. 8671d8dd6b3SBen Gardon * Returns non-zero if a flush is needed before releasing the MMU lock. 8681d8dd6b3SBen Gardon */ 8691d8dd6b3SBen Gardon static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 8701d8dd6b3SBen Gardon struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 8711d8dd6b3SBen Gardon unsigned long data) 8721d8dd6b3SBen Gardon { 8731d8dd6b3SBen Gardon struct tdp_iter iter; 8741d8dd6b3SBen Gardon pte_t *ptep = (pte_t *)data; 8751d8dd6b3SBen Gardon kvm_pfn_t new_pfn; 8761d8dd6b3SBen Gardon u64 new_spte; 8771d8dd6b3SBen Gardon int need_flush = 0; 8781d8dd6b3SBen Gardon 879*7cca2d0bSBen Gardon rcu_read_lock(); 880*7cca2d0bSBen Gardon 8811d8dd6b3SBen Gardon WARN_ON(pte_huge(*ptep)); 8821d8dd6b3SBen Gardon 8831d8dd6b3SBen Gardon new_pfn = pte_pfn(*ptep); 8841d8dd6b3SBen Gardon 8851d8dd6b3SBen Gardon tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 8861d8dd6b3SBen Gardon if (iter.level != PG_LEVEL_4K) 8871d8dd6b3SBen Gardon continue; 8881d8dd6b3SBen Gardon 8891d8dd6b3SBen Gardon if (!is_shadow_present_pte(iter.old_spte)) 8901d8dd6b3SBen Gardon break; 8911d8dd6b3SBen Gardon 8921d8dd6b3SBen Gardon tdp_mmu_set_spte(kvm, &iter, 0); 8931d8dd6b3SBen Gardon 8941d8dd6b3SBen Gardon kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 8951d8dd6b3SBen Gardon 8961d8dd6b3SBen Gardon if (!pte_write(*ptep)) { 8971d8dd6b3SBen Gardon new_spte = kvm_mmu_changed_pte_notifier_make_spte( 8981d8dd6b3SBen Gardon iter.old_spte, new_pfn); 8991d8dd6b3SBen Gardon 9001d8dd6b3SBen Gardon tdp_mmu_set_spte(kvm, &iter, new_spte); 9011d8dd6b3SBen Gardon } 9021d8dd6b3SBen Gardon 9031d8dd6b3SBen Gardon need_flush = 1; 9041d8dd6b3SBen Gardon } 9051d8dd6b3SBen Gardon 9061d8dd6b3SBen Gardon if (need_flush) 9071d8dd6b3SBen Gardon kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 9081d8dd6b3SBen Gardon 909*7cca2d0bSBen Gardon rcu_read_unlock(); 910*7cca2d0bSBen Gardon 9111d8dd6b3SBen Gardon return 0; 9121d8dd6b3SBen Gardon } 9131d8dd6b3SBen Gardon 9141d8dd6b3SBen Gardon int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 9151d8dd6b3SBen Gardon pte_t *host_ptep) 9161d8dd6b3SBen Gardon { 9171d8dd6b3SBen Gardon return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 9181d8dd6b3SBen Gardon (unsigned long)host_ptep, 9191d8dd6b3SBen Gardon set_tdp_spte); 9201d8dd6b3SBen Gardon } 9211d8dd6b3SBen Gardon 922a6a0b05dSBen Gardon /* 923a6a0b05dSBen Gardon * Remove write access from all the SPTEs mapping GFNs [start, end). If 924a6a0b05dSBen Gardon * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 925a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 926a6a0b05dSBen Gardon */ 927a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 928a6a0b05dSBen Gardon gfn_t start, gfn_t end, int min_level) 929a6a0b05dSBen Gardon { 930a6a0b05dSBen Gardon struct tdp_iter iter; 931a6a0b05dSBen Gardon u64 new_spte; 932a6a0b05dSBen Gardon bool spte_set = false; 933a6a0b05dSBen Gardon 934*7cca2d0bSBen Gardon rcu_read_lock(); 935*7cca2d0bSBen Gardon 936a6a0b05dSBen Gardon BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 937a6a0b05dSBen Gardon 938a6a0b05dSBen Gardon for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 939a6a0b05dSBen Gardon min_level, start, end) { 9401af4a960SBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 9411af4a960SBen Gardon continue; 9421af4a960SBen Gardon 943a6a0b05dSBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 9440f99ee2cSBen Gardon !is_last_spte(iter.old_spte, iter.level) || 9450f99ee2cSBen Gardon !(iter.old_spte & PT_WRITABLE_MASK)) 946a6a0b05dSBen Gardon continue; 947a6a0b05dSBen Gardon 948a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 949a6a0b05dSBen Gardon 950a6a0b05dSBen Gardon tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 951a6a0b05dSBen Gardon spte_set = true; 952a6a0b05dSBen Gardon } 953*7cca2d0bSBen Gardon 954*7cca2d0bSBen Gardon rcu_read_unlock(); 955a6a0b05dSBen Gardon return spte_set; 956a6a0b05dSBen Gardon } 957a6a0b05dSBen Gardon 958a6a0b05dSBen Gardon /* 959a6a0b05dSBen Gardon * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 960a6a0b05dSBen Gardon * only affect leaf SPTEs down to min_level. 961a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 962a6a0b05dSBen Gardon */ 963a6a0b05dSBen Gardon bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 964a6a0b05dSBen Gardon int min_level) 965a6a0b05dSBen Gardon { 966a6a0b05dSBen Gardon struct kvm_mmu_page *root; 967a6a0b05dSBen Gardon int root_as_id; 968a6a0b05dSBen Gardon bool spte_set = false; 969a6a0b05dSBen Gardon 970a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 971a6a0b05dSBen Gardon root_as_id = kvm_mmu_page_as_id(root); 972a6a0b05dSBen Gardon if (root_as_id != slot->as_id) 973a6a0b05dSBen Gardon continue; 974a6a0b05dSBen Gardon 975a6a0b05dSBen Gardon spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 976a6a0b05dSBen Gardon slot->base_gfn + slot->npages, min_level); 977a6a0b05dSBen Gardon } 978a6a0b05dSBen Gardon 979a6a0b05dSBen Gardon return spte_set; 980a6a0b05dSBen Gardon } 981a6a0b05dSBen Gardon 982a6a0b05dSBen Gardon /* 983a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 984a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 985a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 986a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 987a6a0b05dSBen Gardon * be flushed. 988a6a0b05dSBen Gardon */ 989a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 990a6a0b05dSBen Gardon gfn_t start, gfn_t end) 991a6a0b05dSBen Gardon { 992a6a0b05dSBen Gardon struct tdp_iter iter; 993a6a0b05dSBen Gardon u64 new_spte; 994a6a0b05dSBen Gardon bool spte_set = false; 995a6a0b05dSBen Gardon 996*7cca2d0bSBen Gardon rcu_read_lock(); 997*7cca2d0bSBen Gardon 998a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, start, end) { 9991af4a960SBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 10001af4a960SBen Gardon continue; 10011af4a960SBen Gardon 1002a6a0b05dSBen Gardon if (spte_ad_need_write_protect(iter.old_spte)) { 1003a6a0b05dSBen Gardon if (is_writable_pte(iter.old_spte)) 1004a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1005a6a0b05dSBen Gardon else 1006a6a0b05dSBen Gardon continue; 1007a6a0b05dSBen Gardon } else { 1008a6a0b05dSBen Gardon if (iter.old_spte & shadow_dirty_mask) 1009a6a0b05dSBen Gardon new_spte = iter.old_spte & ~shadow_dirty_mask; 1010a6a0b05dSBen Gardon else 1011a6a0b05dSBen Gardon continue; 1012a6a0b05dSBen Gardon } 1013a6a0b05dSBen Gardon 1014a6a0b05dSBen Gardon tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1015a6a0b05dSBen Gardon spte_set = true; 1016a6a0b05dSBen Gardon } 1017*7cca2d0bSBen Gardon 1018*7cca2d0bSBen Gardon rcu_read_unlock(); 1019a6a0b05dSBen Gardon return spte_set; 1020a6a0b05dSBen Gardon } 1021a6a0b05dSBen Gardon 1022a6a0b05dSBen Gardon /* 1023a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1024a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1025a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1026a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1027a6a0b05dSBen Gardon * be flushed. 1028a6a0b05dSBen Gardon */ 1029a6a0b05dSBen Gardon bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 1030a6a0b05dSBen Gardon { 1031a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1032a6a0b05dSBen Gardon int root_as_id; 1033a6a0b05dSBen Gardon bool spte_set = false; 1034a6a0b05dSBen Gardon 1035a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 1036a6a0b05dSBen Gardon root_as_id = kvm_mmu_page_as_id(root); 1037a6a0b05dSBen Gardon if (root_as_id != slot->as_id) 1038a6a0b05dSBen Gardon continue; 1039a6a0b05dSBen Gardon 1040a6a0b05dSBen Gardon spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1041a6a0b05dSBen Gardon slot->base_gfn + slot->npages); 1042a6a0b05dSBen Gardon } 1043a6a0b05dSBen Gardon 1044a6a0b05dSBen Gardon return spte_set; 1045a6a0b05dSBen Gardon } 1046a6a0b05dSBen Gardon 1047a6a0b05dSBen Gardon /* 1048a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1049a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1050a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1051a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1052a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1053a6a0b05dSBen Gardon */ 1054a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1055a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, bool wrprot) 1056a6a0b05dSBen Gardon { 1057a6a0b05dSBen Gardon struct tdp_iter iter; 1058a6a0b05dSBen Gardon u64 new_spte; 1059a6a0b05dSBen Gardon 1060*7cca2d0bSBen Gardon rcu_read_lock(); 1061*7cca2d0bSBen Gardon 1062a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1063a6a0b05dSBen Gardon gfn + BITS_PER_LONG) { 1064a6a0b05dSBen Gardon if (!mask) 1065a6a0b05dSBen Gardon break; 1066a6a0b05dSBen Gardon 1067a6a0b05dSBen Gardon if (iter.level > PG_LEVEL_4K || 1068a6a0b05dSBen Gardon !(mask & (1UL << (iter.gfn - gfn)))) 1069a6a0b05dSBen Gardon continue; 1070a6a0b05dSBen Gardon 1071f1b3b06aSBen Gardon mask &= ~(1UL << (iter.gfn - gfn)); 1072f1b3b06aSBen Gardon 1073a6a0b05dSBen Gardon if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1074a6a0b05dSBen Gardon if (is_writable_pte(iter.old_spte)) 1075a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1076a6a0b05dSBen Gardon else 1077a6a0b05dSBen Gardon continue; 1078a6a0b05dSBen Gardon } else { 1079a6a0b05dSBen Gardon if (iter.old_spte & shadow_dirty_mask) 1080a6a0b05dSBen Gardon new_spte = iter.old_spte & ~shadow_dirty_mask; 1081a6a0b05dSBen Gardon else 1082a6a0b05dSBen Gardon continue; 1083a6a0b05dSBen Gardon } 1084a6a0b05dSBen Gardon 1085a6a0b05dSBen Gardon tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1086a6a0b05dSBen Gardon } 1087*7cca2d0bSBen Gardon 1088*7cca2d0bSBen Gardon rcu_read_unlock(); 1089a6a0b05dSBen Gardon } 1090a6a0b05dSBen Gardon 1091a6a0b05dSBen Gardon /* 1092a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1093a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1094a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1095a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1096a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1097a6a0b05dSBen Gardon */ 1098a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1099a6a0b05dSBen Gardon struct kvm_memory_slot *slot, 1100a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, 1101a6a0b05dSBen Gardon bool wrprot) 1102a6a0b05dSBen Gardon { 1103a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1104a6a0b05dSBen Gardon int root_as_id; 1105a6a0b05dSBen Gardon 1106a6a0b05dSBen Gardon lockdep_assert_held(&kvm->mmu_lock); 1107a6a0b05dSBen Gardon for_each_tdp_mmu_root(kvm, root) { 1108a6a0b05dSBen Gardon root_as_id = kvm_mmu_page_as_id(root); 1109a6a0b05dSBen Gardon if (root_as_id != slot->as_id) 1110a6a0b05dSBen Gardon continue; 1111a6a0b05dSBen Gardon 1112a6a0b05dSBen Gardon clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1113a6a0b05dSBen Gardon } 1114a6a0b05dSBen Gardon } 1115a6a0b05dSBen Gardon 1116a6a0b05dSBen Gardon /* 1117a6a0b05dSBen Gardon * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1118a6a0b05dSBen Gardon * only used for PML, and so will involve setting the dirty bit on each SPTE. 1119a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1120a6a0b05dSBen Gardon */ 1121a6a0b05dSBen Gardon static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1122a6a0b05dSBen Gardon gfn_t start, gfn_t end) 1123a6a0b05dSBen Gardon { 1124a6a0b05dSBen Gardon struct tdp_iter iter; 1125a6a0b05dSBen Gardon u64 new_spte; 1126a6a0b05dSBen Gardon bool spte_set = false; 1127a6a0b05dSBen Gardon 1128*7cca2d0bSBen Gardon rcu_read_lock(); 1129*7cca2d0bSBen Gardon 1130a6a0b05dSBen Gardon tdp_root_for_each_pte(iter, root, start, end) { 11311af4a960SBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 11321af4a960SBen Gardon continue; 11331af4a960SBen Gardon 11340f99ee2cSBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 11350f99ee2cSBen Gardon iter.old_spte & shadow_dirty_mask) 1136a6a0b05dSBen Gardon continue; 1137a6a0b05dSBen Gardon 1138a6a0b05dSBen Gardon new_spte = iter.old_spte | shadow_dirty_mask; 1139a6a0b05dSBen Gardon 1140a6a0b05dSBen Gardon tdp_mmu_set_spte(kvm, &iter, new_spte); 1141a6a0b05dSBen Gardon spte_set = true; 1142a6a0b05dSBen Gardon } 1143a6a0b05dSBen Gardon 1144*7cca2d0bSBen Gardon rcu_read_unlock(); 1145a6a0b05dSBen Gardon return spte_set; 1146a6a0b05dSBen Gardon } 1147a6a0b05dSBen Gardon 1148a6a0b05dSBen Gardon /* 1149a6a0b05dSBen Gardon * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1150a6a0b05dSBen Gardon * only used for PML, and so will involve setting the dirty bit on each SPTE. 1151a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1152a6a0b05dSBen Gardon */ 1153a6a0b05dSBen Gardon bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) 1154a6a0b05dSBen Gardon { 1155a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1156a6a0b05dSBen Gardon int root_as_id; 1157a6a0b05dSBen Gardon bool spte_set = false; 1158a6a0b05dSBen Gardon 1159a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 1160a6a0b05dSBen Gardon root_as_id = kvm_mmu_page_as_id(root); 1161a6a0b05dSBen Gardon if (root_as_id != slot->as_id) 1162a6a0b05dSBen Gardon continue; 1163a6a0b05dSBen Gardon 1164a6a0b05dSBen Gardon spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, 1165a6a0b05dSBen Gardon slot->base_gfn + slot->npages); 1166a6a0b05dSBen Gardon } 1167a6a0b05dSBen Gardon return spte_set; 1168a6a0b05dSBen Gardon } 1169a6a0b05dSBen Gardon 117014881998SBen Gardon /* 117187aa9ec9SBen Gardon * Clear leaf entries which could be replaced by large mappings, for 117287aa9ec9SBen Gardon * GFNs within the slot. 117314881998SBen Gardon */ 117414881998SBen Gardon static void zap_collapsible_spte_range(struct kvm *kvm, 117514881998SBen Gardon struct kvm_mmu_page *root, 117614881998SBen Gardon gfn_t start, gfn_t end) 117714881998SBen Gardon { 117814881998SBen Gardon struct tdp_iter iter; 117914881998SBen Gardon kvm_pfn_t pfn; 118014881998SBen Gardon bool spte_set = false; 118114881998SBen Gardon 1182*7cca2d0bSBen Gardon rcu_read_lock(); 1183*7cca2d0bSBen Gardon 118414881998SBen Gardon tdp_root_for_each_pte(iter, root, start, end) { 11851af4a960SBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { 11861af4a960SBen Gardon spte_set = false; 11871af4a960SBen Gardon continue; 11881af4a960SBen Gardon } 11891af4a960SBen Gardon 119014881998SBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 119187aa9ec9SBen Gardon !is_last_spte(iter.old_spte, iter.level)) 119214881998SBen Gardon continue; 119314881998SBen Gardon 119414881998SBen Gardon pfn = spte_to_pfn(iter.old_spte); 119514881998SBen Gardon if (kvm_is_reserved_pfn(pfn) || 119614881998SBen Gardon !PageTransCompoundMap(pfn_to_page(pfn))) 119714881998SBen Gardon continue; 119814881998SBen Gardon 119914881998SBen Gardon tdp_mmu_set_spte(kvm, &iter, 0); 120014881998SBen Gardon 12011af4a960SBen Gardon spte_set = true; 120214881998SBen Gardon } 120314881998SBen Gardon 1204*7cca2d0bSBen Gardon rcu_read_unlock(); 120514881998SBen Gardon if (spte_set) 120614881998SBen Gardon kvm_flush_remote_tlbs(kvm); 120714881998SBen Gardon } 120814881998SBen Gardon 120914881998SBen Gardon /* 121014881998SBen Gardon * Clear non-leaf entries (and free associated page tables) which could 121114881998SBen Gardon * be replaced by large mappings, for GFNs within the slot. 121214881998SBen Gardon */ 121314881998SBen Gardon void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 121414881998SBen Gardon const struct kvm_memory_slot *slot) 121514881998SBen Gardon { 121614881998SBen Gardon struct kvm_mmu_page *root; 121714881998SBen Gardon int root_as_id; 121814881998SBen Gardon 1219a889ea54SBen Gardon for_each_tdp_mmu_root_yield_safe(kvm, root) { 122014881998SBen Gardon root_as_id = kvm_mmu_page_as_id(root); 122114881998SBen Gardon if (root_as_id != slot->as_id) 122214881998SBen Gardon continue; 122314881998SBen Gardon 122414881998SBen Gardon zap_collapsible_spte_range(kvm, root, slot->base_gfn, 122514881998SBen Gardon slot->base_gfn + slot->npages); 122614881998SBen Gardon } 122714881998SBen Gardon } 122846044f72SBen Gardon 122946044f72SBen Gardon /* 123046044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 123146044f72SBen Gardon * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 123246044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 123346044f72SBen Gardon */ 123446044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 123546044f72SBen Gardon gfn_t gfn) 123646044f72SBen Gardon { 123746044f72SBen Gardon struct tdp_iter iter; 123846044f72SBen Gardon u64 new_spte; 123946044f72SBen Gardon bool spte_set = false; 124046044f72SBen Gardon 1241*7cca2d0bSBen Gardon rcu_read_lock(); 1242*7cca2d0bSBen Gardon 124346044f72SBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 124446044f72SBen Gardon if (!is_writable_pte(iter.old_spte)) 124546044f72SBen Gardon break; 124646044f72SBen Gardon 124746044f72SBen Gardon new_spte = iter.old_spte & 124846044f72SBen Gardon ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 124946044f72SBen Gardon 125046044f72SBen Gardon tdp_mmu_set_spte(kvm, &iter, new_spte); 125146044f72SBen Gardon spte_set = true; 125246044f72SBen Gardon } 125346044f72SBen Gardon 1254*7cca2d0bSBen Gardon rcu_read_unlock(); 1255*7cca2d0bSBen Gardon 125646044f72SBen Gardon return spte_set; 125746044f72SBen Gardon } 125846044f72SBen Gardon 125946044f72SBen Gardon /* 126046044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 126146044f72SBen Gardon * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 126246044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 126346044f72SBen Gardon */ 126446044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 126546044f72SBen Gardon struct kvm_memory_slot *slot, gfn_t gfn) 126646044f72SBen Gardon { 126746044f72SBen Gardon struct kvm_mmu_page *root; 126846044f72SBen Gardon int root_as_id; 126946044f72SBen Gardon bool spte_set = false; 127046044f72SBen Gardon 127146044f72SBen Gardon lockdep_assert_held(&kvm->mmu_lock); 127246044f72SBen Gardon for_each_tdp_mmu_root(kvm, root) { 127346044f72SBen Gardon root_as_id = kvm_mmu_page_as_id(root); 127446044f72SBen Gardon if (root_as_id != slot->as_id) 127546044f72SBen Gardon continue; 127646044f72SBen Gardon 127746044f72SBen Gardon spte_set |= write_protect_gfn(kvm, root, gfn); 127846044f72SBen Gardon } 127946044f72SBen Gardon return spte_set; 128046044f72SBen Gardon } 128146044f72SBen Gardon 128295fb5b02SBen Gardon /* 128395fb5b02SBen Gardon * Return the level of the lowest level SPTE added to sptes. 128495fb5b02SBen Gardon * That SPTE may be non-present. 128595fb5b02SBen Gardon */ 128639b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 128739b4d43eSSean Christopherson int *root_level) 128895fb5b02SBen Gardon { 128995fb5b02SBen Gardon struct tdp_iter iter; 129095fb5b02SBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 129195fb5b02SBen Gardon gfn_t gfn = addr >> PAGE_SHIFT; 12922aa07893SSean Christopherson int leaf = -1; 129395fb5b02SBen Gardon 129439b4d43eSSean Christopherson *root_level = vcpu->arch.mmu->shadow_root_level; 129595fb5b02SBen Gardon 1296*7cca2d0bSBen Gardon rcu_read_lock(); 1297*7cca2d0bSBen Gardon 129895fb5b02SBen Gardon tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 129995fb5b02SBen Gardon leaf = iter.level; 1300dde81f94SSean Christopherson sptes[leaf] = iter.old_spte; 130195fb5b02SBen Gardon } 130295fb5b02SBen Gardon 1303*7cca2d0bSBen Gardon rcu_read_unlock(); 1304*7cca2d0bSBen Gardon 130595fb5b02SBen Gardon return leaf; 130695fb5b02SBen Gardon } 1307