1fe5db27dSBen Gardon // SPDX-License-Identifier: GPL-2.0 2fe5db27dSBen Gardon 302c00b3aSBen Gardon #include "mmu.h" 402c00b3aSBen Gardon #include "mmu_internal.h" 5bb18842eSBen Gardon #include "mmutrace.h" 62f2fad08SBen Gardon #include "tdp_iter.h" 7fe5db27dSBen Gardon #include "tdp_mmu.h" 802c00b3aSBen Gardon #include "spte.h" 9fe5db27dSBen Gardon 109a77daacSBen Gardon #include <asm/cmpxchg.h> 1133dd3574SBen Gardon #include <trace/events/kvm.h> 1233dd3574SBen Gardon 1371ba3f31SPaolo Bonzini static bool __read_mostly tdp_mmu_enabled = true; 1495fb5b02SBen Gardon module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15fe5db27dSBen Gardon 16fe5db27dSBen Gardon /* Initializes the TDP MMU for the VM, if enabled. */ 17d501f747SBen Gardon bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18fe5db27dSBen Gardon { 19897218ffSPaolo Bonzini if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 20d501f747SBen Gardon return false; 21fe5db27dSBen Gardon 22fe5db27dSBen Gardon /* This should not be changed for the lifetime of the VM. */ 23fe5db27dSBen Gardon kvm->arch.tdp_mmu_enabled = true; 2402c00b3aSBen Gardon 2502c00b3aSBen Gardon INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 269a77daacSBen Gardon spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 2789c0fd49SBen Gardon INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 2822b94c4bSPaolo Bonzini kvm->arch.tdp_mmu_zap_wq = 2922b94c4bSPaolo Bonzini alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 30d501f747SBen Gardon 31d501f747SBen Gardon return true; 32fe5db27dSBen Gardon } 33fe5db27dSBen Gardon 34226b8c8fSSean Christopherson /* Arbitrarily returns true so that this may be used in if statements. */ 35226b8c8fSSean Christopherson static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 366103bc07SBen Gardon bool shared) 376103bc07SBen Gardon { 386103bc07SBen Gardon if (shared) 396103bc07SBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 406103bc07SBen Gardon else 416103bc07SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 42226b8c8fSSean Christopherson 43226b8c8fSSean Christopherson return true; 446103bc07SBen Gardon } 456103bc07SBen Gardon 46fe5db27dSBen Gardon void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 47fe5db27dSBen Gardon { 48fe5db27dSBen Gardon if (!kvm->arch.tdp_mmu_enabled) 49fe5db27dSBen Gardon return; 5002c00b3aSBen Gardon 5122b94c4bSPaolo Bonzini flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 5222b94c4bSPaolo Bonzini destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 5322b94c4bSPaolo Bonzini 54524a1e4eSSean Christopherson WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 5502c00b3aSBen Gardon WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 567cca2d0bSBen Gardon 577cca2d0bSBen Gardon /* 587cca2d0bSBen Gardon * Ensure that all the outstanding RCU callbacks to free shadow pages 5922b94c4bSPaolo Bonzini * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 6022b94c4bSPaolo Bonzini * can call kvm_tdp_mmu_put_root and create new callbacks. 617cca2d0bSBen Gardon */ 627cca2d0bSBen Gardon rcu_barrier(); 6302c00b3aSBen Gardon } 6402c00b3aSBen Gardon 652bdb3d84SBen Gardon static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 66a889ea54SBen Gardon { 672bdb3d84SBen Gardon free_page((unsigned long)sp->spt); 682bdb3d84SBen Gardon kmem_cache_free(mmu_page_header_cache, sp); 69a889ea54SBen Gardon } 70a889ea54SBen Gardon 71c0e64238SBen Gardon /* 72c0e64238SBen Gardon * This is called through call_rcu in order to free TDP page table memory 73c0e64238SBen Gardon * safely with respect to other kernel threads that may be operating on 74c0e64238SBen Gardon * the memory. 75c0e64238SBen Gardon * By only accessing TDP MMU page table memory in an RCU read critical 76c0e64238SBen Gardon * section, and freeing it after a grace period, lockless access to that 77c0e64238SBen Gardon * memory won't use it after it is freed. 78c0e64238SBen Gardon */ 79c0e64238SBen Gardon static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 80a889ea54SBen Gardon { 81c0e64238SBen Gardon struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 82c0e64238SBen Gardon rcu_head); 83a889ea54SBen Gardon 84c0e64238SBen Gardon tdp_mmu_free_sp(sp); 85a889ea54SBen Gardon } 86a889ea54SBen Gardon 87e2b5b21dSSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 88e2b5b21dSSean Christopherson bool shared); 89e2b5b21dSSean Christopherson 9022b94c4bSPaolo Bonzini static void tdp_mmu_zap_root_work(struct work_struct *work) 9122b94c4bSPaolo Bonzini { 9222b94c4bSPaolo Bonzini struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 9322b94c4bSPaolo Bonzini tdp_mmu_async_work); 9422b94c4bSPaolo Bonzini struct kvm *kvm = root->tdp_mmu_async_data; 9522b94c4bSPaolo Bonzini 9622b94c4bSPaolo Bonzini read_lock(&kvm->mmu_lock); 9722b94c4bSPaolo Bonzini 9822b94c4bSPaolo Bonzini /* 9922b94c4bSPaolo Bonzini * A TLB flush is not necessary as KVM performs a local TLB flush when 10022b94c4bSPaolo Bonzini * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 10122b94c4bSPaolo Bonzini * to a different pCPU. Note, the local TLB flush on reuse also 10222b94c4bSPaolo Bonzini * invalidates any paging-structure-cache entries, i.e. TLB entries for 10322b94c4bSPaolo Bonzini * intermediate paging structures, that may be zapped, as such entries 10422b94c4bSPaolo Bonzini * are associated with the ASID on both VMX and SVM. 10522b94c4bSPaolo Bonzini */ 10622b94c4bSPaolo Bonzini tdp_mmu_zap_root(kvm, root, true); 10722b94c4bSPaolo Bonzini 10822b94c4bSPaolo Bonzini /* 10922b94c4bSPaolo Bonzini * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 11022b94c4bSPaolo Bonzini * avoiding an infinite loop. By design, the root is reachable while 11122b94c4bSPaolo Bonzini * it's being asynchronously zapped, thus a different task can put its 11222b94c4bSPaolo Bonzini * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 11322b94c4bSPaolo Bonzini * asynchronously zapped root is unavoidable. 11422b94c4bSPaolo Bonzini */ 11522b94c4bSPaolo Bonzini kvm_tdp_mmu_put_root(kvm, root, true); 11622b94c4bSPaolo Bonzini 11722b94c4bSPaolo Bonzini read_unlock(&kvm->mmu_lock); 11822b94c4bSPaolo Bonzini } 11922b94c4bSPaolo Bonzini 12022b94c4bSPaolo Bonzini static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 12122b94c4bSPaolo Bonzini { 12222b94c4bSPaolo Bonzini root->tdp_mmu_async_data = kvm; 12322b94c4bSPaolo Bonzini INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 12422b94c4bSPaolo Bonzini queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 12522b94c4bSPaolo Bonzini } 12622b94c4bSPaolo Bonzini 1278351779cSPaolo Bonzini static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 1288351779cSPaolo Bonzini { 1298351779cSPaolo Bonzini union kvm_mmu_page_role role = page->role; 1308351779cSPaolo Bonzini role.invalid = true; 1318351779cSPaolo Bonzini 1328351779cSPaolo Bonzini /* No need to use cmpxchg, only the invalid bit can change. */ 1338351779cSPaolo Bonzini role.word = xchg(&page->role.word, role.word); 1348351779cSPaolo Bonzini return role.invalid; 1358351779cSPaolo Bonzini } 1368351779cSPaolo Bonzini 1376103bc07SBen Gardon void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 1386103bc07SBen Gardon bool shared) 1392bdb3d84SBen Gardon { 1406103bc07SBen Gardon kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1412bdb3d84SBen Gardon 14211cccf5cSBen Gardon if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 1432bdb3d84SBen Gardon return; 1442bdb3d84SBen Gardon 1452bdb3d84SBen Gardon WARN_ON(!root->tdp_mmu_page); 1462bdb3d84SBen Gardon 1478351779cSPaolo Bonzini /* 1488351779cSPaolo Bonzini * The root now has refcount=0. It is valid, but readers already 1498351779cSPaolo Bonzini * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 1508351779cSPaolo Bonzini * rejects it. This remains true for the rest of the execution 1518351779cSPaolo Bonzini * of this function, because readers visit valid roots only 1528351779cSPaolo Bonzini * (except for tdp_mmu_zap_root_work(), which however 1538351779cSPaolo Bonzini * does not acquire any reference itself). 1548351779cSPaolo Bonzini * 1558351779cSPaolo Bonzini * Even though there are flows that need to visit all roots for 1568351779cSPaolo Bonzini * correctness, they all take mmu_lock for write, so they cannot yet 1578351779cSPaolo Bonzini * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 1588351779cSPaolo Bonzini * since the root still has refcount=0. 1598351779cSPaolo Bonzini * 1608351779cSPaolo Bonzini * However, tdp_mmu_zap_root can yield, and writers do not expect to 1618351779cSPaolo Bonzini * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 1628351779cSPaolo Bonzini * So the root temporarily gets an extra reference, going to refcount=1 1638351779cSPaolo Bonzini * while staying invalid. Readers still cannot acquire any reference; 1648351779cSPaolo Bonzini * but writers are now allowed to run if tdp_mmu_zap_root yields and 1658351779cSPaolo Bonzini * they might take an extra reference if they themselves yield. Therefore, 1668351779cSPaolo Bonzini * when the reference is given back after tdp_mmu_zap_root terminates, 1678351779cSPaolo Bonzini * there is no guarantee that the refcount is still 1. If not, whoever 1688351779cSPaolo Bonzini * puts the last reference will free the page, but they will not have to 1698351779cSPaolo Bonzini * zap the root because a root cannot go from invalid to valid. 1708351779cSPaolo Bonzini */ 1718351779cSPaolo Bonzini if (!kvm_tdp_root_mark_invalid(root)) { 1728351779cSPaolo Bonzini refcount_set(&root->tdp_mmu_root_count, 1); 1738351779cSPaolo Bonzini tdp_mmu_zap_root(kvm, root, shared); 1748351779cSPaolo Bonzini 1758351779cSPaolo Bonzini /* 1768351779cSPaolo Bonzini * Give back the reference that was added back above. We now 1778351779cSPaolo Bonzini * know that the root is invalid, so go ahead and free it if 1788351779cSPaolo Bonzini * no one has taken a reference in the meanwhile. 1798351779cSPaolo Bonzini */ 1808351779cSPaolo Bonzini if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 1818351779cSPaolo Bonzini return; 1828351779cSPaolo Bonzini } 1838351779cSPaolo Bonzini 184c0e64238SBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 185c0e64238SBen Gardon list_del_rcu(&root->link); 186c0e64238SBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 187c0e64238SBen Gardon call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 188a889ea54SBen Gardon } 189a889ea54SBen Gardon 190cfc10997SBen Gardon /* 191d62007edSSean Christopherson * Returns the next root after @prev_root (or the first root if @prev_root is 192d62007edSSean Christopherson * NULL). A reference to the returned root is acquired, and the reference to 193d62007edSSean Christopherson * @prev_root is released (the caller obviously must hold a reference to 194d62007edSSean Christopherson * @prev_root if it's non-NULL). 195d62007edSSean Christopherson * 196d62007edSSean Christopherson * If @only_valid is true, invalid roots are skipped. 197d62007edSSean Christopherson * 198d62007edSSean Christopherson * Returns NULL if the end of tdp_mmu_roots was reached. 199cfc10997SBen Gardon */ 200cfc10997SBen Gardon static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 2016103bc07SBen Gardon struct kvm_mmu_page *prev_root, 202d62007edSSean Christopherson bool shared, bool only_valid) 203a889ea54SBen Gardon { 204a889ea54SBen Gardon struct kvm_mmu_page *next_root; 205a889ea54SBen Gardon 206c0e64238SBen Gardon rcu_read_lock(); 207c0e64238SBen Gardon 208cfc10997SBen Gardon if (prev_root) 209c0e64238SBen Gardon next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 210c0e64238SBen Gardon &prev_root->link, 211c0e64238SBen Gardon typeof(*prev_root), link); 212cfc10997SBen Gardon else 213c0e64238SBen Gardon next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 214cfc10997SBen Gardon typeof(*next_root), link); 215cfc10997SBen Gardon 21604dc4e6cSSean Christopherson while (next_root) { 217d62007edSSean Christopherson if ((!only_valid || !next_root->role.invalid) && 218ad6d6b94SJinrong Liang kvm_tdp_mmu_get_root(next_root)) 21904dc4e6cSSean Christopherson break; 22004dc4e6cSSean Christopherson 221c0e64238SBen Gardon next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 222c0e64238SBen Gardon &next_root->link, typeof(*next_root), link); 22304dc4e6cSSean Christopherson } 224fb101293SBen Gardon 225c0e64238SBen Gardon rcu_read_unlock(); 226cfc10997SBen Gardon 227cfc10997SBen Gardon if (prev_root) 2286103bc07SBen Gardon kvm_tdp_mmu_put_root(kvm, prev_root, shared); 229cfc10997SBen Gardon 230a889ea54SBen Gardon return next_root; 231a889ea54SBen Gardon } 232a889ea54SBen Gardon 233a889ea54SBen Gardon /* 234a889ea54SBen Gardon * Note: this iterator gets and puts references to the roots it iterates over. 235a889ea54SBen Gardon * This makes it safe to release the MMU lock and yield within the loop, but 236a889ea54SBen Gardon * if exiting the loop early, the caller must drop the reference to the most 237a889ea54SBen Gardon * recent root. (Unless keeping a live reference is desirable.) 2386103bc07SBen Gardon * 2396103bc07SBen Gardon * If shared is set, this function is operating under the MMU lock in read 2406103bc07SBen Gardon * mode. In the unlikely event that this thread must free a root, the lock 2416103bc07SBen Gardon * will be temporarily dropped and reacquired in write mode. 242a889ea54SBen Gardon */ 243d62007edSSean Christopherson #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 244d62007edSSean Christopherson for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 245cfc10997SBen Gardon _root; \ 246d62007edSSean Christopherson _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 247614f6970SPaolo Bonzini if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 248614f6970SPaolo Bonzini kvm_mmu_page_as_id(_root) != _as_id) { \ 249a3f15bdaSSean Christopherson } else 250a889ea54SBen Gardon 251d62007edSSean Christopherson #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 252d62007edSSean Christopherson __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 253d62007edSSean Christopherson 254614f6970SPaolo Bonzini #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 255614f6970SPaolo Bonzini __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 256d62007edSSean Christopherson 257226b8c8fSSean Christopherson /* 258226b8c8fSSean Christopherson * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 259226b8c8fSSean Christopherson * the implication being that any flow that holds mmu_lock for read is 260226b8c8fSSean Christopherson * inherently yield-friendly and should use the yield-safe variant above. 261226b8c8fSSean Christopherson * Holding mmu_lock for write obviates the need for RCU protection as the list 262226b8c8fSSean Christopherson * is guaranteed to be stable. 263226b8c8fSSean Christopherson */ 264a3f15bdaSSean Christopherson #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 265226b8c8fSSean Christopherson list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 266226b8c8fSSean Christopherson if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 267226b8c8fSSean Christopherson kvm_mmu_page_as_id(_root) != _as_id) { \ 268a3f15bdaSSean Christopherson } else 26902c00b3aSBen Gardon 270a82070b6SDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 27102c00b3aSBen Gardon { 27202c00b3aSBen Gardon struct kvm_mmu_page *sp; 27302c00b3aSBen Gardon 27402c00b3aSBen Gardon sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 27502c00b3aSBen Gardon sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 276a82070b6SDavid Matlack 277a82070b6SDavid Matlack return sp; 278a82070b6SDavid Matlack } 279a82070b6SDavid Matlack 280c10743a1SSean Christopherson static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 281c10743a1SSean Christopherson gfn_t gfn, union kvm_mmu_page_role role) 282a82070b6SDavid Matlack { 28302c00b3aSBen Gardon set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 28402c00b3aSBen Gardon 285a3aca4deSDavid Matlack sp->role = role; 28602c00b3aSBen Gardon sp->gfn = gfn; 287c10743a1SSean Christopherson sp->ptep = sptep; 28802c00b3aSBen Gardon sp->tdp_mmu_page = true; 28902c00b3aSBen Gardon 29033dd3574SBen Gardon trace_kvm_mmu_get_page(sp, true); 29102c00b3aSBen Gardon } 29202c00b3aSBen Gardon 293a82070b6SDavid Matlack static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 294a3aca4deSDavid Matlack struct tdp_iter *iter) 295a3aca4deSDavid Matlack { 296a3aca4deSDavid Matlack struct kvm_mmu_page *parent_sp; 297a3aca4deSDavid Matlack union kvm_mmu_page_role role; 298a3aca4deSDavid Matlack 299a3aca4deSDavid Matlack parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 300a3aca4deSDavid Matlack 301a3aca4deSDavid Matlack role = parent_sp->role; 302a3aca4deSDavid Matlack role.level--; 303a3aca4deSDavid Matlack 304c10743a1SSean Christopherson tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 305a3aca4deSDavid Matlack } 306a3aca4deSDavid Matlack 3076e6ec584SSean Christopherson hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 30802c00b3aSBen Gardon { 309a3aca4deSDavid Matlack union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base; 31002c00b3aSBen Gardon struct kvm *kvm = vcpu->kvm; 31102c00b3aSBen Gardon struct kvm_mmu_page *root; 31202c00b3aSBen Gardon 3136e6ec584SSean Christopherson lockdep_assert_held_write(&kvm->mmu_lock); 31402c00b3aSBen Gardon 31504dc4e6cSSean Christopherson /* 31604dc4e6cSSean Christopherson * Check for an existing root before allocating a new one. Note, the 31704dc4e6cSSean Christopherson * role check prevents consuming an invalid root. 31804dc4e6cSSean Christopherson */ 319a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 320fb101293SBen Gardon if (root->role.word == role.word && 321ad6d6b94SJinrong Liang kvm_tdp_mmu_get_root(root)) 3226e6ec584SSean Christopherson goto out; 32302c00b3aSBen Gardon } 32402c00b3aSBen Gardon 325a82070b6SDavid Matlack root = tdp_mmu_alloc_sp(vcpu); 326c10743a1SSean Christopherson tdp_mmu_init_sp(root, NULL, 0, role); 327a82070b6SDavid Matlack 32811cccf5cSBen Gardon refcount_set(&root->tdp_mmu_root_count, 1); 32902c00b3aSBen Gardon 330c0e64238SBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 331c0e64238SBen Gardon list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 332c0e64238SBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 33302c00b3aSBen Gardon 3346e6ec584SSean Christopherson out: 33502c00b3aSBen Gardon return __pa(root->spt); 336fe5db27dSBen Gardon } 3372f2fad08SBen Gardon 3382f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 3399a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 3409a77daacSBen Gardon bool shared); 3412f2fad08SBen Gardon 342f8e14497SBen Gardon static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 343f8e14497SBen Gardon { 344f8e14497SBen Gardon if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 345f8e14497SBen Gardon return; 346f8e14497SBen Gardon 347f8e14497SBen Gardon if (is_accessed_spte(old_spte) && 34864bb2769SSean Christopherson (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 34964bb2769SSean Christopherson spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 350f8e14497SBen Gardon kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 351f8e14497SBen Gardon } 352f8e14497SBen Gardon 353a6a0b05dSBen Gardon static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 354a6a0b05dSBen Gardon u64 old_spte, u64 new_spte, int level) 355a6a0b05dSBen Gardon { 356a6a0b05dSBen Gardon bool pfn_changed; 357a6a0b05dSBen Gardon struct kvm_memory_slot *slot; 358a6a0b05dSBen Gardon 359a6a0b05dSBen Gardon if (level > PG_LEVEL_4K) 360a6a0b05dSBen Gardon return; 361a6a0b05dSBen Gardon 362a6a0b05dSBen Gardon pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 363a6a0b05dSBen Gardon 364a6a0b05dSBen Gardon if ((!is_writable_pte(old_spte) || pfn_changed) && 365a6a0b05dSBen Gardon is_writable_pte(new_spte)) { 366a6a0b05dSBen Gardon slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 367fb04a1edSPeter Xu mark_page_dirty_in_slot(kvm, slot, gfn); 368a6a0b05dSBen Gardon } 369a6a0b05dSBen Gardon } 370a6a0b05dSBen Gardon 3712f2fad08SBen Gardon /** 372c298a30cSDavid Matlack * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 373a9442f59SBen Gardon * 374a9442f59SBen Gardon * @kvm: kvm instance 375a9442f59SBen Gardon * @sp: the page to be removed 3769a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 3779a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 3789a77daacSBen Gardon * threads that might be adding or removing pages. 379a9442f59SBen Gardon */ 380c298a30cSDavid Matlack static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 3819a77daacSBen Gardon bool shared) 382a9442f59SBen Gardon { 3839a77daacSBen Gardon if (shared) 3849a77daacSBen Gardon spin_lock(&kvm->arch.tdp_mmu_pages_lock); 3859a77daacSBen Gardon else 386a9442f59SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 387a9442f59SBen Gardon 388a9442f59SBen Gardon list_del(&sp->link); 389a9442f59SBen Gardon if (sp->lpage_disallowed) 390a9442f59SBen Gardon unaccount_huge_nx_page(kvm, sp); 3919a77daacSBen Gardon 3929a77daacSBen Gardon if (shared) 3939a77daacSBen Gardon spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 394a9442f59SBen Gardon } 395a9442f59SBen Gardon 396a9442f59SBen Gardon /** 3970f53dfa3SDavid Matlack * handle_removed_pt() - handle a page table removed from the TDP structure 398a066e61fSBen Gardon * 399a066e61fSBen Gardon * @kvm: kvm instance 400a066e61fSBen Gardon * @pt: the page removed from the paging structure 4019a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use 4029a77daacSBen Gardon * of the MMU lock and the operation must synchronize with other 4039a77daacSBen Gardon * threads that might be modifying SPTEs. 404a066e61fSBen Gardon * 405a066e61fSBen Gardon * Given a page table that has been removed from the TDP paging structure, 406a066e61fSBen Gardon * iterates through the page table to clear SPTEs and free child page tables. 40770fb3e41SBen Gardon * 40870fb3e41SBen Gardon * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 40970fb3e41SBen Gardon * protection. Since this thread removed it from the paging structure, 41070fb3e41SBen Gardon * this thread will be responsible for ensuring the page is freed. Hence the 41170fb3e41SBen Gardon * early rcu_dereferences in the function. 412a066e61fSBen Gardon */ 4130f53dfa3SDavid Matlack static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 414a066e61fSBen Gardon { 41570fb3e41SBen Gardon struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 416a066e61fSBen Gardon int level = sp->role.level; 417e25f0e0cSBen Gardon gfn_t base_gfn = sp->gfn; 418a066e61fSBen Gardon int i; 419a066e61fSBen Gardon 420a066e61fSBen Gardon trace_kvm_mmu_prepare_zap_page(sp); 421a066e61fSBen Gardon 422c298a30cSDavid Matlack tdp_mmu_unlink_sp(kvm, sp, shared); 423a066e61fSBen Gardon 424a066e61fSBen Gardon for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 425574c3c55SBen Gardon u64 *sptep = rcu_dereference(pt) + i; 426574c3c55SBen Gardon gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 427574c3c55SBen Gardon u64 old_child_spte; 4289a77daacSBen Gardon 4299a77daacSBen Gardon if (shared) { 430e25f0e0cSBen Gardon /* 431e25f0e0cSBen Gardon * Set the SPTE to a nonpresent value that other 432e25f0e0cSBen Gardon * threads will not overwrite. If the SPTE was 433e25f0e0cSBen Gardon * already marked as removed then another thread 434e25f0e0cSBen Gardon * handling a page fault could overwrite it, so 435e25f0e0cSBen Gardon * set the SPTE until it is set from some other 436e25f0e0cSBen Gardon * value to the removed SPTE value. 437e25f0e0cSBen Gardon */ 438e25f0e0cSBen Gardon for (;;) { 439e25f0e0cSBen Gardon old_child_spte = xchg(sptep, REMOVED_SPTE); 440e25f0e0cSBen Gardon if (!is_removed_spte(old_child_spte)) 441e25f0e0cSBen Gardon break; 442e25f0e0cSBen Gardon cpu_relax(); 443e25f0e0cSBen Gardon } 4449a77daacSBen Gardon } else { 4458df9f1afSSean Christopherson /* 4468df9f1afSSean Christopherson * If the SPTE is not MMU-present, there is no backing 4478df9f1afSSean Christopherson * page associated with the SPTE and so no side effects 4488df9f1afSSean Christopherson * that need to be recorded, and exclusive ownership of 4498df9f1afSSean Christopherson * mmu_lock ensures the SPTE can't be made present. 4508df9f1afSSean Christopherson * Note, zapping MMIO SPTEs is also unnecessary as they 4518df9f1afSSean Christopherson * are guarded by the memslots generation, not by being 4528df9f1afSSean Christopherson * unreachable. 4538df9f1afSSean Christopherson */ 4549a77daacSBen Gardon old_child_spte = READ_ONCE(*sptep); 4558df9f1afSSean Christopherson if (!is_shadow_present_pte(old_child_spte)) 4568df9f1afSSean Christopherson continue; 457e25f0e0cSBen Gardon 458e25f0e0cSBen Gardon /* 459e25f0e0cSBen Gardon * Marking the SPTE as a removed SPTE is not 460e25f0e0cSBen Gardon * strictly necessary here as the MMU lock will 461e25f0e0cSBen Gardon * stop other threads from concurrently modifying 462e25f0e0cSBen Gardon * this SPTE. Using the removed SPTE value keeps 463e25f0e0cSBen Gardon * the two branches consistent and simplifies 464e25f0e0cSBen Gardon * the function. 465e25f0e0cSBen Gardon */ 466e25f0e0cSBen Gardon WRITE_ONCE(*sptep, REMOVED_SPTE); 4679a77daacSBen Gardon } 468e25f0e0cSBen Gardon handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 469f1b83255SKai Huang old_child_spte, REMOVED_SPTE, level, 470e25f0e0cSBen Gardon shared); 471a066e61fSBen Gardon } 472a066e61fSBen Gardon 4737cca2d0bSBen Gardon call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 474a066e61fSBen Gardon } 475a066e61fSBen Gardon 476a066e61fSBen Gardon /** 4777f6231a3SKai Huang * __handle_changed_spte - handle bookkeeping associated with an SPTE change 4782f2fad08SBen Gardon * @kvm: kvm instance 4792f2fad08SBen Gardon * @as_id: the address space of the paging structure the SPTE was a part of 4802f2fad08SBen Gardon * @gfn: the base GFN that was mapped by the SPTE 4812f2fad08SBen Gardon * @old_spte: The value of the SPTE before the change 4822f2fad08SBen Gardon * @new_spte: The value of the SPTE after the change 4832f2fad08SBen Gardon * @level: the level of the PT the SPTE is part of in the paging structure 4849a77daacSBen Gardon * @shared: This operation may not be running under the exclusive use of 4859a77daacSBen Gardon * the MMU lock and the operation must synchronize with other 4869a77daacSBen Gardon * threads that might be modifying SPTEs. 4872f2fad08SBen Gardon * 4882f2fad08SBen Gardon * Handle bookkeeping that might result from the modification of a SPTE. 4892f2fad08SBen Gardon * This function must be called for all TDP SPTE modifications. 4902f2fad08SBen Gardon */ 4912f2fad08SBen Gardon static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 4929a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 4939a77daacSBen Gardon bool shared) 4942f2fad08SBen Gardon { 4952f2fad08SBen Gardon bool was_present = is_shadow_present_pte(old_spte); 4962f2fad08SBen Gardon bool is_present = is_shadow_present_pte(new_spte); 4972f2fad08SBen Gardon bool was_leaf = was_present && is_last_spte(old_spte, level); 4982f2fad08SBen Gardon bool is_leaf = is_present && is_last_spte(new_spte, level); 4992f2fad08SBen Gardon bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 5002f2fad08SBen Gardon 5012f2fad08SBen Gardon WARN_ON(level > PT64_ROOT_MAX_LEVEL); 5022f2fad08SBen Gardon WARN_ON(level < PG_LEVEL_4K); 503764388ceSSean Christopherson WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 5042f2fad08SBen Gardon 5052f2fad08SBen Gardon /* 5062f2fad08SBen Gardon * If this warning were to trigger it would indicate that there was a 5072f2fad08SBen Gardon * missing MMU notifier or a race with some notifier handler. 5082f2fad08SBen Gardon * A present, leaf SPTE should never be directly replaced with another 509d9f6e12fSIngo Molnar * present leaf SPTE pointing to a different PFN. A notifier handler 5102f2fad08SBen Gardon * should be zapping the SPTE before the main MM's page table is 5112f2fad08SBen Gardon * changed, or the SPTE should be zeroed, and the TLBs flushed by the 5122f2fad08SBen Gardon * thread before replacement. 5132f2fad08SBen Gardon */ 5142f2fad08SBen Gardon if (was_leaf && is_leaf && pfn_changed) { 5152f2fad08SBen Gardon pr_err("Invalid SPTE change: cannot replace a present leaf\n" 5162f2fad08SBen Gardon "SPTE with another present leaf SPTE mapping a\n" 5172f2fad08SBen Gardon "different PFN!\n" 5182f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 5192f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 5202f2fad08SBen Gardon 5212f2fad08SBen Gardon /* 5222f2fad08SBen Gardon * Crash the host to prevent error propagation and guest data 523d9f6e12fSIngo Molnar * corruption. 5242f2fad08SBen Gardon */ 5252f2fad08SBen Gardon BUG(); 5262f2fad08SBen Gardon } 5272f2fad08SBen Gardon 5282f2fad08SBen Gardon if (old_spte == new_spte) 5292f2fad08SBen Gardon return; 5302f2fad08SBen Gardon 531b9a98c34SBen Gardon trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 532b9a98c34SBen Gardon 533115111efSDavid Matlack if (is_leaf) 534115111efSDavid Matlack check_spte_writable_invariants(new_spte); 535115111efSDavid Matlack 5362f2fad08SBen Gardon /* 5372f2fad08SBen Gardon * The only times a SPTE should be changed from a non-present to 5382f2fad08SBen Gardon * non-present state is when an MMIO entry is installed/modified/ 5392f2fad08SBen Gardon * removed. In that case, there is nothing to do here. 5402f2fad08SBen Gardon */ 5412f2fad08SBen Gardon if (!was_present && !is_present) { 5422f2fad08SBen Gardon /* 54308f07c80SBen Gardon * If this change does not involve a MMIO SPTE or removed SPTE, 54408f07c80SBen Gardon * it is unexpected. Log the change, though it should not 54508f07c80SBen Gardon * impact the guest since both the former and current SPTEs 54608f07c80SBen Gardon * are nonpresent. 5472f2fad08SBen Gardon */ 54808f07c80SBen Gardon if (WARN_ON(!is_mmio_spte(old_spte) && 54908f07c80SBen Gardon !is_mmio_spte(new_spte) && 55008f07c80SBen Gardon !is_removed_spte(new_spte))) 5512f2fad08SBen Gardon pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 5522f2fad08SBen Gardon "should not be replaced with another,\n" 5532f2fad08SBen Gardon "different nonpresent SPTE, unless one or both\n" 55408f07c80SBen Gardon "are MMIO SPTEs, or the new SPTE is\n" 55508f07c80SBen Gardon "a temporary removed SPTE.\n" 5562f2fad08SBen Gardon "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 5572f2fad08SBen Gardon as_id, gfn, old_spte, new_spte, level); 5582f2fad08SBen Gardon return; 5592f2fad08SBen Gardon } 5602f2fad08SBen Gardon 56171f51d2cSMingwei Zhang if (is_leaf != was_leaf) 56271f51d2cSMingwei Zhang kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 5632f2fad08SBen Gardon 5642f2fad08SBen Gardon if (was_leaf && is_dirty_spte(old_spte) && 56564bb2769SSean Christopherson (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 5662f2fad08SBen Gardon kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 5672f2fad08SBen Gardon 5682f2fad08SBen Gardon /* 5692f2fad08SBen Gardon * Recursively handle child PTs if the change removed a subtree from 570c8e5a0d0SSean Christopherson * the paging structure. Note the WARN on the PFN changing without the 571c8e5a0d0SSean Christopherson * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 572c8e5a0d0SSean Christopherson * pages are kernel allocations and should never be migrated. 5732f2fad08SBen Gardon */ 574c8e5a0d0SSean Christopherson if (was_present && !was_leaf && 575c8e5a0d0SSean Christopherson (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 5760f53dfa3SDavid Matlack handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 5772f2fad08SBen Gardon } 5782f2fad08SBen Gardon 5792f2fad08SBen Gardon static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 5809a77daacSBen Gardon u64 old_spte, u64 new_spte, int level, 5819a77daacSBen Gardon bool shared) 5822f2fad08SBen Gardon { 5839a77daacSBen Gardon __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 5849a77daacSBen Gardon shared); 585f8e14497SBen Gardon handle_changed_spte_acc_track(old_spte, new_spte, level); 586a6a0b05dSBen Gardon handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 587a6a0b05dSBen Gardon new_spte, level); 5882f2fad08SBen Gardon } 589faaf05b0SBen Gardon 590fe43fa2fSBen Gardon /* 5916ccf4438SPaolo Bonzini * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 5926ccf4438SPaolo Bonzini * and handle the associated bookkeeping. Do not mark the page dirty 59324ae4cfaSBen Gardon * in KVM's dirty bitmaps. 5949a77daacSBen Gardon * 5953255530aSDavid Matlack * If setting the SPTE fails because it has changed, iter->old_spte will be 5963255530aSDavid Matlack * refreshed to the current value of the spte. 5973255530aSDavid Matlack * 5989a77daacSBen Gardon * @kvm: kvm instance 5999a77daacSBen Gardon * @iter: a tdp_iter instance currently on the SPTE that should be set 6009a77daacSBen Gardon * @new_spte: The value the SPTE should be set to 6013e72c791SDavid Matlack * Return: 6023e72c791SDavid Matlack * * 0 - If the SPTE was set. 6033e72c791SDavid Matlack * * -EBUSY - If the SPTE cannot be set. In this case this function will have 6043e72c791SDavid Matlack * no side-effects other than setting iter->old_spte to the last 6053e72c791SDavid Matlack * known value of the spte. 6069a77daacSBen Gardon */ 6073e72c791SDavid Matlack static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 6089a77daacSBen Gardon struct tdp_iter *iter, 6099a77daacSBen Gardon u64 new_spte) 6109a77daacSBen Gardon { 6113255530aSDavid Matlack u64 *sptep = rcu_dereference(iter->sptep); 6123255530aSDavid Matlack u64 old_spte; 6133255530aSDavid Matlack 6143a0f64deSSean Christopherson WARN_ON_ONCE(iter->yielded); 6153a0f64deSSean Christopherson 6169a77daacSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 6179a77daacSBen Gardon 61808f07c80SBen Gardon /* 61908f07c80SBen Gardon * Do not change removed SPTEs. Only the thread that froze the SPTE 62008f07c80SBen Gardon * may modify it. 62108f07c80SBen Gardon */ 6227a51393aSSean Christopherson if (is_removed_spte(iter->old_spte)) 6233e72c791SDavid Matlack return -EBUSY; 62408f07c80SBen Gardon 6256e8eb206SDavid Matlack /* 6266e8eb206SDavid Matlack * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 6276e8eb206SDavid Matlack * does not hold the mmu_lock. 6286e8eb206SDavid Matlack */ 6293255530aSDavid Matlack old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); 6303255530aSDavid Matlack if (old_spte != iter->old_spte) { 6313255530aSDavid Matlack /* 6323255530aSDavid Matlack * The page table entry was modified by a different logical 6333255530aSDavid Matlack * CPU. Refresh iter->old_spte with the current value so the 6343255530aSDavid Matlack * caller operates on fresh data, e.g. if it retries 6353255530aSDavid Matlack * tdp_mmu_set_spte_atomic(). 6363255530aSDavid Matlack */ 6373255530aSDavid Matlack iter->old_spte = old_spte; 6383e72c791SDavid Matlack return -EBUSY; 6393255530aSDavid Matlack } 6409a77daacSBen Gardon 64124ae4cfaSBen Gardon __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 64208889894SSean Christopherson new_spte, iter->level, true); 64324ae4cfaSBen Gardon handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 6449a77daacSBen Gardon 6453e72c791SDavid Matlack return 0; 6469a77daacSBen Gardon } 6479a77daacSBen Gardon 6483e72c791SDavid Matlack static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 64908f07c80SBen Gardon struct tdp_iter *iter) 65008f07c80SBen Gardon { 6513e72c791SDavid Matlack int ret; 6523e72c791SDavid Matlack 65308f07c80SBen Gardon /* 65408f07c80SBen Gardon * Freeze the SPTE by setting it to a special, 65508f07c80SBen Gardon * non-present value. This will stop other threads from 65608f07c80SBen Gardon * immediately installing a present entry in its place 65708f07c80SBen Gardon * before the TLBs are flushed. 65808f07c80SBen Gardon */ 6593e72c791SDavid Matlack ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 6603e72c791SDavid Matlack if (ret) 6613e72c791SDavid Matlack return ret; 66208f07c80SBen Gardon 66308f07c80SBen Gardon kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 66408f07c80SBen Gardon KVM_PAGES_PER_HPAGE(iter->level)); 66508f07c80SBen Gardon 66608f07c80SBen Gardon /* 66708f07c80SBen Gardon * No other thread can overwrite the removed SPTE as they 66808f07c80SBen Gardon * must either wait on the MMU lock or use 669d9f6e12fSIngo Molnar * tdp_mmu_set_spte_atomic which will not overwrite the 67008f07c80SBen Gardon * special removed SPTE value. No bookkeeping is needed 67108f07c80SBen Gardon * here since the SPTE is going from non-present 67208f07c80SBen Gardon * to non-present. 67308f07c80SBen Gardon */ 6740e587aa7SSean Christopherson kvm_tdp_mmu_write_spte(iter->sptep, 0); 67508f07c80SBen Gardon 6763e72c791SDavid Matlack return 0; 67708f07c80SBen Gardon } 67808f07c80SBen Gardon 6799a77daacSBen Gardon 6809a77daacSBen Gardon /* 681fe43fa2fSBen Gardon * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 682626808d1SSean Christopherson * @kvm: KVM instance 683626808d1SSean Christopherson * @as_id: Address space ID, i.e. regular vs. SMM 684626808d1SSean Christopherson * @sptep: Pointer to the SPTE 685626808d1SSean Christopherson * @old_spte: The current value of the SPTE 686626808d1SSean Christopherson * @new_spte: The new value that will be set for the SPTE 687626808d1SSean Christopherson * @gfn: The base GFN that was (or will be) mapped by the SPTE 688626808d1SSean Christopherson * @level: The level _containing_ the SPTE (its parent PT's level) 689fe43fa2fSBen Gardon * @record_acc_track: Notify the MM subsystem of changes to the accessed state 690fe43fa2fSBen Gardon * of the page. Should be set unless handling an MMU 691fe43fa2fSBen Gardon * notifier for access tracking. Leaving record_acc_track 692fe43fa2fSBen Gardon * unset in that case prevents page accesses from being 693fe43fa2fSBen Gardon * double counted. 694fe43fa2fSBen Gardon * @record_dirty_log: Record the page as dirty in the dirty bitmap if 695fe43fa2fSBen Gardon * appropriate for the change being made. Should be set 696fe43fa2fSBen Gardon * unless performing certain dirty logging operations. 697fe43fa2fSBen Gardon * Leaving record_dirty_log unset in that case prevents page 698fe43fa2fSBen Gardon * writes from being double counted. 699fe43fa2fSBen Gardon */ 700626808d1SSean Christopherson static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 701626808d1SSean Christopherson u64 old_spte, u64 new_spte, gfn_t gfn, int level, 702626808d1SSean Christopherson bool record_acc_track, bool record_dirty_log) 703faaf05b0SBen Gardon { 704531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 7053a9a4aa5SBen Gardon 70608f07c80SBen Gardon /* 707966da62aSSean Christopherson * No thread should be using this function to set SPTEs to or from the 70808f07c80SBen Gardon * temporary removed SPTE value. 70908f07c80SBen Gardon * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 71008f07c80SBen Gardon * should be used. If operating under the MMU lock in write mode, the 71108f07c80SBen Gardon * use of the removed SPTE should not be necessary. 71208f07c80SBen Gardon */ 713626808d1SSean Christopherson WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 71408f07c80SBen Gardon 715626808d1SSean Christopherson kvm_tdp_mmu_write_spte(sptep, new_spte); 716faaf05b0SBen Gardon 717626808d1SSean Christopherson __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 718626808d1SSean Christopherson 719f8e14497SBen Gardon if (record_acc_track) 720626808d1SSean Christopherson handle_changed_spte_acc_track(old_spte, new_spte, level); 721a6a0b05dSBen Gardon if (record_dirty_log) 722626808d1SSean Christopherson handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 723626808d1SSean Christopherson new_spte, level); 724626808d1SSean Christopherson } 725626808d1SSean Christopherson 726626808d1SSean Christopherson static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 727626808d1SSean Christopherson u64 new_spte, bool record_acc_track, 728626808d1SSean Christopherson bool record_dirty_log) 729626808d1SSean Christopherson { 730626808d1SSean Christopherson WARN_ON_ONCE(iter->yielded); 731626808d1SSean Christopherson 732626808d1SSean Christopherson __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, 733626808d1SSean Christopherson new_spte, iter->gfn, iter->level, 734626808d1SSean Christopherson record_acc_track, record_dirty_log); 735f8e14497SBen Gardon } 736f8e14497SBen Gardon 737f8e14497SBen Gardon static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 738f8e14497SBen Gardon u64 new_spte) 739f8e14497SBen Gardon { 740626808d1SSean Christopherson _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 741f8e14497SBen Gardon } 742f8e14497SBen Gardon 743f8e14497SBen Gardon static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 744f8e14497SBen Gardon struct tdp_iter *iter, 745f8e14497SBen Gardon u64 new_spte) 746f8e14497SBen Gardon { 747626808d1SSean Christopherson _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 748a6a0b05dSBen Gardon } 749a6a0b05dSBen Gardon 750a6a0b05dSBen Gardon static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 751a6a0b05dSBen Gardon struct tdp_iter *iter, 752a6a0b05dSBen Gardon u64 new_spte) 753a6a0b05dSBen Gardon { 754626808d1SSean Christopherson _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 755faaf05b0SBen Gardon } 756faaf05b0SBen Gardon 757faaf05b0SBen Gardon #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 75877aa6075SDavid Matlack for_each_tdp_pte(_iter, _root, _start, _end) 759faaf05b0SBen Gardon 760f8e14497SBen Gardon #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 761f8e14497SBen Gardon tdp_root_for_each_pte(_iter, _root, _start, _end) \ 762f8e14497SBen Gardon if (!is_shadow_present_pte(_iter.old_spte) || \ 763f8e14497SBen Gardon !is_last_spte(_iter.old_spte, _iter.level)) \ 764f8e14497SBen Gardon continue; \ 765f8e14497SBen Gardon else 766f8e14497SBen Gardon 767bb18842eSBen Gardon #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 768b9e5603cSPaolo Bonzini for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 769bb18842eSBen Gardon 770faaf05b0SBen Gardon /* 771e28a436cSBen Gardon * Yield if the MMU lock is contended or this thread needs to return control 772e28a436cSBen Gardon * to the scheduler. 773e28a436cSBen Gardon * 774e139a34eSBen Gardon * If this function should yield and flush is set, it will perform a remote 775e139a34eSBen Gardon * TLB flush before yielding. 776e139a34eSBen Gardon * 7773a0f64deSSean Christopherson * If this function yields, iter->yielded is set and the caller must skip to 7783a0f64deSSean Christopherson * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 7793a0f64deSSean Christopherson * over the paging structures to allow the iterator to continue its traversal 7803a0f64deSSean Christopherson * from the paging structure root. 781e28a436cSBen Gardon * 7823a0f64deSSean Christopherson * Returns true if this function yielded. 783e28a436cSBen Gardon */ 7843a0f64deSSean Christopherson static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 7853a0f64deSSean Christopherson struct tdp_iter *iter, 7863a0f64deSSean Christopherson bool flush, bool shared) 787a6a0b05dSBen Gardon { 7883a0f64deSSean Christopherson WARN_ON(iter->yielded); 7893a0f64deSSean Christopherson 790ed5e484bSBen Gardon /* Ensure forward progress has been made before yielding. */ 791ed5e484bSBen Gardon if (iter->next_last_level_gfn == iter->yielded_gfn) 792ed5e484bSBen Gardon return false; 793ed5e484bSBen Gardon 794531810caSBen Gardon if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 795e139a34eSBen Gardon if (flush) 796e139a34eSBen Gardon kvm_flush_remote_tlbs(kvm); 797e139a34eSBen Gardon 798bd296779SSean Christopherson rcu_read_unlock(); 799bd296779SSean Christopherson 8006103bc07SBen Gardon if (shared) 8016103bc07SBen Gardon cond_resched_rwlock_read(&kvm->mmu_lock); 8026103bc07SBen Gardon else 803531810caSBen Gardon cond_resched_rwlock_write(&kvm->mmu_lock); 8046103bc07SBen Gardon 8057cca2d0bSBen Gardon rcu_read_lock(); 806ed5e484bSBen Gardon 807ed5e484bSBen Gardon WARN_ON(iter->gfn > iter->next_last_level_gfn); 808ed5e484bSBen Gardon 8093a0f64deSSean Christopherson iter->yielded = true; 810a6a0b05dSBen Gardon } 811e28a436cSBen Gardon 8123a0f64deSSean Christopherson return iter->yielded; 813a6a0b05dSBen Gardon } 814a6a0b05dSBen Gardon 815e2b5b21dSSean Christopherson static inline gfn_t tdp_mmu_max_gfn_host(void) 816e2b5b21dSSean Christopherson { 817e2b5b21dSSean Christopherson /* 818e2b5b21dSSean Christopherson * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that 819e2b5b21dSSean Christopherson * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF, 820e2b5b21dSSean Christopherson * and so KVM will never install a SPTE for such addresses. 821e2b5b21dSSean Christopherson */ 822e2b5b21dSSean Christopherson return 1ULL << (shadow_phys_bits - PAGE_SHIFT); 823e2b5b21dSSean Christopherson } 824e2b5b21dSSean Christopherson 825*1b6043e8SSean Christopherson static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 826*1b6043e8SSean Christopherson bool shared, int zap_level) 827e2b5b21dSSean Christopherson { 828e2b5b21dSSean Christopherson struct tdp_iter iter; 829e2b5b21dSSean Christopherson 830e2b5b21dSSean Christopherson gfn_t end = tdp_mmu_max_gfn_host(); 831e2b5b21dSSean Christopherson gfn_t start = 0; 832e2b5b21dSSean Christopherson 833*1b6043e8SSean Christopherson for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 834*1b6043e8SSean Christopherson retry: 835*1b6043e8SSean Christopherson if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 836*1b6043e8SSean Christopherson continue; 837*1b6043e8SSean Christopherson 838*1b6043e8SSean Christopherson if (!is_shadow_present_pte(iter.old_spte)) 839*1b6043e8SSean Christopherson continue; 840*1b6043e8SSean Christopherson 841*1b6043e8SSean Christopherson if (iter.level > zap_level) 842*1b6043e8SSean Christopherson continue; 843*1b6043e8SSean Christopherson 844*1b6043e8SSean Christopherson if (!shared) 845*1b6043e8SSean Christopherson tdp_mmu_set_spte(kvm, &iter, 0); 846*1b6043e8SSean Christopherson else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 847*1b6043e8SSean Christopherson goto retry; 848*1b6043e8SSean Christopherson } 849*1b6043e8SSean Christopherson } 850*1b6043e8SSean Christopherson 851*1b6043e8SSean Christopherson static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 852*1b6043e8SSean Christopherson bool shared) 853*1b6043e8SSean Christopherson { 854*1b6043e8SSean Christopherson 8558351779cSPaolo Bonzini /* 8568351779cSPaolo Bonzini * The root must have an elevated refcount so that it's reachable via 8578351779cSPaolo Bonzini * mmu_notifier callbacks, which allows this path to yield and drop 8588351779cSPaolo Bonzini * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 8598351779cSPaolo Bonzini * must drop all references to relevant pages prior to completing the 8608351779cSPaolo Bonzini * callback. Dropping mmu_lock with an unreachable root would result 8618351779cSPaolo Bonzini * in zapping SPTEs after a relevant mmu_notifier callback completes 8628351779cSPaolo Bonzini * and lead to use-after-free as zapping a SPTE triggers "writeback" of 8638351779cSPaolo Bonzini * dirty accessed bits to the SPTE's associated struct page. 8648351779cSPaolo Bonzini */ 8658351779cSPaolo Bonzini WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 8668351779cSPaolo Bonzini 867e2b5b21dSSean Christopherson kvm_lockdep_assert_mmu_lock_held(kvm, shared); 868e2b5b21dSSean Christopherson 869e2b5b21dSSean Christopherson rcu_read_lock(); 870e2b5b21dSSean Christopherson 871e2b5b21dSSean Christopherson /* 872*1b6043e8SSean Christopherson * To avoid RCU stalls due to recursively removing huge swaths of SPs, 873*1b6043e8SSean Christopherson * split the zap into two passes. On the first pass, zap at the 1gb 874*1b6043e8SSean Christopherson * level, and then zap top-level SPs on the second pass. "1gb" is not 875*1b6043e8SSean Christopherson * arbitrary, as KVM must be able to zap a 1gb shadow page without 876*1b6043e8SSean Christopherson * inducing a stall to allow in-place replacement with a 1gb hugepage. 877*1b6043e8SSean Christopherson * 878*1b6043e8SSean Christopherson * Because zapping a SP recurses on its children, stepping down to 879*1b6043e8SSean Christopherson * PG_LEVEL_4K in the iterator itself is unnecessary. 880e2b5b21dSSean Christopherson */ 881*1b6043e8SSean Christopherson __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 882*1b6043e8SSean Christopherson __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 883e2b5b21dSSean Christopherson 884e2b5b21dSSean Christopherson rcu_read_unlock(); 885e2b5b21dSSean Christopherson } 886e2b5b21dSSean Christopherson 887c10743a1SSean Christopherson bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 888c10743a1SSean Christopherson { 889c10743a1SSean Christopherson u64 old_spte; 890c10743a1SSean Christopherson 891c10743a1SSean Christopherson /* 892c10743a1SSean Christopherson * This helper intentionally doesn't allow zapping a root shadow page, 893c10743a1SSean Christopherson * which doesn't have a parent page table and thus no associated entry. 894c10743a1SSean Christopherson */ 895c10743a1SSean Christopherson if (WARN_ON_ONCE(!sp->ptep)) 896c10743a1SSean Christopherson return false; 897c10743a1SSean Christopherson 898c10743a1SSean Christopherson old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 899bb95dfb9SSean Christopherson if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 900c10743a1SSean Christopherson return false; 901c10743a1SSean Christopherson 902c10743a1SSean Christopherson __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 903c10743a1SSean Christopherson sp->gfn, sp->role.level + 1, true, true); 904c10743a1SSean Christopherson 905c10743a1SSean Christopherson return true; 906c10743a1SSean Christopherson } 907c10743a1SSean Christopherson 908faaf05b0SBen Gardon /* 909cf3e2642SSean Christopherson * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs 910cf3e2642SSean Christopherson * have been cleared and a TLB flush is needed before releasing the MMU lock. 9116103bc07SBen Gardon * 912063afacdSBen Gardon * If can_yield is true, will release the MMU lock and reschedule if the 913063afacdSBen Gardon * scheduler needs the CPU or there is contention on the MMU lock. If this 914063afacdSBen Gardon * function cannot yield, it will not release the MMU lock or reschedule and 915063afacdSBen Gardon * the caller must ensure it does not supply too large a GFN range, or the 9166103bc07SBen Gardon * operation can cause a soft lockup. 917faaf05b0SBen Gardon */ 918cf3e2642SSean Christopherson static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 919acbda82aSSean Christopherson gfn_t start, gfn_t end, bool can_yield, bool flush) 920faaf05b0SBen Gardon { 921faaf05b0SBen Gardon struct tdp_iter iter; 922faaf05b0SBen Gardon 923e2b5b21dSSean Christopherson end = min(end, tdp_mmu_max_gfn_host()); 924524a1e4eSSean Christopherson 925acbda82aSSean Christopherson lockdep_assert_held_write(&kvm->mmu_lock); 9266103bc07SBen Gardon 9277cca2d0bSBen Gardon rcu_read_lock(); 9287cca2d0bSBen Gardon 929cf3e2642SSean Christopherson for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 9301af4a960SBen Gardon if (can_yield && 931acbda82aSSean Christopherson tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 932a835429cSSean Christopherson flush = false; 9331af4a960SBen Gardon continue; 9341af4a960SBen Gardon } 9351af4a960SBen Gardon 936cf3e2642SSean Christopherson if (!is_shadow_present_pte(iter.old_spte) || 937faaf05b0SBen Gardon !is_last_spte(iter.old_spte, iter.level)) 938faaf05b0SBen Gardon continue; 939faaf05b0SBen Gardon 940faaf05b0SBen Gardon tdp_mmu_set_spte(kvm, &iter, 0); 941a835429cSSean Christopherson flush = true; 942faaf05b0SBen Gardon } 9437cca2d0bSBen Gardon 9447cca2d0bSBen Gardon rcu_read_unlock(); 945bb95dfb9SSean Christopherson 946bb95dfb9SSean Christopherson /* 947bb95dfb9SSean Christopherson * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 948bb95dfb9SSean Christopherson * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 949bb95dfb9SSean Christopherson */ 950a835429cSSean Christopherson return flush; 951faaf05b0SBen Gardon } 952faaf05b0SBen Gardon 953faaf05b0SBen Gardon /* 954faaf05b0SBen Gardon * Tears down the mappings for the range of gfns, [start, end), and frees the 955faaf05b0SBen Gardon * non-root pages mapping GFNs strictly within that range. Returns true if 956faaf05b0SBen Gardon * SPTEs have been cleared and a TLB flush is needed before releasing the 957faaf05b0SBen Gardon * MMU lock. 958faaf05b0SBen Gardon */ 959cf3e2642SSean Christopherson bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 960cf3e2642SSean Christopherson bool can_yield, bool flush) 961faaf05b0SBen Gardon { 962faaf05b0SBen Gardon struct kvm_mmu_page *root; 963faaf05b0SBen Gardon 964614f6970SPaolo Bonzini for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 965cf3e2642SSean Christopherson flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false); 966faaf05b0SBen Gardon 967faaf05b0SBen Gardon return flush; 968faaf05b0SBen Gardon } 969faaf05b0SBen Gardon 970faaf05b0SBen Gardon void kvm_tdp_mmu_zap_all(struct kvm *kvm) 971faaf05b0SBen Gardon { 972e2b5b21dSSean Christopherson struct kvm_mmu_page *root; 9732b9663d8SSean Christopherson int i; 974faaf05b0SBen Gardon 97577c8cd6bSSean Christopherson /* 97622b94c4bSPaolo Bonzini * Zap all roots, including invalid roots, as all SPTEs must be dropped 97722b94c4bSPaolo Bonzini * before returning to the caller. Zap directly even if the root is 97822b94c4bSPaolo Bonzini * also being zapped by a worker. Walking zapped top-level SPTEs isn't 97922b94c4bSPaolo Bonzini * all that expensive and mmu_lock is already held, which means the 98022b94c4bSPaolo Bonzini * worker has yielded, i.e. flushing the work instead of zapping here 98122b94c4bSPaolo Bonzini * isn't guaranteed to be any faster. 98222b94c4bSPaolo Bonzini * 98377c8cd6bSSean Christopherson * A TLB flush is unnecessary, KVM zaps everything if and only the VM 98477c8cd6bSSean Christopherson * is being destroyed or the userspace VMM has exited. In both cases, 98577c8cd6bSSean Christopherson * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 98677c8cd6bSSean Christopherson */ 987e2b5b21dSSean Christopherson for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 988e2b5b21dSSean Christopherson for_each_tdp_mmu_root_yield_safe(kvm, root, i) 989e2b5b21dSSean Christopherson tdp_mmu_zap_root(kvm, root, false); 990e2b5b21dSSean Christopherson } 991faaf05b0SBen Gardon } 992bb18842eSBen Gardon 9934c6654bdSBen Gardon /* 994f28e9c7fSSean Christopherson * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 99522b94c4bSPaolo Bonzini * zap" completes. 9964c6654bdSBen Gardon */ 9974c6654bdSBen Gardon void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 9984c6654bdSBen Gardon { 99922b94c4bSPaolo Bonzini flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 10004c6654bdSBen Gardon } 10014c6654bdSBen Gardon 1002bb18842eSBen Gardon /* 1003f28e9c7fSSean Christopherson * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 100422b94c4bSPaolo Bonzini * is about to be zapped, e.g. in response to a memslots update. The actual 100522b94c4bSPaolo Bonzini * zapping is performed asynchronously, so a reference is taken on all roots. 100622b94c4bSPaolo Bonzini * Using a separate workqueue makes it easy to ensure that the destruction is 100722b94c4bSPaolo Bonzini * performed before the "fast zap" completes, without keeping a separate list 100822b94c4bSPaolo Bonzini * of invalidated roots; the list is effectively the list of work items in 100922b94c4bSPaolo Bonzini * the workqueue. 1010b7cccd39SBen Gardon * 101122b94c4bSPaolo Bonzini * Get a reference even if the root is already invalid, the asynchronous worker 101222b94c4bSPaolo Bonzini * assumes it was gifted a reference to the root it processes. Because mmu_lock 101322b94c4bSPaolo Bonzini * is held for write, it should be impossible to observe a root with zero refcount, 101422b94c4bSPaolo Bonzini * i.e. the list of roots cannot be stale. 10154c6654bdSBen Gardon * 1016b7cccd39SBen Gardon * This has essentially the same effect for the TDP MMU 1017b7cccd39SBen Gardon * as updating mmu_valid_gen does for the shadow MMU. 1018b7cccd39SBen Gardon */ 1019b7cccd39SBen Gardon void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1020b7cccd39SBen Gardon { 1021b7cccd39SBen Gardon struct kvm_mmu_page *root; 1022b7cccd39SBen Gardon 1023b7cccd39SBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 1024f28e9c7fSSean Christopherson list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 102522b94c4bSPaolo Bonzini if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1026b7cccd39SBen Gardon root->role.invalid = true; 102722b94c4bSPaolo Bonzini tdp_mmu_schedule_zap_root(kvm, root); 102822b94c4bSPaolo Bonzini } 1029b7cccd39SBen Gardon } 1030f28e9c7fSSean Christopherson } 1031b7cccd39SBen Gardon 1032bb18842eSBen Gardon /* 1033bb18842eSBen Gardon * Installs a last-level SPTE to handle a TDP page fault. 1034bb18842eSBen Gardon * (NPT/EPT violation/misconfiguration) 1035bb18842eSBen Gardon */ 1036cdc47767SPaolo Bonzini static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1037cdc47767SPaolo Bonzini struct kvm_page_fault *fault, 1038cdc47767SPaolo Bonzini struct tdp_iter *iter) 1039bb18842eSBen Gardon { 1040c435d4b7SSean Christopherson struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1041bb18842eSBen Gardon u64 new_spte; 104257a3e96dSKai Huang int ret = RET_PF_FIXED; 1043ad67e480SPaolo Bonzini bool wrprot = false; 1044bb18842eSBen Gardon 10457158bee4SPaolo Bonzini WARN_ON(sp->role.level != fault->goal_level); 1046e710c5f6SDavid Matlack if (unlikely(!fault->slot)) 1047bb18842eSBen Gardon new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 10489a77daacSBen Gardon else 104953597858SDavid Matlack wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 10502839180cSPaolo Bonzini fault->pfn, iter->old_spte, fault->prefetch, true, 10517158bee4SPaolo Bonzini fault->map_writable, &new_spte); 1052bb18842eSBen Gardon 1053bb18842eSBen Gardon if (new_spte == iter->old_spte) 1054bb18842eSBen Gardon ret = RET_PF_SPURIOUS; 10553e72c791SDavid Matlack else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 10569a77daacSBen Gardon return RET_PF_RETRY; 1057bb95dfb9SSean Christopherson else if (is_shadow_present_pte(iter->old_spte) && 1058bb95dfb9SSean Christopherson !is_last_spte(iter->old_spte, iter->level)) 1059bb95dfb9SSean Christopherson kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1060bb95dfb9SSean Christopherson KVM_PAGES_PER_HPAGE(iter->level + 1)); 1061bb18842eSBen Gardon 1062bb18842eSBen Gardon /* 1063bb18842eSBen Gardon * If the page fault was caused by a write but the page is write 1064bb18842eSBen Gardon * protected, emulation is needed. If the emulation was skipped, 1065bb18842eSBen Gardon * the vCPU would have the same fault again. 1066bb18842eSBen Gardon */ 1067ad67e480SPaolo Bonzini if (wrprot) { 1068cdc47767SPaolo Bonzini if (fault->write) 1069bb18842eSBen Gardon ret = RET_PF_EMULATE; 1070bb18842eSBen Gardon } 1071bb18842eSBen Gardon 1072bb18842eSBen Gardon /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 10739a77daacSBen Gardon if (unlikely(is_mmio_spte(new_spte))) { 10749a77daacSBen Gardon trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 10759a77daacSBen Gardon new_spte); 1076bb18842eSBen Gardon ret = RET_PF_EMULATE; 10773849e092SSean Christopherson } else { 10789a77daacSBen Gardon trace_kvm_mmu_set_spte(iter->level, iter->gfn, 10799a77daacSBen Gardon rcu_dereference(iter->sptep)); 10803849e092SSean Christopherson } 1081bb18842eSBen Gardon 1082857f8474SKai Huang /* 1083857f8474SKai Huang * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be 1084857f8474SKai Huang * consistent with legacy MMU behavior. 1085857f8474SKai Huang */ 1086857f8474SKai Huang if (ret != RET_PF_SPURIOUS) 1087bb18842eSBen Gardon vcpu->stat.pf_fixed++; 1088bb18842eSBen Gardon 1089bb18842eSBen Gardon return ret; 1090bb18842eSBen Gardon } 1091bb18842eSBen Gardon 1092bb18842eSBen Gardon /* 1093cb00a70bSDavid Matlack * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1094cb00a70bSDavid Matlack * provided page table. 10957b7e1ab6SDavid Matlack * 10967b7e1ab6SDavid Matlack * @kvm: kvm instance 10977b7e1ab6SDavid Matlack * @iter: a tdp_iter instance currently on the SPTE that should be set 10987b7e1ab6SDavid Matlack * @sp: The new TDP page table to install. 10997b7e1ab6SDavid Matlack * @account_nx: True if this page table is being installed to split a 11007b7e1ab6SDavid Matlack * non-executable huge page. 1101cb00a70bSDavid Matlack * @shared: This operation is running under the MMU lock in read mode. 11027b7e1ab6SDavid Matlack * 11037b7e1ab6SDavid Matlack * Returns: 0 if the new page table was installed. Non-0 if the page table 11047b7e1ab6SDavid Matlack * could not be installed (e.g. the atomic compare-exchange failed). 11057b7e1ab6SDavid Matlack */ 1106cb00a70bSDavid Matlack static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1107cb00a70bSDavid Matlack struct kvm_mmu_page *sp, bool account_nx, 1108cb00a70bSDavid Matlack bool shared) 11097b7e1ab6SDavid Matlack { 11107b7e1ab6SDavid Matlack u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); 1111cb00a70bSDavid Matlack int ret = 0; 11127b7e1ab6SDavid Matlack 1113cb00a70bSDavid Matlack if (shared) { 11147b7e1ab6SDavid Matlack ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 11157b7e1ab6SDavid Matlack if (ret) 11167b7e1ab6SDavid Matlack return ret; 1117cb00a70bSDavid Matlack } else { 1118cb00a70bSDavid Matlack tdp_mmu_set_spte(kvm, iter, spte); 1119cb00a70bSDavid Matlack } 11207b7e1ab6SDavid Matlack 11217b7e1ab6SDavid Matlack spin_lock(&kvm->arch.tdp_mmu_pages_lock); 11227b7e1ab6SDavid Matlack list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 11237b7e1ab6SDavid Matlack if (account_nx) 11247b7e1ab6SDavid Matlack account_huge_nx_page(kvm, sp); 11257b7e1ab6SDavid Matlack spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 11267b7e1ab6SDavid Matlack 11277b7e1ab6SDavid Matlack return 0; 11287b7e1ab6SDavid Matlack } 11297b7e1ab6SDavid Matlack 11307b7e1ab6SDavid Matlack /* 1131bb18842eSBen Gardon * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1132bb18842eSBen Gardon * page tables and SPTEs to translate the faulting guest physical address. 1133bb18842eSBen Gardon */ 11342f6305ddSPaolo Bonzini int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1135bb18842eSBen Gardon { 1136bb18842eSBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 1137bb18842eSBen Gardon struct tdp_iter iter; 113889c0fd49SBen Gardon struct kvm_mmu_page *sp; 1139bb18842eSBen Gardon int ret; 1140bb18842eSBen Gardon 114173a3c659SPaolo Bonzini kvm_mmu_hugepage_adjust(vcpu, fault); 1142bb18842eSBen Gardon 1143f0066d94SPaolo Bonzini trace_kvm_mmu_spte_requested(fault); 11447cca2d0bSBen Gardon 11457cca2d0bSBen Gardon rcu_read_lock(); 11467cca2d0bSBen Gardon 11472f6305ddSPaolo Bonzini tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 114873a3c659SPaolo Bonzini if (fault->nx_huge_page_workaround_enabled) 1149536f0e6aSPaolo Bonzini disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1150bb18842eSBen Gardon 115173a3c659SPaolo Bonzini if (iter.level == fault->goal_level) 1152bb18842eSBen Gardon break; 1153bb18842eSBen Gardon 1154bb18842eSBen Gardon /* 1155bb18842eSBen Gardon * If there is an SPTE mapping a large page at a higher level 1156bb18842eSBen Gardon * than the target, that SPTE must be cleared and replaced 1157bb18842eSBen Gardon * with a non-leaf SPTE. 1158bb18842eSBen Gardon */ 1159bb18842eSBen Gardon if (is_shadow_present_pte(iter.old_spte) && 1160bb18842eSBen Gardon is_large_pte(iter.old_spte)) { 11613e72c791SDavid Matlack if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 11629a77daacSBen Gardon break; 1163bb18842eSBen Gardon 1164bb18842eSBen Gardon /* 1165bb18842eSBen Gardon * The iter must explicitly re-read the spte here 1166bb18842eSBen Gardon * because the new value informs the !present 1167bb18842eSBen Gardon * path below. 1168bb18842eSBen Gardon */ 11690e587aa7SSean Christopherson iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); 1170bb18842eSBen Gardon } 1171bb18842eSBen Gardon 1172bb18842eSBen Gardon if (!is_shadow_present_pte(iter.old_spte)) { 11737b7e1ab6SDavid Matlack bool account_nx = fault->huge_page_disallowed && 11747b7e1ab6SDavid Matlack fault->req_level >= iter.level; 11757b7e1ab6SDavid Matlack 1176ff76d506SKai Huang /* 1177c4342633SIngo Molnar * If SPTE has been frozen by another thread, just 1178ff76d506SKai Huang * give up and retry, avoiding unnecessary page table 1179ff76d506SKai Huang * allocation and free. 1180ff76d506SKai Huang */ 1181ff76d506SKai Huang if (is_removed_spte(iter.old_spte)) 1182ff76d506SKai Huang break; 1183ff76d506SKai Huang 1184a82070b6SDavid Matlack sp = tdp_mmu_alloc_sp(vcpu); 1185a82070b6SDavid Matlack tdp_mmu_init_child_sp(sp, &iter); 1186a82070b6SDavid Matlack 1187cb00a70bSDavid Matlack if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { 11889a77daacSBen Gardon tdp_mmu_free_sp(sp); 11899a77daacSBen Gardon break; 11909a77daacSBen Gardon } 1191bb18842eSBen Gardon } 1192bb18842eSBen Gardon } 1193bb18842eSBen Gardon 119473a3c659SPaolo Bonzini if (iter.level != fault->goal_level) { 11957cca2d0bSBen Gardon rcu_read_unlock(); 1196bb18842eSBen Gardon return RET_PF_RETRY; 11977cca2d0bSBen Gardon } 1198bb18842eSBen Gardon 1199cdc47767SPaolo Bonzini ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 12007cca2d0bSBen Gardon rcu_read_unlock(); 1201bb18842eSBen Gardon 1202bb18842eSBen Gardon return ret; 1203bb18842eSBen Gardon } 1204063afacdSBen Gardon 12053039bcc7SSean Christopherson bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 12063039bcc7SSean Christopherson bool flush) 1207063afacdSBen Gardon { 1208cf3e2642SSean Christopherson return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 120983b83a02SSean Christopherson range->end, range->may_block, flush); 12103039bcc7SSean Christopherson } 12113039bcc7SSean Christopherson 12123039bcc7SSean Christopherson typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 12133039bcc7SSean Christopherson struct kvm_gfn_range *range); 12143039bcc7SSean Christopherson 12153039bcc7SSean Christopherson static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 12163039bcc7SSean Christopherson struct kvm_gfn_range *range, 1217c1b91493SSean Christopherson tdp_handler_t handler) 1218063afacdSBen Gardon { 1219063afacdSBen Gardon struct kvm_mmu_page *root; 12203039bcc7SSean Christopherson struct tdp_iter iter; 12213039bcc7SSean Christopherson bool ret = false; 1222063afacdSBen Gardon 1223063afacdSBen Gardon /* 1224e1eed584SSean Christopherson * Don't support rescheduling, none of the MMU notifiers that funnel 1225e1eed584SSean Christopherson * into this helper allow blocking; it'd be dead, wasteful code. 1226063afacdSBen Gardon */ 12273039bcc7SSean Christopherson for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1228a151acecSSean Christopherson rcu_read_lock(); 1229a151acecSSean Christopherson 12303039bcc7SSean Christopherson tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 12313039bcc7SSean Christopherson ret |= handler(kvm, &iter, range); 1232063afacdSBen Gardon 12333039bcc7SSean Christopherson rcu_read_unlock(); 1234a151acecSSean Christopherson } 1235063afacdSBen Gardon 1236063afacdSBen Gardon return ret; 1237063afacdSBen Gardon } 1238063afacdSBen Gardon 1239f8e14497SBen Gardon /* 1240f8e14497SBen Gardon * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1241f8e14497SBen Gardon * if any of the GFNs in the range have been accessed. 1242f8e14497SBen Gardon */ 12433039bcc7SSean Christopherson static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 12443039bcc7SSean Christopherson struct kvm_gfn_range *range) 1245f8e14497SBen Gardon { 1246f8e14497SBen Gardon u64 new_spte = 0; 1247f8e14497SBen Gardon 12483039bcc7SSean Christopherson /* If we have a non-accessed entry we don't need to change the pte. */ 12493039bcc7SSean Christopherson if (!is_accessed_spte(iter->old_spte)) 12503039bcc7SSean Christopherson return false; 12517cca2d0bSBen Gardon 12523039bcc7SSean Christopherson new_spte = iter->old_spte; 1253f8e14497SBen Gardon 1254f8e14497SBen Gardon if (spte_ad_enabled(new_spte)) { 12558f8f52a4SSean Christopherson new_spte &= ~shadow_accessed_mask; 1256f8e14497SBen Gardon } else { 1257f8e14497SBen Gardon /* 1258f8e14497SBen Gardon * Capture the dirty status of the page, so that it doesn't get 1259f8e14497SBen Gardon * lost when the SPTE is marked for access tracking. 1260f8e14497SBen Gardon */ 1261f8e14497SBen Gardon if (is_writable_pte(new_spte)) 1262f8e14497SBen Gardon kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1263f8e14497SBen Gardon 1264f8e14497SBen Gardon new_spte = mark_spte_for_access_track(new_spte); 1265f8e14497SBen Gardon } 1266f8e14497SBen Gardon 12673039bcc7SSean Christopherson tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 126833dd3574SBen Gardon 12693039bcc7SSean Christopherson return true; 1270f8e14497SBen Gardon } 1271f8e14497SBen Gardon 12723039bcc7SSean Christopherson bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1273f8e14497SBen Gardon { 12743039bcc7SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1275f8e14497SBen Gardon } 1276f8e14497SBen Gardon 12773039bcc7SSean Christopherson static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 12783039bcc7SSean Christopherson struct kvm_gfn_range *range) 1279f8e14497SBen Gardon { 12803039bcc7SSean Christopherson return is_accessed_spte(iter->old_spte); 1281f8e14497SBen Gardon } 1282f8e14497SBen Gardon 12833039bcc7SSean Christopherson bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1284f8e14497SBen Gardon { 12853039bcc7SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 12863039bcc7SSean Christopherson } 12873039bcc7SSean Christopherson 12883039bcc7SSean Christopherson static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 12893039bcc7SSean Christopherson struct kvm_gfn_range *range) 12903039bcc7SSean Christopherson { 12913039bcc7SSean Christopherson u64 new_spte; 12923039bcc7SSean Christopherson 12933039bcc7SSean Christopherson /* Huge pages aren't expected to be modified without first being zapped. */ 12943039bcc7SSean Christopherson WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 12953039bcc7SSean Christopherson 12963039bcc7SSean Christopherson if (iter->level != PG_LEVEL_4K || 12973039bcc7SSean Christopherson !is_shadow_present_pte(iter->old_spte)) 12983039bcc7SSean Christopherson return false; 12993039bcc7SSean Christopherson 13003039bcc7SSean Christopherson /* 13013039bcc7SSean Christopherson * Note, when changing a read-only SPTE, it's not strictly necessary to 13023039bcc7SSean Christopherson * zero the SPTE before setting the new PFN, but doing so preserves the 13033039bcc7SSean Christopherson * invariant that the PFN of a present * leaf SPTE can never change. 13043039bcc7SSean Christopherson * See __handle_changed_spte(). 13053039bcc7SSean Christopherson */ 13063039bcc7SSean Christopherson tdp_mmu_set_spte(kvm, iter, 0); 13073039bcc7SSean Christopherson 13083039bcc7SSean Christopherson if (!pte_write(range->pte)) { 13093039bcc7SSean Christopherson new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 13103039bcc7SSean Christopherson pte_pfn(range->pte)); 13113039bcc7SSean Christopherson 13123039bcc7SSean Christopherson tdp_mmu_set_spte(kvm, iter, new_spte); 13133039bcc7SSean Christopherson } 13143039bcc7SSean Christopherson 13153039bcc7SSean Christopherson return true; 1316f8e14497SBen Gardon } 13171d8dd6b3SBen Gardon 13181d8dd6b3SBen Gardon /* 13191d8dd6b3SBen Gardon * Handle the changed_pte MMU notifier for the TDP MMU. 13201d8dd6b3SBen Gardon * data is a pointer to the new pte_t mapping the HVA specified by the MMU 13211d8dd6b3SBen Gardon * notifier. 13221d8dd6b3SBen Gardon * Returns non-zero if a flush is needed before releasing the MMU lock. 13231d8dd6b3SBen Gardon */ 13243039bcc7SSean Christopherson bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 13251d8dd6b3SBen Gardon { 132693fa50f6SSean Christopherson /* 132793fa50f6SSean Christopherson * No need to handle the remote TLB flush under RCU protection, the 132893fa50f6SSean Christopherson * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 132993fa50f6SSean Christopherson * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 133093fa50f6SSean Christopherson */ 133193fa50f6SSean Christopherson return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 13321d8dd6b3SBen Gardon } 13331d8dd6b3SBen Gardon 1334a6a0b05dSBen Gardon /* 1335bedd9195SDavid Matlack * Remove write access from all SPTEs at or above min_level that map GFNs 1336bedd9195SDavid Matlack * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1337bedd9195SDavid Matlack * be flushed. 1338a6a0b05dSBen Gardon */ 1339a6a0b05dSBen Gardon static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1340a6a0b05dSBen Gardon gfn_t start, gfn_t end, int min_level) 1341a6a0b05dSBen Gardon { 1342a6a0b05dSBen Gardon struct tdp_iter iter; 1343a6a0b05dSBen Gardon u64 new_spte; 1344a6a0b05dSBen Gardon bool spte_set = false; 1345a6a0b05dSBen Gardon 13467cca2d0bSBen Gardon rcu_read_lock(); 13477cca2d0bSBen Gardon 1348a6a0b05dSBen Gardon BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1349a6a0b05dSBen Gardon 135077aa6075SDavid Matlack for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 135124ae4cfaSBen Gardon retry: 135224ae4cfaSBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 13531af4a960SBen Gardon continue; 13541af4a960SBen Gardon 1355a6a0b05dSBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 13560f99ee2cSBen Gardon !is_last_spte(iter.old_spte, iter.level) || 13570f99ee2cSBen Gardon !(iter.old_spte & PT_WRITABLE_MASK)) 1358a6a0b05dSBen Gardon continue; 1359a6a0b05dSBen Gardon 1360a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1361a6a0b05dSBen Gardon 13623e72c791SDavid Matlack if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 136324ae4cfaSBen Gardon goto retry; 13643255530aSDavid Matlack 1365a6a0b05dSBen Gardon spte_set = true; 1366a6a0b05dSBen Gardon } 13677cca2d0bSBen Gardon 13687cca2d0bSBen Gardon rcu_read_unlock(); 1369a6a0b05dSBen Gardon return spte_set; 1370a6a0b05dSBen Gardon } 1371a6a0b05dSBen Gardon 1372a6a0b05dSBen Gardon /* 1373a6a0b05dSBen Gardon * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1374a6a0b05dSBen Gardon * only affect leaf SPTEs down to min_level. 1375a6a0b05dSBen Gardon * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1376a6a0b05dSBen Gardon */ 1377269e9552SHamza Mahfooz bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1378269e9552SHamza Mahfooz const struct kvm_memory_slot *slot, int min_level) 1379a6a0b05dSBen Gardon { 1380a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1381a6a0b05dSBen Gardon bool spte_set = false; 1382a6a0b05dSBen Gardon 138324ae4cfaSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 1384a6a0b05dSBen Gardon 1385d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1386a6a0b05dSBen Gardon spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1387a6a0b05dSBen Gardon slot->base_gfn + slot->npages, min_level); 1388a6a0b05dSBen Gardon 1389a6a0b05dSBen Gardon return spte_set; 1390a6a0b05dSBen Gardon } 1391a6a0b05dSBen Gardon 1392a3fe5dbdSDavid Matlack static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1393a3fe5dbdSDavid Matlack { 1394a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp; 1395a3fe5dbdSDavid Matlack 1396a3fe5dbdSDavid Matlack gfp |= __GFP_ZERO; 1397a3fe5dbdSDavid Matlack 1398a3fe5dbdSDavid Matlack sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1399a3fe5dbdSDavid Matlack if (!sp) 1400a3fe5dbdSDavid Matlack return NULL; 1401a3fe5dbdSDavid Matlack 1402a3fe5dbdSDavid Matlack sp->spt = (void *)__get_free_page(gfp); 1403a3fe5dbdSDavid Matlack if (!sp->spt) { 1404a3fe5dbdSDavid Matlack kmem_cache_free(mmu_page_header_cache, sp); 1405a3fe5dbdSDavid Matlack return NULL; 1406a3fe5dbdSDavid Matlack } 1407a3fe5dbdSDavid Matlack 1408a3fe5dbdSDavid Matlack return sp; 1409a3fe5dbdSDavid Matlack } 1410a3fe5dbdSDavid Matlack 1411a3fe5dbdSDavid Matlack static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1412cb00a70bSDavid Matlack struct tdp_iter *iter, 1413cb00a70bSDavid Matlack bool shared) 1414a3fe5dbdSDavid Matlack { 1415a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp; 1416a3fe5dbdSDavid Matlack 1417a3fe5dbdSDavid Matlack /* 1418a3fe5dbdSDavid Matlack * Since we are allocating while under the MMU lock we have to be 1419a3fe5dbdSDavid Matlack * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1420a3fe5dbdSDavid Matlack * reclaim and to avoid making any filesystem callbacks (which can end 1421a3fe5dbdSDavid Matlack * up invoking KVM MMU notifiers, resulting in a deadlock). 1422a3fe5dbdSDavid Matlack * 1423a3fe5dbdSDavid Matlack * If this allocation fails we drop the lock and retry with reclaim 1424a3fe5dbdSDavid Matlack * allowed. 1425a3fe5dbdSDavid Matlack */ 1426a3fe5dbdSDavid Matlack sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1427a3fe5dbdSDavid Matlack if (sp) 1428a3fe5dbdSDavid Matlack return sp; 1429a3fe5dbdSDavid Matlack 1430a3fe5dbdSDavid Matlack rcu_read_unlock(); 1431cb00a70bSDavid Matlack 1432cb00a70bSDavid Matlack if (shared) 1433a3fe5dbdSDavid Matlack read_unlock(&kvm->mmu_lock); 1434cb00a70bSDavid Matlack else 1435cb00a70bSDavid Matlack write_unlock(&kvm->mmu_lock); 1436a3fe5dbdSDavid Matlack 1437a3fe5dbdSDavid Matlack iter->yielded = true; 1438a3fe5dbdSDavid Matlack sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1439a3fe5dbdSDavid Matlack 1440cb00a70bSDavid Matlack if (shared) 1441a3fe5dbdSDavid Matlack read_lock(&kvm->mmu_lock); 1442cb00a70bSDavid Matlack else 1443cb00a70bSDavid Matlack write_lock(&kvm->mmu_lock); 1444cb00a70bSDavid Matlack 1445a3fe5dbdSDavid Matlack rcu_read_lock(); 1446a3fe5dbdSDavid Matlack 1447a3fe5dbdSDavid Matlack return sp; 1448a3fe5dbdSDavid Matlack } 1449a3fe5dbdSDavid Matlack 1450cb00a70bSDavid Matlack static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1451cb00a70bSDavid Matlack struct kvm_mmu_page *sp, bool shared) 1452a3fe5dbdSDavid Matlack { 1453a3fe5dbdSDavid Matlack const u64 huge_spte = iter->old_spte; 1454a3fe5dbdSDavid Matlack const int level = iter->level; 1455a3fe5dbdSDavid Matlack int ret, i; 1456a3fe5dbdSDavid Matlack 1457a3fe5dbdSDavid Matlack tdp_mmu_init_child_sp(sp, iter); 1458a3fe5dbdSDavid Matlack 1459a3fe5dbdSDavid Matlack /* 1460a3fe5dbdSDavid Matlack * No need for atomics when writing to sp->spt since the page table has 1461a3fe5dbdSDavid Matlack * not been linked in yet and thus is not reachable from any other CPU. 1462a3fe5dbdSDavid Matlack */ 1463a3fe5dbdSDavid Matlack for (i = 0; i < PT64_ENT_PER_PAGE; i++) 1464a3fe5dbdSDavid Matlack sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); 1465a3fe5dbdSDavid Matlack 1466a3fe5dbdSDavid Matlack /* 1467a3fe5dbdSDavid Matlack * Replace the huge spte with a pointer to the populated lower level 1468a3fe5dbdSDavid Matlack * page table. Since we are making this change without a TLB flush vCPUs 1469a3fe5dbdSDavid Matlack * will see a mix of the split mappings and the original huge mapping, 1470a3fe5dbdSDavid Matlack * depending on what's currently in their TLB. This is fine from a 1471a3fe5dbdSDavid Matlack * correctness standpoint since the translation will be the same either 1472a3fe5dbdSDavid Matlack * way. 1473a3fe5dbdSDavid Matlack */ 1474cb00a70bSDavid Matlack ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); 1475a3fe5dbdSDavid Matlack if (ret) 1476e0b728b1SDavid Matlack goto out; 1477a3fe5dbdSDavid Matlack 1478a3fe5dbdSDavid Matlack /* 1479a3fe5dbdSDavid Matlack * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1480a3fe5dbdSDavid Matlack * are overwriting from the page stats. But we have to manually update 1481a3fe5dbdSDavid Matlack * the page stats with the new present child pages. 1482a3fe5dbdSDavid Matlack */ 1483a3fe5dbdSDavid Matlack kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); 1484a3fe5dbdSDavid Matlack 1485e0b728b1SDavid Matlack out: 1486e0b728b1SDavid Matlack trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1487e0b728b1SDavid Matlack return ret; 1488a3fe5dbdSDavid Matlack } 1489a3fe5dbdSDavid Matlack 1490a3fe5dbdSDavid Matlack static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1491a3fe5dbdSDavid Matlack struct kvm_mmu_page *root, 1492a3fe5dbdSDavid Matlack gfn_t start, gfn_t end, 1493cb00a70bSDavid Matlack int target_level, bool shared) 1494a3fe5dbdSDavid Matlack { 1495a3fe5dbdSDavid Matlack struct kvm_mmu_page *sp = NULL; 1496a3fe5dbdSDavid Matlack struct tdp_iter iter; 1497a3fe5dbdSDavid Matlack int ret = 0; 1498a3fe5dbdSDavid Matlack 1499a3fe5dbdSDavid Matlack rcu_read_lock(); 1500a3fe5dbdSDavid Matlack 1501a3fe5dbdSDavid Matlack /* 1502a3fe5dbdSDavid Matlack * Traverse the page table splitting all huge pages above the target 1503a3fe5dbdSDavid Matlack * level into one lower level. For example, if we encounter a 1GB page 1504a3fe5dbdSDavid Matlack * we split it into 512 2MB pages. 1505a3fe5dbdSDavid Matlack * 1506a3fe5dbdSDavid Matlack * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1507a3fe5dbdSDavid Matlack * to visit an SPTE before ever visiting its children, which means we 1508a3fe5dbdSDavid Matlack * will correctly recursively split huge pages that are more than one 1509a3fe5dbdSDavid Matlack * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1510a3fe5dbdSDavid Matlack * and then splitting each of those to 512 4KB pages). 1511a3fe5dbdSDavid Matlack */ 1512a3fe5dbdSDavid Matlack for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1513a3fe5dbdSDavid Matlack retry: 1514cb00a70bSDavid Matlack if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1515a3fe5dbdSDavid Matlack continue; 1516a3fe5dbdSDavid Matlack 1517a3fe5dbdSDavid Matlack if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1518a3fe5dbdSDavid Matlack continue; 1519a3fe5dbdSDavid Matlack 1520a3fe5dbdSDavid Matlack if (!sp) { 1521cb00a70bSDavid Matlack sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1522a3fe5dbdSDavid Matlack if (!sp) { 1523a3fe5dbdSDavid Matlack ret = -ENOMEM; 1524e0b728b1SDavid Matlack trace_kvm_mmu_split_huge_page(iter.gfn, 1525e0b728b1SDavid Matlack iter.old_spte, 1526e0b728b1SDavid Matlack iter.level, ret); 1527a3fe5dbdSDavid Matlack break; 1528a3fe5dbdSDavid Matlack } 1529a3fe5dbdSDavid Matlack 1530a3fe5dbdSDavid Matlack if (iter.yielded) 1531a3fe5dbdSDavid Matlack continue; 1532a3fe5dbdSDavid Matlack } 1533a3fe5dbdSDavid Matlack 1534cb00a70bSDavid Matlack if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1535a3fe5dbdSDavid Matlack goto retry; 1536a3fe5dbdSDavid Matlack 1537a3fe5dbdSDavid Matlack sp = NULL; 1538a3fe5dbdSDavid Matlack } 1539a3fe5dbdSDavid Matlack 1540a3fe5dbdSDavid Matlack rcu_read_unlock(); 1541a3fe5dbdSDavid Matlack 1542a3fe5dbdSDavid Matlack /* 1543a3fe5dbdSDavid Matlack * It's possible to exit the loop having never used the last sp if, for 1544a3fe5dbdSDavid Matlack * example, a vCPU doing HugePage NX splitting wins the race and 1545a3fe5dbdSDavid Matlack * installs its own sp in place of the last sp we tried to split. 1546a3fe5dbdSDavid Matlack */ 1547a3fe5dbdSDavid Matlack if (sp) 1548a3fe5dbdSDavid Matlack tdp_mmu_free_sp(sp); 1549a3fe5dbdSDavid Matlack 1550a3fe5dbdSDavid Matlack return ret; 1551a3fe5dbdSDavid Matlack } 1552a3fe5dbdSDavid Matlack 1553cb00a70bSDavid Matlack 1554a3fe5dbdSDavid Matlack /* 1555a3fe5dbdSDavid Matlack * Try to split all huge pages mapped by the TDP MMU down to the target level. 1556a3fe5dbdSDavid Matlack */ 1557a3fe5dbdSDavid Matlack void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1558a3fe5dbdSDavid Matlack const struct kvm_memory_slot *slot, 1559a3fe5dbdSDavid Matlack gfn_t start, gfn_t end, 1560cb00a70bSDavid Matlack int target_level, bool shared) 1561a3fe5dbdSDavid Matlack { 1562a3fe5dbdSDavid Matlack struct kvm_mmu_page *root; 1563a3fe5dbdSDavid Matlack int r = 0; 1564a3fe5dbdSDavid Matlack 1565cb00a70bSDavid Matlack kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1566a3fe5dbdSDavid Matlack 15677c554d8eSPaolo Bonzini for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1568cb00a70bSDavid Matlack r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1569a3fe5dbdSDavid Matlack if (r) { 1570cb00a70bSDavid Matlack kvm_tdp_mmu_put_root(kvm, root, shared); 1571a3fe5dbdSDavid Matlack break; 1572a3fe5dbdSDavid Matlack } 1573a3fe5dbdSDavid Matlack } 1574a3fe5dbdSDavid Matlack } 1575a3fe5dbdSDavid Matlack 1576a6a0b05dSBen Gardon /* 1577a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1578a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1579a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1580a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1581a6a0b05dSBen Gardon * be flushed. 1582a6a0b05dSBen Gardon */ 1583a6a0b05dSBen Gardon static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1584a6a0b05dSBen Gardon gfn_t start, gfn_t end) 1585a6a0b05dSBen Gardon { 1586a6a0b05dSBen Gardon struct tdp_iter iter; 1587a6a0b05dSBen Gardon u64 new_spte; 1588a6a0b05dSBen Gardon bool spte_set = false; 1589a6a0b05dSBen Gardon 15907cca2d0bSBen Gardon rcu_read_lock(); 15917cca2d0bSBen Gardon 1592a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, start, end) { 159324ae4cfaSBen Gardon retry: 159424ae4cfaSBen Gardon if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 15951af4a960SBen Gardon continue; 15961af4a960SBen Gardon 15973354ef5aSSean Christopherson if (!is_shadow_present_pte(iter.old_spte)) 15983354ef5aSSean Christopherson continue; 15993354ef5aSSean Christopherson 1600a6a0b05dSBen Gardon if (spte_ad_need_write_protect(iter.old_spte)) { 1601a6a0b05dSBen Gardon if (is_writable_pte(iter.old_spte)) 1602a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1603a6a0b05dSBen Gardon else 1604a6a0b05dSBen Gardon continue; 1605a6a0b05dSBen Gardon } else { 1606a6a0b05dSBen Gardon if (iter.old_spte & shadow_dirty_mask) 1607a6a0b05dSBen Gardon new_spte = iter.old_spte & ~shadow_dirty_mask; 1608a6a0b05dSBen Gardon else 1609a6a0b05dSBen Gardon continue; 1610a6a0b05dSBen Gardon } 1611a6a0b05dSBen Gardon 16123e72c791SDavid Matlack if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 161324ae4cfaSBen Gardon goto retry; 16143255530aSDavid Matlack 1615a6a0b05dSBen Gardon spte_set = true; 1616a6a0b05dSBen Gardon } 16177cca2d0bSBen Gardon 16187cca2d0bSBen Gardon rcu_read_unlock(); 1619a6a0b05dSBen Gardon return spte_set; 1620a6a0b05dSBen Gardon } 1621a6a0b05dSBen Gardon 1622a6a0b05dSBen Gardon /* 1623a6a0b05dSBen Gardon * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1624a6a0b05dSBen Gardon * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1625a6a0b05dSBen Gardon * If AD bits are not enabled, this will require clearing the writable bit on 1626a6a0b05dSBen Gardon * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1627a6a0b05dSBen Gardon * be flushed. 1628a6a0b05dSBen Gardon */ 1629269e9552SHamza Mahfooz bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1630269e9552SHamza Mahfooz const struct kvm_memory_slot *slot) 1631a6a0b05dSBen Gardon { 1632a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1633a6a0b05dSBen Gardon bool spte_set = false; 1634a6a0b05dSBen Gardon 163524ae4cfaSBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 1636a6a0b05dSBen Gardon 1637d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1638a6a0b05dSBen Gardon spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1639a6a0b05dSBen Gardon slot->base_gfn + slot->npages); 1640a6a0b05dSBen Gardon 1641a6a0b05dSBen Gardon return spte_set; 1642a6a0b05dSBen Gardon } 1643a6a0b05dSBen Gardon 1644a6a0b05dSBen Gardon /* 1645a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1646a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1647a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1648a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1649a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1650a6a0b05dSBen Gardon */ 1651a6a0b05dSBen Gardon static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1652a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, bool wrprot) 1653a6a0b05dSBen Gardon { 1654a6a0b05dSBen Gardon struct tdp_iter iter; 1655a6a0b05dSBen Gardon u64 new_spte; 1656a6a0b05dSBen Gardon 16577cca2d0bSBen Gardon rcu_read_lock(); 16587cca2d0bSBen Gardon 1659a6a0b05dSBen Gardon tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1660a6a0b05dSBen Gardon gfn + BITS_PER_LONG) { 1661a6a0b05dSBen Gardon if (!mask) 1662a6a0b05dSBen Gardon break; 1663a6a0b05dSBen Gardon 1664a6a0b05dSBen Gardon if (iter.level > PG_LEVEL_4K || 1665a6a0b05dSBen Gardon !(mask & (1UL << (iter.gfn - gfn)))) 1666a6a0b05dSBen Gardon continue; 1667a6a0b05dSBen Gardon 1668f1b3b06aSBen Gardon mask &= ~(1UL << (iter.gfn - gfn)); 1669f1b3b06aSBen Gardon 1670a6a0b05dSBen Gardon if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1671a6a0b05dSBen Gardon if (is_writable_pte(iter.old_spte)) 1672a6a0b05dSBen Gardon new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1673a6a0b05dSBen Gardon else 1674a6a0b05dSBen Gardon continue; 1675a6a0b05dSBen Gardon } else { 1676a6a0b05dSBen Gardon if (iter.old_spte & shadow_dirty_mask) 1677a6a0b05dSBen Gardon new_spte = iter.old_spte & ~shadow_dirty_mask; 1678a6a0b05dSBen Gardon else 1679a6a0b05dSBen Gardon continue; 1680a6a0b05dSBen Gardon } 1681a6a0b05dSBen Gardon 1682a6a0b05dSBen Gardon tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1683a6a0b05dSBen Gardon } 16847cca2d0bSBen Gardon 16857cca2d0bSBen Gardon rcu_read_unlock(); 1686a6a0b05dSBen Gardon } 1687a6a0b05dSBen Gardon 1688a6a0b05dSBen Gardon /* 1689a6a0b05dSBen Gardon * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1690a6a0b05dSBen Gardon * set in mask, starting at gfn. The given memslot is expected to contain all 1691a6a0b05dSBen Gardon * the GFNs represented by set bits in the mask. If AD bits are enabled, 1692a6a0b05dSBen Gardon * clearing the dirty status will involve clearing the dirty bit on each SPTE 1693a6a0b05dSBen Gardon * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1694a6a0b05dSBen Gardon */ 1695a6a0b05dSBen Gardon void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1696a6a0b05dSBen Gardon struct kvm_memory_slot *slot, 1697a6a0b05dSBen Gardon gfn_t gfn, unsigned long mask, 1698a6a0b05dSBen Gardon bool wrprot) 1699a6a0b05dSBen Gardon { 1700a6a0b05dSBen Gardon struct kvm_mmu_page *root; 1701a6a0b05dSBen Gardon 1702531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 1703a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, slot->as_id) 1704a6a0b05dSBen Gardon clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1705a6a0b05dSBen Gardon } 1706a6a0b05dSBen Gardon 1707a6a0b05dSBen Gardon /* 170887aa9ec9SBen Gardon * Clear leaf entries which could be replaced by large mappings, for 170987aa9ec9SBen Gardon * GFNs within the slot. 171014881998SBen Gardon */ 17114b85c921SSean Christopherson static void zap_collapsible_spte_range(struct kvm *kvm, 171214881998SBen Gardon struct kvm_mmu_page *root, 17134b85c921SSean Christopherson const struct kvm_memory_slot *slot) 171414881998SBen Gardon { 17159eba50f8SSean Christopherson gfn_t start = slot->base_gfn; 17169eba50f8SSean Christopherson gfn_t end = start + slot->npages; 171714881998SBen Gardon struct tdp_iter iter; 171814881998SBen Gardon kvm_pfn_t pfn; 171914881998SBen Gardon 17207cca2d0bSBen Gardon rcu_read_lock(); 17217cca2d0bSBen Gardon 172214881998SBen Gardon tdp_root_for_each_pte(iter, root, start, end) { 17232db6f772SBen Gardon retry: 17244b85c921SSean Christopherson if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 17251af4a960SBen Gardon continue; 17261af4a960SBen Gardon 172714881998SBen Gardon if (!is_shadow_present_pte(iter.old_spte) || 172887aa9ec9SBen Gardon !is_last_spte(iter.old_spte, iter.level)) 172914881998SBen Gardon continue; 173014881998SBen Gardon 173114881998SBen Gardon pfn = spte_to_pfn(iter.old_spte); 173214881998SBen Gardon if (kvm_is_reserved_pfn(pfn) || 17339eba50f8SSean Christopherson iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 17349eba50f8SSean Christopherson pfn, PG_LEVEL_NUM)) 173514881998SBen Gardon continue; 173614881998SBen Gardon 17374b85c921SSean Christopherson /* Note, a successful atomic zap also does a remote TLB flush. */ 17383e72c791SDavid Matlack if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 17392db6f772SBen Gardon goto retry; 17402db6f772SBen Gardon } 174114881998SBen Gardon 17427cca2d0bSBen Gardon rcu_read_unlock(); 174314881998SBen Gardon } 174414881998SBen Gardon 174514881998SBen Gardon /* 174614881998SBen Gardon * Clear non-leaf entries (and free associated page tables) which could 174714881998SBen Gardon * be replaced by large mappings, for GFNs within the slot. 174814881998SBen Gardon */ 17494b85c921SSean Christopherson void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 17504b85c921SSean Christopherson const struct kvm_memory_slot *slot) 175114881998SBen Gardon { 175214881998SBen Gardon struct kvm_mmu_page *root; 175314881998SBen Gardon 17542db6f772SBen Gardon lockdep_assert_held_read(&kvm->mmu_lock); 175514881998SBen Gardon 1756d62007edSSean Christopherson for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 17574b85c921SSean Christopherson zap_collapsible_spte_range(kvm, root, slot); 175814881998SBen Gardon } 175946044f72SBen Gardon 176046044f72SBen Gardon /* 176146044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 17625fc3424fSSean Christopherson * MMU-writable bit to ensure future writes continue to be intercepted. 176346044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 176446044f72SBen Gardon */ 176546044f72SBen Gardon static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 17663ad93562SKeqian Zhu gfn_t gfn, int min_level) 176746044f72SBen Gardon { 176846044f72SBen Gardon struct tdp_iter iter; 176946044f72SBen Gardon u64 new_spte; 177046044f72SBen Gardon bool spte_set = false; 177146044f72SBen Gardon 17723ad93562SKeqian Zhu BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 17733ad93562SKeqian Zhu 17747cca2d0bSBen Gardon rcu_read_lock(); 17757cca2d0bSBen Gardon 177677aa6075SDavid Matlack for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 17773ad93562SKeqian Zhu if (!is_shadow_present_pte(iter.old_spte) || 17783ad93562SKeqian Zhu !is_last_spte(iter.old_spte, iter.level)) 17793ad93562SKeqian Zhu continue; 17803ad93562SKeqian Zhu 178146044f72SBen Gardon new_spte = iter.old_spte & 17825fc3424fSSean Christopherson ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 178346044f72SBen Gardon 17847c8a4742SDavid Matlack if (new_spte == iter.old_spte) 17857c8a4742SDavid Matlack break; 17867c8a4742SDavid Matlack 178746044f72SBen Gardon tdp_mmu_set_spte(kvm, &iter, new_spte); 178846044f72SBen Gardon spte_set = true; 178946044f72SBen Gardon } 179046044f72SBen Gardon 17917cca2d0bSBen Gardon rcu_read_unlock(); 17927cca2d0bSBen Gardon 179346044f72SBen Gardon return spte_set; 179446044f72SBen Gardon } 179546044f72SBen Gardon 179646044f72SBen Gardon /* 179746044f72SBen Gardon * Removes write access on the last level SPTE mapping this GFN and unsets the 17985fc3424fSSean Christopherson * MMU-writable bit to ensure future writes continue to be intercepted. 179946044f72SBen Gardon * Returns true if an SPTE was set and a TLB flush is needed. 180046044f72SBen Gardon */ 180146044f72SBen Gardon bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 18023ad93562SKeqian Zhu struct kvm_memory_slot *slot, gfn_t gfn, 18033ad93562SKeqian Zhu int min_level) 180446044f72SBen Gardon { 180546044f72SBen Gardon struct kvm_mmu_page *root; 180646044f72SBen Gardon bool spte_set = false; 180746044f72SBen Gardon 1808531810caSBen Gardon lockdep_assert_held_write(&kvm->mmu_lock); 1809a3f15bdaSSean Christopherson for_each_tdp_mmu_root(kvm, root, slot->as_id) 18103ad93562SKeqian Zhu spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1811a3f15bdaSSean Christopherson 181246044f72SBen Gardon return spte_set; 181346044f72SBen Gardon } 181446044f72SBen Gardon 181595fb5b02SBen Gardon /* 181695fb5b02SBen Gardon * Return the level of the lowest level SPTE added to sptes. 181795fb5b02SBen Gardon * That SPTE may be non-present. 1818c5c8c7c5SDavid Matlack * 1819c5c8c7c5SDavid Matlack * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 182095fb5b02SBen Gardon */ 182139b4d43eSSean Christopherson int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 182239b4d43eSSean Christopherson int *root_level) 182395fb5b02SBen Gardon { 182495fb5b02SBen Gardon struct tdp_iter iter; 182595fb5b02SBen Gardon struct kvm_mmu *mmu = vcpu->arch.mmu; 182695fb5b02SBen Gardon gfn_t gfn = addr >> PAGE_SHIFT; 18272aa07893SSean Christopherson int leaf = -1; 182895fb5b02SBen Gardon 182939b4d43eSSean Christopherson *root_level = vcpu->arch.mmu->shadow_root_level; 183095fb5b02SBen Gardon 183195fb5b02SBen Gardon tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 183295fb5b02SBen Gardon leaf = iter.level; 1833dde81f94SSean Christopherson sptes[leaf] = iter.old_spte; 183495fb5b02SBen Gardon } 183595fb5b02SBen Gardon 183695fb5b02SBen Gardon return leaf; 183795fb5b02SBen Gardon } 18386e8eb206SDavid Matlack 18396e8eb206SDavid Matlack /* 18406e8eb206SDavid Matlack * Returns the last level spte pointer of the shadow page walk for the given 18416e8eb206SDavid Matlack * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 18426e8eb206SDavid Matlack * walk could be performed, returns NULL and *spte does not contain valid data. 18436e8eb206SDavid Matlack * 18446e8eb206SDavid Matlack * Contract: 18456e8eb206SDavid Matlack * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 18466e8eb206SDavid Matlack * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 18476e8eb206SDavid Matlack * 18486e8eb206SDavid Matlack * WARNING: This function is only intended to be called during fast_page_fault. 18496e8eb206SDavid Matlack */ 18506e8eb206SDavid Matlack u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 18516e8eb206SDavid Matlack u64 *spte) 18526e8eb206SDavid Matlack { 18536e8eb206SDavid Matlack struct tdp_iter iter; 18546e8eb206SDavid Matlack struct kvm_mmu *mmu = vcpu->arch.mmu; 18556e8eb206SDavid Matlack gfn_t gfn = addr >> PAGE_SHIFT; 18566e8eb206SDavid Matlack tdp_ptep_t sptep = NULL; 18576e8eb206SDavid Matlack 18586e8eb206SDavid Matlack tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 18596e8eb206SDavid Matlack *spte = iter.old_spte; 18606e8eb206SDavid Matlack sptep = iter.sptep; 18616e8eb206SDavid Matlack } 18626e8eb206SDavid Matlack 18636e8eb206SDavid Matlack /* 18646e8eb206SDavid Matlack * Perform the rcu_dereference to get the raw spte pointer value since 18656e8eb206SDavid Matlack * we are passing it up to fast_page_fault, which is shared with the 18666e8eb206SDavid Matlack * legacy MMU and thus does not retain the TDP MMU-specific __rcu 18676e8eb206SDavid Matlack * annotation. 18686e8eb206SDavid Matlack * 18696e8eb206SDavid Matlack * This is safe since fast_page_fault obeys the contracts of this 18706e8eb206SDavid Matlack * function as well as all TDP MMU contracts around modifying SPTEs 18716e8eb206SDavid Matlack * outside of mmu_lock. 18726e8eb206SDavid Matlack */ 18736e8eb206SDavid Matlack return rcu_dereference(sptep); 18746e8eb206SDavid Matlack } 1875