1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = true; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 20 return false; 21 22 /* This should not be changed for the lifetime of the VM. */ 23 kvm->arch.tdp_mmu_enabled = true; 24 25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 28 kvm->arch.tdp_mmu_zap_wq = 29 alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 30 31 return true; 32 } 33 34 /* Arbitrarily returns true so that this may be used in if statements. */ 35 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 36 bool shared) 37 { 38 if (shared) 39 lockdep_assert_held_read(&kvm->mmu_lock); 40 else 41 lockdep_assert_held_write(&kvm->mmu_lock); 42 43 return true; 44 } 45 46 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 47 { 48 if (!kvm->arch.tdp_mmu_enabled) 49 return; 50 51 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 52 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 53 54 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 55 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 56 57 /* 58 * Ensure that all the outstanding RCU callbacks to free shadow pages 59 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 60 * can call kvm_tdp_mmu_put_root and create new callbacks. 61 */ 62 rcu_barrier(); 63 } 64 65 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 66 { 67 free_page((unsigned long)sp->spt); 68 kmem_cache_free(mmu_page_header_cache, sp); 69 } 70 71 /* 72 * This is called through call_rcu in order to free TDP page table memory 73 * safely with respect to other kernel threads that may be operating on 74 * the memory. 75 * By only accessing TDP MMU page table memory in an RCU read critical 76 * section, and freeing it after a grace period, lockless access to that 77 * memory won't use it after it is freed. 78 */ 79 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 80 { 81 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 82 rcu_head); 83 84 tdp_mmu_free_sp(sp); 85 } 86 87 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 88 bool shared); 89 90 static void tdp_mmu_zap_root_work(struct work_struct *work) 91 { 92 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 93 tdp_mmu_async_work); 94 struct kvm *kvm = root->tdp_mmu_async_data; 95 96 read_lock(&kvm->mmu_lock); 97 98 /* 99 * A TLB flush is not necessary as KVM performs a local TLB flush when 100 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 101 * to a different pCPU. Note, the local TLB flush on reuse also 102 * invalidates any paging-structure-cache entries, i.e. TLB entries for 103 * intermediate paging structures, that may be zapped, as such entries 104 * are associated with the ASID on both VMX and SVM. 105 */ 106 tdp_mmu_zap_root(kvm, root, true); 107 108 /* 109 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 110 * avoiding an infinite loop. By design, the root is reachable while 111 * it's being asynchronously zapped, thus a different task can put its 112 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 113 * asynchronously zapped root is unavoidable. 114 */ 115 kvm_tdp_mmu_put_root(kvm, root, true); 116 117 read_unlock(&kvm->mmu_lock); 118 } 119 120 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 121 { 122 root->tdp_mmu_async_data = kvm; 123 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 124 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 125 } 126 127 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 128 { 129 union kvm_mmu_page_role role = page->role; 130 role.invalid = true; 131 132 /* No need to use cmpxchg, only the invalid bit can change. */ 133 role.word = xchg(&page->role.word, role.word); 134 return role.invalid; 135 } 136 137 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 138 bool shared) 139 { 140 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 141 142 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 143 return; 144 145 WARN_ON(!root->tdp_mmu_page); 146 147 /* 148 * The root now has refcount=0. It is valid, but readers already 149 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 150 * rejects it. This remains true for the rest of the execution 151 * of this function, because readers visit valid roots only 152 * (except for tdp_mmu_zap_root_work(), which however 153 * does not acquire any reference itself). 154 * 155 * Even though there are flows that need to visit all roots for 156 * correctness, they all take mmu_lock for write, so they cannot yet 157 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 158 * since the root still has refcount=0. 159 * 160 * However, tdp_mmu_zap_root can yield, and writers do not expect to 161 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 162 * So the root temporarily gets an extra reference, going to refcount=1 163 * while staying invalid. Readers still cannot acquire any reference; 164 * but writers are now allowed to run if tdp_mmu_zap_root yields and 165 * they might take an extra reference if they themselves yield. 166 * Therefore, when the reference is given back by the worker, 167 * there is no guarantee that the refcount is still 1. If not, whoever 168 * puts the last reference will free the page, but they will not have to 169 * zap the root because a root cannot go from invalid to valid. 170 */ 171 if (!kvm_tdp_root_mark_invalid(root)) { 172 refcount_set(&root->tdp_mmu_root_count, 1); 173 174 /* 175 * Zapping the root in a worker is not just "nice to have"; 176 * it is required because kvm_tdp_mmu_invalidate_all_roots() 177 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 178 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 179 * might return with some roots not zapped yet. 180 */ 181 tdp_mmu_schedule_zap_root(kvm, root); 182 return; 183 } 184 185 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 186 list_del_rcu(&root->link); 187 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 188 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 189 } 190 191 /* 192 * Returns the next root after @prev_root (or the first root if @prev_root is 193 * NULL). A reference to the returned root is acquired, and the reference to 194 * @prev_root is released (the caller obviously must hold a reference to 195 * @prev_root if it's non-NULL). 196 * 197 * If @only_valid is true, invalid roots are skipped. 198 * 199 * Returns NULL if the end of tdp_mmu_roots was reached. 200 */ 201 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 202 struct kvm_mmu_page *prev_root, 203 bool shared, bool only_valid) 204 { 205 struct kvm_mmu_page *next_root; 206 207 rcu_read_lock(); 208 209 if (prev_root) 210 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 211 &prev_root->link, 212 typeof(*prev_root), link); 213 else 214 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 215 typeof(*next_root), link); 216 217 while (next_root) { 218 if ((!only_valid || !next_root->role.invalid) && 219 kvm_tdp_mmu_get_root(next_root)) 220 break; 221 222 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 223 &next_root->link, typeof(*next_root), link); 224 } 225 226 rcu_read_unlock(); 227 228 if (prev_root) 229 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 230 231 return next_root; 232 } 233 234 /* 235 * Note: this iterator gets and puts references to the roots it iterates over. 236 * This makes it safe to release the MMU lock and yield within the loop, but 237 * if exiting the loop early, the caller must drop the reference to the most 238 * recent root. (Unless keeping a live reference is desirable.) 239 * 240 * If shared is set, this function is operating under the MMU lock in read 241 * mode. In the unlikely event that this thread must free a root, the lock 242 * will be temporarily dropped and reacquired in write mode. 243 */ 244 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 245 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 246 _root; \ 247 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 248 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 249 kvm_mmu_page_as_id(_root) != _as_id) { \ 250 } else 251 252 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 253 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 254 255 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 257 258 /* 259 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 260 * the implication being that any flow that holds mmu_lock for read is 261 * inherently yield-friendly and should use the yield-safe variant above. 262 * Holding mmu_lock for write obviates the need for RCU protection as the list 263 * is guaranteed to be stable. 264 */ 265 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 266 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 267 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 268 kvm_mmu_page_as_id(_root) != _as_id) { \ 269 } else 270 271 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 272 { 273 struct kvm_mmu_page *sp; 274 275 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 276 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 277 278 return sp; 279 } 280 281 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 282 gfn_t gfn, union kvm_mmu_page_role role) 283 { 284 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 285 286 sp->role = role; 287 sp->gfn = gfn; 288 sp->ptep = sptep; 289 sp->tdp_mmu_page = true; 290 291 trace_kvm_mmu_get_page(sp, true); 292 } 293 294 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 295 struct tdp_iter *iter) 296 { 297 struct kvm_mmu_page *parent_sp; 298 union kvm_mmu_page_role role; 299 300 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 301 302 role = parent_sp->role; 303 role.level--; 304 305 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 306 } 307 308 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 309 { 310 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base; 311 struct kvm *kvm = vcpu->kvm; 312 struct kvm_mmu_page *root; 313 314 lockdep_assert_held_write(&kvm->mmu_lock); 315 316 /* 317 * Check for an existing root before allocating a new one. Note, the 318 * role check prevents consuming an invalid root. 319 */ 320 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 321 if (root->role.word == role.word && 322 kvm_tdp_mmu_get_root(root)) 323 goto out; 324 } 325 326 root = tdp_mmu_alloc_sp(vcpu); 327 tdp_mmu_init_sp(root, NULL, 0, role); 328 329 refcount_set(&root->tdp_mmu_root_count, 1); 330 331 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 332 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 333 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 334 335 out: 336 return __pa(root->spt); 337 } 338 339 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 340 u64 old_spte, u64 new_spte, int level, 341 bool shared); 342 343 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 344 { 345 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 346 return; 347 348 if (is_accessed_spte(old_spte) && 349 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 350 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 351 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 352 } 353 354 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 355 u64 old_spte, u64 new_spte, int level) 356 { 357 bool pfn_changed; 358 struct kvm_memory_slot *slot; 359 360 if (level > PG_LEVEL_4K) 361 return; 362 363 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 364 365 if ((!is_writable_pte(old_spte) || pfn_changed) && 366 is_writable_pte(new_spte)) { 367 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 368 mark_page_dirty_in_slot(kvm, slot, gfn); 369 } 370 } 371 372 /** 373 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 374 * 375 * @kvm: kvm instance 376 * @sp: the page to be removed 377 * @shared: This operation may not be running under the exclusive use of 378 * the MMU lock and the operation must synchronize with other 379 * threads that might be adding or removing pages. 380 */ 381 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 382 bool shared) 383 { 384 if (shared) 385 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 386 else 387 lockdep_assert_held_write(&kvm->mmu_lock); 388 389 list_del(&sp->link); 390 if (sp->lpage_disallowed) 391 unaccount_huge_nx_page(kvm, sp); 392 393 if (shared) 394 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 395 } 396 397 /** 398 * handle_removed_pt() - handle a page table removed from the TDP structure 399 * 400 * @kvm: kvm instance 401 * @pt: the page removed from the paging structure 402 * @shared: This operation may not be running under the exclusive use 403 * of the MMU lock and the operation must synchronize with other 404 * threads that might be modifying SPTEs. 405 * 406 * Given a page table that has been removed from the TDP paging structure, 407 * iterates through the page table to clear SPTEs and free child page tables. 408 * 409 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 410 * protection. Since this thread removed it from the paging structure, 411 * this thread will be responsible for ensuring the page is freed. Hence the 412 * early rcu_dereferences in the function. 413 */ 414 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 415 { 416 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 417 int level = sp->role.level; 418 gfn_t base_gfn = sp->gfn; 419 int i; 420 421 trace_kvm_mmu_prepare_zap_page(sp); 422 423 tdp_mmu_unlink_sp(kvm, sp, shared); 424 425 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 426 u64 *sptep = rcu_dereference(pt) + i; 427 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 428 u64 old_child_spte; 429 430 if (shared) { 431 /* 432 * Set the SPTE to a nonpresent value that other 433 * threads will not overwrite. If the SPTE was 434 * already marked as removed then another thread 435 * handling a page fault could overwrite it, so 436 * set the SPTE until it is set from some other 437 * value to the removed SPTE value. 438 */ 439 for (;;) { 440 old_child_spte = xchg(sptep, REMOVED_SPTE); 441 if (!is_removed_spte(old_child_spte)) 442 break; 443 cpu_relax(); 444 } 445 } else { 446 /* 447 * If the SPTE is not MMU-present, there is no backing 448 * page associated with the SPTE and so no side effects 449 * that need to be recorded, and exclusive ownership of 450 * mmu_lock ensures the SPTE can't be made present. 451 * Note, zapping MMIO SPTEs is also unnecessary as they 452 * are guarded by the memslots generation, not by being 453 * unreachable. 454 */ 455 old_child_spte = READ_ONCE(*sptep); 456 if (!is_shadow_present_pte(old_child_spte)) 457 continue; 458 459 /* 460 * Marking the SPTE as a removed SPTE is not 461 * strictly necessary here as the MMU lock will 462 * stop other threads from concurrently modifying 463 * this SPTE. Using the removed SPTE value keeps 464 * the two branches consistent and simplifies 465 * the function. 466 */ 467 WRITE_ONCE(*sptep, REMOVED_SPTE); 468 } 469 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 470 old_child_spte, REMOVED_SPTE, level, 471 shared); 472 } 473 474 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 475 } 476 477 /** 478 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 479 * @kvm: kvm instance 480 * @as_id: the address space of the paging structure the SPTE was a part of 481 * @gfn: the base GFN that was mapped by the SPTE 482 * @old_spte: The value of the SPTE before the change 483 * @new_spte: The value of the SPTE after the change 484 * @level: the level of the PT the SPTE is part of in the paging structure 485 * @shared: This operation may not be running under the exclusive use of 486 * the MMU lock and the operation must synchronize with other 487 * threads that might be modifying SPTEs. 488 * 489 * Handle bookkeeping that might result from the modification of a SPTE. 490 * This function must be called for all TDP SPTE modifications. 491 */ 492 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 493 u64 old_spte, u64 new_spte, int level, 494 bool shared) 495 { 496 bool was_present = is_shadow_present_pte(old_spte); 497 bool is_present = is_shadow_present_pte(new_spte); 498 bool was_leaf = was_present && is_last_spte(old_spte, level); 499 bool is_leaf = is_present && is_last_spte(new_spte, level); 500 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 501 502 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 503 WARN_ON(level < PG_LEVEL_4K); 504 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 505 506 /* 507 * If this warning were to trigger it would indicate that there was a 508 * missing MMU notifier or a race with some notifier handler. 509 * A present, leaf SPTE should never be directly replaced with another 510 * present leaf SPTE pointing to a different PFN. A notifier handler 511 * should be zapping the SPTE before the main MM's page table is 512 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 513 * thread before replacement. 514 */ 515 if (was_leaf && is_leaf && pfn_changed) { 516 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 517 "SPTE with another present leaf SPTE mapping a\n" 518 "different PFN!\n" 519 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 520 as_id, gfn, old_spte, new_spte, level); 521 522 /* 523 * Crash the host to prevent error propagation and guest data 524 * corruption. 525 */ 526 BUG(); 527 } 528 529 if (old_spte == new_spte) 530 return; 531 532 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 533 534 if (is_leaf) 535 check_spte_writable_invariants(new_spte); 536 537 /* 538 * The only times a SPTE should be changed from a non-present to 539 * non-present state is when an MMIO entry is installed/modified/ 540 * removed. In that case, there is nothing to do here. 541 */ 542 if (!was_present && !is_present) { 543 /* 544 * If this change does not involve a MMIO SPTE or removed SPTE, 545 * it is unexpected. Log the change, though it should not 546 * impact the guest since both the former and current SPTEs 547 * are nonpresent. 548 */ 549 if (WARN_ON(!is_mmio_spte(old_spte) && 550 !is_mmio_spte(new_spte) && 551 !is_removed_spte(new_spte))) 552 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 553 "should not be replaced with another,\n" 554 "different nonpresent SPTE, unless one or both\n" 555 "are MMIO SPTEs, or the new SPTE is\n" 556 "a temporary removed SPTE.\n" 557 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 558 as_id, gfn, old_spte, new_spte, level); 559 return; 560 } 561 562 if (is_leaf != was_leaf) 563 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 564 565 if (was_leaf && is_dirty_spte(old_spte) && 566 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 567 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 568 569 /* 570 * Recursively handle child PTs if the change removed a subtree from 571 * the paging structure. Note the WARN on the PFN changing without the 572 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 573 * pages are kernel allocations and should never be migrated. 574 */ 575 if (was_present && !was_leaf && 576 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 577 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 578 } 579 580 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 581 u64 old_spte, u64 new_spte, int level, 582 bool shared) 583 { 584 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 585 shared); 586 handle_changed_spte_acc_track(old_spte, new_spte, level); 587 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 588 new_spte, level); 589 } 590 591 /* 592 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 593 * and handle the associated bookkeeping. Do not mark the page dirty 594 * in KVM's dirty bitmaps. 595 * 596 * If setting the SPTE fails because it has changed, iter->old_spte will be 597 * refreshed to the current value of the spte. 598 * 599 * @kvm: kvm instance 600 * @iter: a tdp_iter instance currently on the SPTE that should be set 601 * @new_spte: The value the SPTE should be set to 602 * Return: 603 * * 0 - If the SPTE was set. 604 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 605 * no side-effects other than setting iter->old_spte to the last 606 * known value of the spte. 607 */ 608 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 609 struct tdp_iter *iter, 610 u64 new_spte) 611 { 612 u64 *sptep = rcu_dereference(iter->sptep); 613 u64 old_spte; 614 615 /* 616 * The caller is responsible for ensuring the old SPTE is not a REMOVED 617 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 618 * and pre-checking before inserting a new SPTE is advantageous as it 619 * avoids unnecessary work. 620 */ 621 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 622 623 lockdep_assert_held_read(&kvm->mmu_lock); 624 625 /* 626 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 627 * does not hold the mmu_lock. 628 */ 629 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); 630 if (old_spte != iter->old_spte) { 631 /* 632 * The page table entry was modified by a different logical 633 * CPU. Refresh iter->old_spte with the current value so the 634 * caller operates on fresh data, e.g. if it retries 635 * tdp_mmu_set_spte_atomic(). 636 */ 637 iter->old_spte = old_spte; 638 return -EBUSY; 639 } 640 641 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 642 new_spte, iter->level, true); 643 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 644 645 return 0; 646 } 647 648 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 649 struct tdp_iter *iter) 650 { 651 int ret; 652 653 /* 654 * Freeze the SPTE by setting it to a special, 655 * non-present value. This will stop other threads from 656 * immediately installing a present entry in its place 657 * before the TLBs are flushed. 658 */ 659 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 660 if (ret) 661 return ret; 662 663 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 664 KVM_PAGES_PER_HPAGE(iter->level)); 665 666 /* 667 * No other thread can overwrite the removed SPTE as they 668 * must either wait on the MMU lock or use 669 * tdp_mmu_set_spte_atomic which will not overwrite the 670 * special removed SPTE value. No bookkeeping is needed 671 * here since the SPTE is going from non-present 672 * to non-present. 673 */ 674 kvm_tdp_mmu_write_spte(iter->sptep, 0); 675 676 return 0; 677 } 678 679 680 /* 681 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 682 * @kvm: KVM instance 683 * @as_id: Address space ID, i.e. regular vs. SMM 684 * @sptep: Pointer to the SPTE 685 * @old_spte: The current value of the SPTE 686 * @new_spte: The new value that will be set for the SPTE 687 * @gfn: The base GFN that was (or will be) mapped by the SPTE 688 * @level: The level _containing_ the SPTE (its parent PT's level) 689 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 690 * of the page. Should be set unless handling an MMU 691 * notifier for access tracking. Leaving record_acc_track 692 * unset in that case prevents page accesses from being 693 * double counted. 694 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 695 * appropriate for the change being made. Should be set 696 * unless performing certain dirty logging operations. 697 * Leaving record_dirty_log unset in that case prevents page 698 * writes from being double counted. 699 */ 700 static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 701 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 702 bool record_acc_track, bool record_dirty_log) 703 { 704 lockdep_assert_held_write(&kvm->mmu_lock); 705 706 /* 707 * No thread should be using this function to set SPTEs to or from the 708 * temporary removed SPTE value. 709 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 710 * should be used. If operating under the MMU lock in write mode, the 711 * use of the removed SPTE should not be necessary. 712 */ 713 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 714 715 kvm_tdp_mmu_write_spte(sptep, new_spte); 716 717 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 718 719 if (record_acc_track) 720 handle_changed_spte_acc_track(old_spte, new_spte, level); 721 if (record_dirty_log) 722 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 723 new_spte, level); 724 } 725 726 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 727 u64 new_spte, bool record_acc_track, 728 bool record_dirty_log) 729 { 730 WARN_ON_ONCE(iter->yielded); 731 732 __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, 733 new_spte, iter->gfn, iter->level, 734 record_acc_track, record_dirty_log); 735 } 736 737 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 738 u64 new_spte) 739 { 740 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 741 } 742 743 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 744 struct tdp_iter *iter, 745 u64 new_spte) 746 { 747 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 748 } 749 750 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 751 struct tdp_iter *iter, 752 u64 new_spte) 753 { 754 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 755 } 756 757 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 758 for_each_tdp_pte(_iter, _root, _start, _end) 759 760 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 761 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 762 if (!is_shadow_present_pte(_iter.old_spte) || \ 763 !is_last_spte(_iter.old_spte, _iter.level)) \ 764 continue; \ 765 else 766 767 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 768 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 769 770 /* 771 * Yield if the MMU lock is contended or this thread needs to return control 772 * to the scheduler. 773 * 774 * If this function should yield and flush is set, it will perform a remote 775 * TLB flush before yielding. 776 * 777 * If this function yields, iter->yielded is set and the caller must skip to 778 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 779 * over the paging structures to allow the iterator to continue its traversal 780 * from the paging structure root. 781 * 782 * Returns true if this function yielded. 783 */ 784 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 785 struct tdp_iter *iter, 786 bool flush, bool shared) 787 { 788 WARN_ON(iter->yielded); 789 790 /* Ensure forward progress has been made before yielding. */ 791 if (iter->next_last_level_gfn == iter->yielded_gfn) 792 return false; 793 794 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 795 if (flush) 796 kvm_flush_remote_tlbs(kvm); 797 798 rcu_read_unlock(); 799 800 if (shared) 801 cond_resched_rwlock_read(&kvm->mmu_lock); 802 else 803 cond_resched_rwlock_write(&kvm->mmu_lock); 804 805 rcu_read_lock(); 806 807 WARN_ON(iter->gfn > iter->next_last_level_gfn); 808 809 iter->yielded = true; 810 } 811 812 return iter->yielded; 813 } 814 815 static inline gfn_t tdp_mmu_max_gfn_host(void) 816 { 817 /* 818 * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that 819 * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF, 820 * and so KVM will never install a SPTE for such addresses. 821 */ 822 return 1ULL << (shadow_phys_bits - PAGE_SHIFT); 823 } 824 825 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 826 bool shared, int zap_level) 827 { 828 struct tdp_iter iter; 829 830 gfn_t end = tdp_mmu_max_gfn_host(); 831 gfn_t start = 0; 832 833 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 834 retry: 835 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 836 continue; 837 838 if (!is_shadow_present_pte(iter.old_spte)) 839 continue; 840 841 if (iter.level > zap_level) 842 continue; 843 844 if (!shared) 845 tdp_mmu_set_spte(kvm, &iter, 0); 846 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 847 goto retry; 848 } 849 } 850 851 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 852 bool shared) 853 { 854 855 /* 856 * The root must have an elevated refcount so that it's reachable via 857 * mmu_notifier callbacks, which allows this path to yield and drop 858 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 859 * must drop all references to relevant pages prior to completing the 860 * callback. Dropping mmu_lock with an unreachable root would result 861 * in zapping SPTEs after a relevant mmu_notifier callback completes 862 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 863 * dirty accessed bits to the SPTE's associated struct page. 864 */ 865 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 866 867 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 868 869 rcu_read_lock(); 870 871 /* 872 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 873 * split the zap into two passes. On the first pass, zap at the 1gb 874 * level, and then zap top-level SPs on the second pass. "1gb" is not 875 * arbitrary, as KVM must be able to zap a 1gb shadow page without 876 * inducing a stall to allow in-place replacement with a 1gb hugepage. 877 * 878 * Because zapping a SP recurses on its children, stepping down to 879 * PG_LEVEL_4K in the iterator itself is unnecessary. 880 */ 881 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 882 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 883 884 rcu_read_unlock(); 885 } 886 887 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 888 { 889 u64 old_spte; 890 891 /* 892 * This helper intentionally doesn't allow zapping a root shadow page, 893 * which doesn't have a parent page table and thus no associated entry. 894 */ 895 if (WARN_ON_ONCE(!sp->ptep)) 896 return false; 897 898 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 899 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 900 return false; 901 902 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 903 sp->gfn, sp->role.level + 1, true, true); 904 905 return true; 906 } 907 908 /* 909 * Tears down the mappings for the range of gfns, [start, end), and frees the 910 * non-root pages mapping GFNs strictly within that range. Returns true if 911 * SPTEs have been cleared and a TLB flush is needed before releasing the 912 * MMU lock. 913 * 914 * If can_yield is true, will release the MMU lock and reschedule if the 915 * scheduler needs the CPU or there is contention on the MMU lock. If this 916 * function cannot yield, it will not release the MMU lock or reschedule and 917 * the caller must ensure it does not supply too large a GFN range, or the 918 * operation can cause a soft lockup. 919 */ 920 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 921 gfn_t start, gfn_t end, bool can_yield, bool flush) 922 { 923 bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host()); 924 struct tdp_iter iter; 925 926 /* 927 * No need to try to step down in the iterator when zapping all SPTEs, 928 * zapping the top-level non-leaf SPTEs will recurse on their children. 929 */ 930 int min_level = zap_all ? root->role.level : PG_LEVEL_4K; 931 932 end = min(end, tdp_mmu_max_gfn_host()); 933 934 lockdep_assert_held_write(&kvm->mmu_lock); 935 936 rcu_read_lock(); 937 938 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 939 if (can_yield && 940 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 941 flush = false; 942 continue; 943 } 944 945 if (!is_shadow_present_pte(iter.old_spte)) 946 continue; 947 948 /* 949 * If this is a non-last-level SPTE that covers a larger range 950 * than should be zapped, continue, and zap the mappings at a 951 * lower level, except when zapping all SPTEs. 952 */ 953 if (!zap_all && 954 (iter.gfn < start || 955 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 956 !is_last_spte(iter.old_spte, iter.level)) 957 continue; 958 959 tdp_mmu_set_spte(kvm, &iter, 0); 960 flush = true; 961 } 962 963 /* 964 * Need to flush before releasing RCU. TODO: do it only if intermediate 965 * page tables were zapped; there is no need to flush under RCU protection 966 * if no 'struct kvm_mmu_page' is freed. 967 */ 968 if (flush) 969 kvm_flush_remote_tlbs_with_address(kvm, start, end - start); 970 971 rcu_read_unlock(); 972 973 return false; 974 } 975 976 /* 977 * Tears down the mappings for the range of gfns, [start, end), and frees the 978 * non-root pages mapping GFNs strictly within that range. Returns true if 979 * SPTEs have been cleared and a TLB flush is needed before releasing the 980 * MMU lock. 981 */ 982 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, 983 gfn_t end, bool can_yield, bool flush) 984 { 985 struct kvm_mmu_page *root; 986 987 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 988 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); 989 990 return flush; 991 } 992 993 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 994 { 995 struct kvm_mmu_page *root; 996 int i; 997 998 /* 999 * Zap all roots, including invalid roots, as all SPTEs must be dropped 1000 * before returning to the caller. Zap directly even if the root is 1001 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 1002 * all that expensive and mmu_lock is already held, which means the 1003 * worker has yielded, i.e. flushing the work instead of zapping here 1004 * isn't guaranteed to be any faster. 1005 * 1006 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 1007 * is being destroyed or the userspace VMM has exited. In both cases, 1008 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1009 */ 1010 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1011 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 1012 tdp_mmu_zap_root(kvm, root, false); 1013 } 1014 } 1015 1016 /* 1017 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1018 * zap" completes. 1019 */ 1020 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1021 { 1022 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1023 } 1024 1025 /* 1026 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1027 * is about to be zapped, e.g. in response to a memslots update. The actual 1028 * zapping is performed asynchronously, so a reference is taken on all roots. 1029 * Using a separate workqueue makes it easy to ensure that the destruction is 1030 * performed before the "fast zap" completes, without keeping a separate list 1031 * of invalidated roots; the list is effectively the list of work items in 1032 * the workqueue. 1033 * 1034 * Get a reference even if the root is already invalid, the asynchronous worker 1035 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1036 * is held for write, it should be impossible to observe a root with zero refcount, 1037 * i.e. the list of roots cannot be stale. 1038 * 1039 * This has essentially the same effect for the TDP MMU 1040 * as updating mmu_valid_gen does for the shadow MMU. 1041 */ 1042 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1043 { 1044 struct kvm_mmu_page *root; 1045 1046 lockdep_assert_held_write(&kvm->mmu_lock); 1047 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1048 if (!root->role.invalid && 1049 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1050 root->role.invalid = true; 1051 tdp_mmu_schedule_zap_root(kvm, root); 1052 } 1053 } 1054 } 1055 1056 /* 1057 * Installs a last-level SPTE to handle a TDP page fault. 1058 * (NPT/EPT violation/misconfiguration) 1059 */ 1060 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1061 struct kvm_page_fault *fault, 1062 struct tdp_iter *iter) 1063 { 1064 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1065 u64 new_spte; 1066 int ret = RET_PF_FIXED; 1067 bool wrprot = false; 1068 1069 WARN_ON(sp->role.level != fault->goal_level); 1070 if (unlikely(!fault->slot)) 1071 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1072 else 1073 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1074 fault->pfn, iter->old_spte, fault->prefetch, true, 1075 fault->map_writable, &new_spte); 1076 1077 if (new_spte == iter->old_spte) 1078 ret = RET_PF_SPURIOUS; 1079 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1080 return RET_PF_RETRY; 1081 else if (is_shadow_present_pte(iter->old_spte) && 1082 !is_last_spte(iter->old_spte, iter->level)) 1083 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1084 KVM_PAGES_PER_HPAGE(iter->level + 1)); 1085 1086 /* 1087 * If the page fault was caused by a write but the page is write 1088 * protected, emulation is needed. If the emulation was skipped, 1089 * the vCPU would have the same fault again. 1090 */ 1091 if (wrprot) { 1092 if (fault->write) 1093 ret = RET_PF_EMULATE; 1094 } 1095 1096 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1097 if (unlikely(is_mmio_spte(new_spte))) { 1098 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1099 new_spte); 1100 ret = RET_PF_EMULATE; 1101 } else { 1102 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1103 rcu_dereference(iter->sptep)); 1104 } 1105 1106 /* 1107 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be 1108 * consistent with legacy MMU behavior. 1109 */ 1110 if (ret != RET_PF_SPURIOUS) 1111 vcpu->stat.pf_fixed++; 1112 1113 return ret; 1114 } 1115 1116 /* 1117 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1118 * provided page table. 1119 * 1120 * @kvm: kvm instance 1121 * @iter: a tdp_iter instance currently on the SPTE that should be set 1122 * @sp: The new TDP page table to install. 1123 * @account_nx: True if this page table is being installed to split a 1124 * non-executable huge page. 1125 * @shared: This operation is running under the MMU lock in read mode. 1126 * 1127 * Returns: 0 if the new page table was installed. Non-0 if the page table 1128 * could not be installed (e.g. the atomic compare-exchange failed). 1129 */ 1130 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1131 struct kvm_mmu_page *sp, bool account_nx, 1132 bool shared) 1133 { 1134 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); 1135 int ret = 0; 1136 1137 if (shared) { 1138 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1139 if (ret) 1140 return ret; 1141 } else { 1142 tdp_mmu_set_spte(kvm, iter, spte); 1143 } 1144 1145 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1146 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 1147 if (account_nx) 1148 account_huge_nx_page(kvm, sp); 1149 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1150 1151 return 0; 1152 } 1153 1154 /* 1155 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1156 * page tables and SPTEs to translate the faulting guest physical address. 1157 */ 1158 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1159 { 1160 struct kvm_mmu *mmu = vcpu->arch.mmu; 1161 struct tdp_iter iter; 1162 struct kvm_mmu_page *sp; 1163 int ret; 1164 1165 kvm_mmu_hugepage_adjust(vcpu, fault); 1166 1167 trace_kvm_mmu_spte_requested(fault); 1168 1169 rcu_read_lock(); 1170 1171 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1172 if (fault->nx_huge_page_workaround_enabled) 1173 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1174 1175 if (iter.level == fault->goal_level) 1176 break; 1177 1178 /* 1179 * If there is an SPTE mapping a large page at a higher level 1180 * than the target, that SPTE must be cleared and replaced 1181 * with a non-leaf SPTE. 1182 */ 1183 if (is_shadow_present_pte(iter.old_spte) && 1184 is_large_pte(iter.old_spte)) { 1185 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 1186 break; 1187 1188 /* 1189 * The iter must explicitly re-read the spte here 1190 * because the new value informs the !present 1191 * path below. 1192 */ 1193 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); 1194 } 1195 1196 if (!is_shadow_present_pte(iter.old_spte)) { 1197 bool account_nx = fault->huge_page_disallowed && 1198 fault->req_level >= iter.level; 1199 1200 /* 1201 * If SPTE has been frozen by another thread, just 1202 * give up and retry, avoiding unnecessary page table 1203 * allocation and free. 1204 */ 1205 if (is_removed_spte(iter.old_spte)) 1206 break; 1207 1208 sp = tdp_mmu_alloc_sp(vcpu); 1209 tdp_mmu_init_child_sp(sp, &iter); 1210 1211 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { 1212 tdp_mmu_free_sp(sp); 1213 break; 1214 } 1215 } 1216 } 1217 1218 /* 1219 * Force the guest to retry the access if the upper level SPTEs aren't 1220 * in place, or if the target leaf SPTE is frozen by another CPU. 1221 */ 1222 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { 1223 rcu_read_unlock(); 1224 return RET_PF_RETRY; 1225 } 1226 1227 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1228 rcu_read_unlock(); 1229 1230 return ret; 1231 } 1232 1233 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1234 bool flush) 1235 { 1236 return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start, 1237 range->end, range->may_block, flush); 1238 } 1239 1240 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1241 struct kvm_gfn_range *range); 1242 1243 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1244 struct kvm_gfn_range *range, 1245 tdp_handler_t handler) 1246 { 1247 struct kvm_mmu_page *root; 1248 struct tdp_iter iter; 1249 bool ret = false; 1250 1251 /* 1252 * Don't support rescheduling, none of the MMU notifiers that funnel 1253 * into this helper allow blocking; it'd be dead, wasteful code. 1254 */ 1255 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1256 rcu_read_lock(); 1257 1258 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1259 ret |= handler(kvm, &iter, range); 1260 1261 rcu_read_unlock(); 1262 } 1263 1264 return ret; 1265 } 1266 1267 /* 1268 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1269 * if any of the GFNs in the range have been accessed. 1270 */ 1271 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1272 struct kvm_gfn_range *range) 1273 { 1274 u64 new_spte = 0; 1275 1276 /* If we have a non-accessed entry we don't need to change the pte. */ 1277 if (!is_accessed_spte(iter->old_spte)) 1278 return false; 1279 1280 new_spte = iter->old_spte; 1281 1282 if (spte_ad_enabled(new_spte)) { 1283 new_spte &= ~shadow_accessed_mask; 1284 } else { 1285 /* 1286 * Capture the dirty status of the page, so that it doesn't get 1287 * lost when the SPTE is marked for access tracking. 1288 */ 1289 if (is_writable_pte(new_spte)) 1290 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1291 1292 new_spte = mark_spte_for_access_track(new_spte); 1293 } 1294 1295 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1296 1297 return true; 1298 } 1299 1300 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1301 { 1302 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1303 } 1304 1305 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1306 struct kvm_gfn_range *range) 1307 { 1308 return is_accessed_spte(iter->old_spte); 1309 } 1310 1311 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1312 { 1313 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1314 } 1315 1316 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1317 struct kvm_gfn_range *range) 1318 { 1319 u64 new_spte; 1320 1321 /* Huge pages aren't expected to be modified without first being zapped. */ 1322 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1323 1324 if (iter->level != PG_LEVEL_4K || 1325 !is_shadow_present_pte(iter->old_spte)) 1326 return false; 1327 1328 /* 1329 * Note, when changing a read-only SPTE, it's not strictly necessary to 1330 * zero the SPTE before setting the new PFN, but doing so preserves the 1331 * invariant that the PFN of a present * leaf SPTE can never change. 1332 * See __handle_changed_spte(). 1333 */ 1334 tdp_mmu_set_spte(kvm, iter, 0); 1335 1336 if (!pte_write(range->pte)) { 1337 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1338 pte_pfn(range->pte)); 1339 1340 tdp_mmu_set_spte(kvm, iter, new_spte); 1341 } 1342 1343 return true; 1344 } 1345 1346 /* 1347 * Handle the changed_pte MMU notifier for the TDP MMU. 1348 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1349 * notifier. 1350 * Returns non-zero if a flush is needed before releasing the MMU lock. 1351 */ 1352 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1353 { 1354 /* 1355 * No need to handle the remote TLB flush under RCU protection, the 1356 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1357 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1358 */ 1359 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1360 } 1361 1362 /* 1363 * Remove write access from all SPTEs at or above min_level that map GFNs 1364 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1365 * be flushed. 1366 */ 1367 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1368 gfn_t start, gfn_t end, int min_level) 1369 { 1370 struct tdp_iter iter; 1371 u64 new_spte; 1372 bool spte_set = false; 1373 1374 rcu_read_lock(); 1375 1376 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1377 1378 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1379 retry: 1380 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1381 continue; 1382 1383 if (!is_shadow_present_pte(iter.old_spte) || 1384 !is_last_spte(iter.old_spte, iter.level) || 1385 !(iter.old_spte & PT_WRITABLE_MASK)) 1386 continue; 1387 1388 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1389 1390 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1391 goto retry; 1392 1393 spte_set = true; 1394 } 1395 1396 rcu_read_unlock(); 1397 return spte_set; 1398 } 1399 1400 /* 1401 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1402 * only affect leaf SPTEs down to min_level. 1403 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1404 */ 1405 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1406 const struct kvm_memory_slot *slot, int min_level) 1407 { 1408 struct kvm_mmu_page *root; 1409 bool spte_set = false; 1410 1411 lockdep_assert_held_read(&kvm->mmu_lock); 1412 1413 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1414 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1415 slot->base_gfn + slot->npages, min_level); 1416 1417 return spte_set; 1418 } 1419 1420 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1421 { 1422 struct kvm_mmu_page *sp; 1423 1424 gfp |= __GFP_ZERO; 1425 1426 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1427 if (!sp) 1428 return NULL; 1429 1430 sp->spt = (void *)__get_free_page(gfp); 1431 if (!sp->spt) { 1432 kmem_cache_free(mmu_page_header_cache, sp); 1433 return NULL; 1434 } 1435 1436 return sp; 1437 } 1438 1439 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1440 struct tdp_iter *iter, 1441 bool shared) 1442 { 1443 struct kvm_mmu_page *sp; 1444 1445 /* 1446 * Since we are allocating while under the MMU lock we have to be 1447 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1448 * reclaim and to avoid making any filesystem callbacks (which can end 1449 * up invoking KVM MMU notifiers, resulting in a deadlock). 1450 * 1451 * If this allocation fails we drop the lock and retry with reclaim 1452 * allowed. 1453 */ 1454 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1455 if (sp) 1456 return sp; 1457 1458 rcu_read_unlock(); 1459 1460 if (shared) 1461 read_unlock(&kvm->mmu_lock); 1462 else 1463 write_unlock(&kvm->mmu_lock); 1464 1465 iter->yielded = true; 1466 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1467 1468 if (shared) 1469 read_lock(&kvm->mmu_lock); 1470 else 1471 write_lock(&kvm->mmu_lock); 1472 1473 rcu_read_lock(); 1474 1475 return sp; 1476 } 1477 1478 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1479 struct kvm_mmu_page *sp, bool shared) 1480 { 1481 const u64 huge_spte = iter->old_spte; 1482 const int level = iter->level; 1483 int ret, i; 1484 1485 tdp_mmu_init_child_sp(sp, iter); 1486 1487 /* 1488 * No need for atomics when writing to sp->spt since the page table has 1489 * not been linked in yet and thus is not reachable from any other CPU. 1490 */ 1491 for (i = 0; i < PT64_ENT_PER_PAGE; i++) 1492 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); 1493 1494 /* 1495 * Replace the huge spte with a pointer to the populated lower level 1496 * page table. Since we are making this change without a TLB flush vCPUs 1497 * will see a mix of the split mappings and the original huge mapping, 1498 * depending on what's currently in their TLB. This is fine from a 1499 * correctness standpoint since the translation will be the same either 1500 * way. 1501 */ 1502 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); 1503 if (ret) 1504 goto out; 1505 1506 /* 1507 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1508 * are overwriting from the page stats. But we have to manually update 1509 * the page stats with the new present child pages. 1510 */ 1511 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); 1512 1513 out: 1514 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1515 return ret; 1516 } 1517 1518 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1519 struct kvm_mmu_page *root, 1520 gfn_t start, gfn_t end, 1521 int target_level, bool shared) 1522 { 1523 struct kvm_mmu_page *sp = NULL; 1524 struct tdp_iter iter; 1525 int ret = 0; 1526 1527 rcu_read_lock(); 1528 1529 /* 1530 * Traverse the page table splitting all huge pages above the target 1531 * level into one lower level. For example, if we encounter a 1GB page 1532 * we split it into 512 2MB pages. 1533 * 1534 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1535 * to visit an SPTE before ever visiting its children, which means we 1536 * will correctly recursively split huge pages that are more than one 1537 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1538 * and then splitting each of those to 512 4KB pages). 1539 */ 1540 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1541 retry: 1542 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1543 continue; 1544 1545 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1546 continue; 1547 1548 if (!sp) { 1549 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1550 if (!sp) { 1551 ret = -ENOMEM; 1552 trace_kvm_mmu_split_huge_page(iter.gfn, 1553 iter.old_spte, 1554 iter.level, ret); 1555 break; 1556 } 1557 1558 if (iter.yielded) 1559 continue; 1560 } 1561 1562 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1563 goto retry; 1564 1565 sp = NULL; 1566 } 1567 1568 rcu_read_unlock(); 1569 1570 /* 1571 * It's possible to exit the loop having never used the last sp if, for 1572 * example, a vCPU doing HugePage NX splitting wins the race and 1573 * installs its own sp in place of the last sp we tried to split. 1574 */ 1575 if (sp) 1576 tdp_mmu_free_sp(sp); 1577 1578 return ret; 1579 } 1580 1581 1582 /* 1583 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1584 */ 1585 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1586 const struct kvm_memory_slot *slot, 1587 gfn_t start, gfn_t end, 1588 int target_level, bool shared) 1589 { 1590 struct kvm_mmu_page *root; 1591 int r = 0; 1592 1593 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1594 1595 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1596 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1597 if (r) { 1598 kvm_tdp_mmu_put_root(kvm, root, shared); 1599 break; 1600 } 1601 } 1602 } 1603 1604 /* 1605 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1606 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1607 * If AD bits are not enabled, this will require clearing the writable bit on 1608 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1609 * be flushed. 1610 */ 1611 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1612 gfn_t start, gfn_t end) 1613 { 1614 struct tdp_iter iter; 1615 u64 new_spte; 1616 bool spte_set = false; 1617 1618 rcu_read_lock(); 1619 1620 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1621 retry: 1622 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1623 continue; 1624 1625 if (!is_shadow_present_pte(iter.old_spte)) 1626 continue; 1627 1628 if (spte_ad_need_write_protect(iter.old_spte)) { 1629 if (is_writable_pte(iter.old_spte)) 1630 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1631 else 1632 continue; 1633 } else { 1634 if (iter.old_spte & shadow_dirty_mask) 1635 new_spte = iter.old_spte & ~shadow_dirty_mask; 1636 else 1637 continue; 1638 } 1639 1640 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1641 goto retry; 1642 1643 spte_set = true; 1644 } 1645 1646 rcu_read_unlock(); 1647 return spte_set; 1648 } 1649 1650 /* 1651 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1652 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1653 * If AD bits are not enabled, this will require clearing the writable bit on 1654 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1655 * be flushed. 1656 */ 1657 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1658 const struct kvm_memory_slot *slot) 1659 { 1660 struct kvm_mmu_page *root; 1661 bool spte_set = false; 1662 1663 lockdep_assert_held_read(&kvm->mmu_lock); 1664 1665 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1666 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1667 slot->base_gfn + slot->npages); 1668 1669 return spte_set; 1670 } 1671 1672 /* 1673 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1674 * set in mask, starting at gfn. The given memslot is expected to contain all 1675 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1676 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1677 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1678 */ 1679 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1680 gfn_t gfn, unsigned long mask, bool wrprot) 1681 { 1682 struct tdp_iter iter; 1683 u64 new_spte; 1684 1685 rcu_read_lock(); 1686 1687 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1688 gfn + BITS_PER_LONG) { 1689 if (!mask) 1690 break; 1691 1692 if (iter.level > PG_LEVEL_4K || 1693 !(mask & (1UL << (iter.gfn - gfn)))) 1694 continue; 1695 1696 mask &= ~(1UL << (iter.gfn - gfn)); 1697 1698 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1699 if (is_writable_pte(iter.old_spte)) 1700 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1701 else 1702 continue; 1703 } else { 1704 if (iter.old_spte & shadow_dirty_mask) 1705 new_spte = iter.old_spte & ~shadow_dirty_mask; 1706 else 1707 continue; 1708 } 1709 1710 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1711 } 1712 1713 rcu_read_unlock(); 1714 } 1715 1716 /* 1717 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1718 * set in mask, starting at gfn. The given memslot is expected to contain all 1719 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1720 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1721 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1722 */ 1723 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1724 struct kvm_memory_slot *slot, 1725 gfn_t gfn, unsigned long mask, 1726 bool wrprot) 1727 { 1728 struct kvm_mmu_page *root; 1729 1730 lockdep_assert_held_write(&kvm->mmu_lock); 1731 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1732 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1733 } 1734 1735 /* 1736 * Clear leaf entries which could be replaced by large mappings, for 1737 * GFNs within the slot. 1738 */ 1739 static void zap_collapsible_spte_range(struct kvm *kvm, 1740 struct kvm_mmu_page *root, 1741 const struct kvm_memory_slot *slot) 1742 { 1743 gfn_t start = slot->base_gfn; 1744 gfn_t end = start + slot->npages; 1745 struct tdp_iter iter; 1746 kvm_pfn_t pfn; 1747 1748 rcu_read_lock(); 1749 1750 tdp_root_for_each_pte(iter, root, start, end) { 1751 retry: 1752 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1753 continue; 1754 1755 if (!is_shadow_present_pte(iter.old_spte) || 1756 !is_last_spte(iter.old_spte, iter.level)) 1757 continue; 1758 1759 pfn = spte_to_pfn(iter.old_spte); 1760 if (kvm_is_reserved_pfn(pfn) || 1761 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 1762 pfn, PG_LEVEL_NUM)) 1763 continue; 1764 1765 /* Note, a successful atomic zap also does a remote TLB flush. */ 1766 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1767 goto retry; 1768 } 1769 1770 rcu_read_unlock(); 1771 } 1772 1773 /* 1774 * Clear non-leaf entries (and free associated page tables) which could 1775 * be replaced by large mappings, for GFNs within the slot. 1776 */ 1777 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1778 const struct kvm_memory_slot *slot) 1779 { 1780 struct kvm_mmu_page *root; 1781 1782 lockdep_assert_held_read(&kvm->mmu_lock); 1783 1784 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1785 zap_collapsible_spte_range(kvm, root, slot); 1786 } 1787 1788 /* 1789 * Removes write access on the last level SPTE mapping this GFN and unsets the 1790 * MMU-writable bit to ensure future writes continue to be intercepted. 1791 * Returns true if an SPTE was set and a TLB flush is needed. 1792 */ 1793 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1794 gfn_t gfn, int min_level) 1795 { 1796 struct tdp_iter iter; 1797 u64 new_spte; 1798 bool spte_set = false; 1799 1800 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1801 1802 rcu_read_lock(); 1803 1804 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1805 if (!is_shadow_present_pte(iter.old_spte) || 1806 !is_last_spte(iter.old_spte, iter.level)) 1807 continue; 1808 1809 new_spte = iter.old_spte & 1810 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1811 1812 if (new_spte == iter.old_spte) 1813 break; 1814 1815 tdp_mmu_set_spte(kvm, &iter, new_spte); 1816 spte_set = true; 1817 } 1818 1819 rcu_read_unlock(); 1820 1821 return spte_set; 1822 } 1823 1824 /* 1825 * Removes write access on the last level SPTE mapping this GFN and unsets the 1826 * MMU-writable bit to ensure future writes continue to be intercepted. 1827 * Returns true if an SPTE was set and a TLB flush is needed. 1828 */ 1829 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1830 struct kvm_memory_slot *slot, gfn_t gfn, 1831 int min_level) 1832 { 1833 struct kvm_mmu_page *root; 1834 bool spte_set = false; 1835 1836 lockdep_assert_held_write(&kvm->mmu_lock); 1837 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1838 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1839 1840 return spte_set; 1841 } 1842 1843 /* 1844 * Return the level of the lowest level SPTE added to sptes. 1845 * That SPTE may be non-present. 1846 * 1847 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1848 */ 1849 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1850 int *root_level) 1851 { 1852 struct tdp_iter iter; 1853 struct kvm_mmu *mmu = vcpu->arch.mmu; 1854 gfn_t gfn = addr >> PAGE_SHIFT; 1855 int leaf = -1; 1856 1857 *root_level = vcpu->arch.mmu->shadow_root_level; 1858 1859 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1860 leaf = iter.level; 1861 sptes[leaf] = iter.old_spte; 1862 } 1863 1864 return leaf; 1865 } 1866 1867 /* 1868 * Returns the last level spte pointer of the shadow page walk for the given 1869 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1870 * walk could be performed, returns NULL and *spte does not contain valid data. 1871 * 1872 * Contract: 1873 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1874 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1875 * 1876 * WARNING: This function is only intended to be called during fast_page_fault. 1877 */ 1878 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1879 u64 *spte) 1880 { 1881 struct tdp_iter iter; 1882 struct kvm_mmu *mmu = vcpu->arch.mmu; 1883 gfn_t gfn = addr >> PAGE_SHIFT; 1884 tdp_ptep_t sptep = NULL; 1885 1886 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1887 *spte = iter.old_spte; 1888 sptep = iter.sptep; 1889 } 1890 1891 /* 1892 * Perform the rcu_dereference to get the raw spte pointer value since 1893 * we are passing it up to fast_page_fault, which is shared with the 1894 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1895 * annotation. 1896 * 1897 * This is safe since fast_page_fault obeys the contracts of this 1898 * function as well as all TDP MMU contracts around modifying SPTEs 1899 * outside of mmu_lock. 1900 */ 1901 return rcu_dereference(sptep); 1902 } 1903