1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = true; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 struct workqueue_struct *wq; 20 21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 22 return 0; 23 24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 25 if (!wq) 26 return -ENOMEM; 27 28 /* This should not be changed for the lifetime of the VM. */ 29 kvm->arch.tdp_mmu_enabled = true; 30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 32 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 33 kvm->arch.tdp_mmu_zap_wq = wq; 34 return 1; 35 } 36 37 /* Arbitrarily returns true so that this may be used in if statements. */ 38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 39 bool shared) 40 { 41 if (shared) 42 lockdep_assert_held_read(&kvm->mmu_lock); 43 else 44 lockdep_assert_held_write(&kvm->mmu_lock); 45 46 return true; 47 } 48 49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 50 { 51 if (!kvm->arch.tdp_mmu_enabled) 52 return; 53 54 /* Also waits for any queued work items. */ 55 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 56 57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 58 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 59 60 /* 61 * Ensure that all the outstanding RCU callbacks to free shadow pages 62 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 63 * can call kvm_tdp_mmu_put_root and create new callbacks. 64 */ 65 rcu_barrier(); 66 } 67 68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 69 { 70 free_page((unsigned long)sp->spt); 71 kmem_cache_free(mmu_page_header_cache, sp); 72 } 73 74 /* 75 * This is called through call_rcu in order to free TDP page table memory 76 * safely with respect to other kernel threads that may be operating on 77 * the memory. 78 * By only accessing TDP MMU page table memory in an RCU read critical 79 * section, and freeing it after a grace period, lockless access to that 80 * memory won't use it after it is freed. 81 */ 82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 83 { 84 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 85 rcu_head); 86 87 tdp_mmu_free_sp(sp); 88 } 89 90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 91 bool shared); 92 93 static void tdp_mmu_zap_root_work(struct work_struct *work) 94 { 95 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 96 tdp_mmu_async_work); 97 struct kvm *kvm = root->tdp_mmu_async_data; 98 99 read_lock(&kvm->mmu_lock); 100 101 /* 102 * A TLB flush is not necessary as KVM performs a local TLB flush when 103 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 104 * to a different pCPU. Note, the local TLB flush on reuse also 105 * invalidates any paging-structure-cache entries, i.e. TLB entries for 106 * intermediate paging structures, that may be zapped, as such entries 107 * are associated with the ASID on both VMX and SVM. 108 */ 109 tdp_mmu_zap_root(kvm, root, true); 110 111 /* 112 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 113 * avoiding an infinite loop. By design, the root is reachable while 114 * it's being asynchronously zapped, thus a different task can put its 115 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 116 * asynchronously zapped root is unavoidable. 117 */ 118 kvm_tdp_mmu_put_root(kvm, root, true); 119 120 read_unlock(&kvm->mmu_lock); 121 } 122 123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 124 { 125 root->tdp_mmu_async_data = kvm; 126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 128 } 129 130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 131 { 132 union kvm_mmu_page_role role = page->role; 133 role.invalid = true; 134 135 /* No need to use cmpxchg, only the invalid bit can change. */ 136 role.word = xchg(&page->role.word, role.word); 137 return role.invalid; 138 } 139 140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 141 bool shared) 142 { 143 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 144 145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 146 return; 147 148 WARN_ON(!root->tdp_mmu_page); 149 150 /* 151 * The root now has refcount=0. It is valid, but readers already 152 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 153 * rejects it. This remains true for the rest of the execution 154 * of this function, because readers visit valid roots only 155 * (except for tdp_mmu_zap_root_work(), which however 156 * does not acquire any reference itself). 157 * 158 * Even though there are flows that need to visit all roots for 159 * correctness, they all take mmu_lock for write, so they cannot yet 160 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 161 * since the root still has refcount=0. 162 * 163 * However, tdp_mmu_zap_root can yield, and writers do not expect to 164 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 165 * So the root temporarily gets an extra reference, going to refcount=1 166 * while staying invalid. Readers still cannot acquire any reference; 167 * but writers are now allowed to run if tdp_mmu_zap_root yields and 168 * they might take an extra reference if they themselves yield. 169 * Therefore, when the reference is given back by the worker, 170 * there is no guarantee that the refcount is still 1. If not, whoever 171 * puts the last reference will free the page, but they will not have to 172 * zap the root because a root cannot go from invalid to valid. 173 */ 174 if (!kvm_tdp_root_mark_invalid(root)) { 175 refcount_set(&root->tdp_mmu_root_count, 1); 176 177 /* 178 * Zapping the root in a worker is not just "nice to have"; 179 * it is required because kvm_tdp_mmu_invalidate_all_roots() 180 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 181 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 182 * might return with some roots not zapped yet. 183 */ 184 tdp_mmu_schedule_zap_root(kvm, root); 185 return; 186 } 187 188 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 189 list_del_rcu(&root->link); 190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 192 } 193 194 /* 195 * Returns the next root after @prev_root (or the first root if @prev_root is 196 * NULL). A reference to the returned root is acquired, and the reference to 197 * @prev_root is released (the caller obviously must hold a reference to 198 * @prev_root if it's non-NULL). 199 * 200 * If @only_valid is true, invalid roots are skipped. 201 * 202 * Returns NULL if the end of tdp_mmu_roots was reached. 203 */ 204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 205 struct kvm_mmu_page *prev_root, 206 bool shared, bool only_valid) 207 { 208 struct kvm_mmu_page *next_root; 209 210 rcu_read_lock(); 211 212 if (prev_root) 213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 214 &prev_root->link, 215 typeof(*prev_root), link); 216 else 217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 218 typeof(*next_root), link); 219 220 while (next_root) { 221 if ((!only_valid || !next_root->role.invalid) && 222 kvm_tdp_mmu_get_root(next_root)) 223 break; 224 225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 226 &next_root->link, typeof(*next_root), link); 227 } 228 229 rcu_read_unlock(); 230 231 if (prev_root) 232 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 233 234 return next_root; 235 } 236 237 /* 238 * Note: this iterator gets and puts references to the roots it iterates over. 239 * This makes it safe to release the MMU lock and yield within the loop, but 240 * if exiting the loop early, the caller must drop the reference to the most 241 * recent root. (Unless keeping a live reference is desirable.) 242 * 243 * If shared is set, this function is operating under the MMU lock in read 244 * mode. In the unlikely event that this thread must free a root, the lock 245 * will be temporarily dropped and reacquired in write mode. 246 */ 247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 249 _root; \ 250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 252 kvm_mmu_page_as_id(_root) != _as_id) { \ 253 } else 254 255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 257 258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 260 261 /* 262 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 263 * the implication being that any flow that holds mmu_lock for read is 264 * inherently yield-friendly and should use the yield-safe variant above. 265 * Holding mmu_lock for write obviates the need for RCU protection as the list 266 * is guaranteed to be stable. 267 */ 268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 271 kvm_mmu_page_as_id(_root) != _as_id) { \ 272 } else 273 274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 275 { 276 struct kvm_mmu_page *sp; 277 278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 280 281 return sp; 282 } 283 284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 285 gfn_t gfn, union kvm_mmu_page_role role) 286 { 287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 288 289 sp->role = role; 290 sp->gfn = gfn; 291 sp->ptep = sptep; 292 sp->tdp_mmu_page = true; 293 294 trace_kvm_mmu_get_page(sp, true); 295 } 296 297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 298 struct tdp_iter *iter) 299 { 300 struct kvm_mmu_page *parent_sp; 301 union kvm_mmu_page_role role; 302 303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 304 305 role = parent_sp->role; 306 role.level--; 307 308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 309 } 310 311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 312 { 313 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base; 314 struct kvm *kvm = vcpu->kvm; 315 struct kvm_mmu_page *root; 316 317 lockdep_assert_held_write(&kvm->mmu_lock); 318 319 /* 320 * Check for an existing root before allocating a new one. Note, the 321 * role check prevents consuming an invalid root. 322 */ 323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 324 if (root->role.word == role.word && 325 kvm_tdp_mmu_get_root(root)) 326 goto out; 327 } 328 329 root = tdp_mmu_alloc_sp(vcpu); 330 tdp_mmu_init_sp(root, NULL, 0, role); 331 332 refcount_set(&root->tdp_mmu_root_count, 1); 333 334 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 337 338 out: 339 return __pa(root->spt); 340 } 341 342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 343 u64 old_spte, u64 new_spte, int level, 344 bool shared); 345 346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 347 { 348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 349 return; 350 351 if (is_accessed_spte(old_spte) && 352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 354 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 355 } 356 357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 358 u64 old_spte, u64 new_spte, int level) 359 { 360 bool pfn_changed; 361 struct kvm_memory_slot *slot; 362 363 if (level > PG_LEVEL_4K) 364 return; 365 366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 367 368 if ((!is_writable_pte(old_spte) || pfn_changed) && 369 is_writable_pte(new_spte)) { 370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 371 mark_page_dirty_in_slot(kvm, slot, gfn); 372 } 373 } 374 375 /** 376 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 377 * 378 * @kvm: kvm instance 379 * @sp: the page to be removed 380 * @shared: This operation may not be running under the exclusive use of 381 * the MMU lock and the operation must synchronize with other 382 * threads that might be adding or removing pages. 383 */ 384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 385 bool shared) 386 { 387 if (shared) 388 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 389 else 390 lockdep_assert_held_write(&kvm->mmu_lock); 391 392 list_del(&sp->link); 393 if (sp->lpage_disallowed) 394 unaccount_huge_nx_page(kvm, sp); 395 396 if (shared) 397 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 398 } 399 400 /** 401 * handle_removed_pt() - handle a page table removed from the TDP structure 402 * 403 * @kvm: kvm instance 404 * @pt: the page removed from the paging structure 405 * @shared: This operation may not be running under the exclusive use 406 * of the MMU lock and the operation must synchronize with other 407 * threads that might be modifying SPTEs. 408 * 409 * Given a page table that has been removed from the TDP paging structure, 410 * iterates through the page table to clear SPTEs and free child page tables. 411 * 412 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 413 * protection. Since this thread removed it from the paging structure, 414 * this thread will be responsible for ensuring the page is freed. Hence the 415 * early rcu_dereferences in the function. 416 */ 417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 418 { 419 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 420 int level = sp->role.level; 421 gfn_t base_gfn = sp->gfn; 422 int i; 423 424 trace_kvm_mmu_prepare_zap_page(sp); 425 426 tdp_mmu_unlink_sp(kvm, sp, shared); 427 428 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 429 u64 *sptep = rcu_dereference(pt) + i; 430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 431 u64 old_child_spte; 432 433 if (shared) { 434 /* 435 * Set the SPTE to a nonpresent value that other 436 * threads will not overwrite. If the SPTE was 437 * already marked as removed then another thread 438 * handling a page fault could overwrite it, so 439 * set the SPTE until it is set from some other 440 * value to the removed SPTE value. 441 */ 442 for (;;) { 443 old_child_spte = xchg(sptep, REMOVED_SPTE); 444 if (!is_removed_spte(old_child_spte)) 445 break; 446 cpu_relax(); 447 } 448 } else { 449 /* 450 * If the SPTE is not MMU-present, there is no backing 451 * page associated with the SPTE and so no side effects 452 * that need to be recorded, and exclusive ownership of 453 * mmu_lock ensures the SPTE can't be made present. 454 * Note, zapping MMIO SPTEs is also unnecessary as they 455 * are guarded by the memslots generation, not by being 456 * unreachable. 457 */ 458 old_child_spte = READ_ONCE(*sptep); 459 if (!is_shadow_present_pte(old_child_spte)) 460 continue; 461 462 /* 463 * Marking the SPTE as a removed SPTE is not 464 * strictly necessary here as the MMU lock will 465 * stop other threads from concurrently modifying 466 * this SPTE. Using the removed SPTE value keeps 467 * the two branches consistent and simplifies 468 * the function. 469 */ 470 WRITE_ONCE(*sptep, REMOVED_SPTE); 471 } 472 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 473 old_child_spte, REMOVED_SPTE, level, 474 shared); 475 } 476 477 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 478 } 479 480 /** 481 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 482 * @kvm: kvm instance 483 * @as_id: the address space of the paging structure the SPTE was a part of 484 * @gfn: the base GFN that was mapped by the SPTE 485 * @old_spte: The value of the SPTE before the change 486 * @new_spte: The value of the SPTE after the change 487 * @level: the level of the PT the SPTE is part of in the paging structure 488 * @shared: This operation may not be running under the exclusive use of 489 * the MMU lock and the operation must synchronize with other 490 * threads that might be modifying SPTEs. 491 * 492 * Handle bookkeeping that might result from the modification of a SPTE. 493 * This function must be called for all TDP SPTE modifications. 494 */ 495 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 496 u64 old_spte, u64 new_spte, int level, 497 bool shared) 498 { 499 bool was_present = is_shadow_present_pte(old_spte); 500 bool is_present = is_shadow_present_pte(new_spte); 501 bool was_leaf = was_present && is_last_spte(old_spte, level); 502 bool is_leaf = is_present && is_last_spte(new_spte, level); 503 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 504 505 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 506 WARN_ON(level < PG_LEVEL_4K); 507 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 508 509 /* 510 * If this warning were to trigger it would indicate that there was a 511 * missing MMU notifier or a race with some notifier handler. 512 * A present, leaf SPTE should never be directly replaced with another 513 * present leaf SPTE pointing to a different PFN. A notifier handler 514 * should be zapping the SPTE before the main MM's page table is 515 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 516 * thread before replacement. 517 */ 518 if (was_leaf && is_leaf && pfn_changed) { 519 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 520 "SPTE with another present leaf SPTE mapping a\n" 521 "different PFN!\n" 522 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 523 as_id, gfn, old_spte, new_spte, level); 524 525 /* 526 * Crash the host to prevent error propagation and guest data 527 * corruption. 528 */ 529 BUG(); 530 } 531 532 if (old_spte == new_spte) 533 return; 534 535 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 536 537 if (is_leaf) 538 check_spte_writable_invariants(new_spte); 539 540 /* 541 * The only times a SPTE should be changed from a non-present to 542 * non-present state is when an MMIO entry is installed/modified/ 543 * removed. In that case, there is nothing to do here. 544 */ 545 if (!was_present && !is_present) { 546 /* 547 * If this change does not involve a MMIO SPTE or removed SPTE, 548 * it is unexpected. Log the change, though it should not 549 * impact the guest since both the former and current SPTEs 550 * are nonpresent. 551 */ 552 if (WARN_ON(!is_mmio_spte(old_spte) && 553 !is_mmio_spte(new_spte) && 554 !is_removed_spte(new_spte))) 555 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 556 "should not be replaced with another,\n" 557 "different nonpresent SPTE, unless one or both\n" 558 "are MMIO SPTEs, or the new SPTE is\n" 559 "a temporary removed SPTE.\n" 560 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 561 as_id, gfn, old_spte, new_spte, level); 562 return; 563 } 564 565 if (is_leaf != was_leaf) 566 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 567 568 if (was_leaf && is_dirty_spte(old_spte) && 569 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 570 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 571 572 /* 573 * Recursively handle child PTs if the change removed a subtree from 574 * the paging structure. Note the WARN on the PFN changing without the 575 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 576 * pages are kernel allocations and should never be migrated. 577 */ 578 if (was_present && !was_leaf && 579 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 580 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 581 } 582 583 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 584 u64 old_spte, u64 new_spte, int level, 585 bool shared) 586 { 587 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 588 shared); 589 handle_changed_spte_acc_track(old_spte, new_spte, level); 590 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 591 new_spte, level); 592 } 593 594 /* 595 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 596 * and handle the associated bookkeeping. Do not mark the page dirty 597 * in KVM's dirty bitmaps. 598 * 599 * If setting the SPTE fails because it has changed, iter->old_spte will be 600 * refreshed to the current value of the spte. 601 * 602 * @kvm: kvm instance 603 * @iter: a tdp_iter instance currently on the SPTE that should be set 604 * @new_spte: The value the SPTE should be set to 605 * Return: 606 * * 0 - If the SPTE was set. 607 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 608 * no side-effects other than setting iter->old_spte to the last 609 * known value of the spte. 610 */ 611 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 612 struct tdp_iter *iter, 613 u64 new_spte) 614 { 615 u64 *sptep = rcu_dereference(iter->sptep); 616 u64 old_spte; 617 618 /* 619 * The caller is responsible for ensuring the old SPTE is not a REMOVED 620 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 621 * and pre-checking before inserting a new SPTE is advantageous as it 622 * avoids unnecessary work. 623 */ 624 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 625 626 lockdep_assert_held_read(&kvm->mmu_lock); 627 628 /* 629 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 630 * does not hold the mmu_lock. 631 */ 632 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); 633 if (old_spte != iter->old_spte) { 634 /* 635 * The page table entry was modified by a different logical 636 * CPU. Refresh iter->old_spte with the current value so the 637 * caller operates on fresh data, e.g. if it retries 638 * tdp_mmu_set_spte_atomic(). 639 */ 640 iter->old_spte = old_spte; 641 return -EBUSY; 642 } 643 644 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 645 new_spte, iter->level, true); 646 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 647 648 return 0; 649 } 650 651 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 652 struct tdp_iter *iter) 653 { 654 int ret; 655 656 /* 657 * Freeze the SPTE by setting it to a special, 658 * non-present value. This will stop other threads from 659 * immediately installing a present entry in its place 660 * before the TLBs are flushed. 661 */ 662 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 663 if (ret) 664 return ret; 665 666 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 667 KVM_PAGES_PER_HPAGE(iter->level)); 668 669 /* 670 * No other thread can overwrite the removed SPTE as they 671 * must either wait on the MMU lock or use 672 * tdp_mmu_set_spte_atomic which will not overwrite the 673 * special removed SPTE value. No bookkeeping is needed 674 * here since the SPTE is going from non-present 675 * to non-present. 676 */ 677 kvm_tdp_mmu_write_spte(iter->sptep, 0); 678 679 return 0; 680 } 681 682 683 /* 684 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 685 * @kvm: KVM instance 686 * @as_id: Address space ID, i.e. regular vs. SMM 687 * @sptep: Pointer to the SPTE 688 * @old_spte: The current value of the SPTE 689 * @new_spte: The new value that will be set for the SPTE 690 * @gfn: The base GFN that was (or will be) mapped by the SPTE 691 * @level: The level _containing_ the SPTE (its parent PT's level) 692 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 693 * of the page. Should be set unless handling an MMU 694 * notifier for access tracking. Leaving record_acc_track 695 * unset in that case prevents page accesses from being 696 * double counted. 697 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 698 * appropriate for the change being made. Should be set 699 * unless performing certain dirty logging operations. 700 * Leaving record_dirty_log unset in that case prevents page 701 * writes from being double counted. 702 */ 703 static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 704 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 705 bool record_acc_track, bool record_dirty_log) 706 { 707 lockdep_assert_held_write(&kvm->mmu_lock); 708 709 /* 710 * No thread should be using this function to set SPTEs to or from the 711 * temporary removed SPTE value. 712 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 713 * should be used. If operating under the MMU lock in write mode, the 714 * use of the removed SPTE should not be necessary. 715 */ 716 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 717 718 kvm_tdp_mmu_write_spte(sptep, new_spte); 719 720 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 721 722 if (record_acc_track) 723 handle_changed_spte_acc_track(old_spte, new_spte, level); 724 if (record_dirty_log) 725 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 726 new_spte, level); 727 } 728 729 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 730 u64 new_spte, bool record_acc_track, 731 bool record_dirty_log) 732 { 733 WARN_ON_ONCE(iter->yielded); 734 735 __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, 736 new_spte, iter->gfn, iter->level, 737 record_acc_track, record_dirty_log); 738 } 739 740 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 741 u64 new_spte) 742 { 743 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 744 } 745 746 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 747 struct tdp_iter *iter, 748 u64 new_spte) 749 { 750 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 751 } 752 753 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 754 struct tdp_iter *iter, 755 u64 new_spte) 756 { 757 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 758 } 759 760 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 761 for_each_tdp_pte(_iter, _root, _start, _end) 762 763 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 764 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 765 if (!is_shadow_present_pte(_iter.old_spte) || \ 766 !is_last_spte(_iter.old_spte, _iter.level)) \ 767 continue; \ 768 else 769 770 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 771 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 772 773 /* 774 * Yield if the MMU lock is contended or this thread needs to return control 775 * to the scheduler. 776 * 777 * If this function should yield and flush is set, it will perform a remote 778 * TLB flush before yielding. 779 * 780 * If this function yields, iter->yielded is set and the caller must skip to 781 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 782 * over the paging structures to allow the iterator to continue its traversal 783 * from the paging structure root. 784 * 785 * Returns true if this function yielded. 786 */ 787 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 788 struct tdp_iter *iter, 789 bool flush, bool shared) 790 { 791 WARN_ON(iter->yielded); 792 793 /* Ensure forward progress has been made before yielding. */ 794 if (iter->next_last_level_gfn == iter->yielded_gfn) 795 return false; 796 797 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 798 if (flush) 799 kvm_flush_remote_tlbs(kvm); 800 801 rcu_read_unlock(); 802 803 if (shared) 804 cond_resched_rwlock_read(&kvm->mmu_lock); 805 else 806 cond_resched_rwlock_write(&kvm->mmu_lock); 807 808 rcu_read_lock(); 809 810 WARN_ON(iter->gfn > iter->next_last_level_gfn); 811 812 iter->yielded = true; 813 } 814 815 return iter->yielded; 816 } 817 818 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 819 { 820 /* 821 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 822 * a gpa range that would exceed the max gfn, and KVM does not create 823 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 824 * the slow emulation path every time. 825 */ 826 return kvm_mmu_max_gfn() + 1; 827 } 828 829 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 830 bool shared, int zap_level) 831 { 832 struct tdp_iter iter; 833 834 gfn_t end = tdp_mmu_max_gfn_exclusive(); 835 gfn_t start = 0; 836 837 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 838 retry: 839 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 840 continue; 841 842 if (!is_shadow_present_pte(iter.old_spte)) 843 continue; 844 845 if (iter.level > zap_level) 846 continue; 847 848 if (!shared) 849 tdp_mmu_set_spte(kvm, &iter, 0); 850 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 851 goto retry; 852 } 853 } 854 855 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 856 bool shared) 857 { 858 859 /* 860 * The root must have an elevated refcount so that it's reachable via 861 * mmu_notifier callbacks, which allows this path to yield and drop 862 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 863 * must drop all references to relevant pages prior to completing the 864 * callback. Dropping mmu_lock with an unreachable root would result 865 * in zapping SPTEs after a relevant mmu_notifier callback completes 866 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 867 * dirty accessed bits to the SPTE's associated struct page. 868 */ 869 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 870 871 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 872 873 rcu_read_lock(); 874 875 /* 876 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 877 * split the zap into two passes. On the first pass, zap at the 1gb 878 * level, and then zap top-level SPs on the second pass. "1gb" is not 879 * arbitrary, as KVM must be able to zap a 1gb shadow page without 880 * inducing a stall to allow in-place replacement with a 1gb hugepage. 881 * 882 * Because zapping a SP recurses on its children, stepping down to 883 * PG_LEVEL_4K in the iterator itself is unnecessary. 884 */ 885 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 886 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 887 888 rcu_read_unlock(); 889 } 890 891 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 892 { 893 u64 old_spte; 894 895 /* 896 * This helper intentionally doesn't allow zapping a root shadow page, 897 * which doesn't have a parent page table and thus no associated entry. 898 */ 899 if (WARN_ON_ONCE(!sp->ptep)) 900 return false; 901 902 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 903 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 904 return false; 905 906 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 907 sp->gfn, sp->role.level + 1, true, true); 908 909 return true; 910 } 911 912 /* 913 * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs 914 * have been cleared and a TLB flush is needed before releasing the MMU lock. 915 * 916 * If can_yield is true, will release the MMU lock and reschedule if the 917 * scheduler needs the CPU or there is contention on the MMU lock. If this 918 * function cannot yield, it will not release the MMU lock or reschedule and 919 * the caller must ensure it does not supply too large a GFN range, or the 920 * operation can cause a soft lockup. 921 */ 922 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 923 gfn_t start, gfn_t end, bool can_yield, bool flush) 924 { 925 struct tdp_iter iter; 926 927 end = min(end, tdp_mmu_max_gfn_exclusive()); 928 929 lockdep_assert_held_write(&kvm->mmu_lock); 930 931 rcu_read_lock(); 932 933 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 934 if (can_yield && 935 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 936 flush = false; 937 continue; 938 } 939 940 if (!is_shadow_present_pte(iter.old_spte) || 941 !is_last_spte(iter.old_spte, iter.level)) 942 continue; 943 944 tdp_mmu_set_spte(kvm, &iter, 0); 945 flush = true; 946 } 947 948 rcu_read_unlock(); 949 950 /* 951 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 952 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 953 */ 954 return flush; 955 } 956 957 /* 958 * Tears down the mappings for the range of gfns, [start, end), and frees the 959 * non-root pages mapping GFNs strictly within that range. Returns true if 960 * SPTEs have been cleared and a TLB flush is needed before releasing the 961 * MMU lock. 962 */ 963 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 964 bool can_yield, bool flush) 965 { 966 struct kvm_mmu_page *root; 967 968 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 969 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 970 971 return flush; 972 } 973 974 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 975 { 976 struct kvm_mmu_page *root; 977 int i; 978 979 /* 980 * Zap all roots, including invalid roots, as all SPTEs must be dropped 981 * before returning to the caller. Zap directly even if the root is 982 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 983 * all that expensive and mmu_lock is already held, which means the 984 * worker has yielded, i.e. flushing the work instead of zapping here 985 * isn't guaranteed to be any faster. 986 * 987 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 988 * is being destroyed or the userspace VMM has exited. In both cases, 989 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 990 */ 991 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 992 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 993 tdp_mmu_zap_root(kvm, root, false); 994 } 995 } 996 997 /* 998 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 999 * zap" completes. 1000 */ 1001 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1002 { 1003 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1004 } 1005 1006 /* 1007 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1008 * is about to be zapped, e.g. in response to a memslots update. The actual 1009 * zapping is performed asynchronously, so a reference is taken on all roots. 1010 * Using a separate workqueue makes it easy to ensure that the destruction is 1011 * performed before the "fast zap" completes, without keeping a separate list 1012 * of invalidated roots; the list is effectively the list of work items in 1013 * the workqueue. 1014 * 1015 * Get a reference even if the root is already invalid, the asynchronous worker 1016 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1017 * is held for write, it should be impossible to observe a root with zero refcount, 1018 * i.e. the list of roots cannot be stale. 1019 * 1020 * This has essentially the same effect for the TDP MMU 1021 * as updating mmu_valid_gen does for the shadow MMU. 1022 */ 1023 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1024 { 1025 struct kvm_mmu_page *root; 1026 1027 lockdep_assert_held_write(&kvm->mmu_lock); 1028 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1029 if (!root->role.invalid && 1030 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1031 root->role.invalid = true; 1032 tdp_mmu_schedule_zap_root(kvm, root); 1033 } 1034 } 1035 } 1036 1037 /* 1038 * Installs a last-level SPTE to handle a TDP page fault. 1039 * (NPT/EPT violation/misconfiguration) 1040 */ 1041 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1042 struct kvm_page_fault *fault, 1043 struct tdp_iter *iter) 1044 { 1045 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1046 u64 new_spte; 1047 int ret = RET_PF_FIXED; 1048 bool wrprot = false; 1049 1050 WARN_ON(sp->role.level != fault->goal_level); 1051 if (unlikely(!fault->slot)) 1052 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1053 else 1054 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1055 fault->pfn, iter->old_spte, fault->prefetch, true, 1056 fault->map_writable, &new_spte); 1057 1058 if (new_spte == iter->old_spte) 1059 ret = RET_PF_SPURIOUS; 1060 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1061 return RET_PF_RETRY; 1062 else if (is_shadow_present_pte(iter->old_spte) && 1063 !is_last_spte(iter->old_spte, iter->level)) 1064 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1065 KVM_PAGES_PER_HPAGE(iter->level + 1)); 1066 1067 /* 1068 * If the page fault was caused by a write but the page is write 1069 * protected, emulation is needed. If the emulation was skipped, 1070 * the vCPU would have the same fault again. 1071 */ 1072 if (wrprot) { 1073 if (fault->write) 1074 ret = RET_PF_EMULATE; 1075 } 1076 1077 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1078 if (unlikely(is_mmio_spte(new_spte))) { 1079 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1080 new_spte); 1081 ret = RET_PF_EMULATE; 1082 } else { 1083 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1084 rcu_dereference(iter->sptep)); 1085 } 1086 1087 /* 1088 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be 1089 * consistent with legacy MMU behavior. 1090 */ 1091 if (ret != RET_PF_SPURIOUS) 1092 vcpu->stat.pf_fixed++; 1093 1094 return ret; 1095 } 1096 1097 /* 1098 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1099 * provided page table. 1100 * 1101 * @kvm: kvm instance 1102 * @iter: a tdp_iter instance currently on the SPTE that should be set 1103 * @sp: The new TDP page table to install. 1104 * @account_nx: True if this page table is being installed to split a 1105 * non-executable huge page. 1106 * @shared: This operation is running under the MMU lock in read mode. 1107 * 1108 * Returns: 0 if the new page table was installed. Non-0 if the page table 1109 * could not be installed (e.g. the atomic compare-exchange failed). 1110 */ 1111 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1112 struct kvm_mmu_page *sp, bool account_nx, 1113 bool shared) 1114 { 1115 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); 1116 int ret = 0; 1117 1118 if (shared) { 1119 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1120 if (ret) 1121 return ret; 1122 } else { 1123 tdp_mmu_set_spte(kvm, iter, spte); 1124 } 1125 1126 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1127 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 1128 if (account_nx) 1129 account_huge_nx_page(kvm, sp); 1130 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1131 1132 return 0; 1133 } 1134 1135 /* 1136 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1137 * page tables and SPTEs to translate the faulting guest physical address. 1138 */ 1139 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1140 { 1141 struct kvm_mmu *mmu = vcpu->arch.mmu; 1142 struct tdp_iter iter; 1143 struct kvm_mmu_page *sp; 1144 int ret; 1145 1146 kvm_mmu_hugepage_adjust(vcpu, fault); 1147 1148 trace_kvm_mmu_spte_requested(fault); 1149 1150 rcu_read_lock(); 1151 1152 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1153 if (fault->nx_huge_page_workaround_enabled) 1154 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1155 1156 if (iter.level == fault->goal_level) 1157 break; 1158 1159 /* 1160 * If there is an SPTE mapping a large page at a higher level 1161 * than the target, that SPTE must be cleared and replaced 1162 * with a non-leaf SPTE. 1163 */ 1164 if (is_shadow_present_pte(iter.old_spte) && 1165 is_large_pte(iter.old_spte)) { 1166 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 1167 break; 1168 1169 /* 1170 * The iter must explicitly re-read the spte here 1171 * because the new value informs the !present 1172 * path below. 1173 */ 1174 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); 1175 } 1176 1177 if (!is_shadow_present_pte(iter.old_spte)) { 1178 bool account_nx = fault->huge_page_disallowed && 1179 fault->req_level >= iter.level; 1180 1181 /* 1182 * If SPTE has been frozen by another thread, just 1183 * give up and retry, avoiding unnecessary page table 1184 * allocation and free. 1185 */ 1186 if (is_removed_spte(iter.old_spte)) 1187 break; 1188 1189 sp = tdp_mmu_alloc_sp(vcpu); 1190 tdp_mmu_init_child_sp(sp, &iter); 1191 1192 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { 1193 tdp_mmu_free_sp(sp); 1194 break; 1195 } 1196 } 1197 } 1198 1199 /* 1200 * Force the guest to retry the access if the upper level SPTEs aren't 1201 * in place, or if the target leaf SPTE is frozen by another CPU. 1202 */ 1203 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { 1204 rcu_read_unlock(); 1205 return RET_PF_RETRY; 1206 } 1207 1208 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1209 rcu_read_unlock(); 1210 1211 return ret; 1212 } 1213 1214 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1215 bool flush) 1216 { 1217 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 1218 range->end, range->may_block, flush); 1219 } 1220 1221 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1222 struct kvm_gfn_range *range); 1223 1224 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1225 struct kvm_gfn_range *range, 1226 tdp_handler_t handler) 1227 { 1228 struct kvm_mmu_page *root; 1229 struct tdp_iter iter; 1230 bool ret = false; 1231 1232 /* 1233 * Don't support rescheduling, none of the MMU notifiers that funnel 1234 * into this helper allow blocking; it'd be dead, wasteful code. 1235 */ 1236 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1237 rcu_read_lock(); 1238 1239 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1240 ret |= handler(kvm, &iter, range); 1241 1242 rcu_read_unlock(); 1243 } 1244 1245 return ret; 1246 } 1247 1248 /* 1249 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1250 * if any of the GFNs in the range have been accessed. 1251 */ 1252 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1253 struct kvm_gfn_range *range) 1254 { 1255 u64 new_spte = 0; 1256 1257 /* If we have a non-accessed entry we don't need to change the pte. */ 1258 if (!is_accessed_spte(iter->old_spte)) 1259 return false; 1260 1261 new_spte = iter->old_spte; 1262 1263 if (spte_ad_enabled(new_spte)) { 1264 new_spte &= ~shadow_accessed_mask; 1265 } else { 1266 /* 1267 * Capture the dirty status of the page, so that it doesn't get 1268 * lost when the SPTE is marked for access tracking. 1269 */ 1270 if (is_writable_pte(new_spte)) 1271 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1272 1273 new_spte = mark_spte_for_access_track(new_spte); 1274 } 1275 1276 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1277 1278 return true; 1279 } 1280 1281 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1282 { 1283 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1284 } 1285 1286 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1287 struct kvm_gfn_range *range) 1288 { 1289 return is_accessed_spte(iter->old_spte); 1290 } 1291 1292 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1293 { 1294 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1295 } 1296 1297 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1298 struct kvm_gfn_range *range) 1299 { 1300 u64 new_spte; 1301 1302 /* Huge pages aren't expected to be modified without first being zapped. */ 1303 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1304 1305 if (iter->level != PG_LEVEL_4K || 1306 !is_shadow_present_pte(iter->old_spte)) 1307 return false; 1308 1309 /* 1310 * Note, when changing a read-only SPTE, it's not strictly necessary to 1311 * zero the SPTE before setting the new PFN, but doing so preserves the 1312 * invariant that the PFN of a present * leaf SPTE can never change. 1313 * See __handle_changed_spte(). 1314 */ 1315 tdp_mmu_set_spte(kvm, iter, 0); 1316 1317 if (!pte_write(range->pte)) { 1318 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1319 pte_pfn(range->pte)); 1320 1321 tdp_mmu_set_spte(kvm, iter, new_spte); 1322 } 1323 1324 return true; 1325 } 1326 1327 /* 1328 * Handle the changed_pte MMU notifier for the TDP MMU. 1329 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1330 * notifier. 1331 * Returns non-zero if a flush is needed before releasing the MMU lock. 1332 */ 1333 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1334 { 1335 /* 1336 * No need to handle the remote TLB flush under RCU protection, the 1337 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1338 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1339 */ 1340 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1341 } 1342 1343 /* 1344 * Remove write access from all SPTEs at or above min_level that map GFNs 1345 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1346 * be flushed. 1347 */ 1348 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1349 gfn_t start, gfn_t end, int min_level) 1350 { 1351 struct tdp_iter iter; 1352 u64 new_spte; 1353 bool spte_set = false; 1354 1355 rcu_read_lock(); 1356 1357 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1358 1359 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1360 retry: 1361 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1362 continue; 1363 1364 if (!is_shadow_present_pte(iter.old_spte) || 1365 !is_last_spte(iter.old_spte, iter.level) || 1366 !(iter.old_spte & PT_WRITABLE_MASK)) 1367 continue; 1368 1369 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1370 1371 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1372 goto retry; 1373 1374 spte_set = true; 1375 } 1376 1377 rcu_read_unlock(); 1378 return spte_set; 1379 } 1380 1381 /* 1382 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1383 * only affect leaf SPTEs down to min_level. 1384 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1385 */ 1386 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1387 const struct kvm_memory_slot *slot, int min_level) 1388 { 1389 struct kvm_mmu_page *root; 1390 bool spte_set = false; 1391 1392 lockdep_assert_held_read(&kvm->mmu_lock); 1393 1394 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1395 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1396 slot->base_gfn + slot->npages, min_level); 1397 1398 return spte_set; 1399 } 1400 1401 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1402 { 1403 struct kvm_mmu_page *sp; 1404 1405 gfp |= __GFP_ZERO; 1406 1407 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1408 if (!sp) 1409 return NULL; 1410 1411 sp->spt = (void *)__get_free_page(gfp); 1412 if (!sp->spt) { 1413 kmem_cache_free(mmu_page_header_cache, sp); 1414 return NULL; 1415 } 1416 1417 return sp; 1418 } 1419 1420 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1421 struct tdp_iter *iter, 1422 bool shared) 1423 { 1424 struct kvm_mmu_page *sp; 1425 1426 /* 1427 * Since we are allocating while under the MMU lock we have to be 1428 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1429 * reclaim and to avoid making any filesystem callbacks (which can end 1430 * up invoking KVM MMU notifiers, resulting in a deadlock). 1431 * 1432 * If this allocation fails we drop the lock and retry with reclaim 1433 * allowed. 1434 */ 1435 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1436 if (sp) 1437 return sp; 1438 1439 rcu_read_unlock(); 1440 1441 if (shared) 1442 read_unlock(&kvm->mmu_lock); 1443 else 1444 write_unlock(&kvm->mmu_lock); 1445 1446 iter->yielded = true; 1447 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1448 1449 if (shared) 1450 read_lock(&kvm->mmu_lock); 1451 else 1452 write_lock(&kvm->mmu_lock); 1453 1454 rcu_read_lock(); 1455 1456 return sp; 1457 } 1458 1459 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1460 struct kvm_mmu_page *sp, bool shared) 1461 { 1462 const u64 huge_spte = iter->old_spte; 1463 const int level = iter->level; 1464 int ret, i; 1465 1466 tdp_mmu_init_child_sp(sp, iter); 1467 1468 /* 1469 * No need for atomics when writing to sp->spt since the page table has 1470 * not been linked in yet and thus is not reachable from any other CPU. 1471 */ 1472 for (i = 0; i < PT64_ENT_PER_PAGE; i++) 1473 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); 1474 1475 /* 1476 * Replace the huge spte with a pointer to the populated lower level 1477 * page table. Since we are making this change without a TLB flush vCPUs 1478 * will see a mix of the split mappings and the original huge mapping, 1479 * depending on what's currently in their TLB. This is fine from a 1480 * correctness standpoint since the translation will be the same either 1481 * way. 1482 */ 1483 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); 1484 if (ret) 1485 goto out; 1486 1487 /* 1488 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1489 * are overwriting from the page stats. But we have to manually update 1490 * the page stats with the new present child pages. 1491 */ 1492 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); 1493 1494 out: 1495 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1496 return ret; 1497 } 1498 1499 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1500 struct kvm_mmu_page *root, 1501 gfn_t start, gfn_t end, 1502 int target_level, bool shared) 1503 { 1504 struct kvm_mmu_page *sp = NULL; 1505 struct tdp_iter iter; 1506 int ret = 0; 1507 1508 rcu_read_lock(); 1509 1510 /* 1511 * Traverse the page table splitting all huge pages above the target 1512 * level into one lower level. For example, if we encounter a 1GB page 1513 * we split it into 512 2MB pages. 1514 * 1515 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1516 * to visit an SPTE before ever visiting its children, which means we 1517 * will correctly recursively split huge pages that are more than one 1518 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1519 * and then splitting each of those to 512 4KB pages). 1520 */ 1521 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1522 retry: 1523 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1524 continue; 1525 1526 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1527 continue; 1528 1529 if (!sp) { 1530 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1531 if (!sp) { 1532 ret = -ENOMEM; 1533 trace_kvm_mmu_split_huge_page(iter.gfn, 1534 iter.old_spte, 1535 iter.level, ret); 1536 break; 1537 } 1538 1539 if (iter.yielded) 1540 continue; 1541 } 1542 1543 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1544 goto retry; 1545 1546 sp = NULL; 1547 } 1548 1549 rcu_read_unlock(); 1550 1551 /* 1552 * It's possible to exit the loop having never used the last sp if, for 1553 * example, a vCPU doing HugePage NX splitting wins the race and 1554 * installs its own sp in place of the last sp we tried to split. 1555 */ 1556 if (sp) 1557 tdp_mmu_free_sp(sp); 1558 1559 return ret; 1560 } 1561 1562 1563 /* 1564 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1565 */ 1566 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1567 const struct kvm_memory_slot *slot, 1568 gfn_t start, gfn_t end, 1569 int target_level, bool shared) 1570 { 1571 struct kvm_mmu_page *root; 1572 int r = 0; 1573 1574 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1575 1576 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1577 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1578 if (r) { 1579 kvm_tdp_mmu_put_root(kvm, root, shared); 1580 break; 1581 } 1582 } 1583 } 1584 1585 /* 1586 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1587 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1588 * If AD bits are not enabled, this will require clearing the writable bit on 1589 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1590 * be flushed. 1591 */ 1592 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1593 gfn_t start, gfn_t end) 1594 { 1595 struct tdp_iter iter; 1596 u64 new_spte; 1597 bool spte_set = false; 1598 1599 rcu_read_lock(); 1600 1601 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1602 retry: 1603 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1604 continue; 1605 1606 if (!is_shadow_present_pte(iter.old_spte)) 1607 continue; 1608 1609 if (spte_ad_need_write_protect(iter.old_spte)) { 1610 if (is_writable_pte(iter.old_spte)) 1611 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1612 else 1613 continue; 1614 } else { 1615 if (iter.old_spte & shadow_dirty_mask) 1616 new_spte = iter.old_spte & ~shadow_dirty_mask; 1617 else 1618 continue; 1619 } 1620 1621 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1622 goto retry; 1623 1624 spte_set = true; 1625 } 1626 1627 rcu_read_unlock(); 1628 return spte_set; 1629 } 1630 1631 /* 1632 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1633 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1634 * If AD bits are not enabled, this will require clearing the writable bit on 1635 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1636 * be flushed. 1637 */ 1638 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1639 const struct kvm_memory_slot *slot) 1640 { 1641 struct kvm_mmu_page *root; 1642 bool spte_set = false; 1643 1644 lockdep_assert_held_read(&kvm->mmu_lock); 1645 1646 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1647 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1648 slot->base_gfn + slot->npages); 1649 1650 return spte_set; 1651 } 1652 1653 /* 1654 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1655 * set in mask, starting at gfn. The given memslot is expected to contain all 1656 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1657 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1658 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1659 */ 1660 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1661 gfn_t gfn, unsigned long mask, bool wrprot) 1662 { 1663 struct tdp_iter iter; 1664 u64 new_spte; 1665 1666 rcu_read_lock(); 1667 1668 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1669 gfn + BITS_PER_LONG) { 1670 if (!mask) 1671 break; 1672 1673 if (iter.level > PG_LEVEL_4K || 1674 !(mask & (1UL << (iter.gfn - gfn)))) 1675 continue; 1676 1677 mask &= ~(1UL << (iter.gfn - gfn)); 1678 1679 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1680 if (is_writable_pte(iter.old_spte)) 1681 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1682 else 1683 continue; 1684 } else { 1685 if (iter.old_spte & shadow_dirty_mask) 1686 new_spte = iter.old_spte & ~shadow_dirty_mask; 1687 else 1688 continue; 1689 } 1690 1691 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1692 } 1693 1694 rcu_read_unlock(); 1695 } 1696 1697 /* 1698 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1699 * set in mask, starting at gfn. The given memslot is expected to contain all 1700 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1701 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1702 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1703 */ 1704 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1705 struct kvm_memory_slot *slot, 1706 gfn_t gfn, unsigned long mask, 1707 bool wrprot) 1708 { 1709 struct kvm_mmu_page *root; 1710 1711 lockdep_assert_held_write(&kvm->mmu_lock); 1712 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1713 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1714 } 1715 1716 /* 1717 * Clear leaf entries which could be replaced by large mappings, for 1718 * GFNs within the slot. 1719 */ 1720 static void zap_collapsible_spte_range(struct kvm *kvm, 1721 struct kvm_mmu_page *root, 1722 const struct kvm_memory_slot *slot) 1723 { 1724 gfn_t start = slot->base_gfn; 1725 gfn_t end = start + slot->npages; 1726 struct tdp_iter iter; 1727 kvm_pfn_t pfn; 1728 1729 rcu_read_lock(); 1730 1731 tdp_root_for_each_pte(iter, root, start, end) { 1732 retry: 1733 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1734 continue; 1735 1736 if (!is_shadow_present_pte(iter.old_spte) || 1737 !is_last_spte(iter.old_spte, iter.level)) 1738 continue; 1739 1740 pfn = spte_to_pfn(iter.old_spte); 1741 if (kvm_is_reserved_pfn(pfn) || 1742 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 1743 pfn, PG_LEVEL_NUM)) 1744 continue; 1745 1746 /* Note, a successful atomic zap also does a remote TLB flush. */ 1747 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1748 goto retry; 1749 } 1750 1751 rcu_read_unlock(); 1752 } 1753 1754 /* 1755 * Clear non-leaf entries (and free associated page tables) which could 1756 * be replaced by large mappings, for GFNs within the slot. 1757 */ 1758 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1759 const struct kvm_memory_slot *slot) 1760 { 1761 struct kvm_mmu_page *root; 1762 1763 lockdep_assert_held_read(&kvm->mmu_lock); 1764 1765 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1766 zap_collapsible_spte_range(kvm, root, slot); 1767 } 1768 1769 /* 1770 * Removes write access on the last level SPTE mapping this GFN and unsets the 1771 * MMU-writable bit to ensure future writes continue to be intercepted. 1772 * Returns true if an SPTE was set and a TLB flush is needed. 1773 */ 1774 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1775 gfn_t gfn, int min_level) 1776 { 1777 struct tdp_iter iter; 1778 u64 new_spte; 1779 bool spte_set = false; 1780 1781 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1782 1783 rcu_read_lock(); 1784 1785 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1786 if (!is_shadow_present_pte(iter.old_spte) || 1787 !is_last_spte(iter.old_spte, iter.level)) 1788 continue; 1789 1790 new_spte = iter.old_spte & 1791 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1792 1793 if (new_spte == iter.old_spte) 1794 break; 1795 1796 tdp_mmu_set_spte(kvm, &iter, new_spte); 1797 spte_set = true; 1798 } 1799 1800 rcu_read_unlock(); 1801 1802 return spte_set; 1803 } 1804 1805 /* 1806 * Removes write access on the last level SPTE mapping this GFN and unsets the 1807 * MMU-writable bit to ensure future writes continue to be intercepted. 1808 * Returns true if an SPTE was set and a TLB flush is needed. 1809 */ 1810 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1811 struct kvm_memory_slot *slot, gfn_t gfn, 1812 int min_level) 1813 { 1814 struct kvm_mmu_page *root; 1815 bool spte_set = false; 1816 1817 lockdep_assert_held_write(&kvm->mmu_lock); 1818 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1819 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1820 1821 return spte_set; 1822 } 1823 1824 /* 1825 * Return the level of the lowest level SPTE added to sptes. 1826 * That SPTE may be non-present. 1827 * 1828 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1829 */ 1830 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1831 int *root_level) 1832 { 1833 struct tdp_iter iter; 1834 struct kvm_mmu *mmu = vcpu->arch.mmu; 1835 gfn_t gfn = addr >> PAGE_SHIFT; 1836 int leaf = -1; 1837 1838 *root_level = vcpu->arch.mmu->shadow_root_level; 1839 1840 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1841 leaf = iter.level; 1842 sptes[leaf] = iter.old_spte; 1843 } 1844 1845 return leaf; 1846 } 1847 1848 /* 1849 * Returns the last level spte pointer of the shadow page walk for the given 1850 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1851 * walk could be performed, returns NULL and *spte does not contain valid data. 1852 * 1853 * Contract: 1854 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1855 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1856 * 1857 * WARNING: This function is only intended to be called during fast_page_fault. 1858 */ 1859 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1860 u64 *spte) 1861 { 1862 struct tdp_iter iter; 1863 struct kvm_mmu *mmu = vcpu->arch.mmu; 1864 gfn_t gfn = addr >> PAGE_SHIFT; 1865 tdp_ptep_t sptep = NULL; 1866 1867 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1868 *spte = iter.old_spte; 1869 sptep = iter.sptep; 1870 } 1871 1872 /* 1873 * Perform the rcu_dereference to get the raw spte pointer value since 1874 * we are passing it up to fast_page_fault, which is shared with the 1875 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1876 * annotation. 1877 * 1878 * This is safe since fast_page_fault obeys the contracts of this 1879 * function as well as all TDP MMU contracts around modifying SPTEs 1880 * outside of mmu_lock. 1881 */ 1882 return rcu_dereference(sptep); 1883 } 1884