1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include "mmu.h" 5 #include "mmu_internal.h" 6 #include "mmutrace.h" 7 #include "tdp_iter.h" 8 #include "tdp_mmu.h" 9 #include "spte.h" 10 11 #include <asm/cmpxchg.h> 12 #include <trace/events/kvm.h> 13 14 /* Initializes the TDP MMU for the VM, if enabled. */ 15 int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16 { 17 struct workqueue_struct *wq; 18 19 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 20 if (!wq) 21 return -ENOMEM; 22 23 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 24 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 25 kvm->arch.tdp_mmu_zap_wq = wq; 26 return 1; 27 } 28 29 /* Arbitrarily returns true so that this may be used in if statements. */ 30 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 31 bool shared) 32 { 33 if (shared) 34 lockdep_assert_held_read(&kvm->mmu_lock); 35 else 36 lockdep_assert_held_write(&kvm->mmu_lock); 37 38 return true; 39 } 40 41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 42 { 43 /* Also waits for any queued work items. */ 44 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 45 46 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 48 49 /* 50 * Ensure that all the outstanding RCU callbacks to free shadow pages 51 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 52 * can call kvm_tdp_mmu_put_root and create new callbacks. 53 */ 54 rcu_barrier(); 55 } 56 57 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 58 { 59 free_page((unsigned long)sp->spt); 60 kmem_cache_free(mmu_page_header_cache, sp); 61 } 62 63 /* 64 * This is called through call_rcu in order to free TDP page table memory 65 * safely with respect to other kernel threads that may be operating on 66 * the memory. 67 * By only accessing TDP MMU page table memory in an RCU read critical 68 * section, and freeing it after a grace period, lockless access to that 69 * memory won't use it after it is freed. 70 */ 71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 72 { 73 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 74 rcu_head); 75 76 tdp_mmu_free_sp(sp); 77 } 78 79 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 80 bool shared); 81 82 static void tdp_mmu_zap_root_work(struct work_struct *work) 83 { 84 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 85 tdp_mmu_async_work); 86 struct kvm *kvm = root->tdp_mmu_async_data; 87 88 read_lock(&kvm->mmu_lock); 89 90 /* 91 * A TLB flush is not necessary as KVM performs a local TLB flush when 92 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 93 * to a different pCPU. Note, the local TLB flush on reuse also 94 * invalidates any paging-structure-cache entries, i.e. TLB entries for 95 * intermediate paging structures, that may be zapped, as such entries 96 * are associated with the ASID on both VMX and SVM. 97 */ 98 tdp_mmu_zap_root(kvm, root, true); 99 100 /* 101 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 102 * avoiding an infinite loop. By design, the root is reachable while 103 * it's being asynchronously zapped, thus a different task can put its 104 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 105 * asynchronously zapped root is unavoidable. 106 */ 107 kvm_tdp_mmu_put_root(kvm, root, true); 108 109 read_unlock(&kvm->mmu_lock); 110 } 111 112 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 113 { 114 root->tdp_mmu_async_data = kvm; 115 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 116 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 117 } 118 119 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 120 { 121 union kvm_mmu_page_role role = page->role; 122 role.invalid = true; 123 124 /* No need to use cmpxchg, only the invalid bit can change. */ 125 role.word = xchg(&page->role.word, role.word); 126 return role.invalid; 127 } 128 129 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 130 bool shared) 131 { 132 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 133 134 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 135 return; 136 137 WARN_ON(!is_tdp_mmu_page(root)); 138 139 /* 140 * The root now has refcount=0. It is valid, but readers already 141 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 142 * rejects it. This remains true for the rest of the execution 143 * of this function, because readers visit valid roots only 144 * (except for tdp_mmu_zap_root_work(), which however 145 * does not acquire any reference itself). 146 * 147 * Even though there are flows that need to visit all roots for 148 * correctness, they all take mmu_lock for write, so they cannot yet 149 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 150 * since the root still has refcount=0. 151 * 152 * However, tdp_mmu_zap_root can yield, and writers do not expect to 153 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 154 * So the root temporarily gets an extra reference, going to refcount=1 155 * while staying invalid. Readers still cannot acquire any reference; 156 * but writers are now allowed to run if tdp_mmu_zap_root yields and 157 * they might take an extra reference if they themselves yield. 158 * Therefore, when the reference is given back by the worker, 159 * there is no guarantee that the refcount is still 1. If not, whoever 160 * puts the last reference will free the page, but they will not have to 161 * zap the root because a root cannot go from invalid to valid. 162 */ 163 if (!kvm_tdp_root_mark_invalid(root)) { 164 refcount_set(&root->tdp_mmu_root_count, 1); 165 166 /* 167 * Zapping the root in a worker is not just "nice to have"; 168 * it is required because kvm_tdp_mmu_invalidate_all_roots() 169 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 170 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 171 * might return with some roots not zapped yet. 172 */ 173 tdp_mmu_schedule_zap_root(kvm, root); 174 return; 175 } 176 177 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 178 list_del_rcu(&root->link); 179 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 180 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 181 } 182 183 /* 184 * Returns the next root after @prev_root (or the first root if @prev_root is 185 * NULL). A reference to the returned root is acquired, and the reference to 186 * @prev_root is released (the caller obviously must hold a reference to 187 * @prev_root if it's non-NULL). 188 * 189 * If @only_valid is true, invalid roots are skipped. 190 * 191 * Returns NULL if the end of tdp_mmu_roots was reached. 192 */ 193 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 194 struct kvm_mmu_page *prev_root, 195 bool shared, bool only_valid) 196 { 197 struct kvm_mmu_page *next_root; 198 199 rcu_read_lock(); 200 201 if (prev_root) 202 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 203 &prev_root->link, 204 typeof(*prev_root), link); 205 else 206 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 207 typeof(*next_root), link); 208 209 while (next_root) { 210 if ((!only_valid || !next_root->role.invalid) && 211 kvm_tdp_mmu_get_root(next_root)) 212 break; 213 214 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 215 &next_root->link, typeof(*next_root), link); 216 } 217 218 rcu_read_unlock(); 219 220 if (prev_root) 221 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 222 223 return next_root; 224 } 225 226 /* 227 * Note: this iterator gets and puts references to the roots it iterates over. 228 * This makes it safe to release the MMU lock and yield within the loop, but 229 * if exiting the loop early, the caller must drop the reference to the most 230 * recent root. (Unless keeping a live reference is desirable.) 231 * 232 * If shared is set, this function is operating under the MMU lock in read 233 * mode. In the unlikely event that this thread must free a root, the lock 234 * will be temporarily dropped and reacquired in write mode. 235 */ 236 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 237 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 238 _root; \ 239 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 240 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 241 kvm_mmu_page_as_id(_root) != _as_id) { \ 242 } else 243 244 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 245 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 246 247 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 248 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 249 250 /* 251 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 252 * the implication being that any flow that holds mmu_lock for read is 253 * inherently yield-friendly and should use the yield-safe variant above. 254 * Holding mmu_lock for write obviates the need for RCU protection as the list 255 * is guaranteed to be stable. 256 */ 257 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 258 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 259 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 260 kvm_mmu_page_as_id(_root) != _as_id) { \ 261 } else 262 263 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 264 { 265 struct kvm_mmu_page *sp; 266 267 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 268 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 269 270 return sp; 271 } 272 273 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 274 gfn_t gfn, union kvm_mmu_page_role role) 275 { 276 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 277 278 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 279 280 sp->role = role; 281 sp->gfn = gfn; 282 sp->ptep = sptep; 283 sp->tdp_mmu_page = true; 284 285 trace_kvm_mmu_get_page(sp, true); 286 } 287 288 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 289 struct tdp_iter *iter) 290 { 291 struct kvm_mmu_page *parent_sp; 292 union kvm_mmu_page_role role; 293 294 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 295 296 role = parent_sp->role; 297 role.level--; 298 299 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 300 } 301 302 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 303 { 304 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 305 struct kvm *kvm = vcpu->kvm; 306 struct kvm_mmu_page *root; 307 308 lockdep_assert_held_write(&kvm->mmu_lock); 309 310 /* 311 * Check for an existing root before allocating a new one. Note, the 312 * role check prevents consuming an invalid root. 313 */ 314 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 315 if (root->role.word == role.word && 316 kvm_tdp_mmu_get_root(root)) 317 goto out; 318 } 319 320 root = tdp_mmu_alloc_sp(vcpu); 321 tdp_mmu_init_sp(root, NULL, 0, role); 322 323 refcount_set(&root->tdp_mmu_root_count, 1); 324 325 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 326 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 327 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 328 329 out: 330 return __pa(root->spt); 331 } 332 333 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 334 u64 old_spte, u64 new_spte, int level, 335 bool shared); 336 337 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 338 { 339 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 340 return; 341 342 if (is_accessed_spte(old_spte) && 343 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 344 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 345 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 346 } 347 348 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 349 u64 old_spte, u64 new_spte, int level) 350 { 351 bool pfn_changed; 352 struct kvm_memory_slot *slot; 353 354 if (level > PG_LEVEL_4K) 355 return; 356 357 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 358 359 if ((!is_writable_pte(old_spte) || pfn_changed) && 360 is_writable_pte(new_spte)) { 361 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 362 mark_page_dirty_in_slot(kvm, slot, gfn); 363 } 364 } 365 366 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 367 { 368 kvm_account_pgtable_pages((void *)sp->spt, +1); 369 atomic64_inc(&kvm->arch.tdp_mmu_pages); 370 } 371 372 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 373 { 374 kvm_account_pgtable_pages((void *)sp->spt, -1); 375 atomic64_dec(&kvm->arch.tdp_mmu_pages); 376 } 377 378 /** 379 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 380 * 381 * @kvm: kvm instance 382 * @sp: the page to be removed 383 * @shared: This operation may not be running under the exclusive use of 384 * the MMU lock and the operation must synchronize with other 385 * threads that might be adding or removing pages. 386 */ 387 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 388 bool shared) 389 { 390 tdp_unaccount_mmu_page(kvm, sp); 391 392 if (!sp->nx_huge_page_disallowed) 393 return; 394 395 if (shared) 396 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 397 else 398 lockdep_assert_held_write(&kvm->mmu_lock); 399 400 sp->nx_huge_page_disallowed = false; 401 untrack_possible_nx_huge_page(kvm, sp); 402 403 if (shared) 404 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 405 } 406 407 /** 408 * handle_removed_pt() - handle a page table removed from the TDP structure 409 * 410 * @kvm: kvm instance 411 * @pt: the page removed from the paging structure 412 * @shared: This operation may not be running under the exclusive use 413 * of the MMU lock and the operation must synchronize with other 414 * threads that might be modifying SPTEs. 415 * 416 * Given a page table that has been removed from the TDP paging structure, 417 * iterates through the page table to clear SPTEs and free child page tables. 418 * 419 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 420 * protection. Since this thread removed it from the paging structure, 421 * this thread will be responsible for ensuring the page is freed. Hence the 422 * early rcu_dereferences in the function. 423 */ 424 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 425 { 426 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 427 int level = sp->role.level; 428 gfn_t base_gfn = sp->gfn; 429 int i; 430 431 trace_kvm_mmu_prepare_zap_page(sp); 432 433 tdp_mmu_unlink_sp(kvm, sp, shared); 434 435 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 436 tdp_ptep_t sptep = pt + i; 437 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 438 u64 old_spte; 439 440 if (shared) { 441 /* 442 * Set the SPTE to a nonpresent value that other 443 * threads will not overwrite. If the SPTE was 444 * already marked as removed then another thread 445 * handling a page fault could overwrite it, so 446 * set the SPTE until it is set from some other 447 * value to the removed SPTE value. 448 */ 449 for (;;) { 450 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 451 if (!is_removed_spte(old_spte)) 452 break; 453 cpu_relax(); 454 } 455 } else { 456 /* 457 * If the SPTE is not MMU-present, there is no backing 458 * page associated with the SPTE and so no side effects 459 * that need to be recorded, and exclusive ownership of 460 * mmu_lock ensures the SPTE can't be made present. 461 * Note, zapping MMIO SPTEs is also unnecessary as they 462 * are guarded by the memslots generation, not by being 463 * unreachable. 464 */ 465 old_spte = kvm_tdp_mmu_read_spte(sptep); 466 if (!is_shadow_present_pte(old_spte)) 467 continue; 468 469 /* 470 * Use the common helper instead of a raw WRITE_ONCE as 471 * the SPTE needs to be updated atomically if it can be 472 * modified by a different vCPU outside of mmu_lock. 473 * Even though the parent SPTE is !PRESENT, the TLB 474 * hasn't yet been flushed, and both Intel and AMD 475 * document that A/D assists can use upper-level PxE 476 * entries that are cached in the TLB, i.e. the CPU can 477 * still access the page and mark it dirty. 478 * 479 * No retry is needed in the atomic update path as the 480 * sole concern is dropping a Dirty bit, i.e. no other 481 * task can zap/remove the SPTE as mmu_lock is held for 482 * write. Marking the SPTE as a removed SPTE is not 483 * strictly necessary for the same reason, but using 484 * the remove SPTE value keeps the shared/exclusive 485 * paths consistent and allows the handle_changed_spte() 486 * call below to hardcode the new value to REMOVED_SPTE. 487 * 488 * Note, even though dropping a Dirty bit is the only 489 * scenario where a non-atomic update could result in a 490 * functional bug, simply checking the Dirty bit isn't 491 * sufficient as a fast page fault could read the upper 492 * level SPTE before it is zapped, and then make this 493 * target SPTE writable, resume the guest, and set the 494 * Dirty bit between reading the SPTE above and writing 495 * it here. 496 */ 497 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 498 REMOVED_SPTE, level); 499 } 500 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 501 old_spte, REMOVED_SPTE, level, shared); 502 } 503 504 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 505 } 506 507 /** 508 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 509 * @kvm: kvm instance 510 * @as_id: the address space of the paging structure the SPTE was a part of 511 * @gfn: the base GFN that was mapped by the SPTE 512 * @old_spte: The value of the SPTE before the change 513 * @new_spte: The value of the SPTE after the change 514 * @level: the level of the PT the SPTE is part of in the paging structure 515 * @shared: This operation may not be running under the exclusive use of 516 * the MMU lock and the operation must synchronize with other 517 * threads that might be modifying SPTEs. 518 * 519 * Handle bookkeeping that might result from the modification of a SPTE. 520 */ 521 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 522 u64 old_spte, u64 new_spte, int level, 523 bool shared) 524 { 525 bool was_present = is_shadow_present_pte(old_spte); 526 bool is_present = is_shadow_present_pte(new_spte); 527 bool was_leaf = was_present && is_last_spte(old_spte, level); 528 bool is_leaf = is_present && is_last_spte(new_spte, level); 529 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 530 531 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 532 WARN_ON(level < PG_LEVEL_4K); 533 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 534 535 /* 536 * If this warning were to trigger it would indicate that there was a 537 * missing MMU notifier or a race with some notifier handler. 538 * A present, leaf SPTE should never be directly replaced with another 539 * present leaf SPTE pointing to a different PFN. A notifier handler 540 * should be zapping the SPTE before the main MM's page table is 541 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 542 * thread before replacement. 543 */ 544 if (was_leaf && is_leaf && pfn_changed) { 545 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 546 "SPTE with another present leaf SPTE mapping a\n" 547 "different PFN!\n" 548 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 549 as_id, gfn, old_spte, new_spte, level); 550 551 /* 552 * Crash the host to prevent error propagation and guest data 553 * corruption. 554 */ 555 BUG(); 556 } 557 558 if (old_spte == new_spte) 559 return; 560 561 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 562 563 if (is_leaf) 564 check_spte_writable_invariants(new_spte); 565 566 /* 567 * The only times a SPTE should be changed from a non-present to 568 * non-present state is when an MMIO entry is installed/modified/ 569 * removed. In that case, there is nothing to do here. 570 */ 571 if (!was_present && !is_present) { 572 /* 573 * If this change does not involve a MMIO SPTE or removed SPTE, 574 * it is unexpected. Log the change, though it should not 575 * impact the guest since both the former and current SPTEs 576 * are nonpresent. 577 */ 578 if (WARN_ON(!is_mmio_spte(old_spte) && 579 !is_mmio_spte(new_spte) && 580 !is_removed_spte(new_spte))) 581 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 582 "should not be replaced with another,\n" 583 "different nonpresent SPTE, unless one or both\n" 584 "are MMIO SPTEs, or the new SPTE is\n" 585 "a temporary removed SPTE.\n" 586 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 587 as_id, gfn, old_spte, new_spte, level); 588 return; 589 } 590 591 if (is_leaf != was_leaf) 592 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 593 594 if (was_leaf && is_dirty_spte(old_spte) && 595 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 596 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 597 598 /* 599 * Recursively handle child PTs if the change removed a subtree from 600 * the paging structure. Note the WARN on the PFN changing without the 601 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 602 * pages are kernel allocations and should never be migrated. 603 */ 604 if (was_present && !was_leaf && 605 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 606 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 607 } 608 609 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 610 u64 old_spte, u64 new_spte, int level, 611 bool shared) 612 { 613 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 614 shared); 615 handle_changed_spte_acc_track(old_spte, new_spte, level); 616 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 617 new_spte, level); 618 } 619 620 /* 621 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 622 * and handle the associated bookkeeping. Do not mark the page dirty 623 * in KVM's dirty bitmaps. 624 * 625 * If setting the SPTE fails because it has changed, iter->old_spte will be 626 * refreshed to the current value of the spte. 627 * 628 * @kvm: kvm instance 629 * @iter: a tdp_iter instance currently on the SPTE that should be set 630 * @new_spte: The value the SPTE should be set to 631 * Return: 632 * * 0 - If the SPTE was set. 633 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 634 * no side-effects other than setting iter->old_spte to the last 635 * known value of the spte. 636 */ 637 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 638 struct tdp_iter *iter, 639 u64 new_spte) 640 { 641 u64 *sptep = rcu_dereference(iter->sptep); 642 643 /* 644 * The caller is responsible for ensuring the old SPTE is not a REMOVED 645 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 646 * and pre-checking before inserting a new SPTE is advantageous as it 647 * avoids unnecessary work. 648 */ 649 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 650 651 lockdep_assert_held_read(&kvm->mmu_lock); 652 653 /* 654 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 655 * does not hold the mmu_lock. 656 */ 657 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 658 return -EBUSY; 659 660 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 661 new_spte, iter->level, true); 662 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 663 664 return 0; 665 } 666 667 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 668 struct tdp_iter *iter) 669 { 670 int ret; 671 672 /* 673 * Freeze the SPTE by setting it to a special, 674 * non-present value. This will stop other threads from 675 * immediately installing a present entry in its place 676 * before the TLBs are flushed. 677 */ 678 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 679 if (ret) 680 return ret; 681 682 kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); 683 684 /* 685 * No other thread can overwrite the removed SPTE as they must either 686 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 687 * overwrite the special removed SPTE value. No bookkeeping is needed 688 * here since the SPTE is going from non-present to non-present. Use 689 * the raw write helper to avoid an unnecessary check on volatile bits. 690 */ 691 __kvm_tdp_mmu_write_spte(iter->sptep, 0); 692 693 return 0; 694 } 695 696 697 /* 698 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 699 * @kvm: KVM instance 700 * @as_id: Address space ID, i.e. regular vs. SMM 701 * @sptep: Pointer to the SPTE 702 * @old_spte: The current value of the SPTE 703 * @new_spte: The new value that will be set for the SPTE 704 * @gfn: The base GFN that was (or will be) mapped by the SPTE 705 * @level: The level _containing_ the SPTE (its parent PT's level) 706 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 707 * of the page. Should be set unless handling an MMU 708 * notifier for access tracking. Leaving record_acc_track 709 * unset in that case prevents page accesses from being 710 * double counted. 711 * 712 * Returns the old SPTE value, which _may_ be different than @old_spte if the 713 * SPTE had voldatile bits. 714 */ 715 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 716 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 717 bool record_acc_track) 718 { 719 lockdep_assert_held_write(&kvm->mmu_lock); 720 721 /* 722 * No thread should be using this function to set SPTEs to or from the 723 * temporary removed SPTE value. 724 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 725 * should be used. If operating under the MMU lock in write mode, the 726 * use of the removed SPTE should not be necessary. 727 */ 728 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 729 730 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 731 732 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 733 734 if (record_acc_track) 735 handle_changed_spte_acc_track(old_spte, new_spte, level); 736 737 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, new_spte, 738 level); 739 return old_spte; 740 } 741 742 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 743 u64 new_spte, bool record_acc_track) 744 { 745 WARN_ON_ONCE(iter->yielded); 746 747 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 748 iter->old_spte, new_spte, 749 iter->gfn, iter->level, 750 record_acc_track); 751 } 752 753 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 754 u64 new_spte) 755 { 756 _tdp_mmu_set_spte(kvm, iter, new_spte, true); 757 } 758 759 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 760 struct tdp_iter *iter, 761 u64 new_spte) 762 { 763 _tdp_mmu_set_spte(kvm, iter, new_spte, false); 764 } 765 766 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 767 for_each_tdp_pte(_iter, _root, _start, _end) 768 769 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 770 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 771 if (!is_shadow_present_pte(_iter.old_spte) || \ 772 !is_last_spte(_iter.old_spte, _iter.level)) \ 773 continue; \ 774 else 775 776 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 777 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 778 779 /* 780 * Yield if the MMU lock is contended or this thread needs to return control 781 * to the scheduler. 782 * 783 * If this function should yield and flush is set, it will perform a remote 784 * TLB flush before yielding. 785 * 786 * If this function yields, iter->yielded is set and the caller must skip to 787 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 788 * over the paging structures to allow the iterator to continue its traversal 789 * from the paging structure root. 790 * 791 * Returns true if this function yielded. 792 */ 793 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 794 struct tdp_iter *iter, 795 bool flush, bool shared) 796 { 797 WARN_ON(iter->yielded); 798 799 /* Ensure forward progress has been made before yielding. */ 800 if (iter->next_last_level_gfn == iter->yielded_gfn) 801 return false; 802 803 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 804 if (flush) 805 kvm_flush_remote_tlbs(kvm); 806 807 rcu_read_unlock(); 808 809 if (shared) 810 cond_resched_rwlock_read(&kvm->mmu_lock); 811 else 812 cond_resched_rwlock_write(&kvm->mmu_lock); 813 814 rcu_read_lock(); 815 816 WARN_ON(iter->gfn > iter->next_last_level_gfn); 817 818 iter->yielded = true; 819 } 820 821 return iter->yielded; 822 } 823 824 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 825 { 826 /* 827 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 828 * a gpa range that would exceed the max gfn, and KVM does not create 829 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 830 * the slow emulation path every time. 831 */ 832 return kvm_mmu_max_gfn() + 1; 833 } 834 835 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 836 bool shared, int zap_level) 837 { 838 struct tdp_iter iter; 839 840 gfn_t end = tdp_mmu_max_gfn_exclusive(); 841 gfn_t start = 0; 842 843 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 844 retry: 845 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 846 continue; 847 848 if (!is_shadow_present_pte(iter.old_spte)) 849 continue; 850 851 if (iter.level > zap_level) 852 continue; 853 854 if (!shared) 855 tdp_mmu_set_spte(kvm, &iter, 0); 856 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 857 goto retry; 858 } 859 } 860 861 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 862 bool shared) 863 { 864 865 /* 866 * The root must have an elevated refcount so that it's reachable via 867 * mmu_notifier callbacks, which allows this path to yield and drop 868 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 869 * must drop all references to relevant pages prior to completing the 870 * callback. Dropping mmu_lock with an unreachable root would result 871 * in zapping SPTEs after a relevant mmu_notifier callback completes 872 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 873 * dirty accessed bits to the SPTE's associated struct page. 874 */ 875 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 876 877 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 878 879 rcu_read_lock(); 880 881 /* 882 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 883 * split the zap into two passes. On the first pass, zap at the 1gb 884 * level, and then zap top-level SPs on the second pass. "1gb" is not 885 * arbitrary, as KVM must be able to zap a 1gb shadow page without 886 * inducing a stall to allow in-place replacement with a 1gb hugepage. 887 * 888 * Because zapping a SP recurses on its children, stepping down to 889 * PG_LEVEL_4K in the iterator itself is unnecessary. 890 */ 891 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 892 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 893 894 rcu_read_unlock(); 895 } 896 897 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 898 { 899 u64 old_spte; 900 901 /* 902 * This helper intentionally doesn't allow zapping a root shadow page, 903 * which doesn't have a parent page table and thus no associated entry. 904 */ 905 if (WARN_ON_ONCE(!sp->ptep)) 906 return false; 907 908 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 909 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 910 return false; 911 912 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 913 sp->gfn, sp->role.level + 1, true); 914 915 return true; 916 } 917 918 /* 919 * If can_yield is true, will release the MMU lock and reschedule if the 920 * scheduler needs the CPU or there is contention on the MMU lock. If this 921 * function cannot yield, it will not release the MMU lock or reschedule and 922 * the caller must ensure it does not supply too large a GFN range, or the 923 * operation can cause a soft lockup. 924 */ 925 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 926 gfn_t start, gfn_t end, bool can_yield, bool flush) 927 { 928 struct tdp_iter iter; 929 930 end = min(end, tdp_mmu_max_gfn_exclusive()); 931 932 lockdep_assert_held_write(&kvm->mmu_lock); 933 934 rcu_read_lock(); 935 936 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 937 if (can_yield && 938 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 939 flush = false; 940 continue; 941 } 942 943 if (!is_shadow_present_pte(iter.old_spte) || 944 !is_last_spte(iter.old_spte, iter.level)) 945 continue; 946 947 tdp_mmu_set_spte(kvm, &iter, 0); 948 flush = true; 949 } 950 951 rcu_read_unlock(); 952 953 /* 954 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 955 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 956 */ 957 return flush; 958 } 959 960 /* 961 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 962 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 963 * more SPTEs were zapped since the MMU lock was last acquired. 964 */ 965 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 966 bool can_yield, bool flush) 967 { 968 struct kvm_mmu_page *root; 969 970 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 971 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 972 973 return flush; 974 } 975 976 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 977 { 978 struct kvm_mmu_page *root; 979 int i; 980 981 /* 982 * Zap all roots, including invalid roots, as all SPTEs must be dropped 983 * before returning to the caller. Zap directly even if the root is 984 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 985 * all that expensive and mmu_lock is already held, which means the 986 * worker has yielded, i.e. flushing the work instead of zapping here 987 * isn't guaranteed to be any faster. 988 * 989 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 990 * is being destroyed or the userspace VMM has exited. In both cases, 991 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 992 */ 993 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 994 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 995 tdp_mmu_zap_root(kvm, root, false); 996 } 997 } 998 999 /* 1000 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1001 * zap" completes. 1002 */ 1003 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1004 { 1005 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1006 } 1007 1008 /* 1009 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1010 * is about to be zapped, e.g. in response to a memslots update. The actual 1011 * zapping is performed asynchronously, so a reference is taken on all roots. 1012 * Using a separate workqueue makes it easy to ensure that the destruction is 1013 * performed before the "fast zap" completes, without keeping a separate list 1014 * of invalidated roots; the list is effectively the list of work items in 1015 * the workqueue. 1016 * 1017 * Get a reference even if the root is already invalid, the asynchronous worker 1018 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1019 * is held for write, it should be impossible to observe a root with zero refcount, 1020 * i.e. the list of roots cannot be stale. 1021 * 1022 * This has essentially the same effect for the TDP MMU 1023 * as updating mmu_valid_gen does for the shadow MMU. 1024 */ 1025 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1026 { 1027 struct kvm_mmu_page *root; 1028 1029 lockdep_assert_held_write(&kvm->mmu_lock); 1030 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1031 if (!root->role.invalid && 1032 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1033 root->role.invalid = true; 1034 tdp_mmu_schedule_zap_root(kvm, root); 1035 } 1036 } 1037 } 1038 1039 /* 1040 * Installs a last-level SPTE to handle a TDP page fault. 1041 * (NPT/EPT violation/misconfiguration) 1042 */ 1043 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1044 struct kvm_page_fault *fault, 1045 struct tdp_iter *iter) 1046 { 1047 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1048 u64 new_spte; 1049 int ret = RET_PF_FIXED; 1050 bool wrprot = false; 1051 1052 if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 1053 return RET_PF_RETRY; 1054 1055 if (unlikely(!fault->slot)) 1056 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1057 else 1058 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1059 fault->pfn, iter->old_spte, fault->prefetch, true, 1060 fault->map_writable, &new_spte); 1061 1062 if (new_spte == iter->old_spte) 1063 ret = RET_PF_SPURIOUS; 1064 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1065 return RET_PF_RETRY; 1066 else if (is_shadow_present_pte(iter->old_spte) && 1067 !is_last_spte(iter->old_spte, iter->level)) 1068 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 1069 1070 /* 1071 * If the page fault was caused by a write but the page is write 1072 * protected, emulation is needed. If the emulation was skipped, 1073 * the vCPU would have the same fault again. 1074 */ 1075 if (wrprot) { 1076 if (fault->write) 1077 ret = RET_PF_EMULATE; 1078 } 1079 1080 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1081 if (unlikely(is_mmio_spte(new_spte))) { 1082 vcpu->stat.pf_mmio_spte_created++; 1083 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1084 new_spte); 1085 ret = RET_PF_EMULATE; 1086 } else { 1087 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1088 rcu_dereference(iter->sptep)); 1089 } 1090 1091 return ret; 1092 } 1093 1094 /* 1095 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1096 * provided page table. 1097 * 1098 * @kvm: kvm instance 1099 * @iter: a tdp_iter instance currently on the SPTE that should be set 1100 * @sp: The new TDP page table to install. 1101 * @shared: This operation is running under the MMU lock in read mode. 1102 * 1103 * Returns: 0 if the new page table was installed. Non-0 if the page table 1104 * could not be installed (e.g. the atomic compare-exchange failed). 1105 */ 1106 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1107 struct kvm_mmu_page *sp, bool shared) 1108 { 1109 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1110 int ret = 0; 1111 1112 if (shared) { 1113 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1114 if (ret) 1115 return ret; 1116 } else { 1117 tdp_mmu_set_spte(kvm, iter, spte); 1118 } 1119 1120 tdp_account_mmu_page(kvm, sp); 1121 1122 return 0; 1123 } 1124 1125 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1126 struct kvm_mmu_page *sp, bool shared); 1127 1128 /* 1129 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1130 * page tables and SPTEs to translate the faulting guest physical address. 1131 */ 1132 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1133 { 1134 struct kvm_mmu *mmu = vcpu->arch.mmu; 1135 struct kvm *kvm = vcpu->kvm; 1136 struct tdp_iter iter; 1137 struct kvm_mmu_page *sp; 1138 int ret = RET_PF_RETRY; 1139 1140 kvm_mmu_hugepage_adjust(vcpu, fault); 1141 1142 trace_kvm_mmu_spte_requested(fault); 1143 1144 rcu_read_lock(); 1145 1146 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1147 int r; 1148 1149 if (fault->nx_huge_page_workaround_enabled) 1150 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1151 1152 /* 1153 * If SPTE has been frozen by another thread, just give up and 1154 * retry, avoiding unnecessary page table allocation and free. 1155 */ 1156 if (is_removed_spte(iter.old_spte)) 1157 goto retry; 1158 1159 if (iter.level == fault->goal_level) 1160 goto map_target_level; 1161 1162 /* Step down into the lower level page table if it exists. */ 1163 if (is_shadow_present_pte(iter.old_spte) && 1164 !is_large_pte(iter.old_spte)) 1165 continue; 1166 1167 /* 1168 * The SPTE is either non-present or points to a huge page that 1169 * needs to be split. 1170 */ 1171 sp = tdp_mmu_alloc_sp(vcpu); 1172 tdp_mmu_init_child_sp(sp, &iter); 1173 1174 sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 1175 1176 if (is_shadow_present_pte(iter.old_spte)) 1177 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1178 else 1179 r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1180 1181 /* 1182 * Force the guest to retry if installing an upper level SPTE 1183 * failed, e.g. because a different task modified the SPTE. 1184 */ 1185 if (r) { 1186 tdp_mmu_free_sp(sp); 1187 goto retry; 1188 } 1189 1190 if (fault->huge_page_disallowed && 1191 fault->req_level >= iter.level) { 1192 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1193 if (sp->nx_huge_page_disallowed) 1194 track_possible_nx_huge_page(kvm, sp); 1195 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1196 } 1197 } 1198 1199 /* 1200 * The walk aborted before reaching the target level, e.g. because the 1201 * iterator detected an upper level SPTE was frozen during traversal. 1202 */ 1203 WARN_ON_ONCE(iter.level == fault->goal_level); 1204 goto retry; 1205 1206 map_target_level: 1207 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1208 1209 retry: 1210 rcu_read_unlock(); 1211 return ret; 1212 } 1213 1214 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1215 bool flush) 1216 { 1217 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 1218 range->end, range->may_block, flush); 1219 } 1220 1221 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1222 struct kvm_gfn_range *range); 1223 1224 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1225 struct kvm_gfn_range *range, 1226 tdp_handler_t handler) 1227 { 1228 struct kvm_mmu_page *root; 1229 struct tdp_iter iter; 1230 bool ret = false; 1231 1232 /* 1233 * Don't support rescheduling, none of the MMU notifiers that funnel 1234 * into this helper allow blocking; it'd be dead, wasteful code. 1235 */ 1236 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1237 rcu_read_lock(); 1238 1239 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1240 ret |= handler(kvm, &iter, range); 1241 1242 rcu_read_unlock(); 1243 } 1244 1245 return ret; 1246 } 1247 1248 /* 1249 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1250 * if any of the GFNs in the range have been accessed. 1251 */ 1252 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1253 struct kvm_gfn_range *range) 1254 { 1255 u64 new_spte = 0; 1256 1257 /* If we have a non-accessed entry we don't need to change the pte. */ 1258 if (!is_accessed_spte(iter->old_spte)) 1259 return false; 1260 1261 new_spte = iter->old_spte; 1262 1263 if (spte_ad_enabled(new_spte)) { 1264 new_spte &= ~shadow_accessed_mask; 1265 } else { 1266 /* 1267 * Capture the dirty status of the page, so that it doesn't get 1268 * lost when the SPTE is marked for access tracking. 1269 */ 1270 if (is_writable_pte(new_spte)) 1271 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1272 1273 new_spte = mark_spte_for_access_track(new_spte); 1274 } 1275 1276 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1277 1278 return true; 1279 } 1280 1281 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1282 { 1283 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1284 } 1285 1286 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1287 struct kvm_gfn_range *range) 1288 { 1289 return is_accessed_spte(iter->old_spte); 1290 } 1291 1292 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1293 { 1294 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1295 } 1296 1297 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1298 struct kvm_gfn_range *range) 1299 { 1300 u64 new_spte; 1301 1302 /* Huge pages aren't expected to be modified without first being zapped. */ 1303 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1304 1305 if (iter->level != PG_LEVEL_4K || 1306 !is_shadow_present_pte(iter->old_spte)) 1307 return false; 1308 1309 /* 1310 * Note, when changing a read-only SPTE, it's not strictly necessary to 1311 * zero the SPTE before setting the new PFN, but doing so preserves the 1312 * invariant that the PFN of a present * leaf SPTE can never change. 1313 * See __handle_changed_spte(). 1314 */ 1315 tdp_mmu_set_spte(kvm, iter, 0); 1316 1317 if (!pte_write(range->pte)) { 1318 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1319 pte_pfn(range->pte)); 1320 1321 tdp_mmu_set_spte(kvm, iter, new_spte); 1322 } 1323 1324 return true; 1325 } 1326 1327 /* 1328 * Handle the changed_pte MMU notifier for the TDP MMU. 1329 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1330 * notifier. 1331 * Returns non-zero if a flush is needed before releasing the MMU lock. 1332 */ 1333 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1334 { 1335 /* 1336 * No need to handle the remote TLB flush under RCU protection, the 1337 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1338 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1339 */ 1340 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1341 } 1342 1343 /* 1344 * Remove write access from all SPTEs at or above min_level that map GFNs 1345 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1346 * be flushed. 1347 */ 1348 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1349 gfn_t start, gfn_t end, int min_level) 1350 { 1351 struct tdp_iter iter; 1352 u64 new_spte; 1353 bool spte_set = false; 1354 1355 rcu_read_lock(); 1356 1357 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1358 1359 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1360 retry: 1361 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1362 continue; 1363 1364 if (!is_shadow_present_pte(iter.old_spte) || 1365 !is_last_spte(iter.old_spte, iter.level) || 1366 !(iter.old_spte & PT_WRITABLE_MASK)) 1367 continue; 1368 1369 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1370 1371 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1372 goto retry; 1373 1374 spte_set = true; 1375 } 1376 1377 rcu_read_unlock(); 1378 return spte_set; 1379 } 1380 1381 /* 1382 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1383 * only affect leaf SPTEs down to min_level. 1384 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1385 */ 1386 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1387 const struct kvm_memory_slot *slot, int min_level) 1388 { 1389 struct kvm_mmu_page *root; 1390 bool spte_set = false; 1391 1392 lockdep_assert_held_read(&kvm->mmu_lock); 1393 1394 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1395 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1396 slot->base_gfn + slot->npages, min_level); 1397 1398 return spte_set; 1399 } 1400 1401 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1402 { 1403 struct kvm_mmu_page *sp; 1404 1405 gfp |= __GFP_ZERO; 1406 1407 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1408 if (!sp) 1409 return NULL; 1410 1411 sp->spt = (void *)__get_free_page(gfp); 1412 if (!sp->spt) { 1413 kmem_cache_free(mmu_page_header_cache, sp); 1414 return NULL; 1415 } 1416 1417 return sp; 1418 } 1419 1420 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1421 struct tdp_iter *iter, 1422 bool shared) 1423 { 1424 struct kvm_mmu_page *sp; 1425 1426 /* 1427 * Since we are allocating while under the MMU lock we have to be 1428 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1429 * reclaim and to avoid making any filesystem callbacks (which can end 1430 * up invoking KVM MMU notifiers, resulting in a deadlock). 1431 * 1432 * If this allocation fails we drop the lock and retry with reclaim 1433 * allowed. 1434 */ 1435 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1436 if (sp) 1437 return sp; 1438 1439 rcu_read_unlock(); 1440 1441 if (shared) 1442 read_unlock(&kvm->mmu_lock); 1443 else 1444 write_unlock(&kvm->mmu_lock); 1445 1446 iter->yielded = true; 1447 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1448 1449 if (shared) 1450 read_lock(&kvm->mmu_lock); 1451 else 1452 write_lock(&kvm->mmu_lock); 1453 1454 rcu_read_lock(); 1455 1456 return sp; 1457 } 1458 1459 /* Note, the caller is responsible for initializing @sp. */ 1460 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1461 struct kvm_mmu_page *sp, bool shared) 1462 { 1463 const u64 huge_spte = iter->old_spte; 1464 const int level = iter->level; 1465 int ret, i; 1466 1467 /* 1468 * No need for atomics when writing to sp->spt since the page table has 1469 * not been linked in yet and thus is not reachable from any other CPU. 1470 */ 1471 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1472 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1473 1474 /* 1475 * Replace the huge spte with a pointer to the populated lower level 1476 * page table. Since we are making this change without a TLB flush vCPUs 1477 * will see a mix of the split mappings and the original huge mapping, 1478 * depending on what's currently in their TLB. This is fine from a 1479 * correctness standpoint since the translation will be the same either 1480 * way. 1481 */ 1482 ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1483 if (ret) 1484 goto out; 1485 1486 /* 1487 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1488 * are overwriting from the page stats. But we have to manually update 1489 * the page stats with the new present child pages. 1490 */ 1491 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1492 1493 out: 1494 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1495 return ret; 1496 } 1497 1498 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1499 struct kvm_mmu_page *root, 1500 gfn_t start, gfn_t end, 1501 int target_level, bool shared) 1502 { 1503 struct kvm_mmu_page *sp = NULL; 1504 struct tdp_iter iter; 1505 int ret = 0; 1506 1507 rcu_read_lock(); 1508 1509 /* 1510 * Traverse the page table splitting all huge pages above the target 1511 * level into one lower level. For example, if we encounter a 1GB page 1512 * we split it into 512 2MB pages. 1513 * 1514 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1515 * to visit an SPTE before ever visiting its children, which means we 1516 * will correctly recursively split huge pages that are more than one 1517 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1518 * and then splitting each of those to 512 4KB pages). 1519 */ 1520 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1521 retry: 1522 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1523 continue; 1524 1525 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1526 continue; 1527 1528 if (!sp) { 1529 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1530 if (!sp) { 1531 ret = -ENOMEM; 1532 trace_kvm_mmu_split_huge_page(iter.gfn, 1533 iter.old_spte, 1534 iter.level, ret); 1535 break; 1536 } 1537 1538 if (iter.yielded) 1539 continue; 1540 } 1541 1542 tdp_mmu_init_child_sp(sp, &iter); 1543 1544 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1545 goto retry; 1546 1547 sp = NULL; 1548 } 1549 1550 rcu_read_unlock(); 1551 1552 /* 1553 * It's possible to exit the loop having never used the last sp if, for 1554 * example, a vCPU doing HugePage NX splitting wins the race and 1555 * installs its own sp in place of the last sp we tried to split. 1556 */ 1557 if (sp) 1558 tdp_mmu_free_sp(sp); 1559 1560 return ret; 1561 } 1562 1563 1564 /* 1565 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1566 */ 1567 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1568 const struct kvm_memory_slot *slot, 1569 gfn_t start, gfn_t end, 1570 int target_level, bool shared) 1571 { 1572 struct kvm_mmu_page *root; 1573 int r = 0; 1574 1575 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1576 1577 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1578 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1579 if (r) { 1580 kvm_tdp_mmu_put_root(kvm, root, shared); 1581 break; 1582 } 1583 } 1584 } 1585 1586 /* 1587 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1588 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1589 * If AD bits are not enabled, this will require clearing the writable bit on 1590 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1591 * be flushed. 1592 */ 1593 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1594 gfn_t start, gfn_t end) 1595 { 1596 u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; 1597 struct tdp_iter iter; 1598 bool spte_set = false; 1599 1600 rcu_read_lock(); 1601 1602 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1603 retry: 1604 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1605 continue; 1606 1607 if (!is_shadow_present_pte(iter.old_spte)) 1608 continue; 1609 1610 MMU_WARN_ON(kvm_ad_enabled() && 1611 spte_ad_need_write_protect(iter.old_spte)); 1612 1613 if (!(iter.old_spte & dbit)) 1614 continue; 1615 1616 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 1617 goto retry; 1618 1619 spte_set = true; 1620 } 1621 1622 rcu_read_unlock(); 1623 return spte_set; 1624 } 1625 1626 /* 1627 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1628 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1629 * If AD bits are not enabled, this will require clearing the writable bit on 1630 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1631 * be flushed. 1632 */ 1633 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1634 const struct kvm_memory_slot *slot) 1635 { 1636 struct kvm_mmu_page *root; 1637 bool spte_set = false; 1638 1639 lockdep_assert_held_read(&kvm->mmu_lock); 1640 1641 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1642 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1643 slot->base_gfn + slot->npages); 1644 1645 return spte_set; 1646 } 1647 1648 /* 1649 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1650 * set in mask, starting at gfn. The given memslot is expected to contain all 1651 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1652 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1653 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1654 */ 1655 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1656 gfn_t gfn, unsigned long mask, bool wrprot) 1657 { 1658 u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : 1659 shadow_dirty_mask; 1660 struct tdp_iter iter; 1661 1662 rcu_read_lock(); 1663 1664 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1665 gfn + BITS_PER_LONG) { 1666 if (!mask) 1667 break; 1668 1669 MMU_WARN_ON(kvm_ad_enabled() && 1670 spte_ad_need_write_protect(iter.old_spte)); 1671 1672 if (iter.level > PG_LEVEL_4K || 1673 !(mask & (1UL << (iter.gfn - gfn)))) 1674 continue; 1675 1676 mask &= ~(1UL << (iter.gfn - gfn)); 1677 1678 if (!(iter.old_spte & dbit)) 1679 continue; 1680 1681 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 1682 iter.old_spte, dbit, 1683 iter.level); 1684 1685 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 1686 iter.old_spte, 1687 iter.old_spte & ~dbit); 1688 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); 1689 } 1690 1691 rcu_read_unlock(); 1692 } 1693 1694 /* 1695 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1696 * set in mask, starting at gfn. The given memslot is expected to contain all 1697 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1698 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1699 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1700 */ 1701 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1702 struct kvm_memory_slot *slot, 1703 gfn_t gfn, unsigned long mask, 1704 bool wrprot) 1705 { 1706 struct kvm_mmu_page *root; 1707 1708 lockdep_assert_held_write(&kvm->mmu_lock); 1709 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1710 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1711 } 1712 1713 static void zap_collapsible_spte_range(struct kvm *kvm, 1714 struct kvm_mmu_page *root, 1715 const struct kvm_memory_slot *slot) 1716 { 1717 gfn_t start = slot->base_gfn; 1718 gfn_t end = start + slot->npages; 1719 struct tdp_iter iter; 1720 int max_mapping_level; 1721 1722 rcu_read_lock(); 1723 1724 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 1725 retry: 1726 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1727 continue; 1728 1729 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1730 !is_shadow_present_pte(iter.old_spte)) 1731 continue; 1732 1733 /* 1734 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1735 * a large page size, then its parent would have been zapped 1736 * instead of stepping down. 1737 */ 1738 if (is_last_spte(iter.old_spte, iter.level)) 1739 continue; 1740 1741 /* 1742 * If iter.gfn resides outside of the slot, i.e. the page for 1743 * the current level overlaps but is not contained by the slot, 1744 * then the SPTE can't be made huge. More importantly, trying 1745 * to query that info from slot->arch.lpage_info will cause an 1746 * out-of-bounds access. 1747 */ 1748 if (iter.gfn < start || iter.gfn >= end) 1749 continue; 1750 1751 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1752 iter.gfn, PG_LEVEL_NUM); 1753 if (max_mapping_level < iter.level) 1754 continue; 1755 1756 /* Note, a successful atomic zap also does a remote TLB flush. */ 1757 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1758 goto retry; 1759 } 1760 1761 rcu_read_unlock(); 1762 } 1763 1764 /* 1765 * Zap non-leaf SPTEs (and free their associated page tables) which could 1766 * be replaced by huge pages, for GFNs within the slot. 1767 */ 1768 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1769 const struct kvm_memory_slot *slot) 1770 { 1771 struct kvm_mmu_page *root; 1772 1773 lockdep_assert_held_read(&kvm->mmu_lock); 1774 1775 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1776 zap_collapsible_spte_range(kvm, root, slot); 1777 } 1778 1779 /* 1780 * Removes write access on the last level SPTE mapping this GFN and unsets the 1781 * MMU-writable bit to ensure future writes continue to be intercepted. 1782 * Returns true if an SPTE was set and a TLB flush is needed. 1783 */ 1784 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1785 gfn_t gfn, int min_level) 1786 { 1787 struct tdp_iter iter; 1788 u64 new_spte; 1789 bool spte_set = false; 1790 1791 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1792 1793 rcu_read_lock(); 1794 1795 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1796 if (!is_shadow_present_pte(iter.old_spte) || 1797 !is_last_spte(iter.old_spte, iter.level)) 1798 continue; 1799 1800 new_spte = iter.old_spte & 1801 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1802 1803 if (new_spte == iter.old_spte) 1804 break; 1805 1806 tdp_mmu_set_spte(kvm, &iter, new_spte); 1807 spte_set = true; 1808 } 1809 1810 rcu_read_unlock(); 1811 1812 return spte_set; 1813 } 1814 1815 /* 1816 * Removes write access on the last level SPTE mapping this GFN and unsets the 1817 * MMU-writable bit to ensure future writes continue to be intercepted. 1818 * Returns true if an SPTE was set and a TLB flush is needed. 1819 */ 1820 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1821 struct kvm_memory_slot *slot, gfn_t gfn, 1822 int min_level) 1823 { 1824 struct kvm_mmu_page *root; 1825 bool spte_set = false; 1826 1827 lockdep_assert_held_write(&kvm->mmu_lock); 1828 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1829 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1830 1831 return spte_set; 1832 } 1833 1834 /* 1835 * Return the level of the lowest level SPTE added to sptes. 1836 * That SPTE may be non-present. 1837 * 1838 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1839 */ 1840 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1841 int *root_level) 1842 { 1843 struct tdp_iter iter; 1844 struct kvm_mmu *mmu = vcpu->arch.mmu; 1845 gfn_t gfn = addr >> PAGE_SHIFT; 1846 int leaf = -1; 1847 1848 *root_level = vcpu->arch.mmu->root_role.level; 1849 1850 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1851 leaf = iter.level; 1852 sptes[leaf] = iter.old_spte; 1853 } 1854 1855 return leaf; 1856 } 1857 1858 /* 1859 * Returns the last level spte pointer of the shadow page walk for the given 1860 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1861 * walk could be performed, returns NULL and *spte does not contain valid data. 1862 * 1863 * Contract: 1864 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1865 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1866 * 1867 * WARNING: This function is only intended to be called during fast_page_fault. 1868 */ 1869 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1870 u64 *spte) 1871 { 1872 struct tdp_iter iter; 1873 struct kvm_mmu *mmu = vcpu->arch.mmu; 1874 gfn_t gfn = addr >> PAGE_SHIFT; 1875 tdp_ptep_t sptep = NULL; 1876 1877 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1878 *spte = iter.old_spte; 1879 sptep = iter.sptep; 1880 } 1881 1882 /* 1883 * Perform the rcu_dereference to get the raw spte pointer value since 1884 * we are passing it up to fast_page_fault, which is shared with the 1885 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1886 * annotation. 1887 * 1888 * This is safe since fast_page_fault obeys the contracts of this 1889 * function as well as all TDP MMU contracts around modifying SPTEs 1890 * outside of mmu_lock. 1891 */ 1892 return rcu_dereference(sptep); 1893 } 1894