1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include "mmu.h" 5 #include "mmu_internal.h" 6 #include "mmutrace.h" 7 #include "tdp_iter.h" 8 #include "tdp_mmu.h" 9 #include "spte.h" 10 11 #include <asm/cmpxchg.h> 12 #include <trace/events/kvm.h> 13 14 /* Initializes the TDP MMU for the VM, if enabled. */ 15 int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16 { 17 struct workqueue_struct *wq; 18 19 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 20 if (!wq) 21 return -ENOMEM; 22 23 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 24 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 25 kvm->arch.tdp_mmu_zap_wq = wq; 26 return 1; 27 } 28 29 /* Arbitrarily returns true so that this may be used in if statements. */ 30 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 31 bool shared) 32 { 33 if (shared) 34 lockdep_assert_held_read(&kvm->mmu_lock); 35 else 36 lockdep_assert_held_write(&kvm->mmu_lock); 37 38 return true; 39 } 40 41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 42 { 43 /* Also waits for any queued work items. */ 44 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 45 46 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 48 49 /* 50 * Ensure that all the outstanding RCU callbacks to free shadow pages 51 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 52 * can call kvm_tdp_mmu_put_root and create new callbacks. 53 */ 54 rcu_barrier(); 55 } 56 57 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 58 { 59 free_page((unsigned long)sp->spt); 60 kmem_cache_free(mmu_page_header_cache, sp); 61 } 62 63 /* 64 * This is called through call_rcu in order to free TDP page table memory 65 * safely with respect to other kernel threads that may be operating on 66 * the memory. 67 * By only accessing TDP MMU page table memory in an RCU read critical 68 * section, and freeing it after a grace period, lockless access to that 69 * memory won't use it after it is freed. 70 */ 71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 72 { 73 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 74 rcu_head); 75 76 tdp_mmu_free_sp(sp); 77 } 78 79 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 80 bool shared); 81 82 static void tdp_mmu_zap_root_work(struct work_struct *work) 83 { 84 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 85 tdp_mmu_async_work); 86 struct kvm *kvm = root->tdp_mmu_async_data; 87 88 read_lock(&kvm->mmu_lock); 89 90 /* 91 * A TLB flush is not necessary as KVM performs a local TLB flush when 92 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 93 * to a different pCPU. Note, the local TLB flush on reuse also 94 * invalidates any paging-structure-cache entries, i.e. TLB entries for 95 * intermediate paging structures, that may be zapped, as such entries 96 * are associated with the ASID on both VMX and SVM. 97 */ 98 tdp_mmu_zap_root(kvm, root, true); 99 100 /* 101 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 102 * avoiding an infinite loop. By design, the root is reachable while 103 * it's being asynchronously zapped, thus a different task can put its 104 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 105 * asynchronously zapped root is unavoidable. 106 */ 107 kvm_tdp_mmu_put_root(kvm, root, true); 108 109 read_unlock(&kvm->mmu_lock); 110 } 111 112 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 113 { 114 root->tdp_mmu_async_data = kvm; 115 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 116 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 117 } 118 119 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 120 { 121 union kvm_mmu_page_role role = page->role; 122 role.invalid = true; 123 124 /* No need to use cmpxchg, only the invalid bit can change. */ 125 role.word = xchg(&page->role.word, role.word); 126 return role.invalid; 127 } 128 129 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 130 bool shared) 131 { 132 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 133 134 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 135 return; 136 137 WARN_ON(!is_tdp_mmu_page(root)); 138 139 /* 140 * The root now has refcount=0. It is valid, but readers already 141 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 142 * rejects it. This remains true for the rest of the execution 143 * of this function, because readers visit valid roots only 144 * (except for tdp_mmu_zap_root_work(), which however 145 * does not acquire any reference itself). 146 * 147 * Even though there are flows that need to visit all roots for 148 * correctness, they all take mmu_lock for write, so they cannot yet 149 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 150 * since the root still has refcount=0. 151 * 152 * However, tdp_mmu_zap_root can yield, and writers do not expect to 153 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 154 * So the root temporarily gets an extra reference, going to refcount=1 155 * while staying invalid. Readers still cannot acquire any reference; 156 * but writers are now allowed to run if tdp_mmu_zap_root yields and 157 * they might take an extra reference if they themselves yield. 158 * Therefore, when the reference is given back by the worker, 159 * there is no guarantee that the refcount is still 1. If not, whoever 160 * puts the last reference will free the page, but they will not have to 161 * zap the root because a root cannot go from invalid to valid. 162 */ 163 if (!kvm_tdp_root_mark_invalid(root)) { 164 refcount_set(&root->tdp_mmu_root_count, 1); 165 166 /* 167 * Zapping the root in a worker is not just "nice to have"; 168 * it is required because kvm_tdp_mmu_invalidate_all_roots() 169 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 170 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 171 * might return with some roots not zapped yet. 172 */ 173 tdp_mmu_schedule_zap_root(kvm, root); 174 return; 175 } 176 177 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 178 list_del_rcu(&root->link); 179 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 180 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 181 } 182 183 /* 184 * Returns the next root after @prev_root (or the first root if @prev_root is 185 * NULL). A reference to the returned root is acquired, and the reference to 186 * @prev_root is released (the caller obviously must hold a reference to 187 * @prev_root if it's non-NULL). 188 * 189 * If @only_valid is true, invalid roots are skipped. 190 * 191 * Returns NULL if the end of tdp_mmu_roots was reached. 192 */ 193 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 194 struct kvm_mmu_page *prev_root, 195 bool shared, bool only_valid) 196 { 197 struct kvm_mmu_page *next_root; 198 199 rcu_read_lock(); 200 201 if (prev_root) 202 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 203 &prev_root->link, 204 typeof(*prev_root), link); 205 else 206 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 207 typeof(*next_root), link); 208 209 while (next_root) { 210 if ((!only_valid || !next_root->role.invalid) && 211 kvm_tdp_mmu_get_root(next_root)) 212 break; 213 214 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 215 &next_root->link, typeof(*next_root), link); 216 } 217 218 rcu_read_unlock(); 219 220 if (prev_root) 221 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 222 223 return next_root; 224 } 225 226 /* 227 * Note: this iterator gets and puts references to the roots it iterates over. 228 * This makes it safe to release the MMU lock and yield within the loop, but 229 * if exiting the loop early, the caller must drop the reference to the most 230 * recent root. (Unless keeping a live reference is desirable.) 231 * 232 * If shared is set, this function is operating under the MMU lock in read 233 * mode. In the unlikely event that this thread must free a root, the lock 234 * will be temporarily dropped and reacquired in write mode. 235 */ 236 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 237 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 238 _root; \ 239 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 240 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 241 kvm_mmu_page_as_id(_root) != _as_id) { \ 242 } else 243 244 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 245 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 246 247 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 248 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 249 250 /* 251 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 252 * the implication being that any flow that holds mmu_lock for read is 253 * inherently yield-friendly and should use the yield-safe variant above. 254 * Holding mmu_lock for write obviates the need for RCU protection as the list 255 * is guaranteed to be stable. 256 */ 257 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 258 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 259 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 260 kvm_mmu_page_as_id(_root) != _as_id) { \ 261 } else 262 263 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 264 { 265 struct kvm_mmu_page *sp; 266 267 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 268 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 269 270 return sp; 271 } 272 273 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 274 gfn_t gfn, union kvm_mmu_page_role role) 275 { 276 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 277 278 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 279 280 sp->role = role; 281 sp->gfn = gfn; 282 sp->ptep = sptep; 283 sp->tdp_mmu_page = true; 284 285 trace_kvm_mmu_get_page(sp, true); 286 } 287 288 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 289 struct tdp_iter *iter) 290 { 291 struct kvm_mmu_page *parent_sp; 292 union kvm_mmu_page_role role; 293 294 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 295 296 role = parent_sp->role; 297 role.level--; 298 299 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 300 } 301 302 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 303 { 304 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 305 struct kvm *kvm = vcpu->kvm; 306 struct kvm_mmu_page *root; 307 308 lockdep_assert_held_write(&kvm->mmu_lock); 309 310 /* 311 * Check for an existing root before allocating a new one. Note, the 312 * role check prevents consuming an invalid root. 313 */ 314 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 315 if (root->role.word == role.word && 316 kvm_tdp_mmu_get_root(root)) 317 goto out; 318 } 319 320 root = tdp_mmu_alloc_sp(vcpu); 321 tdp_mmu_init_sp(root, NULL, 0, role); 322 323 refcount_set(&root->tdp_mmu_root_count, 1); 324 325 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 326 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 327 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 328 329 out: 330 return __pa(root->spt); 331 } 332 333 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 334 u64 old_spte, u64 new_spte, int level, 335 bool shared); 336 337 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 338 { 339 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 340 return; 341 342 if (is_accessed_spte(old_spte) && 343 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 344 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 345 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 346 } 347 348 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 349 u64 old_spte, u64 new_spte, int level) 350 { 351 bool pfn_changed; 352 struct kvm_memory_slot *slot; 353 354 if (level > PG_LEVEL_4K) 355 return; 356 357 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 358 359 if ((!is_writable_pte(old_spte) || pfn_changed) && 360 is_writable_pte(new_spte)) { 361 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 362 mark_page_dirty_in_slot(kvm, slot, gfn); 363 } 364 } 365 366 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 367 { 368 kvm_account_pgtable_pages((void *)sp->spt, +1); 369 atomic64_inc(&kvm->arch.tdp_mmu_pages); 370 } 371 372 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 373 { 374 kvm_account_pgtable_pages((void *)sp->spt, -1); 375 atomic64_dec(&kvm->arch.tdp_mmu_pages); 376 } 377 378 /** 379 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 380 * 381 * @kvm: kvm instance 382 * @sp: the page to be removed 383 * @shared: This operation may not be running under the exclusive use of 384 * the MMU lock and the operation must synchronize with other 385 * threads that might be adding or removing pages. 386 */ 387 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 388 bool shared) 389 { 390 tdp_unaccount_mmu_page(kvm, sp); 391 392 if (!sp->nx_huge_page_disallowed) 393 return; 394 395 if (shared) 396 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 397 else 398 lockdep_assert_held_write(&kvm->mmu_lock); 399 400 sp->nx_huge_page_disallowed = false; 401 untrack_possible_nx_huge_page(kvm, sp); 402 403 if (shared) 404 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 405 } 406 407 /** 408 * handle_removed_pt() - handle a page table removed from the TDP structure 409 * 410 * @kvm: kvm instance 411 * @pt: the page removed from the paging structure 412 * @shared: This operation may not be running under the exclusive use 413 * of the MMU lock and the operation must synchronize with other 414 * threads that might be modifying SPTEs. 415 * 416 * Given a page table that has been removed from the TDP paging structure, 417 * iterates through the page table to clear SPTEs and free child page tables. 418 * 419 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 420 * protection. Since this thread removed it from the paging structure, 421 * this thread will be responsible for ensuring the page is freed. Hence the 422 * early rcu_dereferences in the function. 423 */ 424 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 425 { 426 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 427 int level = sp->role.level; 428 gfn_t base_gfn = sp->gfn; 429 int i; 430 431 trace_kvm_mmu_prepare_zap_page(sp); 432 433 tdp_mmu_unlink_sp(kvm, sp, shared); 434 435 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 436 tdp_ptep_t sptep = pt + i; 437 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 438 u64 old_spte; 439 440 if (shared) { 441 /* 442 * Set the SPTE to a nonpresent value that other 443 * threads will not overwrite. If the SPTE was 444 * already marked as removed then another thread 445 * handling a page fault could overwrite it, so 446 * set the SPTE until it is set from some other 447 * value to the removed SPTE value. 448 */ 449 for (;;) { 450 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 451 if (!is_removed_spte(old_spte)) 452 break; 453 cpu_relax(); 454 } 455 } else { 456 /* 457 * If the SPTE is not MMU-present, there is no backing 458 * page associated with the SPTE and so no side effects 459 * that need to be recorded, and exclusive ownership of 460 * mmu_lock ensures the SPTE can't be made present. 461 * Note, zapping MMIO SPTEs is also unnecessary as they 462 * are guarded by the memslots generation, not by being 463 * unreachable. 464 */ 465 old_spte = kvm_tdp_mmu_read_spte(sptep); 466 if (!is_shadow_present_pte(old_spte)) 467 continue; 468 469 /* 470 * Use the common helper instead of a raw WRITE_ONCE as 471 * the SPTE needs to be updated atomically if it can be 472 * modified by a different vCPU outside of mmu_lock. 473 * Even though the parent SPTE is !PRESENT, the TLB 474 * hasn't yet been flushed, and both Intel and AMD 475 * document that A/D assists can use upper-level PxE 476 * entries that are cached in the TLB, i.e. the CPU can 477 * still access the page and mark it dirty. 478 * 479 * No retry is needed in the atomic update path as the 480 * sole concern is dropping a Dirty bit, i.e. no other 481 * task can zap/remove the SPTE as mmu_lock is held for 482 * write. Marking the SPTE as a removed SPTE is not 483 * strictly necessary for the same reason, but using 484 * the remove SPTE value keeps the shared/exclusive 485 * paths consistent and allows the handle_changed_spte() 486 * call below to hardcode the new value to REMOVED_SPTE. 487 * 488 * Note, even though dropping a Dirty bit is the only 489 * scenario where a non-atomic update could result in a 490 * functional bug, simply checking the Dirty bit isn't 491 * sufficient as a fast page fault could read the upper 492 * level SPTE before it is zapped, and then make this 493 * target SPTE writable, resume the guest, and set the 494 * Dirty bit between reading the SPTE above and writing 495 * it here. 496 */ 497 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 498 REMOVED_SPTE, level); 499 } 500 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 501 old_spte, REMOVED_SPTE, level, shared); 502 } 503 504 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 505 } 506 507 /** 508 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 509 * @kvm: kvm instance 510 * @as_id: the address space of the paging structure the SPTE was a part of 511 * @gfn: the base GFN that was mapped by the SPTE 512 * @old_spte: The value of the SPTE before the change 513 * @new_spte: The value of the SPTE after the change 514 * @level: the level of the PT the SPTE is part of in the paging structure 515 * @shared: This operation may not be running under the exclusive use of 516 * the MMU lock and the operation must synchronize with other 517 * threads that might be modifying SPTEs. 518 * 519 * Handle bookkeeping that might result from the modification of a SPTE. 520 */ 521 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 522 u64 old_spte, u64 new_spte, int level, 523 bool shared) 524 { 525 bool was_present = is_shadow_present_pte(old_spte); 526 bool is_present = is_shadow_present_pte(new_spte); 527 bool was_leaf = was_present && is_last_spte(old_spte, level); 528 bool is_leaf = is_present && is_last_spte(new_spte, level); 529 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 530 531 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 532 WARN_ON(level < PG_LEVEL_4K); 533 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 534 535 /* 536 * If this warning were to trigger it would indicate that there was a 537 * missing MMU notifier or a race with some notifier handler. 538 * A present, leaf SPTE should never be directly replaced with another 539 * present leaf SPTE pointing to a different PFN. A notifier handler 540 * should be zapping the SPTE before the main MM's page table is 541 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 542 * thread before replacement. 543 */ 544 if (was_leaf && is_leaf && pfn_changed) { 545 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 546 "SPTE with another present leaf SPTE mapping a\n" 547 "different PFN!\n" 548 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 549 as_id, gfn, old_spte, new_spte, level); 550 551 /* 552 * Crash the host to prevent error propagation and guest data 553 * corruption. 554 */ 555 BUG(); 556 } 557 558 if (old_spte == new_spte) 559 return; 560 561 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 562 563 if (is_leaf) 564 check_spte_writable_invariants(new_spte); 565 566 /* 567 * The only times a SPTE should be changed from a non-present to 568 * non-present state is when an MMIO entry is installed/modified/ 569 * removed. In that case, there is nothing to do here. 570 */ 571 if (!was_present && !is_present) { 572 /* 573 * If this change does not involve a MMIO SPTE or removed SPTE, 574 * it is unexpected. Log the change, though it should not 575 * impact the guest since both the former and current SPTEs 576 * are nonpresent. 577 */ 578 if (WARN_ON(!is_mmio_spte(old_spte) && 579 !is_mmio_spte(new_spte) && 580 !is_removed_spte(new_spte))) 581 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 582 "should not be replaced with another,\n" 583 "different nonpresent SPTE, unless one or both\n" 584 "are MMIO SPTEs, or the new SPTE is\n" 585 "a temporary removed SPTE.\n" 586 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 587 as_id, gfn, old_spte, new_spte, level); 588 return; 589 } 590 591 if (is_leaf != was_leaf) 592 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 593 594 if (was_leaf && is_dirty_spte(old_spte) && 595 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 596 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 597 598 /* 599 * Recursively handle child PTs if the change removed a subtree from 600 * the paging structure. Note the WARN on the PFN changing without the 601 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 602 * pages are kernel allocations and should never be migrated. 603 */ 604 if (was_present && !was_leaf && 605 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 606 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 607 } 608 609 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 610 u64 old_spte, u64 new_spte, int level, 611 bool shared) 612 { 613 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 614 shared); 615 handle_changed_spte_acc_track(old_spte, new_spte, level); 616 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 617 new_spte, level); 618 } 619 620 /* 621 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 622 * and handle the associated bookkeeping. Do not mark the page dirty 623 * in KVM's dirty bitmaps. 624 * 625 * If setting the SPTE fails because it has changed, iter->old_spte will be 626 * refreshed to the current value of the spte. 627 * 628 * @kvm: kvm instance 629 * @iter: a tdp_iter instance currently on the SPTE that should be set 630 * @new_spte: The value the SPTE should be set to 631 * Return: 632 * * 0 - If the SPTE was set. 633 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 634 * no side-effects other than setting iter->old_spte to the last 635 * known value of the spte. 636 */ 637 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 638 struct tdp_iter *iter, 639 u64 new_spte) 640 { 641 u64 *sptep = rcu_dereference(iter->sptep); 642 643 /* 644 * The caller is responsible for ensuring the old SPTE is not a REMOVED 645 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 646 * and pre-checking before inserting a new SPTE is advantageous as it 647 * avoids unnecessary work. 648 */ 649 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 650 651 lockdep_assert_held_read(&kvm->mmu_lock); 652 653 /* 654 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 655 * does not hold the mmu_lock. 656 */ 657 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 658 return -EBUSY; 659 660 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 661 new_spte, iter->level, true); 662 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 663 664 return 0; 665 } 666 667 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 668 struct tdp_iter *iter) 669 { 670 int ret; 671 672 /* 673 * Freeze the SPTE by setting it to a special, 674 * non-present value. This will stop other threads from 675 * immediately installing a present entry in its place 676 * before the TLBs are flushed. 677 */ 678 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 679 if (ret) 680 return ret; 681 682 kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); 683 684 /* 685 * No other thread can overwrite the removed SPTE as they must either 686 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 687 * overwrite the special removed SPTE value. No bookkeeping is needed 688 * here since the SPTE is going from non-present to non-present. Use 689 * the raw write helper to avoid an unnecessary check on volatile bits. 690 */ 691 __kvm_tdp_mmu_write_spte(iter->sptep, 0); 692 693 return 0; 694 } 695 696 697 /* 698 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 699 * @kvm: KVM instance 700 * @as_id: Address space ID, i.e. regular vs. SMM 701 * @sptep: Pointer to the SPTE 702 * @old_spte: The current value of the SPTE 703 * @new_spte: The new value that will be set for the SPTE 704 * @gfn: The base GFN that was (or will be) mapped by the SPTE 705 * @level: The level _containing_ the SPTE (its parent PT's level) 706 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 707 * of the page. Should be set unless handling an MMU 708 * notifier for access tracking. Leaving record_acc_track 709 * unset in that case prevents page accesses from being 710 * double counted. 711 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 712 * appropriate for the change being made. Should be set 713 * unless performing certain dirty logging operations. 714 * Leaving record_dirty_log unset in that case prevents page 715 * writes from being double counted. 716 * 717 * Returns the old SPTE value, which _may_ be different than @old_spte if the 718 * SPTE had voldatile bits. 719 */ 720 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 721 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 722 bool record_acc_track, bool record_dirty_log) 723 { 724 lockdep_assert_held_write(&kvm->mmu_lock); 725 726 /* 727 * No thread should be using this function to set SPTEs to or from the 728 * temporary removed SPTE value. 729 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 730 * should be used. If operating under the MMU lock in write mode, the 731 * use of the removed SPTE should not be necessary. 732 */ 733 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 734 735 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 736 737 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 738 739 if (record_acc_track) 740 handle_changed_spte_acc_track(old_spte, new_spte, level); 741 if (record_dirty_log) 742 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 743 new_spte, level); 744 return old_spte; 745 } 746 747 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 748 u64 new_spte, bool record_acc_track, 749 bool record_dirty_log) 750 { 751 WARN_ON_ONCE(iter->yielded); 752 753 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 754 iter->old_spte, new_spte, 755 iter->gfn, iter->level, 756 record_acc_track, record_dirty_log); 757 } 758 759 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 760 u64 new_spte) 761 { 762 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 763 } 764 765 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 766 struct tdp_iter *iter, 767 u64 new_spte) 768 { 769 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 770 } 771 772 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 773 for_each_tdp_pte(_iter, _root, _start, _end) 774 775 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 776 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 777 if (!is_shadow_present_pte(_iter.old_spte) || \ 778 !is_last_spte(_iter.old_spte, _iter.level)) \ 779 continue; \ 780 else 781 782 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 783 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 784 785 /* 786 * Yield if the MMU lock is contended or this thread needs to return control 787 * to the scheduler. 788 * 789 * If this function should yield and flush is set, it will perform a remote 790 * TLB flush before yielding. 791 * 792 * If this function yields, iter->yielded is set and the caller must skip to 793 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 794 * over the paging structures to allow the iterator to continue its traversal 795 * from the paging structure root. 796 * 797 * Returns true if this function yielded. 798 */ 799 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 800 struct tdp_iter *iter, 801 bool flush, bool shared) 802 { 803 WARN_ON(iter->yielded); 804 805 /* Ensure forward progress has been made before yielding. */ 806 if (iter->next_last_level_gfn == iter->yielded_gfn) 807 return false; 808 809 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 810 if (flush) 811 kvm_flush_remote_tlbs(kvm); 812 813 rcu_read_unlock(); 814 815 if (shared) 816 cond_resched_rwlock_read(&kvm->mmu_lock); 817 else 818 cond_resched_rwlock_write(&kvm->mmu_lock); 819 820 rcu_read_lock(); 821 822 WARN_ON(iter->gfn > iter->next_last_level_gfn); 823 824 iter->yielded = true; 825 } 826 827 return iter->yielded; 828 } 829 830 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 831 { 832 /* 833 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 834 * a gpa range that would exceed the max gfn, and KVM does not create 835 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 836 * the slow emulation path every time. 837 */ 838 return kvm_mmu_max_gfn() + 1; 839 } 840 841 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 842 bool shared, int zap_level) 843 { 844 struct tdp_iter iter; 845 846 gfn_t end = tdp_mmu_max_gfn_exclusive(); 847 gfn_t start = 0; 848 849 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 850 retry: 851 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 852 continue; 853 854 if (!is_shadow_present_pte(iter.old_spte)) 855 continue; 856 857 if (iter.level > zap_level) 858 continue; 859 860 if (!shared) 861 tdp_mmu_set_spte(kvm, &iter, 0); 862 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 863 goto retry; 864 } 865 } 866 867 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 868 bool shared) 869 { 870 871 /* 872 * The root must have an elevated refcount so that it's reachable via 873 * mmu_notifier callbacks, which allows this path to yield and drop 874 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 875 * must drop all references to relevant pages prior to completing the 876 * callback. Dropping mmu_lock with an unreachable root would result 877 * in zapping SPTEs after a relevant mmu_notifier callback completes 878 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 879 * dirty accessed bits to the SPTE's associated struct page. 880 */ 881 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 882 883 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 884 885 rcu_read_lock(); 886 887 /* 888 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 889 * split the zap into two passes. On the first pass, zap at the 1gb 890 * level, and then zap top-level SPs on the second pass. "1gb" is not 891 * arbitrary, as KVM must be able to zap a 1gb shadow page without 892 * inducing a stall to allow in-place replacement with a 1gb hugepage. 893 * 894 * Because zapping a SP recurses on its children, stepping down to 895 * PG_LEVEL_4K in the iterator itself is unnecessary. 896 */ 897 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 898 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 899 900 rcu_read_unlock(); 901 } 902 903 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 904 { 905 u64 old_spte; 906 907 /* 908 * This helper intentionally doesn't allow zapping a root shadow page, 909 * which doesn't have a parent page table and thus no associated entry. 910 */ 911 if (WARN_ON_ONCE(!sp->ptep)) 912 return false; 913 914 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 915 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 916 return false; 917 918 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 919 sp->gfn, sp->role.level + 1, true, true); 920 921 return true; 922 } 923 924 /* 925 * If can_yield is true, will release the MMU lock and reschedule if the 926 * scheduler needs the CPU or there is contention on the MMU lock. If this 927 * function cannot yield, it will not release the MMU lock or reschedule and 928 * the caller must ensure it does not supply too large a GFN range, or the 929 * operation can cause a soft lockup. 930 */ 931 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 932 gfn_t start, gfn_t end, bool can_yield, bool flush) 933 { 934 struct tdp_iter iter; 935 936 end = min(end, tdp_mmu_max_gfn_exclusive()); 937 938 lockdep_assert_held_write(&kvm->mmu_lock); 939 940 rcu_read_lock(); 941 942 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 943 if (can_yield && 944 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 945 flush = false; 946 continue; 947 } 948 949 if (!is_shadow_present_pte(iter.old_spte) || 950 !is_last_spte(iter.old_spte, iter.level)) 951 continue; 952 953 tdp_mmu_set_spte(kvm, &iter, 0); 954 flush = true; 955 } 956 957 rcu_read_unlock(); 958 959 /* 960 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 961 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 962 */ 963 return flush; 964 } 965 966 /* 967 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 968 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 969 * more SPTEs were zapped since the MMU lock was last acquired. 970 */ 971 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 972 bool can_yield, bool flush) 973 { 974 struct kvm_mmu_page *root; 975 976 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 977 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 978 979 return flush; 980 } 981 982 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 983 { 984 struct kvm_mmu_page *root; 985 int i; 986 987 /* 988 * Zap all roots, including invalid roots, as all SPTEs must be dropped 989 * before returning to the caller. Zap directly even if the root is 990 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 991 * all that expensive and mmu_lock is already held, which means the 992 * worker has yielded, i.e. flushing the work instead of zapping here 993 * isn't guaranteed to be any faster. 994 * 995 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 996 * is being destroyed or the userspace VMM has exited. In both cases, 997 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 998 */ 999 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1000 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 1001 tdp_mmu_zap_root(kvm, root, false); 1002 } 1003 } 1004 1005 /* 1006 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1007 * zap" completes. 1008 */ 1009 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1010 { 1011 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1012 } 1013 1014 /* 1015 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1016 * is about to be zapped, e.g. in response to a memslots update. The actual 1017 * zapping is performed asynchronously, so a reference is taken on all roots. 1018 * Using a separate workqueue makes it easy to ensure that the destruction is 1019 * performed before the "fast zap" completes, without keeping a separate list 1020 * of invalidated roots; the list is effectively the list of work items in 1021 * the workqueue. 1022 * 1023 * Get a reference even if the root is already invalid, the asynchronous worker 1024 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1025 * is held for write, it should be impossible to observe a root with zero refcount, 1026 * i.e. the list of roots cannot be stale. 1027 * 1028 * This has essentially the same effect for the TDP MMU 1029 * as updating mmu_valid_gen does for the shadow MMU. 1030 */ 1031 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1032 { 1033 struct kvm_mmu_page *root; 1034 1035 lockdep_assert_held_write(&kvm->mmu_lock); 1036 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1037 if (!root->role.invalid && 1038 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1039 root->role.invalid = true; 1040 tdp_mmu_schedule_zap_root(kvm, root); 1041 } 1042 } 1043 } 1044 1045 /* 1046 * Installs a last-level SPTE to handle a TDP page fault. 1047 * (NPT/EPT violation/misconfiguration) 1048 */ 1049 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1050 struct kvm_page_fault *fault, 1051 struct tdp_iter *iter) 1052 { 1053 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1054 u64 new_spte; 1055 int ret = RET_PF_FIXED; 1056 bool wrprot = false; 1057 1058 if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 1059 return RET_PF_RETRY; 1060 1061 if (unlikely(!fault->slot)) 1062 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1063 else 1064 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1065 fault->pfn, iter->old_spte, fault->prefetch, true, 1066 fault->map_writable, &new_spte); 1067 1068 if (new_spte == iter->old_spte) 1069 ret = RET_PF_SPURIOUS; 1070 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1071 return RET_PF_RETRY; 1072 else if (is_shadow_present_pte(iter->old_spte) && 1073 !is_last_spte(iter->old_spte, iter->level)) 1074 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 1075 1076 /* 1077 * If the page fault was caused by a write but the page is write 1078 * protected, emulation is needed. If the emulation was skipped, 1079 * the vCPU would have the same fault again. 1080 */ 1081 if (wrprot) { 1082 if (fault->write) 1083 ret = RET_PF_EMULATE; 1084 } 1085 1086 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1087 if (unlikely(is_mmio_spte(new_spte))) { 1088 vcpu->stat.pf_mmio_spte_created++; 1089 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1090 new_spte); 1091 ret = RET_PF_EMULATE; 1092 } else { 1093 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1094 rcu_dereference(iter->sptep)); 1095 } 1096 1097 return ret; 1098 } 1099 1100 /* 1101 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1102 * provided page table. 1103 * 1104 * @kvm: kvm instance 1105 * @iter: a tdp_iter instance currently on the SPTE that should be set 1106 * @sp: The new TDP page table to install. 1107 * @shared: This operation is running under the MMU lock in read mode. 1108 * 1109 * Returns: 0 if the new page table was installed. Non-0 if the page table 1110 * could not be installed (e.g. the atomic compare-exchange failed). 1111 */ 1112 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1113 struct kvm_mmu_page *sp, bool shared) 1114 { 1115 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1116 int ret = 0; 1117 1118 if (shared) { 1119 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1120 if (ret) 1121 return ret; 1122 } else { 1123 tdp_mmu_set_spte(kvm, iter, spte); 1124 } 1125 1126 tdp_account_mmu_page(kvm, sp); 1127 1128 return 0; 1129 } 1130 1131 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1132 struct kvm_mmu_page *sp, bool shared); 1133 1134 /* 1135 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1136 * page tables and SPTEs to translate the faulting guest physical address. 1137 */ 1138 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1139 { 1140 struct kvm_mmu *mmu = vcpu->arch.mmu; 1141 struct kvm *kvm = vcpu->kvm; 1142 struct tdp_iter iter; 1143 struct kvm_mmu_page *sp; 1144 int ret = RET_PF_RETRY; 1145 1146 kvm_mmu_hugepage_adjust(vcpu, fault); 1147 1148 trace_kvm_mmu_spte_requested(fault); 1149 1150 rcu_read_lock(); 1151 1152 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1153 int r; 1154 1155 if (fault->nx_huge_page_workaround_enabled) 1156 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1157 1158 /* 1159 * If SPTE has been frozen by another thread, just give up and 1160 * retry, avoiding unnecessary page table allocation and free. 1161 */ 1162 if (is_removed_spte(iter.old_spte)) 1163 goto retry; 1164 1165 if (iter.level == fault->goal_level) 1166 goto map_target_level; 1167 1168 /* Step down into the lower level page table if it exists. */ 1169 if (is_shadow_present_pte(iter.old_spte) && 1170 !is_large_pte(iter.old_spte)) 1171 continue; 1172 1173 /* 1174 * The SPTE is either non-present or points to a huge page that 1175 * needs to be split. 1176 */ 1177 sp = tdp_mmu_alloc_sp(vcpu); 1178 tdp_mmu_init_child_sp(sp, &iter); 1179 1180 sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 1181 1182 if (is_shadow_present_pte(iter.old_spte)) 1183 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1184 else 1185 r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1186 1187 /* 1188 * Force the guest to retry if installing an upper level SPTE 1189 * failed, e.g. because a different task modified the SPTE. 1190 */ 1191 if (r) { 1192 tdp_mmu_free_sp(sp); 1193 goto retry; 1194 } 1195 1196 if (fault->huge_page_disallowed && 1197 fault->req_level >= iter.level) { 1198 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1199 if (sp->nx_huge_page_disallowed) 1200 track_possible_nx_huge_page(kvm, sp); 1201 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1202 } 1203 } 1204 1205 /* 1206 * The walk aborted before reaching the target level, e.g. because the 1207 * iterator detected an upper level SPTE was frozen during traversal. 1208 */ 1209 WARN_ON_ONCE(iter.level == fault->goal_level); 1210 goto retry; 1211 1212 map_target_level: 1213 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1214 1215 retry: 1216 rcu_read_unlock(); 1217 return ret; 1218 } 1219 1220 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1221 bool flush) 1222 { 1223 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 1224 range->end, range->may_block, flush); 1225 } 1226 1227 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1228 struct kvm_gfn_range *range); 1229 1230 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1231 struct kvm_gfn_range *range, 1232 tdp_handler_t handler) 1233 { 1234 struct kvm_mmu_page *root; 1235 struct tdp_iter iter; 1236 bool ret = false; 1237 1238 /* 1239 * Don't support rescheduling, none of the MMU notifiers that funnel 1240 * into this helper allow blocking; it'd be dead, wasteful code. 1241 */ 1242 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1243 rcu_read_lock(); 1244 1245 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1246 ret |= handler(kvm, &iter, range); 1247 1248 rcu_read_unlock(); 1249 } 1250 1251 return ret; 1252 } 1253 1254 /* 1255 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1256 * if any of the GFNs in the range have been accessed. 1257 */ 1258 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1259 struct kvm_gfn_range *range) 1260 { 1261 u64 new_spte = 0; 1262 1263 /* If we have a non-accessed entry we don't need to change the pte. */ 1264 if (!is_accessed_spte(iter->old_spte)) 1265 return false; 1266 1267 new_spte = iter->old_spte; 1268 1269 if (spte_ad_enabled(new_spte)) { 1270 new_spte &= ~shadow_accessed_mask; 1271 } else { 1272 /* 1273 * Capture the dirty status of the page, so that it doesn't get 1274 * lost when the SPTE is marked for access tracking. 1275 */ 1276 if (is_writable_pte(new_spte)) 1277 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1278 1279 new_spte = mark_spte_for_access_track(new_spte); 1280 } 1281 1282 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1283 1284 return true; 1285 } 1286 1287 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1288 { 1289 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1290 } 1291 1292 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1293 struct kvm_gfn_range *range) 1294 { 1295 return is_accessed_spte(iter->old_spte); 1296 } 1297 1298 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1299 { 1300 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1301 } 1302 1303 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1304 struct kvm_gfn_range *range) 1305 { 1306 u64 new_spte; 1307 1308 /* Huge pages aren't expected to be modified without first being zapped. */ 1309 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1310 1311 if (iter->level != PG_LEVEL_4K || 1312 !is_shadow_present_pte(iter->old_spte)) 1313 return false; 1314 1315 /* 1316 * Note, when changing a read-only SPTE, it's not strictly necessary to 1317 * zero the SPTE before setting the new PFN, but doing so preserves the 1318 * invariant that the PFN of a present * leaf SPTE can never change. 1319 * See __handle_changed_spte(). 1320 */ 1321 tdp_mmu_set_spte(kvm, iter, 0); 1322 1323 if (!pte_write(range->pte)) { 1324 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1325 pte_pfn(range->pte)); 1326 1327 tdp_mmu_set_spte(kvm, iter, new_spte); 1328 } 1329 1330 return true; 1331 } 1332 1333 /* 1334 * Handle the changed_pte MMU notifier for the TDP MMU. 1335 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1336 * notifier. 1337 * Returns non-zero if a flush is needed before releasing the MMU lock. 1338 */ 1339 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1340 { 1341 /* 1342 * No need to handle the remote TLB flush under RCU protection, the 1343 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1344 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1345 */ 1346 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1347 } 1348 1349 /* 1350 * Remove write access from all SPTEs at or above min_level that map GFNs 1351 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1352 * be flushed. 1353 */ 1354 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1355 gfn_t start, gfn_t end, int min_level) 1356 { 1357 struct tdp_iter iter; 1358 u64 new_spte; 1359 bool spte_set = false; 1360 1361 rcu_read_lock(); 1362 1363 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1364 1365 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1366 retry: 1367 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1368 continue; 1369 1370 if (!is_shadow_present_pte(iter.old_spte) || 1371 !is_last_spte(iter.old_spte, iter.level) || 1372 !(iter.old_spte & PT_WRITABLE_MASK)) 1373 continue; 1374 1375 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1376 1377 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1378 goto retry; 1379 1380 spte_set = true; 1381 } 1382 1383 rcu_read_unlock(); 1384 return spte_set; 1385 } 1386 1387 /* 1388 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1389 * only affect leaf SPTEs down to min_level. 1390 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1391 */ 1392 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1393 const struct kvm_memory_slot *slot, int min_level) 1394 { 1395 struct kvm_mmu_page *root; 1396 bool spte_set = false; 1397 1398 lockdep_assert_held_read(&kvm->mmu_lock); 1399 1400 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1401 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1402 slot->base_gfn + slot->npages, min_level); 1403 1404 return spte_set; 1405 } 1406 1407 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1408 { 1409 struct kvm_mmu_page *sp; 1410 1411 gfp |= __GFP_ZERO; 1412 1413 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1414 if (!sp) 1415 return NULL; 1416 1417 sp->spt = (void *)__get_free_page(gfp); 1418 if (!sp->spt) { 1419 kmem_cache_free(mmu_page_header_cache, sp); 1420 return NULL; 1421 } 1422 1423 return sp; 1424 } 1425 1426 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1427 struct tdp_iter *iter, 1428 bool shared) 1429 { 1430 struct kvm_mmu_page *sp; 1431 1432 /* 1433 * Since we are allocating while under the MMU lock we have to be 1434 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1435 * reclaim and to avoid making any filesystem callbacks (which can end 1436 * up invoking KVM MMU notifiers, resulting in a deadlock). 1437 * 1438 * If this allocation fails we drop the lock and retry with reclaim 1439 * allowed. 1440 */ 1441 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1442 if (sp) 1443 return sp; 1444 1445 rcu_read_unlock(); 1446 1447 if (shared) 1448 read_unlock(&kvm->mmu_lock); 1449 else 1450 write_unlock(&kvm->mmu_lock); 1451 1452 iter->yielded = true; 1453 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1454 1455 if (shared) 1456 read_lock(&kvm->mmu_lock); 1457 else 1458 write_lock(&kvm->mmu_lock); 1459 1460 rcu_read_lock(); 1461 1462 return sp; 1463 } 1464 1465 /* Note, the caller is responsible for initializing @sp. */ 1466 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1467 struct kvm_mmu_page *sp, bool shared) 1468 { 1469 const u64 huge_spte = iter->old_spte; 1470 const int level = iter->level; 1471 int ret, i; 1472 1473 /* 1474 * No need for atomics when writing to sp->spt since the page table has 1475 * not been linked in yet and thus is not reachable from any other CPU. 1476 */ 1477 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1478 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1479 1480 /* 1481 * Replace the huge spte with a pointer to the populated lower level 1482 * page table. Since we are making this change without a TLB flush vCPUs 1483 * will see a mix of the split mappings and the original huge mapping, 1484 * depending on what's currently in their TLB. This is fine from a 1485 * correctness standpoint since the translation will be the same either 1486 * way. 1487 */ 1488 ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1489 if (ret) 1490 goto out; 1491 1492 /* 1493 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1494 * are overwriting from the page stats. But we have to manually update 1495 * the page stats with the new present child pages. 1496 */ 1497 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1498 1499 out: 1500 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1501 return ret; 1502 } 1503 1504 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1505 struct kvm_mmu_page *root, 1506 gfn_t start, gfn_t end, 1507 int target_level, bool shared) 1508 { 1509 struct kvm_mmu_page *sp = NULL; 1510 struct tdp_iter iter; 1511 int ret = 0; 1512 1513 rcu_read_lock(); 1514 1515 /* 1516 * Traverse the page table splitting all huge pages above the target 1517 * level into one lower level. For example, if we encounter a 1GB page 1518 * we split it into 512 2MB pages. 1519 * 1520 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1521 * to visit an SPTE before ever visiting its children, which means we 1522 * will correctly recursively split huge pages that are more than one 1523 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1524 * and then splitting each of those to 512 4KB pages). 1525 */ 1526 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1527 retry: 1528 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1529 continue; 1530 1531 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1532 continue; 1533 1534 if (!sp) { 1535 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1536 if (!sp) { 1537 ret = -ENOMEM; 1538 trace_kvm_mmu_split_huge_page(iter.gfn, 1539 iter.old_spte, 1540 iter.level, ret); 1541 break; 1542 } 1543 1544 if (iter.yielded) 1545 continue; 1546 } 1547 1548 tdp_mmu_init_child_sp(sp, &iter); 1549 1550 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1551 goto retry; 1552 1553 sp = NULL; 1554 } 1555 1556 rcu_read_unlock(); 1557 1558 /* 1559 * It's possible to exit the loop having never used the last sp if, for 1560 * example, a vCPU doing HugePage NX splitting wins the race and 1561 * installs its own sp in place of the last sp we tried to split. 1562 */ 1563 if (sp) 1564 tdp_mmu_free_sp(sp); 1565 1566 return ret; 1567 } 1568 1569 1570 /* 1571 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1572 */ 1573 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1574 const struct kvm_memory_slot *slot, 1575 gfn_t start, gfn_t end, 1576 int target_level, bool shared) 1577 { 1578 struct kvm_mmu_page *root; 1579 int r = 0; 1580 1581 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1582 1583 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1584 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1585 if (r) { 1586 kvm_tdp_mmu_put_root(kvm, root, shared); 1587 break; 1588 } 1589 } 1590 } 1591 1592 /* 1593 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1594 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1595 * If AD bits are not enabled, this will require clearing the writable bit on 1596 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1597 * be flushed. 1598 */ 1599 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1600 gfn_t start, gfn_t end) 1601 { 1602 u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; 1603 struct tdp_iter iter; 1604 bool spte_set = false; 1605 1606 rcu_read_lock(); 1607 1608 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1609 retry: 1610 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1611 continue; 1612 1613 if (!is_shadow_present_pte(iter.old_spte)) 1614 continue; 1615 1616 MMU_WARN_ON(kvm_ad_enabled() && 1617 spte_ad_need_write_protect(iter.old_spte)); 1618 1619 if (!(iter.old_spte & dbit)) 1620 continue; 1621 1622 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 1623 goto retry; 1624 1625 spte_set = true; 1626 } 1627 1628 rcu_read_unlock(); 1629 return spte_set; 1630 } 1631 1632 /* 1633 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1634 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1635 * If AD bits are not enabled, this will require clearing the writable bit on 1636 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1637 * be flushed. 1638 */ 1639 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1640 const struct kvm_memory_slot *slot) 1641 { 1642 struct kvm_mmu_page *root; 1643 bool spte_set = false; 1644 1645 lockdep_assert_held_read(&kvm->mmu_lock); 1646 1647 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1648 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1649 slot->base_gfn + slot->npages); 1650 1651 return spte_set; 1652 } 1653 1654 /* 1655 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1656 * set in mask, starting at gfn. The given memslot is expected to contain all 1657 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1658 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1659 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1660 */ 1661 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1662 gfn_t gfn, unsigned long mask, bool wrprot) 1663 { 1664 u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : 1665 shadow_dirty_mask; 1666 struct tdp_iter iter; 1667 1668 rcu_read_lock(); 1669 1670 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1671 gfn + BITS_PER_LONG) { 1672 if (!mask) 1673 break; 1674 1675 MMU_WARN_ON(kvm_ad_enabled() && 1676 spte_ad_need_write_protect(iter.old_spte)); 1677 1678 if (iter.level > PG_LEVEL_4K || 1679 !(mask & (1UL << (iter.gfn - gfn)))) 1680 continue; 1681 1682 mask &= ~(1UL << (iter.gfn - gfn)); 1683 1684 if (!(iter.old_spte & dbit)) 1685 continue; 1686 1687 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 1688 iter.old_spte, dbit, 1689 iter.level); 1690 1691 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 1692 iter.old_spte, 1693 iter.old_spte & ~dbit); 1694 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); 1695 } 1696 1697 rcu_read_unlock(); 1698 } 1699 1700 /* 1701 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1702 * set in mask, starting at gfn. The given memslot is expected to contain all 1703 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1704 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1705 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1706 */ 1707 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1708 struct kvm_memory_slot *slot, 1709 gfn_t gfn, unsigned long mask, 1710 bool wrprot) 1711 { 1712 struct kvm_mmu_page *root; 1713 1714 lockdep_assert_held_write(&kvm->mmu_lock); 1715 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1716 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1717 } 1718 1719 static void zap_collapsible_spte_range(struct kvm *kvm, 1720 struct kvm_mmu_page *root, 1721 const struct kvm_memory_slot *slot) 1722 { 1723 gfn_t start = slot->base_gfn; 1724 gfn_t end = start + slot->npages; 1725 struct tdp_iter iter; 1726 int max_mapping_level; 1727 1728 rcu_read_lock(); 1729 1730 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 1731 retry: 1732 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1733 continue; 1734 1735 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1736 !is_shadow_present_pte(iter.old_spte)) 1737 continue; 1738 1739 /* 1740 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1741 * a large page size, then its parent would have been zapped 1742 * instead of stepping down. 1743 */ 1744 if (is_last_spte(iter.old_spte, iter.level)) 1745 continue; 1746 1747 /* 1748 * If iter.gfn resides outside of the slot, i.e. the page for 1749 * the current level overlaps but is not contained by the slot, 1750 * then the SPTE can't be made huge. More importantly, trying 1751 * to query that info from slot->arch.lpage_info will cause an 1752 * out-of-bounds access. 1753 */ 1754 if (iter.gfn < start || iter.gfn >= end) 1755 continue; 1756 1757 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1758 iter.gfn, PG_LEVEL_NUM); 1759 if (max_mapping_level < iter.level) 1760 continue; 1761 1762 /* Note, a successful atomic zap also does a remote TLB flush. */ 1763 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1764 goto retry; 1765 } 1766 1767 rcu_read_unlock(); 1768 } 1769 1770 /* 1771 * Zap non-leaf SPTEs (and free their associated page tables) which could 1772 * be replaced by huge pages, for GFNs within the slot. 1773 */ 1774 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1775 const struct kvm_memory_slot *slot) 1776 { 1777 struct kvm_mmu_page *root; 1778 1779 lockdep_assert_held_read(&kvm->mmu_lock); 1780 1781 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1782 zap_collapsible_spte_range(kvm, root, slot); 1783 } 1784 1785 /* 1786 * Removes write access on the last level SPTE mapping this GFN and unsets the 1787 * MMU-writable bit to ensure future writes continue to be intercepted. 1788 * Returns true if an SPTE was set and a TLB flush is needed. 1789 */ 1790 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1791 gfn_t gfn, int min_level) 1792 { 1793 struct tdp_iter iter; 1794 u64 new_spte; 1795 bool spte_set = false; 1796 1797 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1798 1799 rcu_read_lock(); 1800 1801 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1802 if (!is_shadow_present_pte(iter.old_spte) || 1803 !is_last_spte(iter.old_spte, iter.level)) 1804 continue; 1805 1806 new_spte = iter.old_spte & 1807 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1808 1809 if (new_spte == iter.old_spte) 1810 break; 1811 1812 tdp_mmu_set_spte(kvm, &iter, new_spte); 1813 spte_set = true; 1814 } 1815 1816 rcu_read_unlock(); 1817 1818 return spte_set; 1819 } 1820 1821 /* 1822 * Removes write access on the last level SPTE mapping this GFN and unsets the 1823 * MMU-writable bit to ensure future writes continue to be intercepted. 1824 * Returns true if an SPTE was set and a TLB flush is needed. 1825 */ 1826 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1827 struct kvm_memory_slot *slot, gfn_t gfn, 1828 int min_level) 1829 { 1830 struct kvm_mmu_page *root; 1831 bool spte_set = false; 1832 1833 lockdep_assert_held_write(&kvm->mmu_lock); 1834 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1835 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1836 1837 return spte_set; 1838 } 1839 1840 /* 1841 * Return the level of the lowest level SPTE added to sptes. 1842 * That SPTE may be non-present. 1843 * 1844 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1845 */ 1846 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1847 int *root_level) 1848 { 1849 struct tdp_iter iter; 1850 struct kvm_mmu *mmu = vcpu->arch.mmu; 1851 gfn_t gfn = addr >> PAGE_SHIFT; 1852 int leaf = -1; 1853 1854 *root_level = vcpu->arch.mmu->root_role.level; 1855 1856 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1857 leaf = iter.level; 1858 sptes[leaf] = iter.old_spte; 1859 } 1860 1861 return leaf; 1862 } 1863 1864 /* 1865 * Returns the last level spte pointer of the shadow page walk for the given 1866 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1867 * walk could be performed, returns NULL and *spte does not contain valid data. 1868 * 1869 * Contract: 1870 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1871 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1872 * 1873 * WARNING: This function is only intended to be called during fast_page_fault. 1874 */ 1875 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1876 u64 *spte) 1877 { 1878 struct tdp_iter iter; 1879 struct kvm_mmu *mmu = vcpu->arch.mmu; 1880 gfn_t gfn = addr >> PAGE_SHIFT; 1881 tdp_ptep_t sptep = NULL; 1882 1883 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1884 *spte = iter.old_spte; 1885 sptep = iter.sptep; 1886 } 1887 1888 /* 1889 * Perform the rcu_dereference to get the raw spte pointer value since 1890 * we are passing it up to fast_page_fault, which is shared with the 1891 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1892 * annotation. 1893 * 1894 * This is safe since fast_page_fault obeys the contracts of this 1895 * function as well as all TDP MMU contracts around modifying SPTEs 1896 * outside of mmu_lock. 1897 */ 1898 return rcu_dereference(sptep); 1899 } 1900