1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = true; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 struct workqueue_struct *wq; 20 21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 22 return 0; 23 24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 25 if (!wq) 26 return -ENOMEM; 27 28 /* This should not be changed for the lifetime of the VM. */ 29 kvm->arch.tdp_mmu_enabled = true; 30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 32 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 33 kvm->arch.tdp_mmu_zap_wq = wq; 34 return 1; 35 } 36 37 /* Arbitrarily returns true so that this may be used in if statements. */ 38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 39 bool shared) 40 { 41 if (shared) 42 lockdep_assert_held_read(&kvm->mmu_lock); 43 else 44 lockdep_assert_held_write(&kvm->mmu_lock); 45 46 return true; 47 } 48 49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 50 { 51 if (!kvm->arch.tdp_mmu_enabled) 52 return; 53 54 /* Also waits for any queued work items. */ 55 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 56 57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 58 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 59 60 /* 61 * Ensure that all the outstanding RCU callbacks to free shadow pages 62 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 63 * can call kvm_tdp_mmu_put_root and create new callbacks. 64 */ 65 rcu_barrier(); 66 } 67 68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 69 { 70 free_page((unsigned long)sp->spt); 71 kmem_cache_free(mmu_page_header_cache, sp); 72 } 73 74 /* 75 * This is called through call_rcu in order to free TDP page table memory 76 * safely with respect to other kernel threads that may be operating on 77 * the memory. 78 * By only accessing TDP MMU page table memory in an RCU read critical 79 * section, and freeing it after a grace period, lockless access to that 80 * memory won't use it after it is freed. 81 */ 82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 83 { 84 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 85 rcu_head); 86 87 tdp_mmu_free_sp(sp); 88 } 89 90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 91 bool shared); 92 93 static void tdp_mmu_zap_root_work(struct work_struct *work) 94 { 95 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 96 tdp_mmu_async_work); 97 struct kvm *kvm = root->tdp_mmu_async_data; 98 99 read_lock(&kvm->mmu_lock); 100 101 /* 102 * A TLB flush is not necessary as KVM performs a local TLB flush when 103 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 104 * to a different pCPU. Note, the local TLB flush on reuse also 105 * invalidates any paging-structure-cache entries, i.e. TLB entries for 106 * intermediate paging structures, that may be zapped, as such entries 107 * are associated with the ASID on both VMX and SVM. 108 */ 109 tdp_mmu_zap_root(kvm, root, true); 110 111 /* 112 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 113 * avoiding an infinite loop. By design, the root is reachable while 114 * it's being asynchronously zapped, thus a different task can put its 115 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 116 * asynchronously zapped root is unavoidable. 117 */ 118 kvm_tdp_mmu_put_root(kvm, root, true); 119 120 read_unlock(&kvm->mmu_lock); 121 } 122 123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 124 { 125 root->tdp_mmu_async_data = kvm; 126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 128 } 129 130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 131 { 132 union kvm_mmu_page_role role = page->role; 133 role.invalid = true; 134 135 /* No need to use cmpxchg, only the invalid bit can change. */ 136 role.word = xchg(&page->role.word, role.word); 137 return role.invalid; 138 } 139 140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 141 bool shared) 142 { 143 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 144 145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 146 return; 147 148 WARN_ON(!root->tdp_mmu_page); 149 150 /* 151 * The root now has refcount=0. It is valid, but readers already 152 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 153 * rejects it. This remains true for the rest of the execution 154 * of this function, because readers visit valid roots only 155 * (except for tdp_mmu_zap_root_work(), which however 156 * does not acquire any reference itself). 157 * 158 * Even though there are flows that need to visit all roots for 159 * correctness, they all take mmu_lock for write, so they cannot yet 160 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 161 * since the root still has refcount=0. 162 * 163 * However, tdp_mmu_zap_root can yield, and writers do not expect to 164 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 165 * So the root temporarily gets an extra reference, going to refcount=1 166 * while staying invalid. Readers still cannot acquire any reference; 167 * but writers are now allowed to run if tdp_mmu_zap_root yields and 168 * they might take an extra reference if they themselves yield. 169 * Therefore, when the reference is given back by the worker, 170 * there is no guarantee that the refcount is still 1. If not, whoever 171 * puts the last reference will free the page, but they will not have to 172 * zap the root because a root cannot go from invalid to valid. 173 */ 174 if (!kvm_tdp_root_mark_invalid(root)) { 175 refcount_set(&root->tdp_mmu_root_count, 1); 176 177 /* 178 * Zapping the root in a worker is not just "nice to have"; 179 * it is required because kvm_tdp_mmu_invalidate_all_roots() 180 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 181 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 182 * might return with some roots not zapped yet. 183 */ 184 tdp_mmu_schedule_zap_root(kvm, root); 185 return; 186 } 187 188 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 189 list_del_rcu(&root->link); 190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 192 } 193 194 /* 195 * Returns the next root after @prev_root (or the first root if @prev_root is 196 * NULL). A reference to the returned root is acquired, and the reference to 197 * @prev_root is released (the caller obviously must hold a reference to 198 * @prev_root if it's non-NULL). 199 * 200 * If @only_valid is true, invalid roots are skipped. 201 * 202 * Returns NULL if the end of tdp_mmu_roots was reached. 203 */ 204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 205 struct kvm_mmu_page *prev_root, 206 bool shared, bool only_valid) 207 { 208 struct kvm_mmu_page *next_root; 209 210 rcu_read_lock(); 211 212 if (prev_root) 213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 214 &prev_root->link, 215 typeof(*prev_root), link); 216 else 217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 218 typeof(*next_root), link); 219 220 while (next_root) { 221 if ((!only_valid || !next_root->role.invalid) && 222 kvm_tdp_mmu_get_root(next_root)) 223 break; 224 225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 226 &next_root->link, typeof(*next_root), link); 227 } 228 229 rcu_read_unlock(); 230 231 if (prev_root) 232 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 233 234 return next_root; 235 } 236 237 /* 238 * Note: this iterator gets and puts references to the roots it iterates over. 239 * This makes it safe to release the MMU lock and yield within the loop, but 240 * if exiting the loop early, the caller must drop the reference to the most 241 * recent root. (Unless keeping a live reference is desirable.) 242 * 243 * If shared is set, this function is operating under the MMU lock in read 244 * mode. In the unlikely event that this thread must free a root, the lock 245 * will be temporarily dropped and reacquired in write mode. 246 */ 247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 249 _root; \ 250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 252 kvm_mmu_page_as_id(_root) != _as_id) { \ 253 } else 254 255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 257 258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 260 261 /* 262 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 263 * the implication being that any flow that holds mmu_lock for read is 264 * inherently yield-friendly and should use the yield-safe variant above. 265 * Holding mmu_lock for write obviates the need for RCU protection as the list 266 * is guaranteed to be stable. 267 */ 268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 271 kvm_mmu_page_as_id(_root) != _as_id) { \ 272 } else 273 274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 275 { 276 struct kvm_mmu_page *sp; 277 278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 280 281 return sp; 282 } 283 284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 285 gfn_t gfn, union kvm_mmu_page_role role) 286 { 287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 288 289 sp->role = role; 290 sp->gfn = gfn; 291 sp->ptep = sptep; 292 sp->tdp_mmu_page = true; 293 294 trace_kvm_mmu_get_page(sp, true); 295 } 296 297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 298 struct tdp_iter *iter) 299 { 300 struct kvm_mmu_page *parent_sp; 301 union kvm_mmu_page_role role; 302 303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 304 305 role = parent_sp->role; 306 role.level--; 307 308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 309 } 310 311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 312 { 313 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 314 struct kvm *kvm = vcpu->kvm; 315 struct kvm_mmu_page *root; 316 317 lockdep_assert_held_write(&kvm->mmu_lock); 318 319 /* 320 * Check for an existing root before allocating a new one. Note, the 321 * role check prevents consuming an invalid root. 322 */ 323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 324 if (root->role.word == role.word && 325 kvm_tdp_mmu_get_root(root)) 326 goto out; 327 } 328 329 root = tdp_mmu_alloc_sp(vcpu); 330 tdp_mmu_init_sp(root, NULL, 0, role); 331 332 refcount_set(&root->tdp_mmu_root_count, 1); 333 334 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 337 338 out: 339 return __pa(root->spt); 340 } 341 342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 343 u64 old_spte, u64 new_spte, int level, 344 bool shared); 345 346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 347 { 348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 349 return; 350 351 if (is_accessed_spte(old_spte) && 352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 354 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 355 } 356 357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 358 u64 old_spte, u64 new_spte, int level) 359 { 360 bool pfn_changed; 361 struct kvm_memory_slot *slot; 362 363 if (level > PG_LEVEL_4K) 364 return; 365 366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 367 368 if ((!is_writable_pte(old_spte) || pfn_changed) && 369 is_writable_pte(new_spte)) { 370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 371 mark_page_dirty_in_slot(kvm, slot, gfn); 372 } 373 } 374 375 /** 376 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 377 * 378 * @kvm: kvm instance 379 * @sp: the page to be removed 380 * @shared: This operation may not be running under the exclusive use of 381 * the MMU lock and the operation must synchronize with other 382 * threads that might be adding or removing pages. 383 */ 384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 385 bool shared) 386 { 387 if (shared) 388 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 389 else 390 lockdep_assert_held_write(&kvm->mmu_lock); 391 392 list_del(&sp->link); 393 if (sp->lpage_disallowed) 394 unaccount_huge_nx_page(kvm, sp); 395 396 if (shared) 397 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 398 } 399 400 /** 401 * handle_removed_pt() - handle a page table removed from the TDP structure 402 * 403 * @kvm: kvm instance 404 * @pt: the page removed from the paging structure 405 * @shared: This operation may not be running under the exclusive use 406 * of the MMU lock and the operation must synchronize with other 407 * threads that might be modifying SPTEs. 408 * 409 * Given a page table that has been removed from the TDP paging structure, 410 * iterates through the page table to clear SPTEs and free child page tables. 411 * 412 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 413 * protection. Since this thread removed it from the paging structure, 414 * this thread will be responsible for ensuring the page is freed. Hence the 415 * early rcu_dereferences in the function. 416 */ 417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 418 { 419 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 420 int level = sp->role.level; 421 gfn_t base_gfn = sp->gfn; 422 int i; 423 424 trace_kvm_mmu_prepare_zap_page(sp); 425 426 tdp_mmu_unlink_sp(kvm, sp, shared); 427 428 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 429 tdp_ptep_t sptep = pt + i; 430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 431 u64 old_spte; 432 433 if (shared) { 434 /* 435 * Set the SPTE to a nonpresent value that other 436 * threads will not overwrite. If the SPTE was 437 * already marked as removed then another thread 438 * handling a page fault could overwrite it, so 439 * set the SPTE until it is set from some other 440 * value to the removed SPTE value. 441 */ 442 for (;;) { 443 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 444 if (!is_removed_spte(old_spte)) 445 break; 446 cpu_relax(); 447 } 448 } else { 449 /* 450 * If the SPTE is not MMU-present, there is no backing 451 * page associated with the SPTE and so no side effects 452 * that need to be recorded, and exclusive ownership of 453 * mmu_lock ensures the SPTE can't be made present. 454 * Note, zapping MMIO SPTEs is also unnecessary as they 455 * are guarded by the memslots generation, not by being 456 * unreachable. 457 */ 458 old_spte = kvm_tdp_mmu_read_spte(sptep); 459 if (!is_shadow_present_pte(old_spte)) 460 continue; 461 462 /* 463 * Use the common helper instead of a raw WRITE_ONCE as 464 * the SPTE needs to be updated atomically if it can be 465 * modified by a different vCPU outside of mmu_lock. 466 * Even though the parent SPTE is !PRESENT, the TLB 467 * hasn't yet been flushed, and both Intel and AMD 468 * document that A/D assists can use upper-level PxE 469 * entries that are cached in the TLB, i.e. the CPU can 470 * still access the page and mark it dirty. 471 * 472 * No retry is needed in the atomic update path as the 473 * sole concern is dropping a Dirty bit, i.e. no other 474 * task can zap/remove the SPTE as mmu_lock is held for 475 * write. Marking the SPTE as a removed SPTE is not 476 * strictly necessary for the same reason, but using 477 * the remove SPTE value keeps the shared/exclusive 478 * paths consistent and allows the handle_changed_spte() 479 * call below to hardcode the new value to REMOVED_SPTE. 480 * 481 * Note, even though dropping a Dirty bit is the only 482 * scenario where a non-atomic update could result in a 483 * functional bug, simply checking the Dirty bit isn't 484 * sufficient as a fast page fault could read the upper 485 * level SPTE before it is zapped, and then make this 486 * target SPTE writable, resume the guest, and set the 487 * Dirty bit between reading the SPTE above and writing 488 * it here. 489 */ 490 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 491 REMOVED_SPTE, level); 492 } 493 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 494 old_spte, REMOVED_SPTE, level, shared); 495 } 496 497 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 498 } 499 500 /** 501 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 502 * @kvm: kvm instance 503 * @as_id: the address space of the paging structure the SPTE was a part of 504 * @gfn: the base GFN that was mapped by the SPTE 505 * @old_spte: The value of the SPTE before the change 506 * @new_spte: The value of the SPTE after the change 507 * @level: the level of the PT the SPTE is part of in the paging structure 508 * @shared: This operation may not be running under the exclusive use of 509 * the MMU lock and the operation must synchronize with other 510 * threads that might be modifying SPTEs. 511 * 512 * Handle bookkeeping that might result from the modification of a SPTE. 513 * This function must be called for all TDP SPTE modifications. 514 */ 515 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 516 u64 old_spte, u64 new_spte, int level, 517 bool shared) 518 { 519 bool was_present = is_shadow_present_pte(old_spte); 520 bool is_present = is_shadow_present_pte(new_spte); 521 bool was_leaf = was_present && is_last_spte(old_spte, level); 522 bool is_leaf = is_present && is_last_spte(new_spte, level); 523 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 524 525 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 526 WARN_ON(level < PG_LEVEL_4K); 527 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 528 529 /* 530 * If this warning were to trigger it would indicate that there was a 531 * missing MMU notifier or a race with some notifier handler. 532 * A present, leaf SPTE should never be directly replaced with another 533 * present leaf SPTE pointing to a different PFN. A notifier handler 534 * should be zapping the SPTE before the main MM's page table is 535 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 536 * thread before replacement. 537 */ 538 if (was_leaf && is_leaf && pfn_changed) { 539 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 540 "SPTE with another present leaf SPTE mapping a\n" 541 "different PFN!\n" 542 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 543 as_id, gfn, old_spte, new_spte, level); 544 545 /* 546 * Crash the host to prevent error propagation and guest data 547 * corruption. 548 */ 549 BUG(); 550 } 551 552 if (old_spte == new_spte) 553 return; 554 555 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 556 557 if (is_leaf) 558 check_spte_writable_invariants(new_spte); 559 560 /* 561 * The only times a SPTE should be changed from a non-present to 562 * non-present state is when an MMIO entry is installed/modified/ 563 * removed. In that case, there is nothing to do here. 564 */ 565 if (!was_present && !is_present) { 566 /* 567 * If this change does not involve a MMIO SPTE or removed SPTE, 568 * it is unexpected. Log the change, though it should not 569 * impact the guest since both the former and current SPTEs 570 * are nonpresent. 571 */ 572 if (WARN_ON(!is_mmio_spte(old_spte) && 573 !is_mmio_spte(new_spte) && 574 !is_removed_spte(new_spte))) 575 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 576 "should not be replaced with another,\n" 577 "different nonpresent SPTE, unless one or both\n" 578 "are MMIO SPTEs, or the new SPTE is\n" 579 "a temporary removed SPTE.\n" 580 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 581 as_id, gfn, old_spte, new_spte, level); 582 return; 583 } 584 585 if (is_leaf != was_leaf) 586 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 587 588 if (was_leaf && is_dirty_spte(old_spte) && 589 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 590 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 591 592 /* 593 * Recursively handle child PTs if the change removed a subtree from 594 * the paging structure. Note the WARN on the PFN changing without the 595 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 596 * pages are kernel allocations and should never be migrated. 597 */ 598 if (was_present && !was_leaf && 599 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 600 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 601 } 602 603 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 604 u64 old_spte, u64 new_spte, int level, 605 bool shared) 606 { 607 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 608 shared); 609 handle_changed_spte_acc_track(old_spte, new_spte, level); 610 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 611 new_spte, level); 612 } 613 614 /* 615 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 616 * and handle the associated bookkeeping. Do not mark the page dirty 617 * in KVM's dirty bitmaps. 618 * 619 * If setting the SPTE fails because it has changed, iter->old_spte will be 620 * refreshed to the current value of the spte. 621 * 622 * @kvm: kvm instance 623 * @iter: a tdp_iter instance currently on the SPTE that should be set 624 * @new_spte: The value the SPTE should be set to 625 * Return: 626 * * 0 - If the SPTE was set. 627 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 628 * no side-effects other than setting iter->old_spte to the last 629 * known value of the spte. 630 */ 631 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 632 struct tdp_iter *iter, 633 u64 new_spte) 634 { 635 u64 *sptep = rcu_dereference(iter->sptep); 636 637 /* 638 * The caller is responsible for ensuring the old SPTE is not a REMOVED 639 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 640 * and pre-checking before inserting a new SPTE is advantageous as it 641 * avoids unnecessary work. 642 */ 643 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 644 645 lockdep_assert_held_read(&kvm->mmu_lock); 646 647 /* 648 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 649 * does not hold the mmu_lock. 650 */ 651 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 652 return -EBUSY; 653 654 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 655 new_spte, iter->level, true); 656 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 657 658 return 0; 659 } 660 661 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 662 struct tdp_iter *iter) 663 { 664 int ret; 665 666 /* 667 * Freeze the SPTE by setting it to a special, 668 * non-present value. This will stop other threads from 669 * immediately installing a present entry in its place 670 * before the TLBs are flushed. 671 */ 672 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 673 if (ret) 674 return ret; 675 676 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 677 KVM_PAGES_PER_HPAGE(iter->level)); 678 679 /* 680 * No other thread can overwrite the removed SPTE as they must either 681 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 682 * overwrite the special removed SPTE value. No bookkeeping is needed 683 * here since the SPTE is going from non-present to non-present. Use 684 * the raw write helper to avoid an unnecessary check on volatile bits. 685 */ 686 __kvm_tdp_mmu_write_spte(iter->sptep, 0); 687 688 return 0; 689 } 690 691 692 /* 693 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 694 * @kvm: KVM instance 695 * @as_id: Address space ID, i.e. regular vs. SMM 696 * @sptep: Pointer to the SPTE 697 * @old_spte: The current value of the SPTE 698 * @new_spte: The new value that will be set for the SPTE 699 * @gfn: The base GFN that was (or will be) mapped by the SPTE 700 * @level: The level _containing_ the SPTE (its parent PT's level) 701 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 702 * of the page. Should be set unless handling an MMU 703 * notifier for access tracking. Leaving record_acc_track 704 * unset in that case prevents page accesses from being 705 * double counted. 706 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 707 * appropriate for the change being made. Should be set 708 * unless performing certain dirty logging operations. 709 * Leaving record_dirty_log unset in that case prevents page 710 * writes from being double counted. 711 * 712 * Returns the old SPTE value, which _may_ be different than @old_spte if the 713 * SPTE had voldatile bits. 714 */ 715 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 716 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 717 bool record_acc_track, bool record_dirty_log) 718 { 719 lockdep_assert_held_write(&kvm->mmu_lock); 720 721 /* 722 * No thread should be using this function to set SPTEs to or from the 723 * temporary removed SPTE value. 724 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 725 * should be used. If operating under the MMU lock in write mode, the 726 * use of the removed SPTE should not be necessary. 727 */ 728 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 729 730 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 731 732 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 733 734 if (record_acc_track) 735 handle_changed_spte_acc_track(old_spte, new_spte, level); 736 if (record_dirty_log) 737 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 738 new_spte, level); 739 return old_spte; 740 } 741 742 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 743 u64 new_spte, bool record_acc_track, 744 bool record_dirty_log) 745 { 746 WARN_ON_ONCE(iter->yielded); 747 748 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 749 iter->old_spte, new_spte, 750 iter->gfn, iter->level, 751 record_acc_track, record_dirty_log); 752 } 753 754 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 755 u64 new_spte) 756 { 757 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 758 } 759 760 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 761 struct tdp_iter *iter, 762 u64 new_spte) 763 { 764 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 765 } 766 767 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 768 struct tdp_iter *iter, 769 u64 new_spte) 770 { 771 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 772 } 773 774 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 775 for_each_tdp_pte(_iter, _root, _start, _end) 776 777 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 778 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 779 if (!is_shadow_present_pte(_iter.old_spte) || \ 780 !is_last_spte(_iter.old_spte, _iter.level)) \ 781 continue; \ 782 else 783 784 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 785 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 786 787 /* 788 * Yield if the MMU lock is contended or this thread needs to return control 789 * to the scheduler. 790 * 791 * If this function should yield and flush is set, it will perform a remote 792 * TLB flush before yielding. 793 * 794 * If this function yields, iter->yielded is set and the caller must skip to 795 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 796 * over the paging structures to allow the iterator to continue its traversal 797 * from the paging structure root. 798 * 799 * Returns true if this function yielded. 800 */ 801 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 802 struct tdp_iter *iter, 803 bool flush, bool shared) 804 { 805 WARN_ON(iter->yielded); 806 807 /* Ensure forward progress has been made before yielding. */ 808 if (iter->next_last_level_gfn == iter->yielded_gfn) 809 return false; 810 811 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 812 if (flush) 813 kvm_flush_remote_tlbs(kvm); 814 815 rcu_read_unlock(); 816 817 if (shared) 818 cond_resched_rwlock_read(&kvm->mmu_lock); 819 else 820 cond_resched_rwlock_write(&kvm->mmu_lock); 821 822 rcu_read_lock(); 823 824 WARN_ON(iter->gfn > iter->next_last_level_gfn); 825 826 iter->yielded = true; 827 } 828 829 return iter->yielded; 830 } 831 832 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 833 { 834 /* 835 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 836 * a gpa range that would exceed the max gfn, and KVM does not create 837 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 838 * the slow emulation path every time. 839 */ 840 return kvm_mmu_max_gfn() + 1; 841 } 842 843 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 844 bool shared, int zap_level) 845 { 846 struct tdp_iter iter; 847 848 gfn_t end = tdp_mmu_max_gfn_exclusive(); 849 gfn_t start = 0; 850 851 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 852 retry: 853 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 854 continue; 855 856 if (!is_shadow_present_pte(iter.old_spte)) 857 continue; 858 859 if (iter.level > zap_level) 860 continue; 861 862 if (!shared) 863 tdp_mmu_set_spte(kvm, &iter, 0); 864 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 865 goto retry; 866 } 867 } 868 869 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 870 bool shared) 871 { 872 873 /* 874 * The root must have an elevated refcount so that it's reachable via 875 * mmu_notifier callbacks, which allows this path to yield and drop 876 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 877 * must drop all references to relevant pages prior to completing the 878 * callback. Dropping mmu_lock with an unreachable root would result 879 * in zapping SPTEs after a relevant mmu_notifier callback completes 880 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 881 * dirty accessed bits to the SPTE's associated struct page. 882 */ 883 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 884 885 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 886 887 rcu_read_lock(); 888 889 /* 890 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 891 * split the zap into two passes. On the first pass, zap at the 1gb 892 * level, and then zap top-level SPs on the second pass. "1gb" is not 893 * arbitrary, as KVM must be able to zap a 1gb shadow page without 894 * inducing a stall to allow in-place replacement with a 1gb hugepage. 895 * 896 * Because zapping a SP recurses on its children, stepping down to 897 * PG_LEVEL_4K in the iterator itself is unnecessary. 898 */ 899 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 900 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 901 902 rcu_read_unlock(); 903 } 904 905 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 906 { 907 u64 old_spte; 908 909 /* 910 * This helper intentionally doesn't allow zapping a root shadow page, 911 * which doesn't have a parent page table and thus no associated entry. 912 */ 913 if (WARN_ON_ONCE(!sp->ptep)) 914 return false; 915 916 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 917 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 918 return false; 919 920 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 921 sp->gfn, sp->role.level + 1, true, true); 922 923 return true; 924 } 925 926 /* 927 * If can_yield is true, will release the MMU lock and reschedule if the 928 * scheduler needs the CPU or there is contention on the MMU lock. If this 929 * function cannot yield, it will not release the MMU lock or reschedule and 930 * the caller must ensure it does not supply too large a GFN range, or the 931 * operation can cause a soft lockup. 932 */ 933 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 934 gfn_t start, gfn_t end, bool can_yield, bool flush) 935 { 936 struct tdp_iter iter; 937 938 end = min(end, tdp_mmu_max_gfn_exclusive()); 939 940 lockdep_assert_held_write(&kvm->mmu_lock); 941 942 rcu_read_lock(); 943 944 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 945 if (can_yield && 946 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 947 flush = false; 948 continue; 949 } 950 951 if (!is_shadow_present_pte(iter.old_spte) || 952 !is_last_spte(iter.old_spte, iter.level)) 953 continue; 954 955 tdp_mmu_set_spte(kvm, &iter, 0); 956 flush = true; 957 } 958 959 rcu_read_unlock(); 960 961 /* 962 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 963 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 964 */ 965 return flush; 966 } 967 968 /* 969 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 970 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 971 * more SPTEs were zapped since the MMU lock was last acquired. 972 */ 973 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 974 bool can_yield, bool flush) 975 { 976 struct kvm_mmu_page *root; 977 978 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 979 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 980 981 return flush; 982 } 983 984 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 985 { 986 struct kvm_mmu_page *root; 987 int i; 988 989 /* 990 * Zap all roots, including invalid roots, as all SPTEs must be dropped 991 * before returning to the caller. Zap directly even if the root is 992 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 993 * all that expensive and mmu_lock is already held, which means the 994 * worker has yielded, i.e. flushing the work instead of zapping here 995 * isn't guaranteed to be any faster. 996 * 997 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 998 * is being destroyed or the userspace VMM has exited. In both cases, 999 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1000 */ 1001 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1002 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 1003 tdp_mmu_zap_root(kvm, root, false); 1004 } 1005 } 1006 1007 /* 1008 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1009 * zap" completes. 1010 */ 1011 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1012 { 1013 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1014 } 1015 1016 /* 1017 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1018 * is about to be zapped, e.g. in response to a memslots update. The actual 1019 * zapping is performed asynchronously, so a reference is taken on all roots. 1020 * Using a separate workqueue makes it easy to ensure that the destruction is 1021 * performed before the "fast zap" completes, without keeping a separate list 1022 * of invalidated roots; the list is effectively the list of work items in 1023 * the workqueue. 1024 * 1025 * Get a reference even if the root is already invalid, the asynchronous worker 1026 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1027 * is held for write, it should be impossible to observe a root with zero refcount, 1028 * i.e. the list of roots cannot be stale. 1029 * 1030 * This has essentially the same effect for the TDP MMU 1031 * as updating mmu_valid_gen does for the shadow MMU. 1032 */ 1033 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1034 { 1035 struct kvm_mmu_page *root; 1036 1037 lockdep_assert_held_write(&kvm->mmu_lock); 1038 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1039 if (!root->role.invalid && 1040 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1041 root->role.invalid = true; 1042 tdp_mmu_schedule_zap_root(kvm, root); 1043 } 1044 } 1045 } 1046 1047 /* 1048 * Installs a last-level SPTE to handle a TDP page fault. 1049 * (NPT/EPT violation/misconfiguration) 1050 */ 1051 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1052 struct kvm_page_fault *fault, 1053 struct tdp_iter *iter) 1054 { 1055 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1056 u64 new_spte; 1057 int ret = RET_PF_FIXED; 1058 bool wrprot = false; 1059 1060 WARN_ON(sp->role.level != fault->goal_level); 1061 if (unlikely(!fault->slot)) 1062 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1063 else 1064 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1065 fault->pfn, iter->old_spte, fault->prefetch, true, 1066 fault->map_writable, &new_spte); 1067 1068 if (new_spte == iter->old_spte) 1069 ret = RET_PF_SPURIOUS; 1070 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1071 return RET_PF_RETRY; 1072 else if (is_shadow_present_pte(iter->old_spte) && 1073 !is_last_spte(iter->old_spte, iter->level)) 1074 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1075 KVM_PAGES_PER_HPAGE(iter->level + 1)); 1076 1077 /* 1078 * If the page fault was caused by a write but the page is write 1079 * protected, emulation is needed. If the emulation was skipped, 1080 * the vCPU would have the same fault again. 1081 */ 1082 if (wrprot) { 1083 if (fault->write) 1084 ret = RET_PF_EMULATE; 1085 } 1086 1087 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1088 if (unlikely(is_mmio_spte(new_spte))) { 1089 vcpu->stat.pf_mmio_spte_created++; 1090 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1091 new_spte); 1092 ret = RET_PF_EMULATE; 1093 } else { 1094 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1095 rcu_dereference(iter->sptep)); 1096 } 1097 1098 return ret; 1099 } 1100 1101 /* 1102 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1103 * provided page table. 1104 * 1105 * @kvm: kvm instance 1106 * @iter: a tdp_iter instance currently on the SPTE that should be set 1107 * @sp: The new TDP page table to install. 1108 * @account_nx: True if this page table is being installed to split a 1109 * non-executable huge page. 1110 * @shared: This operation is running under the MMU lock in read mode. 1111 * 1112 * Returns: 0 if the new page table was installed. Non-0 if the page table 1113 * could not be installed (e.g. the atomic compare-exchange failed). 1114 */ 1115 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1116 struct kvm_mmu_page *sp, bool account_nx, 1117 bool shared) 1118 { 1119 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1120 int ret = 0; 1121 1122 if (shared) { 1123 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1124 if (ret) 1125 return ret; 1126 } else { 1127 tdp_mmu_set_spte(kvm, iter, spte); 1128 } 1129 1130 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1131 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 1132 if (account_nx) 1133 account_huge_nx_page(kvm, sp); 1134 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1135 1136 return 0; 1137 } 1138 1139 /* 1140 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1141 * page tables and SPTEs to translate the faulting guest physical address. 1142 */ 1143 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1144 { 1145 struct kvm_mmu *mmu = vcpu->arch.mmu; 1146 struct tdp_iter iter; 1147 struct kvm_mmu_page *sp; 1148 int ret; 1149 1150 kvm_mmu_hugepage_adjust(vcpu, fault); 1151 1152 trace_kvm_mmu_spte_requested(fault); 1153 1154 rcu_read_lock(); 1155 1156 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1157 if (fault->nx_huge_page_workaround_enabled) 1158 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1159 1160 if (iter.level == fault->goal_level) 1161 break; 1162 1163 /* 1164 * If there is an SPTE mapping a large page at a higher level 1165 * than the target, that SPTE must be cleared and replaced 1166 * with a non-leaf SPTE. 1167 */ 1168 if (is_shadow_present_pte(iter.old_spte) && 1169 is_large_pte(iter.old_spte)) { 1170 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 1171 break; 1172 1173 /* 1174 * The iter must explicitly re-read the spte here 1175 * because the new value informs the !present 1176 * path below. 1177 */ 1178 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); 1179 } 1180 1181 if (!is_shadow_present_pte(iter.old_spte)) { 1182 bool account_nx = fault->huge_page_disallowed && 1183 fault->req_level >= iter.level; 1184 1185 /* 1186 * If SPTE has been frozen by another thread, just 1187 * give up and retry, avoiding unnecessary page table 1188 * allocation and free. 1189 */ 1190 if (is_removed_spte(iter.old_spte)) 1191 break; 1192 1193 sp = tdp_mmu_alloc_sp(vcpu); 1194 tdp_mmu_init_child_sp(sp, &iter); 1195 1196 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { 1197 tdp_mmu_free_sp(sp); 1198 break; 1199 } 1200 } 1201 } 1202 1203 /* 1204 * Force the guest to retry the access if the upper level SPTEs aren't 1205 * in place, or if the target leaf SPTE is frozen by another CPU. 1206 */ 1207 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { 1208 rcu_read_unlock(); 1209 return RET_PF_RETRY; 1210 } 1211 1212 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1213 rcu_read_unlock(); 1214 1215 return ret; 1216 } 1217 1218 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1219 bool flush) 1220 { 1221 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 1222 range->end, range->may_block, flush); 1223 } 1224 1225 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1226 struct kvm_gfn_range *range); 1227 1228 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1229 struct kvm_gfn_range *range, 1230 tdp_handler_t handler) 1231 { 1232 struct kvm_mmu_page *root; 1233 struct tdp_iter iter; 1234 bool ret = false; 1235 1236 /* 1237 * Don't support rescheduling, none of the MMU notifiers that funnel 1238 * into this helper allow blocking; it'd be dead, wasteful code. 1239 */ 1240 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1241 rcu_read_lock(); 1242 1243 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1244 ret |= handler(kvm, &iter, range); 1245 1246 rcu_read_unlock(); 1247 } 1248 1249 return ret; 1250 } 1251 1252 /* 1253 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1254 * if any of the GFNs in the range have been accessed. 1255 */ 1256 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1257 struct kvm_gfn_range *range) 1258 { 1259 u64 new_spte = 0; 1260 1261 /* If we have a non-accessed entry we don't need to change the pte. */ 1262 if (!is_accessed_spte(iter->old_spte)) 1263 return false; 1264 1265 new_spte = iter->old_spte; 1266 1267 if (spte_ad_enabled(new_spte)) { 1268 new_spte &= ~shadow_accessed_mask; 1269 } else { 1270 /* 1271 * Capture the dirty status of the page, so that it doesn't get 1272 * lost when the SPTE is marked for access tracking. 1273 */ 1274 if (is_writable_pte(new_spte)) 1275 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1276 1277 new_spte = mark_spte_for_access_track(new_spte); 1278 } 1279 1280 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1281 1282 return true; 1283 } 1284 1285 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1286 { 1287 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1288 } 1289 1290 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1291 struct kvm_gfn_range *range) 1292 { 1293 return is_accessed_spte(iter->old_spte); 1294 } 1295 1296 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1297 { 1298 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1299 } 1300 1301 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1302 struct kvm_gfn_range *range) 1303 { 1304 u64 new_spte; 1305 1306 /* Huge pages aren't expected to be modified without first being zapped. */ 1307 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1308 1309 if (iter->level != PG_LEVEL_4K || 1310 !is_shadow_present_pte(iter->old_spte)) 1311 return false; 1312 1313 /* 1314 * Note, when changing a read-only SPTE, it's not strictly necessary to 1315 * zero the SPTE before setting the new PFN, but doing so preserves the 1316 * invariant that the PFN of a present * leaf SPTE can never change. 1317 * See __handle_changed_spte(). 1318 */ 1319 tdp_mmu_set_spte(kvm, iter, 0); 1320 1321 if (!pte_write(range->pte)) { 1322 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1323 pte_pfn(range->pte)); 1324 1325 tdp_mmu_set_spte(kvm, iter, new_spte); 1326 } 1327 1328 return true; 1329 } 1330 1331 /* 1332 * Handle the changed_pte MMU notifier for the TDP MMU. 1333 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1334 * notifier. 1335 * Returns non-zero if a flush is needed before releasing the MMU lock. 1336 */ 1337 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1338 { 1339 /* 1340 * No need to handle the remote TLB flush under RCU protection, the 1341 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1342 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1343 */ 1344 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1345 } 1346 1347 /* 1348 * Remove write access from all SPTEs at or above min_level that map GFNs 1349 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1350 * be flushed. 1351 */ 1352 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1353 gfn_t start, gfn_t end, int min_level) 1354 { 1355 struct tdp_iter iter; 1356 u64 new_spte; 1357 bool spte_set = false; 1358 1359 rcu_read_lock(); 1360 1361 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1362 1363 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1364 retry: 1365 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1366 continue; 1367 1368 if (!is_shadow_present_pte(iter.old_spte) || 1369 !is_last_spte(iter.old_spte, iter.level) || 1370 !(iter.old_spte & PT_WRITABLE_MASK)) 1371 continue; 1372 1373 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1374 1375 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1376 goto retry; 1377 1378 spte_set = true; 1379 } 1380 1381 rcu_read_unlock(); 1382 return spte_set; 1383 } 1384 1385 /* 1386 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1387 * only affect leaf SPTEs down to min_level. 1388 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1389 */ 1390 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1391 const struct kvm_memory_slot *slot, int min_level) 1392 { 1393 struct kvm_mmu_page *root; 1394 bool spte_set = false; 1395 1396 lockdep_assert_held_read(&kvm->mmu_lock); 1397 1398 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1399 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1400 slot->base_gfn + slot->npages, min_level); 1401 1402 return spte_set; 1403 } 1404 1405 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1406 { 1407 struct kvm_mmu_page *sp; 1408 1409 gfp |= __GFP_ZERO; 1410 1411 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1412 if (!sp) 1413 return NULL; 1414 1415 sp->spt = (void *)__get_free_page(gfp); 1416 if (!sp->spt) { 1417 kmem_cache_free(mmu_page_header_cache, sp); 1418 return NULL; 1419 } 1420 1421 return sp; 1422 } 1423 1424 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1425 struct tdp_iter *iter, 1426 bool shared) 1427 { 1428 struct kvm_mmu_page *sp; 1429 1430 /* 1431 * Since we are allocating while under the MMU lock we have to be 1432 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1433 * reclaim and to avoid making any filesystem callbacks (which can end 1434 * up invoking KVM MMU notifiers, resulting in a deadlock). 1435 * 1436 * If this allocation fails we drop the lock and retry with reclaim 1437 * allowed. 1438 */ 1439 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1440 if (sp) 1441 return sp; 1442 1443 rcu_read_unlock(); 1444 1445 if (shared) 1446 read_unlock(&kvm->mmu_lock); 1447 else 1448 write_unlock(&kvm->mmu_lock); 1449 1450 iter->yielded = true; 1451 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1452 1453 if (shared) 1454 read_lock(&kvm->mmu_lock); 1455 else 1456 write_lock(&kvm->mmu_lock); 1457 1458 rcu_read_lock(); 1459 1460 return sp; 1461 } 1462 1463 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1464 struct kvm_mmu_page *sp, bool shared) 1465 { 1466 const u64 huge_spte = iter->old_spte; 1467 const int level = iter->level; 1468 int ret, i; 1469 1470 tdp_mmu_init_child_sp(sp, iter); 1471 1472 /* 1473 * No need for atomics when writing to sp->spt since the page table has 1474 * not been linked in yet and thus is not reachable from any other CPU. 1475 */ 1476 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1477 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1478 1479 /* 1480 * Replace the huge spte with a pointer to the populated lower level 1481 * page table. Since we are making this change without a TLB flush vCPUs 1482 * will see a mix of the split mappings and the original huge mapping, 1483 * depending on what's currently in their TLB. This is fine from a 1484 * correctness standpoint since the translation will be the same either 1485 * way. 1486 */ 1487 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); 1488 if (ret) 1489 goto out; 1490 1491 /* 1492 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1493 * are overwriting from the page stats. But we have to manually update 1494 * the page stats with the new present child pages. 1495 */ 1496 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1497 1498 out: 1499 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1500 return ret; 1501 } 1502 1503 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1504 struct kvm_mmu_page *root, 1505 gfn_t start, gfn_t end, 1506 int target_level, bool shared) 1507 { 1508 struct kvm_mmu_page *sp = NULL; 1509 struct tdp_iter iter; 1510 int ret = 0; 1511 1512 rcu_read_lock(); 1513 1514 /* 1515 * Traverse the page table splitting all huge pages above the target 1516 * level into one lower level. For example, if we encounter a 1GB page 1517 * we split it into 512 2MB pages. 1518 * 1519 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1520 * to visit an SPTE before ever visiting its children, which means we 1521 * will correctly recursively split huge pages that are more than one 1522 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1523 * and then splitting each of those to 512 4KB pages). 1524 */ 1525 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1526 retry: 1527 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1528 continue; 1529 1530 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1531 continue; 1532 1533 if (!sp) { 1534 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1535 if (!sp) { 1536 ret = -ENOMEM; 1537 trace_kvm_mmu_split_huge_page(iter.gfn, 1538 iter.old_spte, 1539 iter.level, ret); 1540 break; 1541 } 1542 1543 if (iter.yielded) 1544 continue; 1545 } 1546 1547 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1548 goto retry; 1549 1550 sp = NULL; 1551 } 1552 1553 rcu_read_unlock(); 1554 1555 /* 1556 * It's possible to exit the loop having never used the last sp if, for 1557 * example, a vCPU doing HugePage NX splitting wins the race and 1558 * installs its own sp in place of the last sp we tried to split. 1559 */ 1560 if (sp) 1561 tdp_mmu_free_sp(sp); 1562 1563 return ret; 1564 } 1565 1566 1567 /* 1568 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1569 */ 1570 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1571 const struct kvm_memory_slot *slot, 1572 gfn_t start, gfn_t end, 1573 int target_level, bool shared) 1574 { 1575 struct kvm_mmu_page *root; 1576 int r = 0; 1577 1578 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1579 1580 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1581 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1582 if (r) { 1583 kvm_tdp_mmu_put_root(kvm, root, shared); 1584 break; 1585 } 1586 } 1587 } 1588 1589 /* 1590 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1591 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1592 * If AD bits are not enabled, this will require clearing the writable bit on 1593 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1594 * be flushed. 1595 */ 1596 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1597 gfn_t start, gfn_t end) 1598 { 1599 struct tdp_iter iter; 1600 u64 new_spte; 1601 bool spte_set = false; 1602 1603 rcu_read_lock(); 1604 1605 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1606 retry: 1607 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1608 continue; 1609 1610 if (!is_shadow_present_pte(iter.old_spte)) 1611 continue; 1612 1613 if (spte_ad_need_write_protect(iter.old_spte)) { 1614 if (is_writable_pte(iter.old_spte)) 1615 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1616 else 1617 continue; 1618 } else { 1619 if (iter.old_spte & shadow_dirty_mask) 1620 new_spte = iter.old_spte & ~shadow_dirty_mask; 1621 else 1622 continue; 1623 } 1624 1625 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1626 goto retry; 1627 1628 spte_set = true; 1629 } 1630 1631 rcu_read_unlock(); 1632 return spte_set; 1633 } 1634 1635 /* 1636 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1637 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1638 * If AD bits are not enabled, this will require clearing the writable bit on 1639 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1640 * be flushed. 1641 */ 1642 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1643 const struct kvm_memory_slot *slot) 1644 { 1645 struct kvm_mmu_page *root; 1646 bool spte_set = false; 1647 1648 lockdep_assert_held_read(&kvm->mmu_lock); 1649 1650 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1651 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1652 slot->base_gfn + slot->npages); 1653 1654 return spte_set; 1655 } 1656 1657 /* 1658 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1659 * set in mask, starting at gfn. The given memslot is expected to contain all 1660 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1661 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1662 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1663 */ 1664 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1665 gfn_t gfn, unsigned long mask, bool wrprot) 1666 { 1667 struct tdp_iter iter; 1668 u64 new_spte; 1669 1670 rcu_read_lock(); 1671 1672 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1673 gfn + BITS_PER_LONG) { 1674 if (!mask) 1675 break; 1676 1677 if (iter.level > PG_LEVEL_4K || 1678 !(mask & (1UL << (iter.gfn - gfn)))) 1679 continue; 1680 1681 mask &= ~(1UL << (iter.gfn - gfn)); 1682 1683 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1684 if (is_writable_pte(iter.old_spte)) 1685 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1686 else 1687 continue; 1688 } else { 1689 if (iter.old_spte & shadow_dirty_mask) 1690 new_spte = iter.old_spte & ~shadow_dirty_mask; 1691 else 1692 continue; 1693 } 1694 1695 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1696 } 1697 1698 rcu_read_unlock(); 1699 } 1700 1701 /* 1702 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1703 * set in mask, starting at gfn. The given memslot is expected to contain all 1704 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1705 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1706 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1707 */ 1708 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1709 struct kvm_memory_slot *slot, 1710 gfn_t gfn, unsigned long mask, 1711 bool wrprot) 1712 { 1713 struct kvm_mmu_page *root; 1714 1715 lockdep_assert_held_write(&kvm->mmu_lock); 1716 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1717 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1718 } 1719 1720 static void zap_collapsible_spte_range(struct kvm *kvm, 1721 struct kvm_mmu_page *root, 1722 const struct kvm_memory_slot *slot) 1723 { 1724 gfn_t start = slot->base_gfn; 1725 gfn_t end = start + slot->npages; 1726 struct tdp_iter iter; 1727 int max_mapping_level; 1728 1729 rcu_read_lock(); 1730 1731 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 1732 retry: 1733 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1734 continue; 1735 1736 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1737 !is_shadow_present_pte(iter.old_spte)) 1738 continue; 1739 1740 /* 1741 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1742 * a large page size, then its parent would have been zapped 1743 * instead of stepping down. 1744 */ 1745 if (is_last_spte(iter.old_spte, iter.level)) 1746 continue; 1747 1748 /* 1749 * If iter.gfn resides outside of the slot, i.e. the page for 1750 * the current level overlaps but is not contained by the slot, 1751 * then the SPTE can't be made huge. More importantly, trying 1752 * to query that info from slot->arch.lpage_info will cause an 1753 * out-of-bounds access. 1754 */ 1755 if (iter.gfn < start || iter.gfn >= end) 1756 continue; 1757 1758 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1759 iter.gfn, PG_LEVEL_NUM); 1760 if (max_mapping_level < iter.level) 1761 continue; 1762 1763 /* Note, a successful atomic zap also does a remote TLB flush. */ 1764 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1765 goto retry; 1766 } 1767 1768 rcu_read_unlock(); 1769 } 1770 1771 /* 1772 * Zap non-leaf SPTEs (and free their associated page tables) which could 1773 * be replaced by huge pages, for GFNs within the slot. 1774 */ 1775 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1776 const struct kvm_memory_slot *slot) 1777 { 1778 struct kvm_mmu_page *root; 1779 1780 lockdep_assert_held_read(&kvm->mmu_lock); 1781 1782 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1783 zap_collapsible_spte_range(kvm, root, slot); 1784 } 1785 1786 /* 1787 * Removes write access on the last level SPTE mapping this GFN and unsets the 1788 * MMU-writable bit to ensure future writes continue to be intercepted. 1789 * Returns true if an SPTE was set and a TLB flush is needed. 1790 */ 1791 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1792 gfn_t gfn, int min_level) 1793 { 1794 struct tdp_iter iter; 1795 u64 new_spte; 1796 bool spte_set = false; 1797 1798 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1799 1800 rcu_read_lock(); 1801 1802 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1803 if (!is_shadow_present_pte(iter.old_spte) || 1804 !is_last_spte(iter.old_spte, iter.level)) 1805 continue; 1806 1807 new_spte = iter.old_spte & 1808 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1809 1810 if (new_spte == iter.old_spte) 1811 break; 1812 1813 tdp_mmu_set_spte(kvm, &iter, new_spte); 1814 spte_set = true; 1815 } 1816 1817 rcu_read_unlock(); 1818 1819 return spte_set; 1820 } 1821 1822 /* 1823 * Removes write access on the last level SPTE mapping this GFN and unsets the 1824 * MMU-writable bit to ensure future writes continue to be intercepted. 1825 * Returns true if an SPTE was set and a TLB flush is needed. 1826 */ 1827 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1828 struct kvm_memory_slot *slot, gfn_t gfn, 1829 int min_level) 1830 { 1831 struct kvm_mmu_page *root; 1832 bool spte_set = false; 1833 1834 lockdep_assert_held_write(&kvm->mmu_lock); 1835 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1836 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1837 1838 return spte_set; 1839 } 1840 1841 /* 1842 * Return the level of the lowest level SPTE added to sptes. 1843 * That SPTE may be non-present. 1844 * 1845 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1846 */ 1847 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1848 int *root_level) 1849 { 1850 struct tdp_iter iter; 1851 struct kvm_mmu *mmu = vcpu->arch.mmu; 1852 gfn_t gfn = addr >> PAGE_SHIFT; 1853 int leaf = -1; 1854 1855 *root_level = vcpu->arch.mmu->root_role.level; 1856 1857 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1858 leaf = iter.level; 1859 sptes[leaf] = iter.old_spte; 1860 } 1861 1862 return leaf; 1863 } 1864 1865 /* 1866 * Returns the last level spte pointer of the shadow page walk for the given 1867 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1868 * walk could be performed, returns NULL and *spte does not contain valid data. 1869 * 1870 * Contract: 1871 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1872 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1873 * 1874 * WARNING: This function is only intended to be called during fast_page_fault. 1875 */ 1876 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1877 u64 *spte) 1878 { 1879 struct tdp_iter iter; 1880 struct kvm_mmu *mmu = vcpu->arch.mmu; 1881 gfn_t gfn = addr >> PAGE_SHIFT; 1882 tdp_ptep_t sptep = NULL; 1883 1884 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1885 *spte = iter.old_spte; 1886 sptep = iter.sptep; 1887 } 1888 1889 /* 1890 * Perform the rcu_dereference to get the raw spte pointer value since 1891 * we are passing it up to fast_page_fault, which is shared with the 1892 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1893 * annotation. 1894 * 1895 * This is safe since fast_page_fault obeys the contracts of this 1896 * function as well as all TDP MMU contracts around modifying SPTEs 1897 * outside of mmu_lock. 1898 */ 1899 return rcu_dereference(sptep); 1900 } 1901