1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = true; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 20 return false; 21 22 /* This should not be changed for the lifetime of the VM. */ 23 kvm->arch.tdp_mmu_enabled = true; 24 25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 28 29 return true; 30 } 31 32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 33 bool shared) 34 { 35 if (shared) 36 lockdep_assert_held_read(&kvm->mmu_lock); 37 else 38 lockdep_assert_held_write(&kvm->mmu_lock); 39 } 40 41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 42 { 43 if (!kvm->arch.tdp_mmu_enabled) 44 return; 45 46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 48 49 /* 50 * Ensure that all the outstanding RCU callbacks to free shadow pages 51 * can run before the VM is torn down. 52 */ 53 rcu_barrier(); 54 } 55 56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 57 gfn_t start, gfn_t end, bool can_yield, bool flush, 58 bool shared); 59 60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 61 { 62 free_page((unsigned long)sp->spt); 63 kmem_cache_free(mmu_page_header_cache, sp); 64 } 65 66 /* 67 * This is called through call_rcu in order to free TDP page table memory 68 * safely with respect to other kernel threads that may be operating on 69 * the memory. 70 * By only accessing TDP MMU page table memory in an RCU read critical 71 * section, and freeing it after a grace period, lockless access to that 72 * memory won't use it after it is freed. 73 */ 74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 75 { 76 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 77 rcu_head); 78 79 tdp_mmu_free_sp(sp); 80 } 81 82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 83 bool shared) 84 { 85 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 86 87 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 88 return; 89 90 WARN_ON(!root->tdp_mmu_page); 91 92 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 93 list_del_rcu(&root->link); 94 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 95 96 zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); 97 98 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 99 } 100 101 /* 102 * Finds the next valid root after root (or the first valid root if root 103 * is NULL), takes a reference on it, and returns that next root. If root 104 * is not NULL, this thread should have already taken a reference on it, and 105 * that reference will be dropped. If no valid root is found, this 106 * function will return NULL. 107 */ 108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 109 struct kvm_mmu_page *prev_root, 110 bool shared) 111 { 112 struct kvm_mmu_page *next_root; 113 114 rcu_read_lock(); 115 116 if (prev_root) 117 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 118 &prev_root->link, 119 typeof(*prev_root), link); 120 else 121 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 122 typeof(*next_root), link); 123 124 while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) 125 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 126 &next_root->link, typeof(*next_root), link); 127 128 rcu_read_unlock(); 129 130 if (prev_root) 131 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 132 133 return next_root; 134 } 135 136 /* 137 * Note: this iterator gets and puts references to the roots it iterates over. 138 * This makes it safe to release the MMU lock and yield within the loop, but 139 * if exiting the loop early, the caller must drop the reference to the most 140 * recent root. (Unless keeping a live reference is desirable.) 141 * 142 * If shared is set, this function is operating under the MMU lock in read 143 * mode. In the unlikely event that this thread must free a root, the lock 144 * will be temporarily dropped and reacquired in write mode. 145 */ 146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 147 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ 148 _root; \ 149 _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ 150 if (kvm_mmu_page_as_id(_root) != _as_id) { \ 151 } else 152 153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 154 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ 155 lockdep_is_held_type(&kvm->mmu_lock, 0) || \ 156 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \ 157 if (kvm_mmu_page_as_id(_root) != _as_id) { \ 158 } else 159 160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 161 int level) 162 { 163 union kvm_mmu_page_role role; 164 165 role = vcpu->arch.mmu->mmu_role.base; 166 role.level = level; 167 role.direct = true; 168 role.gpte_is_8_bytes = true; 169 role.access = ACC_ALL; 170 role.ad_disabled = !shadow_accessed_mask; 171 172 return role; 173 } 174 175 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 176 int level) 177 { 178 struct kvm_mmu_page *sp; 179 180 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 181 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 182 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 183 184 sp->role.word = page_role_for_level(vcpu, level).word; 185 sp->gfn = gfn; 186 sp->tdp_mmu_page = true; 187 188 trace_kvm_mmu_get_page(sp, true); 189 190 return sp; 191 } 192 193 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 194 { 195 union kvm_mmu_page_role role; 196 struct kvm *kvm = vcpu->kvm; 197 struct kvm_mmu_page *root; 198 199 lockdep_assert_held_write(&kvm->mmu_lock); 200 201 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 202 203 /* Check for an existing root before allocating a new one. */ 204 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 205 if (root->role.word == role.word && 206 kvm_tdp_mmu_get_root(kvm, root)) 207 goto out; 208 } 209 210 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 211 refcount_set(&root->tdp_mmu_root_count, 1); 212 213 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 214 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 215 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 216 217 out: 218 return __pa(root->spt); 219 } 220 221 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 222 u64 old_spte, u64 new_spte, int level, 223 bool shared); 224 225 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 226 { 227 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 228 return; 229 230 if (is_accessed_spte(old_spte) && 231 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 232 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 233 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 234 } 235 236 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 237 u64 old_spte, u64 new_spte, int level) 238 { 239 bool pfn_changed; 240 struct kvm_memory_slot *slot; 241 242 if (level > PG_LEVEL_4K) 243 return; 244 245 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 246 247 if ((!is_writable_pte(old_spte) || pfn_changed) && 248 is_writable_pte(new_spte)) { 249 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 250 mark_page_dirty_in_slot(kvm, slot, gfn); 251 } 252 } 253 254 /** 255 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU 256 * 257 * @kvm: kvm instance 258 * @sp: the new page 259 * @account_nx: This page replaces a NX large page and should be marked for 260 * eventual reclaim. 261 */ 262 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, 263 bool account_nx) 264 { 265 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 266 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 267 if (account_nx) 268 account_huge_nx_page(kvm, sp); 269 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 270 } 271 272 /** 273 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU 274 * 275 * @kvm: kvm instance 276 * @sp: the page to be removed 277 * @shared: This operation may not be running under the exclusive use of 278 * the MMU lock and the operation must synchronize with other 279 * threads that might be adding or removing pages. 280 */ 281 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, 282 bool shared) 283 { 284 if (shared) 285 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 286 else 287 lockdep_assert_held_write(&kvm->mmu_lock); 288 289 list_del(&sp->link); 290 if (sp->lpage_disallowed) 291 unaccount_huge_nx_page(kvm, sp); 292 293 if (shared) 294 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 295 } 296 297 /** 298 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure 299 * 300 * @kvm: kvm instance 301 * @pt: the page removed from the paging structure 302 * @shared: This operation may not be running under the exclusive use 303 * of the MMU lock and the operation must synchronize with other 304 * threads that might be modifying SPTEs. 305 * 306 * Given a page table that has been removed from the TDP paging structure, 307 * iterates through the page table to clear SPTEs and free child page tables. 308 * 309 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 310 * protection. Since this thread removed it from the paging structure, 311 * this thread will be responsible for ensuring the page is freed. Hence the 312 * early rcu_dereferences in the function. 313 */ 314 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, 315 bool shared) 316 { 317 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 318 int level = sp->role.level; 319 gfn_t base_gfn = sp->gfn; 320 u64 old_child_spte; 321 u64 *sptep; 322 gfn_t gfn; 323 int i; 324 325 trace_kvm_mmu_prepare_zap_page(sp); 326 327 tdp_mmu_unlink_page(kvm, sp, shared); 328 329 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 330 sptep = rcu_dereference(pt) + i; 331 gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 332 333 if (shared) { 334 /* 335 * Set the SPTE to a nonpresent value that other 336 * threads will not overwrite. If the SPTE was 337 * already marked as removed then another thread 338 * handling a page fault could overwrite it, so 339 * set the SPTE until it is set from some other 340 * value to the removed SPTE value. 341 */ 342 for (;;) { 343 old_child_spte = xchg(sptep, REMOVED_SPTE); 344 if (!is_removed_spte(old_child_spte)) 345 break; 346 cpu_relax(); 347 } 348 } else { 349 /* 350 * If the SPTE is not MMU-present, there is no backing 351 * page associated with the SPTE and so no side effects 352 * that need to be recorded, and exclusive ownership of 353 * mmu_lock ensures the SPTE can't be made present. 354 * Note, zapping MMIO SPTEs is also unnecessary as they 355 * are guarded by the memslots generation, not by being 356 * unreachable. 357 */ 358 old_child_spte = READ_ONCE(*sptep); 359 if (!is_shadow_present_pte(old_child_spte)) 360 continue; 361 362 /* 363 * Marking the SPTE as a removed SPTE is not 364 * strictly necessary here as the MMU lock will 365 * stop other threads from concurrently modifying 366 * this SPTE. Using the removed SPTE value keeps 367 * the two branches consistent and simplifies 368 * the function. 369 */ 370 WRITE_ONCE(*sptep, REMOVED_SPTE); 371 } 372 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 373 old_child_spte, REMOVED_SPTE, level, 374 shared); 375 } 376 377 kvm_flush_remote_tlbs_with_address(kvm, gfn, 378 KVM_PAGES_PER_HPAGE(level + 1)); 379 380 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 381 } 382 383 /** 384 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 385 * @kvm: kvm instance 386 * @as_id: the address space of the paging structure the SPTE was a part of 387 * @gfn: the base GFN that was mapped by the SPTE 388 * @old_spte: The value of the SPTE before the change 389 * @new_spte: The value of the SPTE after the change 390 * @level: the level of the PT the SPTE is part of in the paging structure 391 * @shared: This operation may not be running under the exclusive use of 392 * the MMU lock and the operation must synchronize with other 393 * threads that might be modifying SPTEs. 394 * 395 * Handle bookkeeping that might result from the modification of a SPTE. 396 * This function must be called for all TDP SPTE modifications. 397 */ 398 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 399 u64 old_spte, u64 new_spte, int level, 400 bool shared) 401 { 402 bool was_present = is_shadow_present_pte(old_spte); 403 bool is_present = is_shadow_present_pte(new_spte); 404 bool was_leaf = was_present && is_last_spte(old_spte, level); 405 bool is_leaf = is_present && is_last_spte(new_spte, level); 406 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 407 408 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 409 WARN_ON(level < PG_LEVEL_4K); 410 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 411 412 /* 413 * If this warning were to trigger it would indicate that there was a 414 * missing MMU notifier or a race with some notifier handler. 415 * A present, leaf SPTE should never be directly replaced with another 416 * present leaf SPTE pointing to a different PFN. A notifier handler 417 * should be zapping the SPTE before the main MM's page table is 418 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 419 * thread before replacement. 420 */ 421 if (was_leaf && is_leaf && pfn_changed) { 422 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 423 "SPTE with another present leaf SPTE mapping a\n" 424 "different PFN!\n" 425 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 426 as_id, gfn, old_spte, new_spte, level); 427 428 /* 429 * Crash the host to prevent error propagation and guest data 430 * corruption. 431 */ 432 BUG(); 433 } 434 435 if (old_spte == new_spte) 436 return; 437 438 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 439 440 /* 441 * The only times a SPTE should be changed from a non-present to 442 * non-present state is when an MMIO entry is installed/modified/ 443 * removed. In that case, there is nothing to do here. 444 */ 445 if (!was_present && !is_present) { 446 /* 447 * If this change does not involve a MMIO SPTE or removed SPTE, 448 * it is unexpected. Log the change, though it should not 449 * impact the guest since both the former and current SPTEs 450 * are nonpresent. 451 */ 452 if (WARN_ON(!is_mmio_spte(old_spte) && 453 !is_mmio_spte(new_spte) && 454 !is_removed_spte(new_spte))) 455 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 456 "should not be replaced with another,\n" 457 "different nonpresent SPTE, unless one or both\n" 458 "are MMIO SPTEs, or the new SPTE is\n" 459 "a temporary removed SPTE.\n" 460 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 461 as_id, gfn, old_spte, new_spte, level); 462 return; 463 } 464 465 if (is_leaf != was_leaf) 466 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 467 468 if (was_leaf && is_dirty_spte(old_spte) && 469 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 470 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 471 472 /* 473 * Recursively handle child PTs if the change removed a subtree from 474 * the paging structure. 475 */ 476 if (was_present && !was_leaf && (pfn_changed || !is_present)) 477 handle_removed_tdp_mmu_page(kvm, 478 spte_to_child_pt(old_spte, level), shared); 479 } 480 481 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 482 u64 old_spte, u64 new_spte, int level, 483 bool shared) 484 { 485 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 486 shared); 487 handle_changed_spte_acc_track(old_spte, new_spte, level); 488 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 489 new_spte, level); 490 } 491 492 /* 493 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 494 * and handle the associated bookkeeping. Do not mark the page dirty 495 * in KVM's dirty bitmaps. 496 * 497 * @kvm: kvm instance 498 * @iter: a tdp_iter instance currently on the SPTE that should be set 499 * @new_spte: The value the SPTE should be set to 500 * Returns: true if the SPTE was set, false if it was not. If false is returned, 501 * this function will have no side-effects. 502 */ 503 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, 504 struct tdp_iter *iter, 505 u64 new_spte) 506 { 507 lockdep_assert_held_read(&kvm->mmu_lock); 508 509 /* 510 * Do not change removed SPTEs. Only the thread that froze the SPTE 511 * may modify it. 512 */ 513 if (is_removed_spte(iter->old_spte)) 514 return false; 515 516 /* 517 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 518 * does not hold the mmu_lock. 519 */ 520 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, 521 new_spte) != iter->old_spte) 522 return false; 523 524 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 525 new_spte, iter->level, true); 526 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 527 528 return true; 529 } 530 531 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, 532 struct tdp_iter *iter) 533 { 534 /* 535 * Freeze the SPTE by setting it to a special, 536 * non-present value. This will stop other threads from 537 * immediately installing a present entry in its place 538 * before the TLBs are flushed. 539 */ 540 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) 541 return false; 542 543 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 544 KVM_PAGES_PER_HPAGE(iter->level)); 545 546 /* 547 * No other thread can overwrite the removed SPTE as they 548 * must either wait on the MMU lock or use 549 * tdp_mmu_set_spte_atomic which will not overwrite the 550 * special removed SPTE value. No bookkeeping is needed 551 * here since the SPTE is going from non-present 552 * to non-present. 553 */ 554 WRITE_ONCE(*rcu_dereference(iter->sptep), 0); 555 556 return true; 557 } 558 559 560 /* 561 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 562 * @kvm: kvm instance 563 * @iter: a tdp_iter instance currently on the SPTE that should be set 564 * @new_spte: The value the SPTE should be set to 565 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 566 * of the page. Should be set unless handling an MMU 567 * notifier for access tracking. Leaving record_acc_track 568 * unset in that case prevents page accesses from being 569 * double counted. 570 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 571 * appropriate for the change being made. Should be set 572 * unless performing certain dirty logging operations. 573 * Leaving record_dirty_log unset in that case prevents page 574 * writes from being double counted. 575 */ 576 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 577 u64 new_spte, bool record_acc_track, 578 bool record_dirty_log) 579 { 580 lockdep_assert_held_write(&kvm->mmu_lock); 581 582 /* 583 * No thread should be using this function to set SPTEs to the 584 * temporary removed SPTE value. 585 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 586 * should be used. If operating under the MMU lock in write mode, the 587 * use of the removed SPTE should not be necessary. 588 */ 589 WARN_ON(is_removed_spte(iter->old_spte)); 590 591 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); 592 593 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 594 new_spte, iter->level, false); 595 if (record_acc_track) 596 handle_changed_spte_acc_track(iter->old_spte, new_spte, 597 iter->level); 598 if (record_dirty_log) 599 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, 600 iter->old_spte, new_spte, 601 iter->level); 602 } 603 604 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 605 u64 new_spte) 606 { 607 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 608 } 609 610 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 611 struct tdp_iter *iter, 612 u64 new_spte) 613 { 614 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 615 } 616 617 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 618 struct tdp_iter *iter, 619 u64 new_spte) 620 { 621 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 622 } 623 624 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 625 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 626 627 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 628 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 629 if (!is_shadow_present_pte(_iter.old_spte) || \ 630 !is_last_spte(_iter.old_spte, _iter.level)) \ 631 continue; \ 632 else 633 634 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 635 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 636 _mmu->shadow_root_level, _start, _end) 637 638 /* 639 * Yield if the MMU lock is contended or this thread needs to return control 640 * to the scheduler. 641 * 642 * If this function should yield and flush is set, it will perform a remote 643 * TLB flush before yielding. 644 * 645 * If this function yields, it will also reset the tdp_iter's walk over the 646 * paging structure and the calling function should skip to the next 647 * iteration to allow the iterator to continue its traversal from the 648 * paging structure root. 649 * 650 * Return true if this function yielded and the iterator's traversal was reset. 651 * Return false if a yield was not needed. 652 */ 653 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 654 struct tdp_iter *iter, bool flush, 655 bool shared) 656 { 657 /* Ensure forward progress has been made before yielding. */ 658 if (iter->next_last_level_gfn == iter->yielded_gfn) 659 return false; 660 661 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 662 rcu_read_unlock(); 663 664 if (flush) 665 kvm_flush_remote_tlbs(kvm); 666 667 if (shared) 668 cond_resched_rwlock_read(&kvm->mmu_lock); 669 else 670 cond_resched_rwlock_write(&kvm->mmu_lock); 671 672 rcu_read_lock(); 673 674 WARN_ON(iter->gfn > iter->next_last_level_gfn); 675 676 tdp_iter_restart(iter); 677 678 return true; 679 } 680 681 return false; 682 } 683 684 /* 685 * Tears down the mappings for the range of gfns, [start, end), and frees the 686 * non-root pages mapping GFNs strictly within that range. Returns true if 687 * SPTEs have been cleared and a TLB flush is needed before releasing the 688 * MMU lock. 689 * 690 * If can_yield is true, will release the MMU lock and reschedule if the 691 * scheduler needs the CPU or there is contention on the MMU lock. If this 692 * function cannot yield, it will not release the MMU lock or reschedule and 693 * the caller must ensure it does not supply too large a GFN range, or the 694 * operation can cause a soft lockup. 695 * 696 * If shared is true, this thread holds the MMU lock in read mode and must 697 * account for the possibility that other threads are modifying the paging 698 * structures concurrently. If shared is false, this thread should hold the 699 * MMU lock in write mode. 700 */ 701 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 702 gfn_t start, gfn_t end, bool can_yield, bool flush, 703 bool shared) 704 { 705 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 706 bool zap_all = (start == 0 && end >= max_gfn_host); 707 struct tdp_iter iter; 708 709 /* 710 * No need to try to step down in the iterator when zapping all SPTEs, 711 * zapping the top-level non-leaf SPTEs will recurse on their children. 712 */ 713 int min_level = zap_all ? root->role.level : PG_LEVEL_4K; 714 715 /* 716 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will 717 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, 718 * and so KVM will never install a SPTE for such addresses. 719 */ 720 end = min(end, max_gfn_host); 721 722 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 723 724 rcu_read_lock(); 725 726 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 727 min_level, start, end) { 728 retry: 729 if (can_yield && 730 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { 731 flush = false; 732 continue; 733 } 734 735 if (!is_shadow_present_pte(iter.old_spte)) 736 continue; 737 738 /* 739 * If this is a non-last-level SPTE that covers a larger range 740 * than should be zapped, continue, and zap the mappings at a 741 * lower level, except when zapping all SPTEs. 742 */ 743 if (!zap_all && 744 (iter.gfn < start || 745 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 746 !is_last_spte(iter.old_spte, iter.level)) 747 continue; 748 749 if (!shared) { 750 tdp_mmu_set_spte(kvm, &iter, 0); 751 flush = true; 752 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { 753 /* 754 * The iter must explicitly re-read the SPTE because 755 * the atomic cmpxchg failed. 756 */ 757 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 758 goto retry; 759 } 760 } 761 762 rcu_read_unlock(); 763 return flush; 764 } 765 766 /* 767 * Tears down the mappings for the range of gfns, [start, end), and frees the 768 * non-root pages mapping GFNs strictly within that range. Returns true if 769 * SPTEs have been cleared and a TLB flush is needed before releasing the 770 * MMU lock. 771 */ 772 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, 773 gfn_t end, bool can_yield, bool flush) 774 { 775 struct kvm_mmu_page *root; 776 777 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) 778 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, 779 false); 780 781 return flush; 782 } 783 784 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 785 { 786 bool flush = false; 787 int i; 788 789 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 790 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush); 791 792 if (flush) 793 kvm_flush_remote_tlbs(kvm); 794 } 795 796 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, 797 struct kvm_mmu_page *prev_root) 798 { 799 struct kvm_mmu_page *next_root; 800 801 if (prev_root) 802 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 803 &prev_root->link, 804 typeof(*prev_root), link); 805 else 806 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 807 typeof(*next_root), link); 808 809 while (next_root && !(next_root->role.invalid && 810 refcount_read(&next_root->tdp_mmu_root_count))) 811 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 812 &next_root->link, 813 typeof(*next_root), link); 814 815 return next_root; 816 } 817 818 /* 819 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each 820 * invalidated root, they will not be freed until this function drops the 821 * reference. Before dropping that reference, tear down the paging 822 * structure so that whichever thread does drop the last reference 823 * only has to do a trivial amount of work. Since the roots are invalid, 824 * no new SPTEs should be created under them. 825 */ 826 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 827 { 828 struct kvm_mmu_page *next_root; 829 struct kvm_mmu_page *root; 830 bool flush = false; 831 832 lockdep_assert_held_read(&kvm->mmu_lock); 833 834 rcu_read_lock(); 835 836 root = next_invalidated_root(kvm, NULL); 837 838 while (root) { 839 next_root = next_invalidated_root(kvm, root); 840 841 rcu_read_unlock(); 842 843 flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); 844 845 /* 846 * Put the reference acquired in 847 * kvm_tdp_mmu_invalidate_roots 848 */ 849 kvm_tdp_mmu_put_root(kvm, root, true); 850 851 root = next_root; 852 853 rcu_read_lock(); 854 } 855 856 rcu_read_unlock(); 857 858 if (flush) 859 kvm_flush_remote_tlbs(kvm); 860 } 861 862 /* 863 * Mark each TDP MMU root as invalid so that other threads 864 * will drop their references and allow the root count to 865 * go to 0. 866 * 867 * Also take a reference on all roots so that this thread 868 * can do the bulk of the work required to free the roots 869 * once they are invalidated. Without this reference, a 870 * vCPU thread might drop the last reference to a root and 871 * get stuck with tearing down the entire paging structure. 872 * 873 * Roots which have a zero refcount should be skipped as 874 * they're already being torn down. 875 * Already invalid roots should be referenced again so that 876 * they aren't freed before kvm_tdp_mmu_zap_all_fast is 877 * done with them. 878 * 879 * This has essentially the same effect for the TDP MMU 880 * as updating mmu_valid_gen does for the shadow MMU. 881 */ 882 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 883 { 884 struct kvm_mmu_page *root; 885 886 lockdep_assert_held_write(&kvm->mmu_lock); 887 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) 888 if (refcount_inc_not_zero(&root->tdp_mmu_root_count)) 889 root->role.invalid = true; 890 } 891 892 /* 893 * Installs a last-level SPTE to handle a TDP page fault. 894 * (NPT/EPT violation/misconfiguration) 895 */ 896 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 897 struct kvm_page_fault *fault, 898 struct tdp_iter *iter) 899 { 900 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 901 u64 new_spte; 902 int ret = RET_PF_FIXED; 903 bool wrprot = false; 904 905 WARN_ON(sp->role.level != fault->goal_level); 906 if (unlikely(!fault->slot)) 907 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 908 else 909 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 910 fault->pfn, iter->old_spte, fault->prefetch, true, 911 fault->map_writable, &new_spte); 912 913 if (new_spte == iter->old_spte) 914 ret = RET_PF_SPURIOUS; 915 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 916 return RET_PF_RETRY; 917 918 /* 919 * If the page fault was caused by a write but the page is write 920 * protected, emulation is needed. If the emulation was skipped, 921 * the vCPU would have the same fault again. 922 */ 923 if (wrprot) { 924 if (fault->write) 925 ret = RET_PF_EMULATE; 926 } 927 928 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 929 if (unlikely(is_mmio_spte(new_spte))) { 930 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 931 new_spte); 932 ret = RET_PF_EMULATE; 933 } else { 934 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 935 rcu_dereference(iter->sptep)); 936 } 937 938 /* 939 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be 940 * consistent with legacy MMU behavior. 941 */ 942 if (ret != RET_PF_SPURIOUS) 943 vcpu->stat.pf_fixed++; 944 945 return ret; 946 } 947 948 /* 949 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 950 * page tables and SPTEs to translate the faulting guest physical address. 951 */ 952 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 953 { 954 struct kvm_mmu *mmu = vcpu->arch.mmu; 955 struct tdp_iter iter; 956 struct kvm_mmu_page *sp; 957 u64 *child_pt; 958 u64 new_spte; 959 int ret; 960 961 kvm_mmu_hugepage_adjust(vcpu, fault); 962 963 trace_kvm_mmu_spte_requested(fault); 964 965 rcu_read_lock(); 966 967 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 968 if (fault->nx_huge_page_workaround_enabled) 969 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 970 971 if (iter.level == fault->goal_level) 972 break; 973 974 /* 975 * If there is an SPTE mapping a large page at a higher level 976 * than the target, that SPTE must be cleared and replaced 977 * with a non-leaf SPTE. 978 */ 979 if (is_shadow_present_pte(iter.old_spte) && 980 is_large_pte(iter.old_spte)) { 981 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 982 break; 983 984 /* 985 * The iter must explicitly re-read the spte here 986 * because the new value informs the !present 987 * path below. 988 */ 989 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 990 } 991 992 if (!is_shadow_present_pte(iter.old_spte)) { 993 /* 994 * If SPTE has been frozen by another thread, just 995 * give up and retry, avoiding unnecessary page table 996 * allocation and free. 997 */ 998 if (is_removed_spte(iter.old_spte)) 999 break; 1000 1001 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); 1002 child_pt = sp->spt; 1003 1004 new_spte = make_nonleaf_spte(child_pt, 1005 !shadow_accessed_mask); 1006 1007 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { 1008 tdp_mmu_link_page(vcpu->kvm, sp, 1009 fault->huge_page_disallowed && 1010 fault->req_level >= iter.level); 1011 1012 trace_kvm_mmu_get_page(sp, true); 1013 } else { 1014 tdp_mmu_free_sp(sp); 1015 break; 1016 } 1017 } 1018 } 1019 1020 if (iter.level != fault->goal_level) { 1021 rcu_read_unlock(); 1022 return RET_PF_RETRY; 1023 } 1024 1025 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1026 rcu_read_unlock(); 1027 1028 return ret; 1029 } 1030 1031 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1032 bool flush) 1033 { 1034 struct kvm_mmu_page *root; 1035 1036 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) 1037 flush |= zap_gfn_range(kvm, root, range->start, range->end, 1038 range->may_block, flush, false); 1039 1040 return flush; 1041 } 1042 1043 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1044 struct kvm_gfn_range *range); 1045 1046 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1047 struct kvm_gfn_range *range, 1048 tdp_handler_t handler) 1049 { 1050 struct kvm_mmu_page *root; 1051 struct tdp_iter iter; 1052 bool ret = false; 1053 1054 rcu_read_lock(); 1055 1056 /* 1057 * Don't support rescheduling, none of the MMU notifiers that funnel 1058 * into this helper allow blocking; it'd be dead, wasteful code. 1059 */ 1060 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1061 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1062 ret |= handler(kvm, &iter, range); 1063 } 1064 1065 rcu_read_unlock(); 1066 1067 return ret; 1068 } 1069 1070 /* 1071 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1072 * if any of the GFNs in the range have been accessed. 1073 */ 1074 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1075 struct kvm_gfn_range *range) 1076 { 1077 u64 new_spte = 0; 1078 1079 /* If we have a non-accessed entry we don't need to change the pte. */ 1080 if (!is_accessed_spte(iter->old_spte)) 1081 return false; 1082 1083 new_spte = iter->old_spte; 1084 1085 if (spte_ad_enabled(new_spte)) { 1086 new_spte &= ~shadow_accessed_mask; 1087 } else { 1088 /* 1089 * Capture the dirty status of the page, so that it doesn't get 1090 * lost when the SPTE is marked for access tracking. 1091 */ 1092 if (is_writable_pte(new_spte)) 1093 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1094 1095 new_spte = mark_spte_for_access_track(new_spte); 1096 } 1097 1098 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1099 1100 return true; 1101 } 1102 1103 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1104 { 1105 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1106 } 1107 1108 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1109 struct kvm_gfn_range *range) 1110 { 1111 return is_accessed_spte(iter->old_spte); 1112 } 1113 1114 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1115 { 1116 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1117 } 1118 1119 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1120 struct kvm_gfn_range *range) 1121 { 1122 u64 new_spte; 1123 1124 /* Huge pages aren't expected to be modified without first being zapped. */ 1125 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1126 1127 if (iter->level != PG_LEVEL_4K || 1128 !is_shadow_present_pte(iter->old_spte)) 1129 return false; 1130 1131 /* 1132 * Note, when changing a read-only SPTE, it's not strictly necessary to 1133 * zero the SPTE before setting the new PFN, but doing so preserves the 1134 * invariant that the PFN of a present * leaf SPTE can never change. 1135 * See __handle_changed_spte(). 1136 */ 1137 tdp_mmu_set_spte(kvm, iter, 0); 1138 1139 if (!pte_write(range->pte)) { 1140 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1141 pte_pfn(range->pte)); 1142 1143 tdp_mmu_set_spte(kvm, iter, new_spte); 1144 } 1145 1146 return true; 1147 } 1148 1149 /* 1150 * Handle the changed_pte MMU notifier for the TDP MMU. 1151 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1152 * notifier. 1153 * Returns non-zero if a flush is needed before releasing the MMU lock. 1154 */ 1155 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1156 { 1157 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1158 1159 /* FIXME: return 'flush' instead of flushing here. */ 1160 if (flush) 1161 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1); 1162 1163 return false; 1164 } 1165 1166 /* 1167 * Remove write access from all SPTEs at or above min_level that map GFNs 1168 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1169 * be flushed. 1170 */ 1171 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1172 gfn_t start, gfn_t end, int min_level) 1173 { 1174 struct tdp_iter iter; 1175 u64 new_spte; 1176 bool spte_set = false; 1177 1178 rcu_read_lock(); 1179 1180 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1181 1182 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1183 min_level, start, end) { 1184 retry: 1185 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1186 continue; 1187 1188 if (!is_shadow_present_pte(iter.old_spte) || 1189 !is_last_spte(iter.old_spte, iter.level) || 1190 !(iter.old_spte & PT_WRITABLE_MASK)) 1191 continue; 1192 1193 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1194 1195 if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { 1196 /* 1197 * The iter must explicitly re-read the SPTE because 1198 * the atomic cmpxchg failed. 1199 */ 1200 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1201 goto retry; 1202 } 1203 spte_set = true; 1204 } 1205 1206 rcu_read_unlock(); 1207 return spte_set; 1208 } 1209 1210 /* 1211 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1212 * only affect leaf SPTEs down to min_level. 1213 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1214 */ 1215 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1216 const struct kvm_memory_slot *slot, int min_level) 1217 { 1218 struct kvm_mmu_page *root; 1219 bool spte_set = false; 1220 1221 lockdep_assert_held_read(&kvm->mmu_lock); 1222 1223 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1224 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1225 slot->base_gfn + slot->npages, min_level); 1226 1227 return spte_set; 1228 } 1229 1230 /* 1231 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1232 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1233 * If AD bits are not enabled, this will require clearing the writable bit on 1234 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1235 * be flushed. 1236 */ 1237 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1238 gfn_t start, gfn_t end) 1239 { 1240 struct tdp_iter iter; 1241 u64 new_spte; 1242 bool spte_set = false; 1243 1244 rcu_read_lock(); 1245 1246 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1247 retry: 1248 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1249 continue; 1250 1251 if (spte_ad_need_write_protect(iter.old_spte)) { 1252 if (is_writable_pte(iter.old_spte)) 1253 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1254 else 1255 continue; 1256 } else { 1257 if (iter.old_spte & shadow_dirty_mask) 1258 new_spte = iter.old_spte & ~shadow_dirty_mask; 1259 else 1260 continue; 1261 } 1262 1263 if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { 1264 /* 1265 * The iter must explicitly re-read the SPTE because 1266 * the atomic cmpxchg failed. 1267 */ 1268 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1269 goto retry; 1270 } 1271 spte_set = true; 1272 } 1273 1274 rcu_read_unlock(); 1275 return spte_set; 1276 } 1277 1278 /* 1279 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1280 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1281 * If AD bits are not enabled, this will require clearing the writable bit on 1282 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1283 * be flushed. 1284 */ 1285 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1286 const struct kvm_memory_slot *slot) 1287 { 1288 struct kvm_mmu_page *root; 1289 bool spte_set = false; 1290 1291 lockdep_assert_held_read(&kvm->mmu_lock); 1292 1293 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1294 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1295 slot->base_gfn + slot->npages); 1296 1297 return spte_set; 1298 } 1299 1300 /* 1301 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1302 * set in mask, starting at gfn. The given memslot is expected to contain all 1303 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1304 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1305 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1306 */ 1307 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1308 gfn_t gfn, unsigned long mask, bool wrprot) 1309 { 1310 struct tdp_iter iter; 1311 u64 new_spte; 1312 1313 rcu_read_lock(); 1314 1315 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1316 gfn + BITS_PER_LONG) { 1317 if (!mask) 1318 break; 1319 1320 if (iter.level > PG_LEVEL_4K || 1321 !(mask & (1UL << (iter.gfn - gfn)))) 1322 continue; 1323 1324 mask &= ~(1UL << (iter.gfn - gfn)); 1325 1326 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1327 if (is_writable_pte(iter.old_spte)) 1328 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1329 else 1330 continue; 1331 } else { 1332 if (iter.old_spte & shadow_dirty_mask) 1333 new_spte = iter.old_spte & ~shadow_dirty_mask; 1334 else 1335 continue; 1336 } 1337 1338 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1339 } 1340 1341 rcu_read_unlock(); 1342 } 1343 1344 /* 1345 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1346 * set in mask, starting at gfn. The given memslot is expected to contain all 1347 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1348 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1349 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1350 */ 1351 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1352 struct kvm_memory_slot *slot, 1353 gfn_t gfn, unsigned long mask, 1354 bool wrprot) 1355 { 1356 struct kvm_mmu_page *root; 1357 1358 lockdep_assert_held_write(&kvm->mmu_lock); 1359 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1360 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1361 } 1362 1363 /* 1364 * Clear leaf entries which could be replaced by large mappings, for 1365 * GFNs within the slot. 1366 */ 1367 static bool zap_collapsible_spte_range(struct kvm *kvm, 1368 struct kvm_mmu_page *root, 1369 const struct kvm_memory_slot *slot, 1370 bool flush) 1371 { 1372 gfn_t start = slot->base_gfn; 1373 gfn_t end = start + slot->npages; 1374 struct tdp_iter iter; 1375 kvm_pfn_t pfn; 1376 1377 rcu_read_lock(); 1378 1379 tdp_root_for_each_pte(iter, root, start, end) { 1380 retry: 1381 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { 1382 flush = false; 1383 continue; 1384 } 1385 1386 if (!is_shadow_present_pte(iter.old_spte) || 1387 !is_last_spte(iter.old_spte, iter.level)) 1388 continue; 1389 1390 pfn = spte_to_pfn(iter.old_spte); 1391 if (kvm_is_reserved_pfn(pfn) || 1392 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 1393 pfn, PG_LEVEL_NUM)) 1394 continue; 1395 1396 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { 1397 /* 1398 * The iter must explicitly re-read the SPTE because 1399 * the atomic cmpxchg failed. 1400 */ 1401 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1402 goto retry; 1403 } 1404 flush = true; 1405 } 1406 1407 rcu_read_unlock(); 1408 1409 return flush; 1410 } 1411 1412 /* 1413 * Clear non-leaf entries (and free associated page tables) which could 1414 * be replaced by large mappings, for GFNs within the slot. 1415 */ 1416 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1417 const struct kvm_memory_slot *slot, 1418 bool flush) 1419 { 1420 struct kvm_mmu_page *root; 1421 1422 lockdep_assert_held_read(&kvm->mmu_lock); 1423 1424 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1425 flush = zap_collapsible_spte_range(kvm, root, slot, flush); 1426 1427 return flush; 1428 } 1429 1430 /* 1431 * Removes write access on the last level SPTE mapping this GFN and unsets the 1432 * MMU-writable bit to ensure future writes continue to be intercepted. 1433 * Returns true if an SPTE was set and a TLB flush is needed. 1434 */ 1435 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1436 gfn_t gfn, int min_level) 1437 { 1438 struct tdp_iter iter; 1439 u64 new_spte; 1440 bool spte_set = false; 1441 1442 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1443 1444 rcu_read_lock(); 1445 1446 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1447 min_level, gfn, gfn + 1) { 1448 if (!is_shadow_present_pte(iter.old_spte) || 1449 !is_last_spte(iter.old_spte, iter.level)) 1450 continue; 1451 1452 if (!is_writable_pte(iter.old_spte)) 1453 break; 1454 1455 new_spte = iter.old_spte & 1456 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1457 1458 tdp_mmu_set_spte(kvm, &iter, new_spte); 1459 spte_set = true; 1460 } 1461 1462 rcu_read_unlock(); 1463 1464 return spte_set; 1465 } 1466 1467 /* 1468 * Removes write access on the last level SPTE mapping this GFN and unsets the 1469 * MMU-writable bit to ensure future writes continue to be intercepted. 1470 * Returns true if an SPTE was set and a TLB flush is needed. 1471 */ 1472 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1473 struct kvm_memory_slot *slot, gfn_t gfn, 1474 int min_level) 1475 { 1476 struct kvm_mmu_page *root; 1477 bool spte_set = false; 1478 1479 lockdep_assert_held_write(&kvm->mmu_lock); 1480 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1481 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1482 1483 return spte_set; 1484 } 1485 1486 /* 1487 * Return the level of the lowest level SPTE added to sptes. 1488 * That SPTE may be non-present. 1489 * 1490 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1491 */ 1492 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1493 int *root_level) 1494 { 1495 struct tdp_iter iter; 1496 struct kvm_mmu *mmu = vcpu->arch.mmu; 1497 gfn_t gfn = addr >> PAGE_SHIFT; 1498 int leaf = -1; 1499 1500 *root_level = vcpu->arch.mmu->shadow_root_level; 1501 1502 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1503 leaf = iter.level; 1504 sptes[leaf] = iter.old_spte; 1505 } 1506 1507 return leaf; 1508 } 1509 1510 /* 1511 * Returns the last level spte pointer of the shadow page walk for the given 1512 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1513 * walk could be performed, returns NULL and *spte does not contain valid data. 1514 * 1515 * Contract: 1516 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1517 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1518 * 1519 * WARNING: This function is only intended to be called during fast_page_fault. 1520 */ 1521 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1522 u64 *spte) 1523 { 1524 struct tdp_iter iter; 1525 struct kvm_mmu *mmu = vcpu->arch.mmu; 1526 gfn_t gfn = addr >> PAGE_SHIFT; 1527 tdp_ptep_t sptep = NULL; 1528 1529 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1530 *spte = iter.old_spte; 1531 sptep = iter.sptep; 1532 } 1533 1534 /* 1535 * Perform the rcu_dereference to get the raw spte pointer value since 1536 * we are passing it up to fast_page_fault, which is shared with the 1537 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1538 * annotation. 1539 * 1540 * This is safe since fast_page_fault obeys the contracts of this 1541 * function as well as all TDP MMU contracts around modifying SPTEs 1542 * outside of mmu_lock. 1543 */ 1544 return rcu_dereference(sptep); 1545 } 1546