1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = false; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 20 return false; 21 22 /* This should not be changed for the lifetime of the VM. */ 23 kvm->arch.tdp_mmu_enabled = true; 24 25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 28 29 return true; 30 } 31 32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 33 bool shared) 34 { 35 if (shared) 36 lockdep_assert_held_read(&kvm->mmu_lock); 37 else 38 lockdep_assert_held_write(&kvm->mmu_lock); 39 } 40 41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 42 { 43 if (!kvm->arch.tdp_mmu_enabled) 44 return; 45 46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 48 49 /* 50 * Ensure that all the outstanding RCU callbacks to free shadow pages 51 * can run before the VM is torn down. 52 */ 53 rcu_barrier(); 54 } 55 56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 57 gfn_t start, gfn_t end, bool can_yield, bool flush, 58 bool shared); 59 60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 61 { 62 free_page((unsigned long)sp->spt); 63 kmem_cache_free(mmu_page_header_cache, sp); 64 } 65 66 /* 67 * This is called through call_rcu in order to free TDP page table memory 68 * safely with respect to other kernel threads that may be operating on 69 * the memory. 70 * By only accessing TDP MMU page table memory in an RCU read critical 71 * section, and freeing it after a grace period, lockless access to that 72 * memory won't use it after it is freed. 73 */ 74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 75 { 76 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 77 rcu_head); 78 79 tdp_mmu_free_sp(sp); 80 } 81 82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 83 bool shared) 84 { 85 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 86 87 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 88 return; 89 90 WARN_ON(!root->tdp_mmu_page); 91 92 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 93 list_del_rcu(&root->link); 94 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 95 96 zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); 97 98 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 99 } 100 101 /* 102 * Finds the next valid root after root (or the first valid root if root 103 * is NULL), takes a reference on it, and returns that next root. If root 104 * is not NULL, this thread should have already taken a reference on it, and 105 * that reference will be dropped. If no valid root is found, this 106 * function will return NULL. 107 */ 108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 109 struct kvm_mmu_page *prev_root, 110 bool shared) 111 { 112 struct kvm_mmu_page *next_root; 113 114 rcu_read_lock(); 115 116 if (prev_root) 117 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 118 &prev_root->link, 119 typeof(*prev_root), link); 120 else 121 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 122 typeof(*next_root), link); 123 124 while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) 125 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 126 &next_root->link, typeof(*next_root), link); 127 128 rcu_read_unlock(); 129 130 if (prev_root) 131 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 132 133 return next_root; 134 } 135 136 /* 137 * Note: this iterator gets and puts references to the roots it iterates over. 138 * This makes it safe to release the MMU lock and yield within the loop, but 139 * if exiting the loop early, the caller must drop the reference to the most 140 * recent root. (Unless keeping a live reference is desirable.) 141 * 142 * If shared is set, this function is operating under the MMU lock in read 143 * mode. In the unlikely event that this thread must free a root, the lock 144 * will be temporarily dropped and reacquired in write mode. 145 */ 146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 147 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ 148 _root; \ 149 _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ 150 if (kvm_mmu_page_as_id(_root) != _as_id) { \ 151 } else 152 153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 154 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ 155 lockdep_is_held_type(&kvm->mmu_lock, 0) || \ 156 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \ 157 if (kvm_mmu_page_as_id(_root) != _as_id) { \ 158 } else 159 160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 161 int level) 162 { 163 union kvm_mmu_page_role role; 164 165 role = vcpu->arch.mmu->mmu_role.base; 166 role.level = level; 167 role.direct = true; 168 role.gpte_is_8_bytes = true; 169 role.access = ACC_ALL; 170 171 return role; 172 } 173 174 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 175 int level) 176 { 177 struct kvm_mmu_page *sp; 178 179 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 180 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 181 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 182 183 sp->role.word = page_role_for_level(vcpu, level).word; 184 sp->gfn = gfn; 185 sp->tdp_mmu_page = true; 186 187 trace_kvm_mmu_get_page(sp, true); 188 189 return sp; 190 } 191 192 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 193 { 194 union kvm_mmu_page_role role; 195 struct kvm *kvm = vcpu->kvm; 196 struct kvm_mmu_page *root; 197 198 lockdep_assert_held_write(&kvm->mmu_lock); 199 200 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 201 202 /* Check for an existing root before allocating a new one. */ 203 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 204 if (root->role.word == role.word && 205 kvm_tdp_mmu_get_root(kvm, root)) 206 goto out; 207 } 208 209 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 210 refcount_set(&root->tdp_mmu_root_count, 1); 211 212 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 213 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 214 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 215 216 out: 217 return __pa(root->spt); 218 } 219 220 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 221 u64 old_spte, u64 new_spte, int level, 222 bool shared); 223 224 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 225 { 226 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 227 return; 228 229 if (is_accessed_spte(old_spte) && 230 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 231 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 232 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 233 } 234 235 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 236 u64 old_spte, u64 new_spte, int level) 237 { 238 bool pfn_changed; 239 struct kvm_memory_slot *slot; 240 241 if (level > PG_LEVEL_4K) 242 return; 243 244 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 245 246 if ((!is_writable_pte(old_spte) || pfn_changed) && 247 is_writable_pte(new_spte)) { 248 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 249 mark_page_dirty_in_slot(kvm, slot, gfn); 250 } 251 } 252 253 /** 254 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU 255 * 256 * @kvm: kvm instance 257 * @sp: the new page 258 * @shared: This operation may not be running under the exclusive use of 259 * the MMU lock and the operation must synchronize with other 260 * threads that might be adding or removing pages. 261 * @account_nx: This page replaces a NX large page and should be marked for 262 * eventual reclaim. 263 */ 264 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, 265 bool shared, bool account_nx) 266 { 267 if (shared) 268 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 269 else 270 lockdep_assert_held_write(&kvm->mmu_lock); 271 272 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 273 if (account_nx) 274 account_huge_nx_page(kvm, sp); 275 276 if (shared) 277 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 278 } 279 280 /** 281 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU 282 * 283 * @kvm: kvm instance 284 * @sp: the page to be removed 285 * @shared: This operation may not be running under the exclusive use of 286 * the MMU lock and the operation must synchronize with other 287 * threads that might be adding or removing pages. 288 */ 289 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, 290 bool shared) 291 { 292 if (shared) 293 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 294 else 295 lockdep_assert_held_write(&kvm->mmu_lock); 296 297 list_del(&sp->link); 298 if (sp->lpage_disallowed) 299 unaccount_huge_nx_page(kvm, sp); 300 301 if (shared) 302 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 303 } 304 305 /** 306 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure 307 * 308 * @kvm: kvm instance 309 * @pt: the page removed from the paging structure 310 * @shared: This operation may not be running under the exclusive use 311 * of the MMU lock and the operation must synchronize with other 312 * threads that might be modifying SPTEs. 313 * 314 * Given a page table that has been removed from the TDP paging structure, 315 * iterates through the page table to clear SPTEs and free child page tables. 316 * 317 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 318 * protection. Since this thread removed it from the paging structure, 319 * this thread will be responsible for ensuring the page is freed. Hence the 320 * early rcu_dereferences in the function. 321 */ 322 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, 323 bool shared) 324 { 325 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 326 int level = sp->role.level; 327 gfn_t base_gfn = sp->gfn; 328 u64 old_child_spte; 329 u64 *sptep; 330 gfn_t gfn; 331 int i; 332 333 trace_kvm_mmu_prepare_zap_page(sp); 334 335 tdp_mmu_unlink_page(kvm, sp, shared); 336 337 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 338 sptep = rcu_dereference(pt) + i; 339 gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 340 341 if (shared) { 342 /* 343 * Set the SPTE to a nonpresent value that other 344 * threads will not overwrite. If the SPTE was 345 * already marked as removed then another thread 346 * handling a page fault could overwrite it, so 347 * set the SPTE until it is set from some other 348 * value to the removed SPTE value. 349 */ 350 for (;;) { 351 old_child_spte = xchg(sptep, REMOVED_SPTE); 352 if (!is_removed_spte(old_child_spte)) 353 break; 354 cpu_relax(); 355 } 356 } else { 357 /* 358 * If the SPTE is not MMU-present, there is no backing 359 * page associated with the SPTE and so no side effects 360 * that need to be recorded, and exclusive ownership of 361 * mmu_lock ensures the SPTE can't be made present. 362 * Note, zapping MMIO SPTEs is also unnecessary as they 363 * are guarded by the memslots generation, not by being 364 * unreachable. 365 */ 366 old_child_spte = READ_ONCE(*sptep); 367 if (!is_shadow_present_pte(old_child_spte)) 368 continue; 369 370 /* 371 * Marking the SPTE as a removed SPTE is not 372 * strictly necessary here as the MMU lock will 373 * stop other threads from concurrently modifying 374 * this SPTE. Using the removed SPTE value keeps 375 * the two branches consistent and simplifies 376 * the function. 377 */ 378 WRITE_ONCE(*sptep, REMOVED_SPTE); 379 } 380 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 381 old_child_spte, REMOVED_SPTE, level, 382 shared); 383 } 384 385 kvm_flush_remote_tlbs_with_address(kvm, gfn, 386 KVM_PAGES_PER_HPAGE(level + 1)); 387 388 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 389 } 390 391 /** 392 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 393 * @kvm: kvm instance 394 * @as_id: the address space of the paging structure the SPTE was a part of 395 * @gfn: the base GFN that was mapped by the SPTE 396 * @old_spte: The value of the SPTE before the change 397 * @new_spte: The value of the SPTE after the change 398 * @level: the level of the PT the SPTE is part of in the paging structure 399 * @shared: This operation may not be running under the exclusive use of 400 * the MMU lock and the operation must synchronize with other 401 * threads that might be modifying SPTEs. 402 * 403 * Handle bookkeeping that might result from the modification of a SPTE. 404 * This function must be called for all TDP SPTE modifications. 405 */ 406 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 407 u64 old_spte, u64 new_spte, int level, 408 bool shared) 409 { 410 bool was_present = is_shadow_present_pte(old_spte); 411 bool is_present = is_shadow_present_pte(new_spte); 412 bool was_leaf = was_present && is_last_spte(old_spte, level); 413 bool is_leaf = is_present && is_last_spte(new_spte, level); 414 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 415 416 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 417 WARN_ON(level < PG_LEVEL_4K); 418 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 419 420 /* 421 * If this warning were to trigger it would indicate that there was a 422 * missing MMU notifier or a race with some notifier handler. 423 * A present, leaf SPTE should never be directly replaced with another 424 * present leaf SPTE pointing to a different PFN. A notifier handler 425 * should be zapping the SPTE before the main MM's page table is 426 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 427 * thread before replacement. 428 */ 429 if (was_leaf && is_leaf && pfn_changed) { 430 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 431 "SPTE with another present leaf SPTE mapping a\n" 432 "different PFN!\n" 433 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 434 as_id, gfn, old_spte, new_spte, level); 435 436 /* 437 * Crash the host to prevent error propagation and guest data 438 * corruption. 439 */ 440 BUG(); 441 } 442 443 if (old_spte == new_spte) 444 return; 445 446 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 447 448 if (is_large_pte(old_spte) != is_large_pte(new_spte)) { 449 if (is_large_pte(old_spte)) 450 atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); 451 else 452 atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); 453 } 454 455 /* 456 * The only times a SPTE should be changed from a non-present to 457 * non-present state is when an MMIO entry is installed/modified/ 458 * removed. In that case, there is nothing to do here. 459 */ 460 if (!was_present && !is_present) { 461 /* 462 * If this change does not involve a MMIO SPTE or removed SPTE, 463 * it is unexpected. Log the change, though it should not 464 * impact the guest since both the former and current SPTEs 465 * are nonpresent. 466 */ 467 if (WARN_ON(!is_mmio_spte(old_spte) && 468 !is_mmio_spte(new_spte) && 469 !is_removed_spte(new_spte))) 470 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 471 "should not be replaced with another,\n" 472 "different nonpresent SPTE, unless one or both\n" 473 "are MMIO SPTEs, or the new SPTE is\n" 474 "a temporary removed SPTE.\n" 475 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 476 as_id, gfn, old_spte, new_spte, level); 477 return; 478 } 479 480 481 if (was_leaf && is_dirty_spte(old_spte) && 482 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 483 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 484 485 /* 486 * Recursively handle child PTs if the change removed a subtree from 487 * the paging structure. 488 */ 489 if (was_present && !was_leaf && (pfn_changed || !is_present)) 490 handle_removed_tdp_mmu_page(kvm, 491 spte_to_child_pt(old_spte, level), shared); 492 } 493 494 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 495 u64 old_spte, u64 new_spte, int level, 496 bool shared) 497 { 498 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 499 shared); 500 handle_changed_spte_acc_track(old_spte, new_spte, level); 501 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 502 new_spte, level); 503 } 504 505 /* 506 * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically 507 * and handle the associated bookkeeping, but do not mark the page dirty 508 * in KVM's dirty bitmaps. 509 * 510 * @kvm: kvm instance 511 * @iter: a tdp_iter instance currently on the SPTE that should be set 512 * @new_spte: The value the SPTE should be set to 513 * Returns: true if the SPTE was set, false if it was not. If false is returned, 514 * this function will have no side-effects. 515 */ 516 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, 517 struct tdp_iter *iter, 518 u64 new_spte) 519 { 520 lockdep_assert_held_read(&kvm->mmu_lock); 521 522 /* 523 * Do not change removed SPTEs. Only the thread that froze the SPTE 524 * may modify it. 525 */ 526 if (is_removed_spte(iter->old_spte)) 527 return false; 528 529 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, 530 new_spte) != iter->old_spte) 531 return false; 532 533 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 534 new_spte, iter->level, true); 535 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 536 537 return true; 538 } 539 540 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, 541 struct tdp_iter *iter, 542 u64 new_spte) 543 { 544 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) 545 return false; 546 547 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, 548 iter->old_spte, new_spte, iter->level); 549 return true; 550 } 551 552 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, 553 struct tdp_iter *iter) 554 { 555 /* 556 * Freeze the SPTE by setting it to a special, 557 * non-present value. This will stop other threads from 558 * immediately installing a present entry in its place 559 * before the TLBs are flushed. 560 */ 561 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) 562 return false; 563 564 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 565 KVM_PAGES_PER_HPAGE(iter->level)); 566 567 /* 568 * No other thread can overwrite the removed SPTE as they 569 * must either wait on the MMU lock or use 570 * tdp_mmu_set_spte_atomic which will not overwrite the 571 * special removed SPTE value. No bookkeeping is needed 572 * here since the SPTE is going from non-present 573 * to non-present. 574 */ 575 WRITE_ONCE(*rcu_dereference(iter->sptep), 0); 576 577 return true; 578 } 579 580 581 /* 582 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 583 * @kvm: kvm instance 584 * @iter: a tdp_iter instance currently on the SPTE that should be set 585 * @new_spte: The value the SPTE should be set to 586 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 587 * of the page. Should be set unless handling an MMU 588 * notifier for access tracking. Leaving record_acc_track 589 * unset in that case prevents page accesses from being 590 * double counted. 591 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 592 * appropriate for the change being made. Should be set 593 * unless performing certain dirty logging operations. 594 * Leaving record_dirty_log unset in that case prevents page 595 * writes from being double counted. 596 */ 597 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 598 u64 new_spte, bool record_acc_track, 599 bool record_dirty_log) 600 { 601 lockdep_assert_held_write(&kvm->mmu_lock); 602 603 /* 604 * No thread should be using this function to set SPTEs to the 605 * temporary removed SPTE value. 606 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 607 * should be used. If operating under the MMU lock in write mode, the 608 * use of the removed SPTE should not be necessary. 609 */ 610 WARN_ON(is_removed_spte(iter->old_spte)); 611 612 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); 613 614 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 615 new_spte, iter->level, false); 616 if (record_acc_track) 617 handle_changed_spte_acc_track(iter->old_spte, new_spte, 618 iter->level); 619 if (record_dirty_log) 620 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, 621 iter->old_spte, new_spte, 622 iter->level); 623 } 624 625 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 626 u64 new_spte) 627 { 628 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 629 } 630 631 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 632 struct tdp_iter *iter, 633 u64 new_spte) 634 { 635 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 636 } 637 638 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 639 struct tdp_iter *iter, 640 u64 new_spte) 641 { 642 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 643 } 644 645 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 646 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 647 648 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 649 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 650 if (!is_shadow_present_pte(_iter.old_spte) || \ 651 !is_last_spte(_iter.old_spte, _iter.level)) \ 652 continue; \ 653 else 654 655 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 656 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 657 _mmu->shadow_root_level, _start, _end) 658 659 /* 660 * Yield if the MMU lock is contended or this thread needs to return control 661 * to the scheduler. 662 * 663 * If this function should yield and flush is set, it will perform a remote 664 * TLB flush before yielding. 665 * 666 * If this function yields, it will also reset the tdp_iter's walk over the 667 * paging structure and the calling function should skip to the next 668 * iteration to allow the iterator to continue its traversal from the 669 * paging structure root. 670 * 671 * Return true if this function yielded and the iterator's traversal was reset. 672 * Return false if a yield was not needed. 673 */ 674 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 675 struct tdp_iter *iter, bool flush, 676 bool shared) 677 { 678 /* Ensure forward progress has been made before yielding. */ 679 if (iter->next_last_level_gfn == iter->yielded_gfn) 680 return false; 681 682 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 683 rcu_read_unlock(); 684 685 if (flush) 686 kvm_flush_remote_tlbs(kvm); 687 688 if (shared) 689 cond_resched_rwlock_read(&kvm->mmu_lock); 690 else 691 cond_resched_rwlock_write(&kvm->mmu_lock); 692 693 rcu_read_lock(); 694 695 WARN_ON(iter->gfn > iter->next_last_level_gfn); 696 697 tdp_iter_restart(iter); 698 699 return true; 700 } 701 702 return false; 703 } 704 705 /* 706 * Tears down the mappings for the range of gfns, [start, end), and frees the 707 * non-root pages mapping GFNs strictly within that range. Returns true if 708 * SPTEs have been cleared and a TLB flush is needed before releasing the 709 * MMU lock. 710 * 711 * If can_yield is true, will release the MMU lock and reschedule if the 712 * scheduler needs the CPU or there is contention on the MMU lock. If this 713 * function cannot yield, it will not release the MMU lock or reschedule and 714 * the caller must ensure it does not supply too large a GFN range, or the 715 * operation can cause a soft lockup. 716 * 717 * If shared is true, this thread holds the MMU lock in read mode and must 718 * account for the possibility that other threads are modifying the paging 719 * structures concurrently. If shared is false, this thread should hold the 720 * MMU lock in write mode. 721 */ 722 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 723 gfn_t start, gfn_t end, bool can_yield, bool flush, 724 bool shared) 725 { 726 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 727 bool zap_all = (start == 0 && end >= max_gfn_host); 728 struct tdp_iter iter; 729 730 /* 731 * No need to try to step down in the iterator when zapping all SPTEs, 732 * zapping the top-level non-leaf SPTEs will recurse on their children. 733 */ 734 int min_level = zap_all ? root->role.level : PG_LEVEL_4K; 735 736 /* 737 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will 738 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, 739 * and so KVM will never install a SPTE for such addresses. 740 */ 741 end = min(end, max_gfn_host); 742 743 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 744 745 rcu_read_lock(); 746 747 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 748 min_level, start, end) { 749 retry: 750 if (can_yield && 751 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { 752 flush = false; 753 continue; 754 } 755 756 if (!is_shadow_present_pte(iter.old_spte)) 757 continue; 758 759 /* 760 * If this is a non-last-level SPTE that covers a larger range 761 * than should be zapped, continue, and zap the mappings at a 762 * lower level, except when zapping all SPTEs. 763 */ 764 if (!zap_all && 765 (iter.gfn < start || 766 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 767 !is_last_spte(iter.old_spte, iter.level)) 768 continue; 769 770 if (!shared) { 771 tdp_mmu_set_spte(kvm, &iter, 0); 772 flush = true; 773 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { 774 /* 775 * The iter must explicitly re-read the SPTE because 776 * the atomic cmpxchg failed. 777 */ 778 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 779 goto retry; 780 } 781 } 782 783 rcu_read_unlock(); 784 return flush; 785 } 786 787 /* 788 * Tears down the mappings for the range of gfns, [start, end), and frees the 789 * non-root pages mapping GFNs strictly within that range. Returns true if 790 * SPTEs have been cleared and a TLB flush is needed before releasing the 791 * MMU lock. 792 * 793 * If shared is true, this thread holds the MMU lock in read mode and must 794 * account for the possibility that other threads are modifying the paging 795 * structures concurrently. If shared is false, this thread should hold the 796 * MMU in write mode. 797 */ 798 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, 799 gfn_t end, bool can_yield, bool flush, 800 bool shared) 801 { 802 struct kvm_mmu_page *root; 803 804 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) 805 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, 806 shared); 807 808 return flush; 809 } 810 811 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 812 { 813 bool flush = false; 814 int i; 815 816 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 817 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, 818 flush, false); 819 820 if (flush) 821 kvm_flush_remote_tlbs(kvm); 822 } 823 824 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, 825 struct kvm_mmu_page *prev_root) 826 { 827 struct kvm_mmu_page *next_root; 828 829 if (prev_root) 830 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 831 &prev_root->link, 832 typeof(*prev_root), link); 833 else 834 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 835 typeof(*next_root), link); 836 837 while (next_root && !(next_root->role.invalid && 838 refcount_read(&next_root->tdp_mmu_root_count))) 839 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 840 &next_root->link, 841 typeof(*next_root), link); 842 843 return next_root; 844 } 845 846 /* 847 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each 848 * invalidated root, they will not be freed until this function drops the 849 * reference. Before dropping that reference, tear down the paging 850 * structure so that whichever thread does drop the last reference 851 * only has to do a trivial amount of work. Since the roots are invalid, 852 * no new SPTEs should be created under them. 853 */ 854 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 855 { 856 struct kvm_mmu_page *next_root; 857 struct kvm_mmu_page *root; 858 bool flush = false; 859 860 lockdep_assert_held_read(&kvm->mmu_lock); 861 862 rcu_read_lock(); 863 864 root = next_invalidated_root(kvm, NULL); 865 866 while (root) { 867 next_root = next_invalidated_root(kvm, root); 868 869 rcu_read_unlock(); 870 871 flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); 872 873 /* 874 * Put the reference acquired in 875 * kvm_tdp_mmu_invalidate_roots 876 */ 877 kvm_tdp_mmu_put_root(kvm, root, true); 878 879 root = next_root; 880 881 rcu_read_lock(); 882 } 883 884 rcu_read_unlock(); 885 886 if (flush) 887 kvm_flush_remote_tlbs(kvm); 888 } 889 890 /* 891 * Mark each TDP MMU root as invalid so that other threads 892 * will drop their references and allow the root count to 893 * go to 0. 894 * 895 * Also take a reference on all roots so that this thread 896 * can do the bulk of the work required to free the roots 897 * once they are invalidated. Without this reference, a 898 * vCPU thread might drop the last reference to a root and 899 * get stuck with tearing down the entire paging structure. 900 * 901 * Roots which have a zero refcount should be skipped as 902 * they're already being torn down. 903 * Already invalid roots should be referenced again so that 904 * they aren't freed before kvm_tdp_mmu_zap_all_fast is 905 * done with them. 906 * 907 * This has essentially the same effect for the TDP MMU 908 * as updating mmu_valid_gen does for the shadow MMU. 909 */ 910 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 911 { 912 struct kvm_mmu_page *root; 913 914 lockdep_assert_held_write(&kvm->mmu_lock); 915 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) 916 if (refcount_inc_not_zero(&root->tdp_mmu_root_count)) 917 root->role.invalid = true; 918 } 919 920 /* 921 * Installs a last-level SPTE to handle a TDP page fault. 922 * (NPT/EPT violation/misconfiguration) 923 */ 924 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 925 int map_writable, 926 struct tdp_iter *iter, 927 kvm_pfn_t pfn, bool prefault) 928 { 929 u64 new_spte; 930 int ret = RET_PF_FIXED; 931 int make_spte_ret = 0; 932 933 if (unlikely(is_noslot_pfn(pfn))) 934 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 935 else 936 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 937 pfn, iter->old_spte, prefault, true, 938 map_writable, !shadow_accessed_mask, 939 &new_spte); 940 941 if (new_spte == iter->old_spte) 942 ret = RET_PF_SPURIOUS; 943 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 944 return RET_PF_RETRY; 945 946 /* 947 * If the page fault was caused by a write but the page is write 948 * protected, emulation is needed. If the emulation was skipped, 949 * the vCPU would have the same fault again. 950 */ 951 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 952 if (write) 953 ret = RET_PF_EMULATE; 954 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 955 } 956 957 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 958 if (unlikely(is_mmio_spte(new_spte))) { 959 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 960 new_spte); 961 ret = RET_PF_EMULATE; 962 } else { 963 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 964 rcu_dereference(iter->sptep)); 965 } 966 967 /* 968 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be 969 * consistent with legacy MMU behavior. 970 */ 971 if (ret != RET_PF_SPURIOUS) 972 vcpu->stat.pf_fixed++; 973 974 return ret; 975 } 976 977 /* 978 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 979 * page tables and SPTEs to translate the faulting guest physical address. 980 */ 981 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 982 int map_writable, int max_level, kvm_pfn_t pfn, 983 bool prefault) 984 { 985 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 986 bool write = error_code & PFERR_WRITE_MASK; 987 bool exec = error_code & PFERR_FETCH_MASK; 988 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 989 struct kvm_mmu *mmu = vcpu->arch.mmu; 990 struct tdp_iter iter; 991 struct kvm_mmu_page *sp; 992 u64 *child_pt; 993 u64 new_spte; 994 int ret; 995 gfn_t gfn = gpa >> PAGE_SHIFT; 996 int level; 997 int req_level; 998 999 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 1000 huge_page_disallowed, &req_level); 1001 1002 trace_kvm_mmu_spte_requested(gpa, level, pfn); 1003 1004 rcu_read_lock(); 1005 1006 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1007 if (nx_huge_page_workaround_enabled) 1008 disallowed_hugepage_adjust(iter.old_spte, gfn, 1009 iter.level, &pfn, &level); 1010 1011 if (iter.level == level) 1012 break; 1013 1014 /* 1015 * If there is an SPTE mapping a large page at a higher level 1016 * than the target, that SPTE must be cleared and replaced 1017 * with a non-leaf SPTE. 1018 */ 1019 if (is_shadow_present_pte(iter.old_spte) && 1020 is_large_pte(iter.old_spte)) { 1021 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 1022 break; 1023 1024 /* 1025 * The iter must explicitly re-read the spte here 1026 * because the new value informs the !present 1027 * path below. 1028 */ 1029 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1030 } 1031 1032 if (!is_shadow_present_pte(iter.old_spte)) { 1033 /* 1034 * If SPTE has been frozen by another thread, just 1035 * give up and retry, avoiding unnecessary page table 1036 * allocation and free. 1037 */ 1038 if (is_removed_spte(iter.old_spte)) 1039 break; 1040 1041 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); 1042 child_pt = sp->spt; 1043 1044 new_spte = make_nonleaf_spte(child_pt, 1045 !shadow_accessed_mask); 1046 1047 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 1048 new_spte)) { 1049 tdp_mmu_link_page(vcpu->kvm, sp, true, 1050 huge_page_disallowed && 1051 req_level >= iter.level); 1052 1053 trace_kvm_mmu_get_page(sp, true); 1054 } else { 1055 tdp_mmu_free_sp(sp); 1056 break; 1057 } 1058 } 1059 } 1060 1061 if (iter.level != level) { 1062 rcu_read_unlock(); 1063 return RET_PF_RETRY; 1064 } 1065 1066 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 1067 pfn, prefault); 1068 rcu_read_unlock(); 1069 1070 return ret; 1071 } 1072 1073 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1074 bool flush) 1075 { 1076 struct kvm_mmu_page *root; 1077 1078 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) 1079 flush |= zap_gfn_range(kvm, root, range->start, range->end, 1080 range->may_block, flush, false); 1081 1082 return flush; 1083 } 1084 1085 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1086 struct kvm_gfn_range *range); 1087 1088 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1089 struct kvm_gfn_range *range, 1090 tdp_handler_t handler) 1091 { 1092 struct kvm_mmu_page *root; 1093 struct tdp_iter iter; 1094 bool ret = false; 1095 1096 rcu_read_lock(); 1097 1098 /* 1099 * Don't support rescheduling, none of the MMU notifiers that funnel 1100 * into this helper allow blocking; it'd be dead, wasteful code. 1101 */ 1102 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1103 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1104 ret |= handler(kvm, &iter, range); 1105 } 1106 1107 rcu_read_unlock(); 1108 1109 return ret; 1110 } 1111 1112 /* 1113 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1114 * if any of the GFNs in the range have been accessed. 1115 */ 1116 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1117 struct kvm_gfn_range *range) 1118 { 1119 u64 new_spte = 0; 1120 1121 /* If we have a non-accessed entry we don't need to change the pte. */ 1122 if (!is_accessed_spte(iter->old_spte)) 1123 return false; 1124 1125 new_spte = iter->old_spte; 1126 1127 if (spte_ad_enabled(new_spte)) { 1128 new_spte &= ~shadow_accessed_mask; 1129 } else { 1130 /* 1131 * Capture the dirty status of the page, so that it doesn't get 1132 * lost when the SPTE is marked for access tracking. 1133 */ 1134 if (is_writable_pte(new_spte)) 1135 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1136 1137 new_spte = mark_spte_for_access_track(new_spte); 1138 } 1139 1140 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1141 1142 return true; 1143 } 1144 1145 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1146 { 1147 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1148 } 1149 1150 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1151 struct kvm_gfn_range *range) 1152 { 1153 return is_accessed_spte(iter->old_spte); 1154 } 1155 1156 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1157 { 1158 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1159 } 1160 1161 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1162 struct kvm_gfn_range *range) 1163 { 1164 u64 new_spte; 1165 1166 /* Huge pages aren't expected to be modified without first being zapped. */ 1167 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1168 1169 if (iter->level != PG_LEVEL_4K || 1170 !is_shadow_present_pte(iter->old_spte)) 1171 return false; 1172 1173 /* 1174 * Note, when changing a read-only SPTE, it's not strictly necessary to 1175 * zero the SPTE before setting the new PFN, but doing so preserves the 1176 * invariant that the PFN of a present * leaf SPTE can never change. 1177 * See __handle_changed_spte(). 1178 */ 1179 tdp_mmu_set_spte(kvm, iter, 0); 1180 1181 if (!pte_write(range->pte)) { 1182 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1183 pte_pfn(range->pte)); 1184 1185 tdp_mmu_set_spte(kvm, iter, new_spte); 1186 } 1187 1188 return true; 1189 } 1190 1191 /* 1192 * Handle the changed_pte MMU notifier for the TDP MMU. 1193 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1194 * notifier. 1195 * Returns non-zero if a flush is needed before releasing the MMU lock. 1196 */ 1197 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1198 { 1199 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1200 1201 /* FIXME: return 'flush' instead of flushing here. */ 1202 if (flush) 1203 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1); 1204 1205 return false; 1206 } 1207 1208 /* 1209 * Remove write access from all SPTEs at or above min_level that map GFNs 1210 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1211 * be flushed. 1212 */ 1213 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1214 gfn_t start, gfn_t end, int min_level) 1215 { 1216 struct tdp_iter iter; 1217 u64 new_spte; 1218 bool spte_set = false; 1219 1220 rcu_read_lock(); 1221 1222 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1223 1224 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1225 min_level, start, end) { 1226 retry: 1227 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1228 continue; 1229 1230 if (!is_shadow_present_pte(iter.old_spte) || 1231 !is_last_spte(iter.old_spte, iter.level) || 1232 !(iter.old_spte & PT_WRITABLE_MASK)) 1233 continue; 1234 1235 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1236 1237 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, 1238 new_spte)) { 1239 /* 1240 * The iter must explicitly re-read the SPTE because 1241 * the atomic cmpxchg failed. 1242 */ 1243 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1244 goto retry; 1245 } 1246 spte_set = true; 1247 } 1248 1249 rcu_read_unlock(); 1250 return spte_set; 1251 } 1252 1253 /* 1254 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1255 * only affect leaf SPTEs down to min_level. 1256 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1257 */ 1258 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 1259 int min_level) 1260 { 1261 struct kvm_mmu_page *root; 1262 bool spte_set = false; 1263 1264 lockdep_assert_held_read(&kvm->mmu_lock); 1265 1266 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1267 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1268 slot->base_gfn + slot->npages, min_level); 1269 1270 return spte_set; 1271 } 1272 1273 /* 1274 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1275 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1276 * If AD bits are not enabled, this will require clearing the writable bit on 1277 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1278 * be flushed. 1279 */ 1280 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1281 gfn_t start, gfn_t end) 1282 { 1283 struct tdp_iter iter; 1284 u64 new_spte; 1285 bool spte_set = false; 1286 1287 rcu_read_lock(); 1288 1289 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1290 retry: 1291 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1292 continue; 1293 1294 if (spte_ad_need_write_protect(iter.old_spte)) { 1295 if (is_writable_pte(iter.old_spte)) 1296 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1297 else 1298 continue; 1299 } else { 1300 if (iter.old_spte & shadow_dirty_mask) 1301 new_spte = iter.old_spte & ~shadow_dirty_mask; 1302 else 1303 continue; 1304 } 1305 1306 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, 1307 new_spte)) { 1308 /* 1309 * The iter must explicitly re-read the SPTE because 1310 * the atomic cmpxchg failed. 1311 */ 1312 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1313 goto retry; 1314 } 1315 spte_set = true; 1316 } 1317 1318 rcu_read_unlock(); 1319 return spte_set; 1320 } 1321 1322 /* 1323 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1324 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1325 * If AD bits are not enabled, this will require clearing the writable bit on 1326 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1327 * be flushed. 1328 */ 1329 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 1330 { 1331 struct kvm_mmu_page *root; 1332 bool spte_set = false; 1333 1334 lockdep_assert_held_read(&kvm->mmu_lock); 1335 1336 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1337 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1338 slot->base_gfn + slot->npages); 1339 1340 return spte_set; 1341 } 1342 1343 /* 1344 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1345 * set in mask, starting at gfn. The given memslot is expected to contain all 1346 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1347 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1348 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1349 */ 1350 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1351 gfn_t gfn, unsigned long mask, bool wrprot) 1352 { 1353 struct tdp_iter iter; 1354 u64 new_spte; 1355 1356 rcu_read_lock(); 1357 1358 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1359 gfn + BITS_PER_LONG) { 1360 if (!mask) 1361 break; 1362 1363 if (iter.level > PG_LEVEL_4K || 1364 !(mask & (1UL << (iter.gfn - gfn)))) 1365 continue; 1366 1367 mask &= ~(1UL << (iter.gfn - gfn)); 1368 1369 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1370 if (is_writable_pte(iter.old_spte)) 1371 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1372 else 1373 continue; 1374 } else { 1375 if (iter.old_spte & shadow_dirty_mask) 1376 new_spte = iter.old_spte & ~shadow_dirty_mask; 1377 else 1378 continue; 1379 } 1380 1381 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1382 } 1383 1384 rcu_read_unlock(); 1385 } 1386 1387 /* 1388 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1389 * set in mask, starting at gfn. The given memslot is expected to contain all 1390 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1391 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1392 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1393 */ 1394 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1395 struct kvm_memory_slot *slot, 1396 gfn_t gfn, unsigned long mask, 1397 bool wrprot) 1398 { 1399 struct kvm_mmu_page *root; 1400 1401 lockdep_assert_held_write(&kvm->mmu_lock); 1402 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1403 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1404 } 1405 1406 /* 1407 * Clear leaf entries which could be replaced by large mappings, for 1408 * GFNs within the slot. 1409 */ 1410 static bool zap_collapsible_spte_range(struct kvm *kvm, 1411 struct kvm_mmu_page *root, 1412 const struct kvm_memory_slot *slot, 1413 bool flush) 1414 { 1415 gfn_t start = slot->base_gfn; 1416 gfn_t end = start + slot->npages; 1417 struct tdp_iter iter; 1418 kvm_pfn_t pfn; 1419 1420 rcu_read_lock(); 1421 1422 tdp_root_for_each_pte(iter, root, start, end) { 1423 retry: 1424 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { 1425 flush = false; 1426 continue; 1427 } 1428 1429 if (!is_shadow_present_pte(iter.old_spte) || 1430 !is_last_spte(iter.old_spte, iter.level)) 1431 continue; 1432 1433 pfn = spte_to_pfn(iter.old_spte); 1434 if (kvm_is_reserved_pfn(pfn) || 1435 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 1436 pfn, PG_LEVEL_NUM)) 1437 continue; 1438 1439 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { 1440 /* 1441 * The iter must explicitly re-read the SPTE because 1442 * the atomic cmpxchg failed. 1443 */ 1444 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1445 goto retry; 1446 } 1447 flush = true; 1448 } 1449 1450 rcu_read_unlock(); 1451 1452 return flush; 1453 } 1454 1455 /* 1456 * Clear non-leaf entries (and free associated page tables) which could 1457 * be replaced by large mappings, for GFNs within the slot. 1458 */ 1459 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1460 const struct kvm_memory_slot *slot, 1461 bool flush) 1462 { 1463 struct kvm_mmu_page *root; 1464 1465 lockdep_assert_held_read(&kvm->mmu_lock); 1466 1467 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1468 flush = zap_collapsible_spte_range(kvm, root, slot, flush); 1469 1470 return flush; 1471 } 1472 1473 /* 1474 * Removes write access on the last level SPTE mapping this GFN and unsets the 1475 * MMU-writable bit to ensure future writes continue to be intercepted. 1476 * Returns true if an SPTE was set and a TLB flush is needed. 1477 */ 1478 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1479 gfn_t gfn, int min_level) 1480 { 1481 struct tdp_iter iter; 1482 u64 new_spte; 1483 bool spte_set = false; 1484 1485 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1486 1487 rcu_read_lock(); 1488 1489 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1490 min_level, gfn, gfn + 1) { 1491 if (!is_shadow_present_pte(iter.old_spte) || 1492 !is_last_spte(iter.old_spte, iter.level)) 1493 continue; 1494 1495 if (!is_writable_pte(iter.old_spte)) 1496 break; 1497 1498 new_spte = iter.old_spte & 1499 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1500 1501 tdp_mmu_set_spte(kvm, &iter, new_spte); 1502 spte_set = true; 1503 } 1504 1505 rcu_read_unlock(); 1506 1507 return spte_set; 1508 } 1509 1510 /* 1511 * Removes write access on the last level SPTE mapping this GFN and unsets the 1512 * MMU-writable bit to ensure future writes continue to be intercepted. 1513 * Returns true if an SPTE was set and a TLB flush is needed. 1514 */ 1515 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1516 struct kvm_memory_slot *slot, gfn_t gfn, 1517 int min_level) 1518 { 1519 struct kvm_mmu_page *root; 1520 bool spte_set = false; 1521 1522 lockdep_assert_held_write(&kvm->mmu_lock); 1523 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1524 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1525 1526 return spte_set; 1527 } 1528 1529 /* 1530 * Return the level of the lowest level SPTE added to sptes. 1531 * That SPTE may be non-present. 1532 */ 1533 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1534 int *root_level) 1535 { 1536 struct tdp_iter iter; 1537 struct kvm_mmu *mmu = vcpu->arch.mmu; 1538 gfn_t gfn = addr >> PAGE_SHIFT; 1539 int leaf = -1; 1540 1541 *root_level = vcpu->arch.mmu->shadow_root_level; 1542 1543 rcu_read_lock(); 1544 1545 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1546 leaf = iter.level; 1547 sptes[leaf] = iter.old_spte; 1548 } 1549 1550 rcu_read_unlock(); 1551 1552 return leaf; 1553 } 1554