1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = false; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 20 return; 21 22 /* This should not be changed for the lifetime of the VM. */ 23 kvm->arch.tdp_mmu_enabled = true; 24 25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 28 } 29 30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 31 { 32 if (!kvm->arch.tdp_mmu_enabled) 33 return; 34 35 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 36 37 /* 38 * Ensure that all the outstanding RCU callbacks to free shadow pages 39 * can run before the VM is torn down. 40 */ 41 rcu_barrier(); 42 } 43 44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 45 { 46 if (kvm_mmu_put_root(kvm, root)) 47 kvm_tdp_mmu_free_root(kvm, root); 48 } 49 50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, 51 struct kvm_mmu_page *root) 52 { 53 lockdep_assert_held_write(&kvm->mmu_lock); 54 55 if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) 56 return false; 57 58 kvm_mmu_get_root(kvm, root); 59 return true; 60 61 } 62 63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 64 struct kvm_mmu_page *root) 65 { 66 struct kvm_mmu_page *next_root; 67 68 next_root = list_next_entry(root, link); 69 tdp_mmu_put_root(kvm, root); 70 return next_root; 71 } 72 73 /* 74 * Note: this iterator gets and puts references to the roots it iterates over. 75 * This makes it safe to release the MMU lock and yield within the loop, but 76 * if exiting the loop early, the caller must drop the reference to the most 77 * recent root. (Unless keeping a live reference is desirable.) 78 */ 79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 80 for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ 81 typeof(*_root), link); \ 82 tdp_mmu_next_root_valid(_kvm, _root); \ 83 _root = tdp_mmu_next_root(_kvm, _root)) 84 85 #define for_each_tdp_mmu_root(_kvm, _root) \ 86 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 87 88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 89 gfn_t start, gfn_t end, bool can_yield); 90 91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 92 { 93 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 94 95 lockdep_assert_held_write(&kvm->mmu_lock); 96 97 WARN_ON(root->root_count); 98 WARN_ON(!root->tdp_mmu_page); 99 100 list_del(&root->link); 101 102 zap_gfn_range(kvm, root, 0, max_gfn, false); 103 104 free_page((unsigned long)root->spt); 105 kmem_cache_free(mmu_page_header_cache, root); 106 } 107 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 109 int level) 110 { 111 union kvm_mmu_page_role role; 112 113 role = vcpu->arch.mmu->mmu_role.base; 114 role.level = level; 115 role.direct = true; 116 role.gpte_is_8_bytes = true; 117 role.access = ACC_ALL; 118 119 return role; 120 } 121 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 123 int level) 124 { 125 struct kvm_mmu_page *sp; 126 127 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 128 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 129 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 130 131 sp->role.word = page_role_for_level(vcpu, level).word; 132 sp->gfn = gfn; 133 sp->tdp_mmu_page = true; 134 135 trace_kvm_mmu_get_page(sp, true); 136 137 return sp; 138 } 139 140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 141 { 142 union kvm_mmu_page_role role; 143 struct kvm *kvm = vcpu->kvm; 144 struct kvm_mmu_page *root; 145 146 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 147 148 write_lock(&kvm->mmu_lock); 149 150 /* Check for an existing root before allocating a new one. */ 151 for_each_tdp_mmu_root(kvm, root) { 152 if (root->role.word == role.word) { 153 kvm_mmu_get_root(kvm, root); 154 write_unlock(&kvm->mmu_lock); 155 return root; 156 } 157 } 158 159 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 160 root->root_count = 1; 161 162 list_add(&root->link, &kvm->arch.tdp_mmu_roots); 163 164 write_unlock(&kvm->mmu_lock); 165 166 return root; 167 } 168 169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 170 { 171 struct kvm_mmu_page *root; 172 173 root = get_tdp_mmu_vcpu_root(vcpu); 174 if (!root) 175 return INVALID_PAGE; 176 177 return __pa(root->spt); 178 } 179 180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 181 { 182 free_page((unsigned long)sp->spt); 183 kmem_cache_free(mmu_page_header_cache, sp); 184 } 185 186 /* 187 * This is called through call_rcu in order to free TDP page table memory 188 * safely with respect to other kernel threads that may be operating on 189 * the memory. 190 * By only accessing TDP MMU page table memory in an RCU read critical 191 * section, and freeing it after a grace period, lockless access to that 192 * memory won't use it after it is freed. 193 */ 194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 195 { 196 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 197 rcu_head); 198 199 tdp_mmu_free_sp(sp); 200 } 201 202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 203 u64 old_spte, u64 new_spte, int level, 204 bool shared); 205 206 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 207 { 208 return sp->role.smm ? 1 : 0; 209 } 210 211 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 212 { 213 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 214 215 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 216 return; 217 218 if (is_accessed_spte(old_spte) && 219 (!is_accessed_spte(new_spte) || pfn_changed)) 220 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 221 } 222 223 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 224 u64 old_spte, u64 new_spte, int level) 225 { 226 bool pfn_changed; 227 struct kvm_memory_slot *slot; 228 229 if (level > PG_LEVEL_4K) 230 return; 231 232 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 233 234 if ((!is_writable_pte(old_spte) || pfn_changed) && 235 is_writable_pte(new_spte)) { 236 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 237 mark_page_dirty_in_slot(kvm, slot, gfn); 238 } 239 } 240 241 /** 242 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU 243 * 244 * @kvm: kvm instance 245 * @sp: the new page 246 * @shared: This operation may not be running under the exclusive use of 247 * the MMU lock and the operation must synchronize with other 248 * threads that might be adding or removing pages. 249 * @account_nx: This page replaces a NX large page and should be marked for 250 * eventual reclaim. 251 */ 252 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, 253 bool shared, bool account_nx) 254 { 255 if (shared) 256 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 257 else 258 lockdep_assert_held_write(&kvm->mmu_lock); 259 260 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 261 if (account_nx) 262 account_huge_nx_page(kvm, sp); 263 264 if (shared) 265 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 266 } 267 268 /** 269 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU 270 * 271 * @kvm: kvm instance 272 * @sp: the page to be removed 273 * @shared: This operation may not be running under the exclusive use of 274 * the MMU lock and the operation must synchronize with other 275 * threads that might be adding or removing pages. 276 */ 277 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, 278 bool shared) 279 { 280 if (shared) 281 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 282 else 283 lockdep_assert_held_write(&kvm->mmu_lock); 284 285 list_del(&sp->link); 286 if (sp->lpage_disallowed) 287 unaccount_huge_nx_page(kvm, sp); 288 289 if (shared) 290 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 291 } 292 293 /** 294 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure 295 * 296 * @kvm: kvm instance 297 * @pt: the page removed from the paging structure 298 * @shared: This operation may not be running under the exclusive use 299 * of the MMU lock and the operation must synchronize with other 300 * threads that might be modifying SPTEs. 301 * 302 * Given a page table that has been removed from the TDP paging structure, 303 * iterates through the page table to clear SPTEs and free child page tables. 304 */ 305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, 306 bool shared) 307 { 308 struct kvm_mmu_page *sp = sptep_to_sp(pt); 309 int level = sp->role.level; 310 gfn_t base_gfn = sp->gfn; 311 u64 old_child_spte; 312 u64 *sptep; 313 gfn_t gfn; 314 int i; 315 316 trace_kvm_mmu_prepare_zap_page(sp); 317 318 tdp_mmu_unlink_page(kvm, sp, shared); 319 320 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 321 sptep = pt + i; 322 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); 323 324 if (shared) { 325 /* 326 * Set the SPTE to a nonpresent value that other 327 * threads will not overwrite. If the SPTE was 328 * already marked as removed then another thread 329 * handling a page fault could overwrite it, so 330 * set the SPTE until it is set from some other 331 * value to the removed SPTE value. 332 */ 333 for (;;) { 334 old_child_spte = xchg(sptep, REMOVED_SPTE); 335 if (!is_removed_spte(old_child_spte)) 336 break; 337 cpu_relax(); 338 } 339 } else { 340 /* 341 * If the SPTE is not MMU-present, there is no backing 342 * page associated with the SPTE and so no side effects 343 * that need to be recorded, and exclusive ownership of 344 * mmu_lock ensures the SPTE can't be made present. 345 * Note, zapping MMIO SPTEs is also unnecessary as they 346 * are guarded by the memslots generation, not by being 347 * unreachable. 348 */ 349 old_child_spte = READ_ONCE(*sptep); 350 if (!is_shadow_present_pte(old_child_spte)) 351 continue; 352 353 /* 354 * Marking the SPTE as a removed SPTE is not 355 * strictly necessary here as the MMU lock will 356 * stop other threads from concurrently modifying 357 * this SPTE. Using the removed SPTE value keeps 358 * the two branches consistent and simplifies 359 * the function. 360 */ 361 WRITE_ONCE(*sptep, REMOVED_SPTE); 362 } 363 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 364 old_child_spte, REMOVED_SPTE, level - 1, 365 shared); 366 } 367 368 kvm_flush_remote_tlbs_with_address(kvm, gfn, 369 KVM_PAGES_PER_HPAGE(level)); 370 371 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 372 } 373 374 /** 375 * handle_changed_spte - handle bookkeeping associated with an SPTE change 376 * @kvm: kvm instance 377 * @as_id: the address space of the paging structure the SPTE was a part of 378 * @gfn: the base GFN that was mapped by the SPTE 379 * @old_spte: The value of the SPTE before the change 380 * @new_spte: The value of the SPTE after the change 381 * @level: the level of the PT the SPTE is part of in the paging structure 382 * @shared: This operation may not be running under the exclusive use of 383 * the MMU lock and the operation must synchronize with other 384 * threads that might be modifying SPTEs. 385 * 386 * Handle bookkeeping that might result from the modification of a SPTE. 387 * This function must be called for all TDP SPTE modifications. 388 */ 389 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 390 u64 old_spte, u64 new_spte, int level, 391 bool shared) 392 { 393 bool was_present = is_shadow_present_pte(old_spte); 394 bool is_present = is_shadow_present_pte(new_spte); 395 bool was_leaf = was_present && is_last_spte(old_spte, level); 396 bool is_leaf = is_present && is_last_spte(new_spte, level); 397 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 398 399 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 400 WARN_ON(level < PG_LEVEL_4K); 401 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 402 403 /* 404 * If this warning were to trigger it would indicate that there was a 405 * missing MMU notifier or a race with some notifier handler. 406 * A present, leaf SPTE should never be directly replaced with another 407 * present leaf SPTE pointing to a different PFN. A notifier handler 408 * should be zapping the SPTE before the main MM's page table is 409 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 410 * thread before replacement. 411 */ 412 if (was_leaf && is_leaf && pfn_changed) { 413 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 414 "SPTE with another present leaf SPTE mapping a\n" 415 "different PFN!\n" 416 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 417 as_id, gfn, old_spte, new_spte, level); 418 419 /* 420 * Crash the host to prevent error propagation and guest data 421 * corruption. 422 */ 423 BUG(); 424 } 425 426 if (old_spte == new_spte) 427 return; 428 429 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 430 431 /* 432 * The only times a SPTE should be changed from a non-present to 433 * non-present state is when an MMIO entry is installed/modified/ 434 * removed. In that case, there is nothing to do here. 435 */ 436 if (!was_present && !is_present) { 437 /* 438 * If this change does not involve a MMIO SPTE or removed SPTE, 439 * it is unexpected. Log the change, though it should not 440 * impact the guest since both the former and current SPTEs 441 * are nonpresent. 442 */ 443 if (WARN_ON(!is_mmio_spte(old_spte) && 444 !is_mmio_spte(new_spte) && 445 !is_removed_spte(new_spte))) 446 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 447 "should not be replaced with another,\n" 448 "different nonpresent SPTE, unless one or both\n" 449 "are MMIO SPTEs, or the new SPTE is\n" 450 "a temporary removed SPTE.\n" 451 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 452 as_id, gfn, old_spte, new_spte, level); 453 return; 454 } 455 456 457 if (was_leaf && is_dirty_spte(old_spte) && 458 (!is_dirty_spte(new_spte) || pfn_changed)) 459 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 460 461 /* 462 * Recursively handle child PTs if the change removed a subtree from 463 * the paging structure. 464 */ 465 if (was_present && !was_leaf && (pfn_changed || !is_present)) 466 handle_removed_tdp_mmu_page(kvm, 467 spte_to_child_pt(old_spte, level), shared); 468 } 469 470 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 471 u64 old_spte, u64 new_spte, int level, 472 bool shared) 473 { 474 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 475 shared); 476 handle_changed_spte_acc_track(old_spte, new_spte, level); 477 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 478 new_spte, level); 479 } 480 481 /* 482 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the 483 * associated bookkeeping 484 * 485 * @kvm: kvm instance 486 * @iter: a tdp_iter instance currently on the SPTE that should be set 487 * @new_spte: The value the SPTE should be set to 488 * Returns: true if the SPTE was set, false if it was not. If false is returned, 489 * this function will have no side-effects. 490 */ 491 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, 492 struct tdp_iter *iter, 493 u64 new_spte) 494 { 495 u64 *root_pt = tdp_iter_root_pt(iter); 496 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 497 int as_id = kvm_mmu_page_as_id(root); 498 499 lockdep_assert_held_read(&kvm->mmu_lock); 500 501 /* 502 * Do not change removed SPTEs. Only the thread that froze the SPTE 503 * may modify it. 504 */ 505 if (iter->old_spte == REMOVED_SPTE) 506 return false; 507 508 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, 509 new_spte) != iter->old_spte) 510 return false; 511 512 handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 513 iter->level, true); 514 515 return true; 516 } 517 518 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, 519 struct tdp_iter *iter) 520 { 521 /* 522 * Freeze the SPTE by setting it to a special, 523 * non-present value. This will stop other threads from 524 * immediately installing a present entry in its place 525 * before the TLBs are flushed. 526 */ 527 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) 528 return false; 529 530 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 531 KVM_PAGES_PER_HPAGE(iter->level)); 532 533 /* 534 * No other thread can overwrite the removed SPTE as they 535 * must either wait on the MMU lock or use 536 * tdp_mmu_set_spte_atomic which will not overwrite the 537 * special removed SPTE value. No bookkeeping is needed 538 * here since the SPTE is going from non-present 539 * to non-present. 540 */ 541 WRITE_ONCE(*iter->sptep, 0); 542 543 return true; 544 } 545 546 547 /* 548 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 549 * @kvm: kvm instance 550 * @iter: a tdp_iter instance currently on the SPTE that should be set 551 * @new_spte: The value the SPTE should be set to 552 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 553 * of the page. Should be set unless handling an MMU 554 * notifier for access tracking. Leaving record_acc_track 555 * unset in that case prevents page accesses from being 556 * double counted. 557 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 558 * appropriate for the change being made. Should be set 559 * unless performing certain dirty logging operations. 560 * Leaving record_dirty_log unset in that case prevents page 561 * writes from being double counted. 562 */ 563 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 564 u64 new_spte, bool record_acc_track, 565 bool record_dirty_log) 566 { 567 tdp_ptep_t root_pt = tdp_iter_root_pt(iter); 568 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 569 int as_id = kvm_mmu_page_as_id(root); 570 571 lockdep_assert_held_write(&kvm->mmu_lock); 572 573 /* 574 * No thread should be using this function to set SPTEs to the 575 * temporary removed SPTE value. 576 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 577 * should be used. If operating under the MMU lock in write mode, the 578 * use of the removed SPTE should not be necessary. 579 */ 580 WARN_ON(iter->old_spte == REMOVED_SPTE); 581 582 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); 583 584 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 585 iter->level, false); 586 if (record_acc_track) 587 handle_changed_spte_acc_track(iter->old_spte, new_spte, 588 iter->level); 589 if (record_dirty_log) 590 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 591 iter->old_spte, new_spte, 592 iter->level); 593 } 594 595 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 596 u64 new_spte) 597 { 598 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 599 } 600 601 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 602 struct tdp_iter *iter, 603 u64 new_spte) 604 { 605 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 606 } 607 608 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 609 struct tdp_iter *iter, 610 u64 new_spte) 611 { 612 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 613 } 614 615 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 616 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 617 618 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 619 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 620 if (!is_shadow_present_pte(_iter.old_spte) || \ 621 !is_last_spte(_iter.old_spte, _iter.level)) \ 622 continue; \ 623 else 624 625 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 626 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 627 _mmu->shadow_root_level, _start, _end) 628 629 /* 630 * Yield if the MMU lock is contended or this thread needs to return control 631 * to the scheduler. 632 * 633 * If this function should yield and flush is set, it will perform a remote 634 * TLB flush before yielding. 635 * 636 * If this function yields, it will also reset the tdp_iter's walk over the 637 * paging structure and the calling function should skip to the next 638 * iteration to allow the iterator to continue its traversal from the 639 * paging structure root. 640 * 641 * Return true if this function yielded and the iterator's traversal was reset. 642 * Return false if a yield was not needed. 643 */ 644 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 645 struct tdp_iter *iter, bool flush) 646 { 647 /* Ensure forward progress has been made before yielding. */ 648 if (iter->next_last_level_gfn == iter->yielded_gfn) 649 return false; 650 651 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 652 rcu_read_unlock(); 653 654 if (flush) 655 kvm_flush_remote_tlbs(kvm); 656 657 cond_resched_rwlock_write(&kvm->mmu_lock); 658 rcu_read_lock(); 659 660 WARN_ON(iter->gfn > iter->next_last_level_gfn); 661 662 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], 663 iter->root_level, iter->min_level, 664 iter->next_last_level_gfn); 665 666 return true; 667 } 668 669 return false; 670 } 671 672 /* 673 * Tears down the mappings for the range of gfns, [start, end), and frees the 674 * non-root pages mapping GFNs strictly within that range. Returns true if 675 * SPTEs have been cleared and a TLB flush is needed before releasing the 676 * MMU lock. 677 * If can_yield is true, will release the MMU lock and reschedule if the 678 * scheduler needs the CPU or there is contention on the MMU lock. If this 679 * function cannot yield, it will not release the MMU lock or reschedule and 680 * the caller must ensure it does not supply too large a GFN range, or the 681 * operation can cause a soft lockup. 682 */ 683 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 684 gfn_t start, gfn_t end, bool can_yield) 685 { 686 struct tdp_iter iter; 687 bool flush_needed = false; 688 689 rcu_read_lock(); 690 691 tdp_root_for_each_pte(iter, root, start, end) { 692 if (can_yield && 693 tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { 694 flush_needed = false; 695 continue; 696 } 697 698 if (!is_shadow_present_pte(iter.old_spte)) 699 continue; 700 701 /* 702 * If this is a non-last-level SPTE that covers a larger range 703 * than should be zapped, continue, and zap the mappings at a 704 * lower level. 705 */ 706 if ((iter.gfn < start || 707 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 708 !is_last_spte(iter.old_spte, iter.level)) 709 continue; 710 711 tdp_mmu_set_spte(kvm, &iter, 0); 712 flush_needed = true; 713 } 714 715 rcu_read_unlock(); 716 return flush_needed; 717 } 718 719 /* 720 * Tears down the mappings for the range of gfns, [start, end), and frees the 721 * non-root pages mapping GFNs strictly within that range. Returns true if 722 * SPTEs have been cleared and a TLB flush is needed before releasing the 723 * MMU lock. 724 */ 725 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) 726 { 727 struct kvm_mmu_page *root; 728 bool flush = false; 729 730 for_each_tdp_mmu_root_yield_safe(kvm, root) 731 flush |= zap_gfn_range(kvm, root, start, end, true); 732 733 return flush; 734 } 735 736 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 737 { 738 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 739 bool flush; 740 741 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 742 if (flush) 743 kvm_flush_remote_tlbs(kvm); 744 } 745 746 /* 747 * Installs a last-level SPTE to handle a TDP page fault. 748 * (NPT/EPT violation/misconfiguration) 749 */ 750 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 751 int map_writable, 752 struct tdp_iter *iter, 753 kvm_pfn_t pfn, bool prefault) 754 { 755 u64 new_spte; 756 int ret = 0; 757 int make_spte_ret = 0; 758 759 if (unlikely(is_noslot_pfn(pfn))) 760 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 761 else 762 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 763 pfn, iter->old_spte, prefault, true, 764 map_writable, !shadow_accessed_mask, 765 &new_spte); 766 767 if (new_spte == iter->old_spte) 768 ret = RET_PF_SPURIOUS; 769 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 770 return RET_PF_RETRY; 771 772 /* 773 * If the page fault was caused by a write but the page is write 774 * protected, emulation is needed. If the emulation was skipped, 775 * the vCPU would have the same fault again. 776 */ 777 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 778 if (write) 779 ret = RET_PF_EMULATE; 780 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 781 } 782 783 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 784 if (unlikely(is_mmio_spte(new_spte))) { 785 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 786 new_spte); 787 ret = RET_PF_EMULATE; 788 } else 789 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 790 rcu_dereference(iter->sptep)); 791 792 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 793 rcu_dereference(iter->sptep)); 794 if (!prefault) 795 vcpu->stat.pf_fixed++; 796 797 return ret; 798 } 799 800 /* 801 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 802 * page tables and SPTEs to translate the faulting guest physical address. 803 */ 804 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 805 int map_writable, int max_level, kvm_pfn_t pfn, 806 bool prefault) 807 { 808 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 809 bool write = error_code & PFERR_WRITE_MASK; 810 bool exec = error_code & PFERR_FETCH_MASK; 811 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 812 struct kvm_mmu *mmu = vcpu->arch.mmu; 813 struct tdp_iter iter; 814 struct kvm_mmu_page *sp; 815 u64 *child_pt; 816 u64 new_spte; 817 int ret; 818 gfn_t gfn = gpa >> PAGE_SHIFT; 819 int level; 820 int req_level; 821 822 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 823 return RET_PF_RETRY; 824 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 825 return RET_PF_RETRY; 826 827 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 828 huge_page_disallowed, &req_level); 829 830 trace_kvm_mmu_spte_requested(gpa, level, pfn); 831 832 rcu_read_lock(); 833 834 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 835 if (nx_huge_page_workaround_enabled) 836 disallowed_hugepage_adjust(iter.old_spte, gfn, 837 iter.level, &pfn, &level); 838 839 if (iter.level == level) 840 break; 841 842 /* 843 * If there is an SPTE mapping a large page at a higher level 844 * than the target, that SPTE must be cleared and replaced 845 * with a non-leaf SPTE. 846 */ 847 if (is_shadow_present_pte(iter.old_spte) && 848 is_large_pte(iter.old_spte)) { 849 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 850 break; 851 852 /* 853 * The iter must explicitly re-read the spte here 854 * because the new value informs the !present 855 * path below. 856 */ 857 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 858 } 859 860 if (!is_shadow_present_pte(iter.old_spte)) { 861 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 862 child_pt = sp->spt; 863 864 new_spte = make_nonleaf_spte(child_pt, 865 !shadow_accessed_mask); 866 867 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 868 new_spte)) { 869 tdp_mmu_link_page(vcpu->kvm, sp, true, 870 huge_page_disallowed && 871 req_level >= iter.level); 872 873 trace_kvm_mmu_get_page(sp, true); 874 } else { 875 tdp_mmu_free_sp(sp); 876 break; 877 } 878 } 879 } 880 881 if (iter.level != level) { 882 rcu_read_unlock(); 883 return RET_PF_RETRY; 884 } 885 886 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 887 pfn, prefault); 888 rcu_read_unlock(); 889 890 return ret; 891 } 892 893 static __always_inline int 894 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, 895 unsigned long start, 896 unsigned long end, 897 unsigned long data, 898 int (*handler)(struct kvm *kvm, 899 struct kvm_memory_slot *slot, 900 struct kvm_mmu_page *root, 901 gfn_t start, 902 gfn_t end, 903 unsigned long data)) 904 { 905 struct kvm_memslots *slots; 906 struct kvm_memory_slot *memslot; 907 struct kvm_mmu_page *root; 908 int ret = 0; 909 int as_id; 910 911 for_each_tdp_mmu_root_yield_safe(kvm, root) { 912 as_id = kvm_mmu_page_as_id(root); 913 slots = __kvm_memslots(kvm, as_id); 914 kvm_for_each_memslot(memslot, slots) { 915 unsigned long hva_start, hva_end; 916 gfn_t gfn_start, gfn_end; 917 918 hva_start = max(start, memslot->userspace_addr); 919 hva_end = min(end, memslot->userspace_addr + 920 (memslot->npages << PAGE_SHIFT)); 921 if (hva_start >= hva_end) 922 continue; 923 /* 924 * {gfn(page) | page intersects with [hva_start, hva_end)} = 925 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 926 */ 927 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 928 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 929 930 ret |= handler(kvm, memslot, root, gfn_start, 931 gfn_end, data); 932 } 933 } 934 935 return ret; 936 } 937 938 static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 939 struct kvm_memory_slot *slot, 940 struct kvm_mmu_page *root, gfn_t start, 941 gfn_t end, unsigned long unused) 942 { 943 return zap_gfn_range(kvm, root, start, end, false); 944 } 945 946 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 947 unsigned long end) 948 { 949 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 950 zap_gfn_range_hva_wrapper); 951 } 952 953 /* 954 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 955 * if any of the GFNs in the range have been accessed. 956 */ 957 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 958 struct kvm_mmu_page *root, gfn_t start, gfn_t end, 959 unsigned long unused) 960 { 961 struct tdp_iter iter; 962 int young = 0; 963 u64 new_spte = 0; 964 965 rcu_read_lock(); 966 967 tdp_root_for_each_leaf_pte(iter, root, start, end) { 968 /* 969 * If we have a non-accessed entry we don't need to change the 970 * pte. 971 */ 972 if (!is_accessed_spte(iter.old_spte)) 973 continue; 974 975 new_spte = iter.old_spte; 976 977 if (spte_ad_enabled(new_spte)) { 978 clear_bit((ffs(shadow_accessed_mask) - 1), 979 (unsigned long *)&new_spte); 980 } else { 981 /* 982 * Capture the dirty status of the page, so that it doesn't get 983 * lost when the SPTE is marked for access tracking. 984 */ 985 if (is_writable_pte(new_spte)) 986 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 987 988 new_spte = mark_spte_for_access_track(new_spte); 989 } 990 new_spte &= ~shadow_dirty_mask; 991 992 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 993 young = 1; 994 995 trace_kvm_age_page(iter.gfn, iter.level, slot, young); 996 } 997 998 rcu_read_unlock(); 999 1000 return young; 1001 } 1002 1003 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 1004 unsigned long end) 1005 { 1006 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 1007 age_gfn_range); 1008 } 1009 1010 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 1011 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 1012 unsigned long unused2) 1013 { 1014 struct tdp_iter iter; 1015 1016 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 1017 if (is_accessed_spte(iter.old_spte)) 1018 return 1; 1019 1020 return 0; 1021 } 1022 1023 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 1024 { 1025 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 1026 test_age_gfn); 1027 } 1028 1029 /* 1030 * Handle the changed_pte MMU notifier for the TDP MMU. 1031 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1032 * notifier. 1033 * Returns non-zero if a flush is needed before releasing the MMU lock. 1034 */ 1035 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 1036 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 1037 unsigned long data) 1038 { 1039 struct tdp_iter iter; 1040 pte_t *ptep = (pte_t *)data; 1041 kvm_pfn_t new_pfn; 1042 u64 new_spte; 1043 int need_flush = 0; 1044 1045 rcu_read_lock(); 1046 1047 WARN_ON(pte_huge(*ptep)); 1048 1049 new_pfn = pte_pfn(*ptep); 1050 1051 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 1052 if (iter.level != PG_LEVEL_4K) 1053 continue; 1054 1055 if (!is_shadow_present_pte(iter.old_spte)) 1056 break; 1057 1058 tdp_mmu_set_spte(kvm, &iter, 0); 1059 1060 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 1061 1062 if (!pte_write(*ptep)) { 1063 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 1064 iter.old_spte, new_pfn); 1065 1066 tdp_mmu_set_spte(kvm, &iter, new_spte); 1067 } 1068 1069 need_flush = 1; 1070 } 1071 1072 if (need_flush) 1073 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 1074 1075 rcu_read_unlock(); 1076 1077 return 0; 1078 } 1079 1080 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 1081 pte_t *host_ptep) 1082 { 1083 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 1084 (unsigned long)host_ptep, 1085 set_tdp_spte); 1086 } 1087 1088 /* 1089 * Remove write access from all the SPTEs mapping GFNs [start, end). If 1090 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 1091 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1092 */ 1093 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1094 gfn_t start, gfn_t end, int min_level) 1095 { 1096 struct tdp_iter iter; 1097 u64 new_spte; 1098 bool spte_set = false; 1099 1100 rcu_read_lock(); 1101 1102 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1103 1104 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1105 min_level, start, end) { 1106 if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 1107 continue; 1108 1109 if (!is_shadow_present_pte(iter.old_spte) || 1110 !is_last_spte(iter.old_spte, iter.level) || 1111 !(iter.old_spte & PT_WRITABLE_MASK)) 1112 continue; 1113 1114 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1115 1116 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1117 spte_set = true; 1118 } 1119 1120 rcu_read_unlock(); 1121 return spte_set; 1122 } 1123 1124 /* 1125 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1126 * only affect leaf SPTEs down to min_level. 1127 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1128 */ 1129 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 1130 int min_level) 1131 { 1132 struct kvm_mmu_page *root; 1133 int root_as_id; 1134 bool spte_set = false; 1135 1136 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1137 root_as_id = kvm_mmu_page_as_id(root); 1138 if (root_as_id != slot->as_id) 1139 continue; 1140 1141 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1142 slot->base_gfn + slot->npages, min_level); 1143 } 1144 1145 return spte_set; 1146 } 1147 1148 /* 1149 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1150 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1151 * If AD bits are not enabled, this will require clearing the writable bit on 1152 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1153 * be flushed. 1154 */ 1155 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1156 gfn_t start, gfn_t end) 1157 { 1158 struct tdp_iter iter; 1159 u64 new_spte; 1160 bool spte_set = false; 1161 1162 rcu_read_lock(); 1163 1164 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1165 if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 1166 continue; 1167 1168 if (spte_ad_need_write_protect(iter.old_spte)) { 1169 if (is_writable_pte(iter.old_spte)) 1170 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1171 else 1172 continue; 1173 } else { 1174 if (iter.old_spte & shadow_dirty_mask) 1175 new_spte = iter.old_spte & ~shadow_dirty_mask; 1176 else 1177 continue; 1178 } 1179 1180 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1181 spte_set = true; 1182 } 1183 1184 rcu_read_unlock(); 1185 return spte_set; 1186 } 1187 1188 /* 1189 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1190 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1191 * If AD bits are not enabled, this will require clearing the writable bit on 1192 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1193 * be flushed. 1194 */ 1195 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 1196 { 1197 struct kvm_mmu_page *root; 1198 int root_as_id; 1199 bool spte_set = false; 1200 1201 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1202 root_as_id = kvm_mmu_page_as_id(root); 1203 if (root_as_id != slot->as_id) 1204 continue; 1205 1206 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1207 slot->base_gfn + slot->npages); 1208 } 1209 1210 return spte_set; 1211 } 1212 1213 /* 1214 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1215 * set in mask, starting at gfn. The given memslot is expected to contain all 1216 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1217 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1218 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1219 */ 1220 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1221 gfn_t gfn, unsigned long mask, bool wrprot) 1222 { 1223 struct tdp_iter iter; 1224 u64 new_spte; 1225 1226 rcu_read_lock(); 1227 1228 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1229 gfn + BITS_PER_LONG) { 1230 if (!mask) 1231 break; 1232 1233 if (iter.level > PG_LEVEL_4K || 1234 !(mask & (1UL << (iter.gfn - gfn)))) 1235 continue; 1236 1237 mask &= ~(1UL << (iter.gfn - gfn)); 1238 1239 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1240 if (is_writable_pte(iter.old_spte)) 1241 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1242 else 1243 continue; 1244 } else { 1245 if (iter.old_spte & shadow_dirty_mask) 1246 new_spte = iter.old_spte & ~shadow_dirty_mask; 1247 else 1248 continue; 1249 } 1250 1251 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1252 } 1253 1254 rcu_read_unlock(); 1255 } 1256 1257 /* 1258 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1259 * set in mask, starting at gfn. The given memslot is expected to contain all 1260 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1261 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1262 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1263 */ 1264 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1265 struct kvm_memory_slot *slot, 1266 gfn_t gfn, unsigned long mask, 1267 bool wrprot) 1268 { 1269 struct kvm_mmu_page *root; 1270 int root_as_id; 1271 1272 lockdep_assert_held_write(&kvm->mmu_lock); 1273 for_each_tdp_mmu_root(kvm, root) { 1274 root_as_id = kvm_mmu_page_as_id(root); 1275 if (root_as_id != slot->as_id) 1276 continue; 1277 1278 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1279 } 1280 } 1281 1282 /* 1283 * Clear leaf entries which could be replaced by large mappings, for 1284 * GFNs within the slot. 1285 */ 1286 static void zap_collapsible_spte_range(struct kvm *kvm, 1287 struct kvm_mmu_page *root, 1288 struct kvm_memory_slot *slot) 1289 { 1290 gfn_t start = slot->base_gfn; 1291 gfn_t end = start + slot->npages; 1292 struct tdp_iter iter; 1293 kvm_pfn_t pfn; 1294 bool spte_set = false; 1295 1296 rcu_read_lock(); 1297 1298 tdp_root_for_each_pte(iter, root, start, end) { 1299 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { 1300 spte_set = false; 1301 continue; 1302 } 1303 1304 if (!is_shadow_present_pte(iter.old_spte) || 1305 !is_last_spte(iter.old_spte, iter.level)) 1306 continue; 1307 1308 pfn = spte_to_pfn(iter.old_spte); 1309 if (kvm_is_reserved_pfn(pfn) || 1310 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 1311 pfn, PG_LEVEL_NUM)) 1312 continue; 1313 1314 tdp_mmu_set_spte(kvm, &iter, 0); 1315 1316 spte_set = true; 1317 } 1318 1319 rcu_read_unlock(); 1320 if (spte_set) 1321 kvm_flush_remote_tlbs(kvm); 1322 } 1323 1324 /* 1325 * Clear non-leaf entries (and free associated page tables) which could 1326 * be replaced by large mappings, for GFNs within the slot. 1327 */ 1328 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1329 struct kvm_memory_slot *slot) 1330 { 1331 struct kvm_mmu_page *root; 1332 int root_as_id; 1333 1334 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1335 root_as_id = kvm_mmu_page_as_id(root); 1336 if (root_as_id != slot->as_id) 1337 continue; 1338 1339 zap_collapsible_spte_range(kvm, root, slot); 1340 } 1341 } 1342 1343 /* 1344 * Removes write access on the last level SPTE mapping this GFN and unsets the 1345 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1346 * Returns true if an SPTE was set and a TLB flush is needed. 1347 */ 1348 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1349 gfn_t gfn) 1350 { 1351 struct tdp_iter iter; 1352 u64 new_spte; 1353 bool spte_set = false; 1354 1355 rcu_read_lock(); 1356 1357 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 1358 if (!is_writable_pte(iter.old_spte)) 1359 break; 1360 1361 new_spte = iter.old_spte & 1362 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 1363 1364 tdp_mmu_set_spte(kvm, &iter, new_spte); 1365 spte_set = true; 1366 } 1367 1368 rcu_read_unlock(); 1369 1370 return spte_set; 1371 } 1372 1373 /* 1374 * Removes write access on the last level SPTE mapping this GFN and unsets the 1375 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1376 * Returns true if an SPTE was set and a TLB flush is needed. 1377 */ 1378 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1379 struct kvm_memory_slot *slot, gfn_t gfn) 1380 { 1381 struct kvm_mmu_page *root; 1382 int root_as_id; 1383 bool spte_set = false; 1384 1385 lockdep_assert_held_write(&kvm->mmu_lock); 1386 for_each_tdp_mmu_root(kvm, root) { 1387 root_as_id = kvm_mmu_page_as_id(root); 1388 if (root_as_id != slot->as_id) 1389 continue; 1390 1391 spte_set |= write_protect_gfn(kvm, root, gfn); 1392 } 1393 return spte_set; 1394 } 1395 1396 /* 1397 * Return the level of the lowest level SPTE added to sptes. 1398 * That SPTE may be non-present. 1399 */ 1400 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1401 int *root_level) 1402 { 1403 struct tdp_iter iter; 1404 struct kvm_mmu *mmu = vcpu->arch.mmu; 1405 gfn_t gfn = addr >> PAGE_SHIFT; 1406 int leaf = -1; 1407 1408 *root_level = vcpu->arch.mmu->shadow_root_level; 1409 1410 rcu_read_lock(); 1411 1412 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1413 leaf = iter.level; 1414 sptes[leaf] = iter.old_spte; 1415 } 1416 1417 rcu_read_unlock(); 1418 1419 return leaf; 1420 } 1421