1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = false; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 20 return; 21 22 /* This should not be changed for the lifetime of the VM. */ 23 kvm->arch.tdp_mmu_enabled = true; 24 25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 28 } 29 30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 31 { 32 if (!kvm->arch.tdp_mmu_enabled) 33 return; 34 35 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 36 37 /* 38 * Ensure that all the outstanding RCU callbacks to free shadow pages 39 * can run before the VM is torn down. 40 */ 41 rcu_barrier(); 42 } 43 44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 45 { 46 if (kvm_mmu_put_root(kvm, root)) 47 kvm_tdp_mmu_free_root(kvm, root); 48 } 49 50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, 51 struct kvm_mmu_page *root) 52 { 53 lockdep_assert_held_write(&kvm->mmu_lock); 54 55 if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) 56 return false; 57 58 kvm_mmu_get_root(kvm, root); 59 return true; 60 61 } 62 63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 64 struct kvm_mmu_page *root) 65 { 66 struct kvm_mmu_page *next_root; 67 68 next_root = list_next_entry(root, link); 69 tdp_mmu_put_root(kvm, root); 70 return next_root; 71 } 72 73 /* 74 * Note: this iterator gets and puts references to the roots it iterates over. 75 * This makes it safe to release the MMU lock and yield within the loop, but 76 * if exiting the loop early, the caller must drop the reference to the most 77 * recent root. (Unless keeping a live reference is desirable.) 78 */ 79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 80 for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ 81 typeof(*_root), link); \ 82 tdp_mmu_next_root_valid(_kvm, _root); \ 83 _root = tdp_mmu_next_root(_kvm, _root)) 84 85 #define for_each_tdp_mmu_root(_kvm, _root) \ 86 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 87 88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 89 gfn_t start, gfn_t end, bool can_yield); 90 91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 92 { 93 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 94 95 lockdep_assert_held_write(&kvm->mmu_lock); 96 97 WARN_ON(root->root_count); 98 WARN_ON(!root->tdp_mmu_page); 99 100 list_del(&root->link); 101 102 zap_gfn_range(kvm, root, 0, max_gfn, false); 103 104 free_page((unsigned long)root->spt); 105 kmem_cache_free(mmu_page_header_cache, root); 106 } 107 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 109 int level) 110 { 111 union kvm_mmu_page_role role; 112 113 role = vcpu->arch.mmu->mmu_role.base; 114 role.level = level; 115 role.direct = true; 116 role.gpte_is_8_bytes = true; 117 role.access = ACC_ALL; 118 119 return role; 120 } 121 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 123 int level) 124 { 125 struct kvm_mmu_page *sp; 126 127 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 128 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 129 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 130 131 sp->role.word = page_role_for_level(vcpu, level).word; 132 sp->gfn = gfn; 133 sp->tdp_mmu_page = true; 134 135 trace_kvm_mmu_get_page(sp, true); 136 137 return sp; 138 } 139 140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 141 { 142 union kvm_mmu_page_role role; 143 struct kvm *kvm = vcpu->kvm; 144 struct kvm_mmu_page *root; 145 146 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 147 148 write_lock(&kvm->mmu_lock); 149 150 /* Check for an existing root before allocating a new one. */ 151 for_each_tdp_mmu_root(kvm, root) { 152 if (root->role.word == role.word) { 153 kvm_mmu_get_root(kvm, root); 154 write_unlock(&kvm->mmu_lock); 155 return root; 156 } 157 } 158 159 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 160 root->root_count = 1; 161 162 list_add(&root->link, &kvm->arch.tdp_mmu_roots); 163 164 write_unlock(&kvm->mmu_lock); 165 166 return root; 167 } 168 169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 170 { 171 struct kvm_mmu_page *root; 172 173 root = get_tdp_mmu_vcpu_root(vcpu); 174 if (!root) 175 return INVALID_PAGE; 176 177 return __pa(root->spt); 178 } 179 180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 181 { 182 free_page((unsigned long)sp->spt); 183 kmem_cache_free(mmu_page_header_cache, sp); 184 } 185 186 /* 187 * This is called through call_rcu in order to free TDP page table memory 188 * safely with respect to other kernel threads that may be operating on 189 * the memory. 190 * By only accessing TDP MMU page table memory in an RCU read critical 191 * section, and freeing it after a grace period, lockless access to that 192 * memory won't use it after it is freed. 193 */ 194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 195 { 196 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 197 rcu_head); 198 199 tdp_mmu_free_sp(sp); 200 } 201 202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 203 u64 old_spte, u64 new_spte, int level, 204 bool shared); 205 206 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 207 { 208 return sp->role.smm ? 1 : 0; 209 } 210 211 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 212 { 213 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 214 215 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 216 return; 217 218 if (is_accessed_spte(old_spte) && 219 (!is_accessed_spte(new_spte) || pfn_changed)) 220 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 221 } 222 223 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 224 u64 old_spte, u64 new_spte, int level) 225 { 226 bool pfn_changed; 227 struct kvm_memory_slot *slot; 228 229 if (level > PG_LEVEL_4K) 230 return; 231 232 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 233 234 if ((!is_writable_pte(old_spte) || pfn_changed) && 235 is_writable_pte(new_spte)) { 236 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 237 mark_page_dirty_in_slot(kvm, slot, gfn); 238 } 239 } 240 241 /** 242 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU 243 * 244 * @kvm: kvm instance 245 * @sp: the new page 246 * @shared: This operation may not be running under the exclusive use of 247 * the MMU lock and the operation must synchronize with other 248 * threads that might be adding or removing pages. 249 * @account_nx: This page replaces a NX large page and should be marked for 250 * eventual reclaim. 251 */ 252 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, 253 bool shared, bool account_nx) 254 { 255 if (shared) 256 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 257 else 258 lockdep_assert_held_write(&kvm->mmu_lock); 259 260 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 261 if (account_nx) 262 account_huge_nx_page(kvm, sp); 263 264 if (shared) 265 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 266 } 267 268 /** 269 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU 270 * 271 * @kvm: kvm instance 272 * @sp: the page to be removed 273 * @shared: This operation may not be running under the exclusive use of 274 * the MMU lock and the operation must synchronize with other 275 * threads that might be adding or removing pages. 276 */ 277 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, 278 bool shared) 279 { 280 if (shared) 281 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 282 else 283 lockdep_assert_held_write(&kvm->mmu_lock); 284 285 list_del(&sp->link); 286 if (sp->lpage_disallowed) 287 unaccount_huge_nx_page(kvm, sp); 288 289 if (shared) 290 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 291 } 292 293 /** 294 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure 295 * 296 * @kvm: kvm instance 297 * @pt: the page removed from the paging structure 298 * @shared: This operation may not be running under the exclusive use 299 * of the MMU lock and the operation must synchronize with other 300 * threads that might be modifying SPTEs. 301 * 302 * Given a page table that has been removed from the TDP paging structure, 303 * iterates through the page table to clear SPTEs and free child page tables. 304 */ 305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, 306 bool shared) 307 { 308 struct kvm_mmu_page *sp = sptep_to_sp(pt); 309 int level = sp->role.level; 310 gfn_t base_gfn = sp->gfn; 311 u64 old_child_spte; 312 u64 *sptep; 313 gfn_t gfn; 314 int i; 315 316 trace_kvm_mmu_prepare_zap_page(sp); 317 318 tdp_mmu_unlink_page(kvm, sp, shared); 319 320 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 321 sptep = pt + i; 322 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); 323 324 if (shared) { 325 /* 326 * Set the SPTE to a nonpresent value that other 327 * threads will not overwrite. If the SPTE was 328 * already marked as removed then another thread 329 * handling a page fault could overwrite it, so 330 * set the SPTE until it is set from some other 331 * value to the removed SPTE value. 332 */ 333 for (;;) { 334 old_child_spte = xchg(sptep, REMOVED_SPTE); 335 if (!is_removed_spte(old_child_spte)) 336 break; 337 cpu_relax(); 338 } 339 } else { 340 old_child_spte = READ_ONCE(*sptep); 341 342 /* 343 * Marking the SPTE as a removed SPTE is not 344 * strictly necessary here as the MMU lock will 345 * stop other threads from concurrently modifying 346 * this SPTE. Using the removed SPTE value keeps 347 * the two branches consistent and simplifies 348 * the function. 349 */ 350 WRITE_ONCE(*sptep, REMOVED_SPTE); 351 } 352 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 353 old_child_spte, REMOVED_SPTE, level - 1, 354 shared); 355 } 356 357 kvm_flush_remote_tlbs_with_address(kvm, gfn, 358 KVM_PAGES_PER_HPAGE(level)); 359 360 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 361 } 362 363 /** 364 * handle_changed_spte - handle bookkeeping associated with an SPTE change 365 * @kvm: kvm instance 366 * @as_id: the address space of the paging structure the SPTE was a part of 367 * @gfn: the base GFN that was mapped by the SPTE 368 * @old_spte: The value of the SPTE before the change 369 * @new_spte: The value of the SPTE after the change 370 * @level: the level of the PT the SPTE is part of in the paging structure 371 * @shared: This operation may not be running under the exclusive use of 372 * the MMU lock and the operation must synchronize with other 373 * threads that might be modifying SPTEs. 374 * 375 * Handle bookkeeping that might result from the modification of a SPTE. 376 * This function must be called for all TDP SPTE modifications. 377 */ 378 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 379 u64 old_spte, u64 new_spte, int level, 380 bool shared) 381 { 382 bool was_present = is_shadow_present_pte(old_spte); 383 bool is_present = is_shadow_present_pte(new_spte); 384 bool was_leaf = was_present && is_last_spte(old_spte, level); 385 bool is_leaf = is_present && is_last_spte(new_spte, level); 386 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 387 388 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 389 WARN_ON(level < PG_LEVEL_4K); 390 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 391 392 /* 393 * If this warning were to trigger it would indicate that there was a 394 * missing MMU notifier or a race with some notifier handler. 395 * A present, leaf SPTE should never be directly replaced with another 396 * present leaf SPTE pointing to a differnt PFN. A notifier handler 397 * should be zapping the SPTE before the main MM's page table is 398 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 399 * thread before replacement. 400 */ 401 if (was_leaf && is_leaf && pfn_changed) { 402 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 403 "SPTE with another present leaf SPTE mapping a\n" 404 "different PFN!\n" 405 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 406 as_id, gfn, old_spte, new_spte, level); 407 408 /* 409 * Crash the host to prevent error propagation and guest data 410 * courruption. 411 */ 412 BUG(); 413 } 414 415 if (old_spte == new_spte) 416 return; 417 418 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 419 420 /* 421 * The only times a SPTE should be changed from a non-present to 422 * non-present state is when an MMIO entry is installed/modified/ 423 * removed. In that case, there is nothing to do here. 424 */ 425 if (!was_present && !is_present) { 426 /* 427 * If this change does not involve a MMIO SPTE or removed SPTE, 428 * it is unexpected. Log the change, though it should not 429 * impact the guest since both the former and current SPTEs 430 * are nonpresent. 431 */ 432 if (WARN_ON(!is_mmio_spte(old_spte) && 433 !is_mmio_spte(new_spte) && 434 !is_removed_spte(new_spte))) 435 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 436 "should not be replaced with another,\n" 437 "different nonpresent SPTE, unless one or both\n" 438 "are MMIO SPTEs, or the new SPTE is\n" 439 "a temporary removed SPTE.\n" 440 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 441 as_id, gfn, old_spte, new_spte, level); 442 return; 443 } 444 445 446 if (was_leaf && is_dirty_spte(old_spte) && 447 (!is_dirty_spte(new_spte) || pfn_changed)) 448 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 449 450 /* 451 * Recursively handle child PTs if the change removed a subtree from 452 * the paging structure. 453 */ 454 if (was_present && !was_leaf && (pfn_changed || !is_present)) 455 handle_removed_tdp_mmu_page(kvm, 456 spte_to_child_pt(old_spte, level), shared); 457 } 458 459 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 460 u64 old_spte, u64 new_spte, int level, 461 bool shared) 462 { 463 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 464 shared); 465 handle_changed_spte_acc_track(old_spte, new_spte, level); 466 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 467 new_spte, level); 468 } 469 470 /* 471 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the 472 * associated bookkeeping 473 * 474 * @kvm: kvm instance 475 * @iter: a tdp_iter instance currently on the SPTE that should be set 476 * @new_spte: The value the SPTE should be set to 477 * Returns: true if the SPTE was set, false if it was not. If false is returned, 478 * this function will have no side-effects. 479 */ 480 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, 481 struct tdp_iter *iter, 482 u64 new_spte) 483 { 484 u64 *root_pt = tdp_iter_root_pt(iter); 485 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 486 int as_id = kvm_mmu_page_as_id(root); 487 488 lockdep_assert_held_read(&kvm->mmu_lock); 489 490 /* 491 * Do not change removed SPTEs. Only the thread that froze the SPTE 492 * may modify it. 493 */ 494 if (iter->old_spte == REMOVED_SPTE) 495 return false; 496 497 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, 498 new_spte) != iter->old_spte) 499 return false; 500 501 handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 502 iter->level, true); 503 504 return true; 505 } 506 507 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, 508 struct tdp_iter *iter) 509 { 510 /* 511 * Freeze the SPTE by setting it to a special, 512 * non-present value. This will stop other threads from 513 * immediately installing a present entry in its place 514 * before the TLBs are flushed. 515 */ 516 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) 517 return false; 518 519 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 520 KVM_PAGES_PER_HPAGE(iter->level)); 521 522 /* 523 * No other thread can overwrite the removed SPTE as they 524 * must either wait on the MMU lock or use 525 * tdp_mmu_set_spte_atomic which will not overrite the 526 * special removed SPTE value. No bookkeeping is needed 527 * here since the SPTE is going from non-present 528 * to non-present. 529 */ 530 WRITE_ONCE(*iter->sptep, 0); 531 532 return true; 533 } 534 535 536 /* 537 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 538 * @kvm: kvm instance 539 * @iter: a tdp_iter instance currently on the SPTE that should be set 540 * @new_spte: The value the SPTE should be set to 541 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 542 * of the page. Should be set unless handling an MMU 543 * notifier for access tracking. Leaving record_acc_track 544 * unset in that case prevents page accesses from being 545 * double counted. 546 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 547 * appropriate for the change being made. Should be set 548 * unless performing certain dirty logging operations. 549 * Leaving record_dirty_log unset in that case prevents page 550 * writes from being double counted. 551 */ 552 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 553 u64 new_spte, bool record_acc_track, 554 bool record_dirty_log) 555 { 556 tdp_ptep_t root_pt = tdp_iter_root_pt(iter); 557 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 558 int as_id = kvm_mmu_page_as_id(root); 559 560 lockdep_assert_held_write(&kvm->mmu_lock); 561 562 /* 563 * No thread should be using this function to set SPTEs to the 564 * temporary removed SPTE value. 565 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 566 * should be used. If operating under the MMU lock in write mode, the 567 * use of the removed SPTE should not be necessary. 568 */ 569 WARN_ON(iter->old_spte == REMOVED_SPTE); 570 571 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); 572 573 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 574 iter->level, false); 575 if (record_acc_track) 576 handle_changed_spte_acc_track(iter->old_spte, new_spte, 577 iter->level); 578 if (record_dirty_log) 579 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 580 iter->old_spte, new_spte, 581 iter->level); 582 } 583 584 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 585 u64 new_spte) 586 { 587 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 588 } 589 590 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 591 struct tdp_iter *iter, 592 u64 new_spte) 593 { 594 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 595 } 596 597 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 598 struct tdp_iter *iter, 599 u64 new_spte) 600 { 601 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 602 } 603 604 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 605 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 606 607 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 608 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 609 if (!is_shadow_present_pte(_iter.old_spte) || \ 610 !is_last_spte(_iter.old_spte, _iter.level)) \ 611 continue; \ 612 else 613 614 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 615 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 616 _mmu->shadow_root_level, _start, _end) 617 618 /* 619 * Yield if the MMU lock is contended or this thread needs to return control 620 * to the scheduler. 621 * 622 * If this function should yield and flush is set, it will perform a remote 623 * TLB flush before yielding. 624 * 625 * If this function yields, it will also reset the tdp_iter's walk over the 626 * paging structure and the calling function should skip to the next 627 * iteration to allow the iterator to continue its traversal from the 628 * paging structure root. 629 * 630 * Return true if this function yielded and the iterator's traversal was reset. 631 * Return false if a yield was not needed. 632 */ 633 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 634 struct tdp_iter *iter, bool flush) 635 { 636 /* Ensure forward progress has been made before yielding. */ 637 if (iter->next_last_level_gfn == iter->yielded_gfn) 638 return false; 639 640 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 641 rcu_read_unlock(); 642 643 if (flush) 644 kvm_flush_remote_tlbs(kvm); 645 646 cond_resched_rwlock_write(&kvm->mmu_lock); 647 rcu_read_lock(); 648 649 WARN_ON(iter->gfn > iter->next_last_level_gfn); 650 651 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], 652 iter->root_level, iter->min_level, 653 iter->next_last_level_gfn); 654 655 return true; 656 } 657 658 return false; 659 } 660 661 /* 662 * Tears down the mappings for the range of gfns, [start, end), and frees the 663 * non-root pages mapping GFNs strictly within that range. Returns true if 664 * SPTEs have been cleared and a TLB flush is needed before releasing the 665 * MMU lock. 666 * If can_yield is true, will release the MMU lock and reschedule if the 667 * scheduler needs the CPU or there is contention on the MMU lock. If this 668 * function cannot yield, it will not release the MMU lock or reschedule and 669 * the caller must ensure it does not supply too large a GFN range, or the 670 * operation can cause a soft lockup. 671 */ 672 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 673 gfn_t start, gfn_t end, bool can_yield) 674 { 675 struct tdp_iter iter; 676 bool flush_needed = false; 677 678 rcu_read_lock(); 679 680 tdp_root_for_each_pte(iter, root, start, end) { 681 if (can_yield && 682 tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { 683 flush_needed = false; 684 continue; 685 } 686 687 if (!is_shadow_present_pte(iter.old_spte)) 688 continue; 689 690 /* 691 * If this is a non-last-level SPTE that covers a larger range 692 * than should be zapped, continue, and zap the mappings at a 693 * lower level. 694 */ 695 if ((iter.gfn < start || 696 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 697 !is_last_spte(iter.old_spte, iter.level)) 698 continue; 699 700 tdp_mmu_set_spte(kvm, &iter, 0); 701 flush_needed = true; 702 } 703 704 rcu_read_unlock(); 705 return flush_needed; 706 } 707 708 /* 709 * Tears down the mappings for the range of gfns, [start, end), and frees the 710 * non-root pages mapping GFNs strictly within that range. Returns true if 711 * SPTEs have been cleared and a TLB flush is needed before releasing the 712 * MMU lock. 713 */ 714 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) 715 { 716 struct kvm_mmu_page *root; 717 bool flush = false; 718 719 for_each_tdp_mmu_root_yield_safe(kvm, root) 720 flush |= zap_gfn_range(kvm, root, start, end, true); 721 722 return flush; 723 } 724 725 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 726 { 727 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 728 bool flush; 729 730 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 731 if (flush) 732 kvm_flush_remote_tlbs(kvm); 733 } 734 735 /* 736 * Installs a last-level SPTE to handle a TDP page fault. 737 * (NPT/EPT violation/misconfiguration) 738 */ 739 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 740 int map_writable, 741 struct tdp_iter *iter, 742 kvm_pfn_t pfn, bool prefault) 743 { 744 u64 new_spte; 745 int ret = 0; 746 int make_spte_ret = 0; 747 748 if (unlikely(is_noslot_pfn(pfn))) 749 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 750 else 751 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 752 pfn, iter->old_spte, prefault, true, 753 map_writable, !shadow_accessed_mask, 754 &new_spte); 755 756 if (new_spte == iter->old_spte) 757 ret = RET_PF_SPURIOUS; 758 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 759 return RET_PF_RETRY; 760 761 /* 762 * If the page fault was caused by a write but the page is write 763 * protected, emulation is needed. If the emulation was skipped, 764 * the vCPU would have the same fault again. 765 */ 766 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 767 if (write) 768 ret = RET_PF_EMULATE; 769 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 770 } 771 772 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 773 if (unlikely(is_mmio_spte(new_spte))) { 774 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 775 new_spte); 776 ret = RET_PF_EMULATE; 777 } else 778 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 779 rcu_dereference(iter->sptep)); 780 781 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 782 rcu_dereference(iter->sptep)); 783 if (!prefault) 784 vcpu->stat.pf_fixed++; 785 786 return ret; 787 } 788 789 /* 790 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 791 * page tables and SPTEs to translate the faulting guest physical address. 792 */ 793 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 794 int map_writable, int max_level, kvm_pfn_t pfn, 795 bool prefault) 796 { 797 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 798 bool write = error_code & PFERR_WRITE_MASK; 799 bool exec = error_code & PFERR_FETCH_MASK; 800 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 801 struct kvm_mmu *mmu = vcpu->arch.mmu; 802 struct tdp_iter iter; 803 struct kvm_mmu_page *sp; 804 u64 *child_pt; 805 u64 new_spte; 806 int ret; 807 gfn_t gfn = gpa >> PAGE_SHIFT; 808 int level; 809 int req_level; 810 811 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 812 return RET_PF_RETRY; 813 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 814 return RET_PF_RETRY; 815 816 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 817 huge_page_disallowed, &req_level); 818 819 trace_kvm_mmu_spte_requested(gpa, level, pfn); 820 821 rcu_read_lock(); 822 823 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 824 if (nx_huge_page_workaround_enabled) 825 disallowed_hugepage_adjust(iter.old_spte, gfn, 826 iter.level, &pfn, &level); 827 828 if (iter.level == level) 829 break; 830 831 /* 832 * If there is an SPTE mapping a large page at a higher level 833 * than the target, that SPTE must be cleared and replaced 834 * with a non-leaf SPTE. 835 */ 836 if (is_shadow_present_pte(iter.old_spte) && 837 is_large_pte(iter.old_spte)) { 838 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 839 break; 840 841 /* 842 * The iter must explicitly re-read the spte here 843 * because the new value informs the !present 844 * path below. 845 */ 846 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 847 } 848 849 if (!is_shadow_present_pte(iter.old_spte)) { 850 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 851 child_pt = sp->spt; 852 853 new_spte = make_nonleaf_spte(child_pt, 854 !shadow_accessed_mask); 855 856 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 857 new_spte)) { 858 tdp_mmu_link_page(vcpu->kvm, sp, true, 859 huge_page_disallowed && 860 req_level >= iter.level); 861 862 trace_kvm_mmu_get_page(sp, true); 863 } else { 864 tdp_mmu_free_sp(sp); 865 break; 866 } 867 } 868 } 869 870 if (iter.level != level) { 871 rcu_read_unlock(); 872 return RET_PF_RETRY; 873 } 874 875 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 876 pfn, prefault); 877 rcu_read_unlock(); 878 879 return ret; 880 } 881 882 static __always_inline int 883 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, 884 unsigned long start, 885 unsigned long end, 886 unsigned long data, 887 int (*handler)(struct kvm *kvm, 888 struct kvm_memory_slot *slot, 889 struct kvm_mmu_page *root, 890 gfn_t start, 891 gfn_t end, 892 unsigned long data)) 893 { 894 struct kvm_memslots *slots; 895 struct kvm_memory_slot *memslot; 896 struct kvm_mmu_page *root; 897 int ret = 0; 898 int as_id; 899 900 for_each_tdp_mmu_root_yield_safe(kvm, root) { 901 as_id = kvm_mmu_page_as_id(root); 902 slots = __kvm_memslots(kvm, as_id); 903 kvm_for_each_memslot(memslot, slots) { 904 unsigned long hva_start, hva_end; 905 gfn_t gfn_start, gfn_end; 906 907 hva_start = max(start, memslot->userspace_addr); 908 hva_end = min(end, memslot->userspace_addr + 909 (memslot->npages << PAGE_SHIFT)); 910 if (hva_start >= hva_end) 911 continue; 912 /* 913 * {gfn(page) | page intersects with [hva_start, hva_end)} = 914 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 915 */ 916 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 917 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 918 919 ret |= handler(kvm, memslot, root, gfn_start, 920 gfn_end, data); 921 } 922 } 923 924 return ret; 925 } 926 927 static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 928 struct kvm_memory_slot *slot, 929 struct kvm_mmu_page *root, gfn_t start, 930 gfn_t end, unsigned long unused) 931 { 932 return zap_gfn_range(kvm, root, start, end, false); 933 } 934 935 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 936 unsigned long end) 937 { 938 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 939 zap_gfn_range_hva_wrapper); 940 } 941 942 /* 943 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 944 * if any of the GFNs in the range have been accessed. 945 */ 946 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 947 struct kvm_mmu_page *root, gfn_t start, gfn_t end, 948 unsigned long unused) 949 { 950 struct tdp_iter iter; 951 int young = 0; 952 u64 new_spte = 0; 953 954 rcu_read_lock(); 955 956 tdp_root_for_each_leaf_pte(iter, root, start, end) { 957 /* 958 * If we have a non-accessed entry we don't need to change the 959 * pte. 960 */ 961 if (!is_accessed_spte(iter.old_spte)) 962 continue; 963 964 new_spte = iter.old_spte; 965 966 if (spte_ad_enabled(new_spte)) { 967 clear_bit((ffs(shadow_accessed_mask) - 1), 968 (unsigned long *)&new_spte); 969 } else { 970 /* 971 * Capture the dirty status of the page, so that it doesn't get 972 * lost when the SPTE is marked for access tracking. 973 */ 974 if (is_writable_pte(new_spte)) 975 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 976 977 new_spte = mark_spte_for_access_track(new_spte); 978 } 979 new_spte &= ~shadow_dirty_mask; 980 981 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 982 young = 1; 983 984 trace_kvm_age_page(iter.gfn, iter.level, slot, young); 985 } 986 987 rcu_read_unlock(); 988 989 return young; 990 } 991 992 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 993 unsigned long end) 994 { 995 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 996 age_gfn_range); 997 } 998 999 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 1000 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 1001 unsigned long unused2) 1002 { 1003 struct tdp_iter iter; 1004 1005 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 1006 if (is_accessed_spte(iter.old_spte)) 1007 return 1; 1008 1009 return 0; 1010 } 1011 1012 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 1013 { 1014 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 1015 test_age_gfn); 1016 } 1017 1018 /* 1019 * Handle the changed_pte MMU notifier for the TDP MMU. 1020 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1021 * notifier. 1022 * Returns non-zero if a flush is needed before releasing the MMU lock. 1023 */ 1024 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 1025 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 1026 unsigned long data) 1027 { 1028 struct tdp_iter iter; 1029 pte_t *ptep = (pte_t *)data; 1030 kvm_pfn_t new_pfn; 1031 u64 new_spte; 1032 int need_flush = 0; 1033 1034 rcu_read_lock(); 1035 1036 WARN_ON(pte_huge(*ptep)); 1037 1038 new_pfn = pte_pfn(*ptep); 1039 1040 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 1041 if (iter.level != PG_LEVEL_4K) 1042 continue; 1043 1044 if (!is_shadow_present_pte(iter.old_spte)) 1045 break; 1046 1047 tdp_mmu_set_spte(kvm, &iter, 0); 1048 1049 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 1050 1051 if (!pte_write(*ptep)) { 1052 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 1053 iter.old_spte, new_pfn); 1054 1055 tdp_mmu_set_spte(kvm, &iter, new_spte); 1056 } 1057 1058 need_flush = 1; 1059 } 1060 1061 if (need_flush) 1062 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 1063 1064 rcu_read_unlock(); 1065 1066 return 0; 1067 } 1068 1069 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 1070 pte_t *host_ptep) 1071 { 1072 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 1073 (unsigned long)host_ptep, 1074 set_tdp_spte); 1075 } 1076 1077 /* 1078 * Remove write access from all the SPTEs mapping GFNs [start, end). If 1079 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 1080 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1081 */ 1082 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1083 gfn_t start, gfn_t end, int min_level) 1084 { 1085 struct tdp_iter iter; 1086 u64 new_spte; 1087 bool spte_set = false; 1088 1089 rcu_read_lock(); 1090 1091 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1092 1093 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1094 min_level, start, end) { 1095 if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 1096 continue; 1097 1098 if (!is_shadow_present_pte(iter.old_spte) || 1099 !is_last_spte(iter.old_spte, iter.level) || 1100 !(iter.old_spte & PT_WRITABLE_MASK)) 1101 continue; 1102 1103 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1104 1105 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1106 spte_set = true; 1107 } 1108 1109 rcu_read_unlock(); 1110 return spte_set; 1111 } 1112 1113 /* 1114 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1115 * only affect leaf SPTEs down to min_level. 1116 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1117 */ 1118 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 1119 int min_level) 1120 { 1121 struct kvm_mmu_page *root; 1122 int root_as_id; 1123 bool spte_set = false; 1124 1125 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1126 root_as_id = kvm_mmu_page_as_id(root); 1127 if (root_as_id != slot->as_id) 1128 continue; 1129 1130 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1131 slot->base_gfn + slot->npages, min_level); 1132 } 1133 1134 return spte_set; 1135 } 1136 1137 /* 1138 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1139 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1140 * If AD bits are not enabled, this will require clearing the writable bit on 1141 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1142 * be flushed. 1143 */ 1144 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1145 gfn_t start, gfn_t end) 1146 { 1147 struct tdp_iter iter; 1148 u64 new_spte; 1149 bool spte_set = false; 1150 1151 rcu_read_lock(); 1152 1153 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1154 if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 1155 continue; 1156 1157 if (spte_ad_need_write_protect(iter.old_spte)) { 1158 if (is_writable_pte(iter.old_spte)) 1159 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1160 else 1161 continue; 1162 } else { 1163 if (iter.old_spte & shadow_dirty_mask) 1164 new_spte = iter.old_spte & ~shadow_dirty_mask; 1165 else 1166 continue; 1167 } 1168 1169 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1170 spte_set = true; 1171 } 1172 1173 rcu_read_unlock(); 1174 return spte_set; 1175 } 1176 1177 /* 1178 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1179 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1180 * If AD bits are not enabled, this will require clearing the writable bit on 1181 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1182 * be flushed. 1183 */ 1184 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 1185 { 1186 struct kvm_mmu_page *root; 1187 int root_as_id; 1188 bool spte_set = false; 1189 1190 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1191 root_as_id = kvm_mmu_page_as_id(root); 1192 if (root_as_id != slot->as_id) 1193 continue; 1194 1195 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1196 slot->base_gfn + slot->npages); 1197 } 1198 1199 return spte_set; 1200 } 1201 1202 /* 1203 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1204 * set in mask, starting at gfn. The given memslot is expected to contain all 1205 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1206 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1207 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1208 */ 1209 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1210 gfn_t gfn, unsigned long mask, bool wrprot) 1211 { 1212 struct tdp_iter iter; 1213 u64 new_spte; 1214 1215 rcu_read_lock(); 1216 1217 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1218 gfn + BITS_PER_LONG) { 1219 if (!mask) 1220 break; 1221 1222 if (iter.level > PG_LEVEL_4K || 1223 !(mask & (1UL << (iter.gfn - gfn)))) 1224 continue; 1225 1226 mask &= ~(1UL << (iter.gfn - gfn)); 1227 1228 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1229 if (is_writable_pte(iter.old_spte)) 1230 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1231 else 1232 continue; 1233 } else { 1234 if (iter.old_spte & shadow_dirty_mask) 1235 new_spte = iter.old_spte & ~shadow_dirty_mask; 1236 else 1237 continue; 1238 } 1239 1240 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1241 } 1242 1243 rcu_read_unlock(); 1244 } 1245 1246 /* 1247 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1248 * set in mask, starting at gfn. The given memslot is expected to contain all 1249 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1250 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1251 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1252 */ 1253 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1254 struct kvm_memory_slot *slot, 1255 gfn_t gfn, unsigned long mask, 1256 bool wrprot) 1257 { 1258 struct kvm_mmu_page *root; 1259 int root_as_id; 1260 1261 lockdep_assert_held_write(&kvm->mmu_lock); 1262 for_each_tdp_mmu_root(kvm, root) { 1263 root_as_id = kvm_mmu_page_as_id(root); 1264 if (root_as_id != slot->as_id) 1265 continue; 1266 1267 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1268 } 1269 } 1270 1271 /* 1272 * Clear leaf entries which could be replaced by large mappings, for 1273 * GFNs within the slot. 1274 */ 1275 static void zap_collapsible_spte_range(struct kvm *kvm, 1276 struct kvm_mmu_page *root, 1277 struct kvm_memory_slot *slot) 1278 { 1279 gfn_t start = slot->base_gfn; 1280 gfn_t end = start + slot->npages; 1281 struct tdp_iter iter; 1282 kvm_pfn_t pfn; 1283 bool spte_set = false; 1284 1285 rcu_read_lock(); 1286 1287 tdp_root_for_each_pte(iter, root, start, end) { 1288 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { 1289 spte_set = false; 1290 continue; 1291 } 1292 1293 if (!is_shadow_present_pte(iter.old_spte) || 1294 !is_last_spte(iter.old_spte, iter.level)) 1295 continue; 1296 1297 pfn = spte_to_pfn(iter.old_spte); 1298 if (kvm_is_reserved_pfn(pfn) || 1299 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 1300 pfn, PG_LEVEL_NUM)) 1301 continue; 1302 1303 tdp_mmu_set_spte(kvm, &iter, 0); 1304 1305 spte_set = true; 1306 } 1307 1308 rcu_read_unlock(); 1309 if (spte_set) 1310 kvm_flush_remote_tlbs(kvm); 1311 } 1312 1313 /* 1314 * Clear non-leaf entries (and free associated page tables) which could 1315 * be replaced by large mappings, for GFNs within the slot. 1316 */ 1317 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1318 struct kvm_memory_slot *slot) 1319 { 1320 struct kvm_mmu_page *root; 1321 int root_as_id; 1322 1323 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1324 root_as_id = kvm_mmu_page_as_id(root); 1325 if (root_as_id != slot->as_id) 1326 continue; 1327 1328 zap_collapsible_spte_range(kvm, root, slot); 1329 } 1330 } 1331 1332 /* 1333 * Removes write access on the last level SPTE mapping this GFN and unsets the 1334 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1335 * Returns true if an SPTE was set and a TLB flush is needed. 1336 */ 1337 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1338 gfn_t gfn) 1339 { 1340 struct tdp_iter iter; 1341 u64 new_spte; 1342 bool spte_set = false; 1343 1344 rcu_read_lock(); 1345 1346 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 1347 if (!is_writable_pte(iter.old_spte)) 1348 break; 1349 1350 new_spte = iter.old_spte & 1351 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 1352 1353 tdp_mmu_set_spte(kvm, &iter, new_spte); 1354 spte_set = true; 1355 } 1356 1357 rcu_read_unlock(); 1358 1359 return spte_set; 1360 } 1361 1362 /* 1363 * Removes write access on the last level SPTE mapping this GFN and unsets the 1364 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1365 * Returns true if an SPTE was set and a TLB flush is needed. 1366 */ 1367 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1368 struct kvm_memory_slot *slot, gfn_t gfn) 1369 { 1370 struct kvm_mmu_page *root; 1371 int root_as_id; 1372 bool spte_set = false; 1373 1374 lockdep_assert_held_write(&kvm->mmu_lock); 1375 for_each_tdp_mmu_root(kvm, root) { 1376 root_as_id = kvm_mmu_page_as_id(root); 1377 if (root_as_id != slot->as_id) 1378 continue; 1379 1380 spte_set |= write_protect_gfn(kvm, root, gfn); 1381 } 1382 return spte_set; 1383 } 1384 1385 /* 1386 * Return the level of the lowest level SPTE added to sptes. 1387 * That SPTE may be non-present. 1388 */ 1389 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1390 int *root_level) 1391 { 1392 struct tdp_iter iter; 1393 struct kvm_mmu *mmu = vcpu->arch.mmu; 1394 gfn_t gfn = addr >> PAGE_SHIFT; 1395 int leaf = -1; 1396 1397 *root_level = vcpu->arch.mmu->shadow_root_level; 1398 1399 rcu_read_lock(); 1400 1401 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1402 leaf = iter.level; 1403 sptes[leaf] = iter.old_spte; 1404 } 1405 1406 rcu_read_unlock(); 1407 1408 return leaf; 1409 } 1410