1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <trace/events/kvm.h> 11 12 #ifdef CONFIG_X86_64 13 static bool __read_mostly tdp_mmu_enabled = false; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 #endif 16 17 static bool is_tdp_mmu_enabled(void) 18 { 19 #ifdef CONFIG_X86_64 20 return tdp_enabled && READ_ONCE(tdp_mmu_enabled); 21 #else 22 return false; 23 #endif /* CONFIG_X86_64 */ 24 } 25 26 /* Initializes the TDP MMU for the VM, if enabled. */ 27 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 28 { 29 if (!is_tdp_mmu_enabled()) 30 return; 31 32 /* This should not be changed for the lifetime of the VM. */ 33 kvm->arch.tdp_mmu_enabled = true; 34 35 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 36 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 37 } 38 39 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 40 { 41 if (!kvm->arch.tdp_mmu_enabled) 42 return; 43 44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 45 } 46 47 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 48 { 49 if (kvm_mmu_put_root(kvm, root)) 50 kvm_tdp_mmu_free_root(kvm, root); 51 } 52 53 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, 54 struct kvm_mmu_page *root) 55 { 56 lockdep_assert_held(&kvm->mmu_lock); 57 58 if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) 59 return false; 60 61 kvm_mmu_get_root(kvm, root); 62 return true; 63 64 } 65 66 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 67 struct kvm_mmu_page *root) 68 { 69 struct kvm_mmu_page *next_root; 70 71 next_root = list_next_entry(root, link); 72 tdp_mmu_put_root(kvm, root); 73 return next_root; 74 } 75 76 /* 77 * Note: this iterator gets and puts references to the roots it iterates over. 78 * This makes it safe to release the MMU lock and yield within the loop, but 79 * if exiting the loop early, the caller must drop the reference to the most 80 * recent root. (Unless keeping a live reference is desirable.) 81 */ 82 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 83 for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ 84 typeof(*_root), link); \ 85 tdp_mmu_next_root_valid(_kvm, _root); \ 86 _root = tdp_mmu_next_root(_kvm, _root)) 87 88 #define for_each_tdp_mmu_root(_kvm, _root) \ 89 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 90 91 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) 92 { 93 struct kvm_mmu_page *sp; 94 95 if (!kvm->arch.tdp_mmu_enabled) 96 return false; 97 if (WARN_ON(!VALID_PAGE(hpa))) 98 return false; 99 100 sp = to_shadow_page(hpa); 101 if (WARN_ON(!sp)) 102 return false; 103 104 return sp->tdp_mmu_page && sp->root_count; 105 } 106 107 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 108 gfn_t start, gfn_t end, bool can_yield); 109 110 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 111 { 112 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 113 114 lockdep_assert_held(&kvm->mmu_lock); 115 116 WARN_ON(root->root_count); 117 WARN_ON(!root->tdp_mmu_page); 118 119 list_del(&root->link); 120 121 zap_gfn_range(kvm, root, 0, max_gfn, false); 122 123 free_page((unsigned long)root->spt); 124 kmem_cache_free(mmu_page_header_cache, root); 125 } 126 127 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 128 int level) 129 { 130 union kvm_mmu_page_role role; 131 132 role = vcpu->arch.mmu->mmu_role.base; 133 role.level = level; 134 role.direct = true; 135 role.gpte_is_8_bytes = true; 136 role.access = ACC_ALL; 137 138 return role; 139 } 140 141 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 142 int level) 143 { 144 struct kvm_mmu_page *sp; 145 146 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 147 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 148 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 149 150 sp->role.word = page_role_for_level(vcpu, level).word; 151 sp->gfn = gfn; 152 sp->tdp_mmu_page = true; 153 154 trace_kvm_mmu_get_page(sp, true); 155 156 return sp; 157 } 158 159 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 160 { 161 union kvm_mmu_page_role role; 162 struct kvm *kvm = vcpu->kvm; 163 struct kvm_mmu_page *root; 164 165 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 166 167 spin_lock(&kvm->mmu_lock); 168 169 /* Check for an existing root before allocating a new one. */ 170 for_each_tdp_mmu_root(kvm, root) { 171 if (root->role.word == role.word) { 172 kvm_mmu_get_root(kvm, root); 173 spin_unlock(&kvm->mmu_lock); 174 return root; 175 } 176 } 177 178 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 179 root->root_count = 1; 180 181 list_add(&root->link, &kvm->arch.tdp_mmu_roots); 182 183 spin_unlock(&kvm->mmu_lock); 184 185 return root; 186 } 187 188 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 189 { 190 struct kvm_mmu_page *root; 191 192 root = get_tdp_mmu_vcpu_root(vcpu); 193 if (!root) 194 return INVALID_PAGE; 195 196 return __pa(root->spt); 197 } 198 199 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 200 u64 old_spte, u64 new_spte, int level); 201 202 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 203 { 204 return sp->role.smm ? 1 : 0; 205 } 206 207 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 208 { 209 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 210 211 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 212 return; 213 214 if (is_accessed_spte(old_spte) && 215 (!is_accessed_spte(new_spte) || pfn_changed)) 216 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 217 } 218 219 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 220 u64 old_spte, u64 new_spte, int level) 221 { 222 bool pfn_changed; 223 struct kvm_memory_slot *slot; 224 225 if (level > PG_LEVEL_4K) 226 return; 227 228 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 229 230 if ((!is_writable_pte(old_spte) || pfn_changed) && 231 is_writable_pte(new_spte)) { 232 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 233 mark_page_dirty_in_slot(kvm, slot, gfn); 234 } 235 } 236 237 /** 238 * handle_changed_spte - handle bookkeeping associated with an SPTE change 239 * @kvm: kvm instance 240 * @as_id: the address space of the paging structure the SPTE was a part of 241 * @gfn: the base GFN that was mapped by the SPTE 242 * @old_spte: The value of the SPTE before the change 243 * @new_spte: The value of the SPTE after the change 244 * @level: the level of the PT the SPTE is part of in the paging structure 245 * 246 * Handle bookkeeping that might result from the modification of a SPTE. 247 * This function must be called for all TDP SPTE modifications. 248 */ 249 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 250 u64 old_spte, u64 new_spte, int level) 251 { 252 bool was_present = is_shadow_present_pte(old_spte); 253 bool is_present = is_shadow_present_pte(new_spte); 254 bool was_leaf = was_present && is_last_spte(old_spte, level); 255 bool is_leaf = is_present && is_last_spte(new_spte, level); 256 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 257 u64 *pt; 258 struct kvm_mmu_page *sp; 259 u64 old_child_spte; 260 int i; 261 262 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 263 WARN_ON(level < PG_LEVEL_4K); 264 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 265 266 /* 267 * If this warning were to trigger it would indicate that there was a 268 * missing MMU notifier or a race with some notifier handler. 269 * A present, leaf SPTE should never be directly replaced with another 270 * present leaf SPTE pointing to a differnt PFN. A notifier handler 271 * should be zapping the SPTE before the main MM's page table is 272 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 273 * thread before replacement. 274 */ 275 if (was_leaf && is_leaf && pfn_changed) { 276 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 277 "SPTE with another present leaf SPTE mapping a\n" 278 "different PFN!\n" 279 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 280 as_id, gfn, old_spte, new_spte, level); 281 282 /* 283 * Crash the host to prevent error propagation and guest data 284 * courruption. 285 */ 286 BUG(); 287 } 288 289 if (old_spte == new_spte) 290 return; 291 292 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 293 294 /* 295 * The only times a SPTE should be changed from a non-present to 296 * non-present state is when an MMIO entry is installed/modified/ 297 * removed. In that case, there is nothing to do here. 298 */ 299 if (!was_present && !is_present) { 300 /* 301 * If this change does not involve a MMIO SPTE, it is 302 * unexpected. Log the change, though it should not impact the 303 * guest since both the former and current SPTEs are nonpresent. 304 */ 305 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) 306 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 307 "should not be replaced with another,\n" 308 "different nonpresent SPTE, unless one or both\n" 309 "are MMIO SPTEs.\n" 310 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 311 as_id, gfn, old_spte, new_spte, level); 312 return; 313 } 314 315 316 if (was_leaf && is_dirty_spte(old_spte) && 317 (!is_dirty_spte(new_spte) || pfn_changed)) 318 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 319 320 /* 321 * Recursively handle child PTs if the change removed a subtree from 322 * the paging structure. 323 */ 324 if (was_present && !was_leaf && (pfn_changed || !is_present)) { 325 pt = spte_to_child_pt(old_spte, level); 326 sp = sptep_to_sp(pt); 327 328 trace_kvm_mmu_prepare_zap_page(sp); 329 330 list_del(&sp->link); 331 332 if (sp->lpage_disallowed) 333 unaccount_huge_nx_page(kvm, sp); 334 335 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 336 old_child_spte = READ_ONCE(*(pt + i)); 337 WRITE_ONCE(*(pt + i), 0); 338 handle_changed_spte(kvm, as_id, 339 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), 340 old_child_spte, 0, level - 1); 341 } 342 343 kvm_flush_remote_tlbs_with_address(kvm, gfn, 344 KVM_PAGES_PER_HPAGE(level)); 345 346 free_page((unsigned long)pt); 347 kmem_cache_free(mmu_page_header_cache, sp); 348 } 349 } 350 351 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 352 u64 old_spte, u64 new_spte, int level) 353 { 354 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); 355 handle_changed_spte_acc_track(old_spte, new_spte, level); 356 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 357 new_spte, level); 358 } 359 360 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 361 u64 new_spte, bool record_acc_track, 362 bool record_dirty_log) 363 { 364 u64 *root_pt = tdp_iter_root_pt(iter); 365 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 366 int as_id = kvm_mmu_page_as_id(root); 367 368 WRITE_ONCE(*iter->sptep, new_spte); 369 370 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 371 iter->level); 372 if (record_acc_track) 373 handle_changed_spte_acc_track(iter->old_spte, new_spte, 374 iter->level); 375 if (record_dirty_log) 376 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 377 iter->old_spte, new_spte, 378 iter->level); 379 } 380 381 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 382 u64 new_spte) 383 { 384 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 385 } 386 387 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 388 struct tdp_iter *iter, 389 u64 new_spte) 390 { 391 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 392 } 393 394 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 395 struct tdp_iter *iter, 396 u64 new_spte) 397 { 398 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 399 } 400 401 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 402 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 403 404 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 405 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 406 if (!is_shadow_present_pte(_iter.old_spte) || \ 407 !is_last_spte(_iter.old_spte, _iter.level)) \ 408 continue; \ 409 else 410 411 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 412 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 413 _mmu->shadow_root_level, _start, _end) 414 415 /* 416 * Flush the TLB if the process should drop kvm->mmu_lock. 417 * Return whether the caller still needs to flush the tlb. 418 */ 419 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) 420 { 421 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 422 kvm_flush_remote_tlbs(kvm); 423 cond_resched_lock(&kvm->mmu_lock); 424 tdp_iter_refresh_walk(iter); 425 return false; 426 } else { 427 return true; 428 } 429 } 430 431 static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) 432 { 433 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 434 cond_resched_lock(&kvm->mmu_lock); 435 tdp_iter_refresh_walk(iter); 436 } 437 } 438 439 /* 440 * Tears down the mappings for the range of gfns, [start, end), and frees the 441 * non-root pages mapping GFNs strictly within that range. Returns true if 442 * SPTEs have been cleared and a TLB flush is needed before releasing the 443 * MMU lock. 444 * If can_yield is true, will release the MMU lock and reschedule if the 445 * scheduler needs the CPU or there is contention on the MMU lock. If this 446 * function cannot yield, it will not release the MMU lock or reschedule and 447 * the caller must ensure it does not supply too large a GFN range, or the 448 * operation can cause a soft lockup. 449 */ 450 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 451 gfn_t start, gfn_t end, bool can_yield) 452 { 453 struct tdp_iter iter; 454 bool flush_needed = false; 455 456 tdp_root_for_each_pte(iter, root, start, end) { 457 if (!is_shadow_present_pte(iter.old_spte)) 458 continue; 459 460 /* 461 * If this is a non-last-level SPTE that covers a larger range 462 * than should be zapped, continue, and zap the mappings at a 463 * lower level. 464 */ 465 if ((iter.gfn < start || 466 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 467 !is_last_spte(iter.old_spte, iter.level)) 468 continue; 469 470 tdp_mmu_set_spte(kvm, &iter, 0); 471 472 if (can_yield) 473 flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); 474 else 475 flush_needed = true; 476 } 477 return flush_needed; 478 } 479 480 /* 481 * Tears down the mappings for the range of gfns, [start, end), and frees the 482 * non-root pages mapping GFNs strictly within that range. Returns true if 483 * SPTEs have been cleared and a TLB flush is needed before releasing the 484 * MMU lock. 485 */ 486 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) 487 { 488 struct kvm_mmu_page *root; 489 bool flush = false; 490 491 for_each_tdp_mmu_root_yield_safe(kvm, root) 492 flush |= zap_gfn_range(kvm, root, start, end, true); 493 494 return flush; 495 } 496 497 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 498 { 499 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 500 bool flush; 501 502 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 503 if (flush) 504 kvm_flush_remote_tlbs(kvm); 505 } 506 507 /* 508 * Installs a last-level SPTE to handle a TDP page fault. 509 * (NPT/EPT violation/misconfiguration) 510 */ 511 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 512 int map_writable, 513 struct tdp_iter *iter, 514 kvm_pfn_t pfn, bool prefault) 515 { 516 u64 new_spte; 517 int ret = 0; 518 int make_spte_ret = 0; 519 520 if (unlikely(is_noslot_pfn(pfn))) { 521 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 522 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); 523 } else { 524 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 525 pfn, iter->old_spte, prefault, true, 526 map_writable, !shadow_accessed_mask, 527 &new_spte); 528 trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); 529 } 530 531 if (new_spte == iter->old_spte) 532 ret = RET_PF_SPURIOUS; 533 else 534 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); 535 536 /* 537 * If the page fault was caused by a write but the page is write 538 * protected, emulation is needed. If the emulation was skipped, 539 * the vCPU would have the same fault again. 540 */ 541 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 542 if (write) 543 ret = RET_PF_EMULATE; 544 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 545 } 546 547 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 548 if (unlikely(is_mmio_spte(new_spte))) 549 ret = RET_PF_EMULATE; 550 551 trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); 552 if (!prefault) 553 vcpu->stat.pf_fixed++; 554 555 return ret; 556 } 557 558 /* 559 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 560 * page tables and SPTEs to translate the faulting guest physical address. 561 */ 562 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 563 int map_writable, int max_level, kvm_pfn_t pfn, 564 bool prefault) 565 { 566 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 567 bool write = error_code & PFERR_WRITE_MASK; 568 bool exec = error_code & PFERR_FETCH_MASK; 569 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 570 struct kvm_mmu *mmu = vcpu->arch.mmu; 571 struct tdp_iter iter; 572 struct kvm_mmu_page *sp; 573 u64 *child_pt; 574 u64 new_spte; 575 int ret; 576 gfn_t gfn = gpa >> PAGE_SHIFT; 577 int level; 578 int req_level; 579 580 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 581 return RET_PF_RETRY; 582 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 583 return RET_PF_RETRY; 584 585 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 586 huge_page_disallowed, &req_level); 587 588 trace_kvm_mmu_spte_requested(gpa, level, pfn); 589 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 590 if (nx_huge_page_workaround_enabled) 591 disallowed_hugepage_adjust(iter.old_spte, gfn, 592 iter.level, &pfn, &level); 593 594 if (iter.level == level) 595 break; 596 597 /* 598 * If there is an SPTE mapping a large page at a higher level 599 * than the target, that SPTE must be cleared and replaced 600 * with a non-leaf SPTE. 601 */ 602 if (is_shadow_present_pte(iter.old_spte) && 603 is_large_pte(iter.old_spte)) { 604 tdp_mmu_set_spte(vcpu->kvm, &iter, 0); 605 606 kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, 607 KVM_PAGES_PER_HPAGE(iter.level)); 608 609 /* 610 * The iter must explicitly re-read the spte here 611 * because the new value informs the !present 612 * path below. 613 */ 614 iter.old_spte = READ_ONCE(*iter.sptep); 615 } 616 617 if (!is_shadow_present_pte(iter.old_spte)) { 618 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 619 list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); 620 child_pt = sp->spt; 621 clear_page(child_pt); 622 new_spte = make_nonleaf_spte(child_pt, 623 !shadow_accessed_mask); 624 625 trace_kvm_mmu_get_page(sp, true); 626 if (huge_page_disallowed && req_level >= iter.level) 627 account_huge_nx_page(vcpu->kvm, sp); 628 629 tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); 630 } 631 } 632 633 if (WARN_ON(iter.level != level)) 634 return RET_PF_RETRY; 635 636 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 637 pfn, prefault); 638 639 return ret; 640 } 641 642 static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, 643 unsigned long end, unsigned long data, 644 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, 645 struct kvm_mmu_page *root, gfn_t start, 646 gfn_t end, unsigned long data)) 647 { 648 struct kvm_memslots *slots; 649 struct kvm_memory_slot *memslot; 650 struct kvm_mmu_page *root; 651 int ret = 0; 652 int as_id; 653 654 for_each_tdp_mmu_root_yield_safe(kvm, root) { 655 as_id = kvm_mmu_page_as_id(root); 656 slots = __kvm_memslots(kvm, as_id); 657 kvm_for_each_memslot(memslot, slots) { 658 unsigned long hva_start, hva_end; 659 gfn_t gfn_start, gfn_end; 660 661 hva_start = max(start, memslot->userspace_addr); 662 hva_end = min(end, memslot->userspace_addr + 663 (memslot->npages << PAGE_SHIFT)); 664 if (hva_start >= hva_end) 665 continue; 666 /* 667 * {gfn(page) | page intersects with [hva_start, hva_end)} = 668 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 669 */ 670 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 671 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 672 673 ret |= handler(kvm, memslot, root, gfn_start, 674 gfn_end, data); 675 } 676 } 677 678 return ret; 679 } 680 681 static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 682 struct kvm_memory_slot *slot, 683 struct kvm_mmu_page *root, gfn_t start, 684 gfn_t end, unsigned long unused) 685 { 686 return zap_gfn_range(kvm, root, start, end, false); 687 } 688 689 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 690 unsigned long end) 691 { 692 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 693 zap_gfn_range_hva_wrapper); 694 } 695 696 /* 697 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 698 * if any of the GFNs in the range have been accessed. 699 */ 700 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 701 struct kvm_mmu_page *root, gfn_t start, gfn_t end, 702 unsigned long unused) 703 { 704 struct tdp_iter iter; 705 int young = 0; 706 u64 new_spte = 0; 707 708 tdp_root_for_each_leaf_pte(iter, root, start, end) { 709 /* 710 * If we have a non-accessed entry we don't need to change the 711 * pte. 712 */ 713 if (!is_accessed_spte(iter.old_spte)) 714 continue; 715 716 new_spte = iter.old_spte; 717 718 if (spte_ad_enabled(new_spte)) { 719 clear_bit((ffs(shadow_accessed_mask) - 1), 720 (unsigned long *)&new_spte); 721 } else { 722 /* 723 * Capture the dirty status of the page, so that it doesn't get 724 * lost when the SPTE is marked for access tracking. 725 */ 726 if (is_writable_pte(new_spte)) 727 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 728 729 new_spte = mark_spte_for_access_track(new_spte); 730 } 731 new_spte &= ~shadow_dirty_mask; 732 733 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 734 young = 1; 735 736 trace_kvm_age_page(iter.gfn, iter.level, slot, young); 737 } 738 739 return young; 740 } 741 742 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 743 unsigned long end) 744 { 745 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 746 age_gfn_range); 747 } 748 749 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 750 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 751 unsigned long unused2) 752 { 753 struct tdp_iter iter; 754 755 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 756 if (is_accessed_spte(iter.old_spte)) 757 return 1; 758 759 return 0; 760 } 761 762 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 763 { 764 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 765 test_age_gfn); 766 } 767 768 /* 769 * Handle the changed_pte MMU notifier for the TDP MMU. 770 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 771 * notifier. 772 * Returns non-zero if a flush is needed before releasing the MMU lock. 773 */ 774 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 775 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 776 unsigned long data) 777 { 778 struct tdp_iter iter; 779 pte_t *ptep = (pte_t *)data; 780 kvm_pfn_t new_pfn; 781 u64 new_spte; 782 int need_flush = 0; 783 784 WARN_ON(pte_huge(*ptep)); 785 786 new_pfn = pte_pfn(*ptep); 787 788 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 789 if (iter.level != PG_LEVEL_4K) 790 continue; 791 792 if (!is_shadow_present_pte(iter.old_spte)) 793 break; 794 795 tdp_mmu_set_spte(kvm, &iter, 0); 796 797 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 798 799 if (!pte_write(*ptep)) { 800 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 801 iter.old_spte, new_pfn); 802 803 tdp_mmu_set_spte(kvm, &iter, new_spte); 804 } 805 806 need_flush = 1; 807 } 808 809 if (need_flush) 810 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 811 812 return 0; 813 } 814 815 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 816 pte_t *host_ptep) 817 { 818 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 819 (unsigned long)host_ptep, 820 set_tdp_spte); 821 } 822 823 /* 824 * Remove write access from all the SPTEs mapping GFNs [start, end). If 825 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 826 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 827 */ 828 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 829 gfn_t start, gfn_t end, int min_level) 830 { 831 struct tdp_iter iter; 832 u64 new_spte; 833 bool spte_set = false; 834 835 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 836 837 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 838 min_level, start, end) { 839 if (!is_shadow_present_pte(iter.old_spte) || 840 !is_last_spte(iter.old_spte, iter.level)) 841 continue; 842 843 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 844 845 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 846 spte_set = true; 847 848 tdp_mmu_iter_cond_resched(kvm, &iter); 849 } 850 return spte_set; 851 } 852 853 /* 854 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 855 * only affect leaf SPTEs down to min_level. 856 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 857 */ 858 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 859 int min_level) 860 { 861 struct kvm_mmu_page *root; 862 int root_as_id; 863 bool spte_set = false; 864 865 for_each_tdp_mmu_root_yield_safe(kvm, root) { 866 root_as_id = kvm_mmu_page_as_id(root); 867 if (root_as_id != slot->as_id) 868 continue; 869 870 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 871 slot->base_gfn + slot->npages, min_level); 872 } 873 874 return spte_set; 875 } 876 877 /* 878 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 879 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 880 * If AD bits are not enabled, this will require clearing the writable bit on 881 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 882 * be flushed. 883 */ 884 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 885 gfn_t start, gfn_t end) 886 { 887 struct tdp_iter iter; 888 u64 new_spte; 889 bool spte_set = false; 890 891 tdp_root_for_each_leaf_pte(iter, root, start, end) { 892 if (spte_ad_need_write_protect(iter.old_spte)) { 893 if (is_writable_pte(iter.old_spte)) 894 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 895 else 896 continue; 897 } else { 898 if (iter.old_spte & shadow_dirty_mask) 899 new_spte = iter.old_spte & ~shadow_dirty_mask; 900 else 901 continue; 902 } 903 904 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 905 spte_set = true; 906 907 tdp_mmu_iter_cond_resched(kvm, &iter); 908 } 909 return spte_set; 910 } 911 912 /* 913 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 914 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 915 * If AD bits are not enabled, this will require clearing the writable bit on 916 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 917 * be flushed. 918 */ 919 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 920 { 921 struct kvm_mmu_page *root; 922 int root_as_id; 923 bool spte_set = false; 924 925 for_each_tdp_mmu_root_yield_safe(kvm, root) { 926 root_as_id = kvm_mmu_page_as_id(root); 927 if (root_as_id != slot->as_id) 928 continue; 929 930 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 931 slot->base_gfn + slot->npages); 932 } 933 934 return spte_set; 935 } 936 937 /* 938 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 939 * set in mask, starting at gfn. The given memslot is expected to contain all 940 * the GFNs represented by set bits in the mask. If AD bits are enabled, 941 * clearing the dirty status will involve clearing the dirty bit on each SPTE 942 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 943 */ 944 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 945 gfn_t gfn, unsigned long mask, bool wrprot) 946 { 947 struct tdp_iter iter; 948 u64 new_spte; 949 950 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 951 gfn + BITS_PER_LONG) { 952 if (!mask) 953 break; 954 955 if (iter.level > PG_LEVEL_4K || 956 !(mask & (1UL << (iter.gfn - gfn)))) 957 continue; 958 959 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 960 if (is_writable_pte(iter.old_spte)) 961 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 962 else 963 continue; 964 } else { 965 if (iter.old_spte & shadow_dirty_mask) 966 new_spte = iter.old_spte & ~shadow_dirty_mask; 967 else 968 continue; 969 } 970 971 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 972 973 mask &= ~(1UL << (iter.gfn - gfn)); 974 } 975 } 976 977 /* 978 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 979 * set in mask, starting at gfn. The given memslot is expected to contain all 980 * the GFNs represented by set bits in the mask. If AD bits are enabled, 981 * clearing the dirty status will involve clearing the dirty bit on each SPTE 982 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 983 */ 984 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 985 struct kvm_memory_slot *slot, 986 gfn_t gfn, unsigned long mask, 987 bool wrprot) 988 { 989 struct kvm_mmu_page *root; 990 int root_as_id; 991 992 lockdep_assert_held(&kvm->mmu_lock); 993 for_each_tdp_mmu_root(kvm, root) { 994 root_as_id = kvm_mmu_page_as_id(root); 995 if (root_as_id != slot->as_id) 996 continue; 997 998 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 999 } 1000 } 1001 1002 /* 1003 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1004 * only used for PML, and so will involve setting the dirty bit on each SPTE. 1005 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1006 */ 1007 static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1008 gfn_t start, gfn_t end) 1009 { 1010 struct tdp_iter iter; 1011 u64 new_spte; 1012 bool spte_set = false; 1013 1014 tdp_root_for_each_pte(iter, root, start, end) { 1015 if (!is_shadow_present_pte(iter.old_spte)) 1016 continue; 1017 1018 new_spte = iter.old_spte | shadow_dirty_mask; 1019 1020 tdp_mmu_set_spte(kvm, &iter, new_spte); 1021 spte_set = true; 1022 1023 tdp_mmu_iter_cond_resched(kvm, &iter); 1024 } 1025 1026 return spte_set; 1027 } 1028 1029 /* 1030 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1031 * only used for PML, and so will involve setting the dirty bit on each SPTE. 1032 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1033 */ 1034 bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) 1035 { 1036 struct kvm_mmu_page *root; 1037 int root_as_id; 1038 bool spte_set = false; 1039 1040 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1041 root_as_id = kvm_mmu_page_as_id(root); 1042 if (root_as_id != slot->as_id) 1043 continue; 1044 1045 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, 1046 slot->base_gfn + slot->npages); 1047 } 1048 return spte_set; 1049 } 1050 1051 /* 1052 * Clear leaf entries which could be replaced by large mappings, for 1053 * GFNs within the slot. 1054 */ 1055 static void zap_collapsible_spte_range(struct kvm *kvm, 1056 struct kvm_mmu_page *root, 1057 gfn_t start, gfn_t end) 1058 { 1059 struct tdp_iter iter; 1060 kvm_pfn_t pfn; 1061 bool spte_set = false; 1062 1063 tdp_root_for_each_pte(iter, root, start, end) { 1064 if (!is_shadow_present_pte(iter.old_spte) || 1065 !is_last_spte(iter.old_spte, iter.level)) 1066 continue; 1067 1068 pfn = spte_to_pfn(iter.old_spte); 1069 if (kvm_is_reserved_pfn(pfn) || 1070 !PageTransCompoundMap(pfn_to_page(pfn))) 1071 continue; 1072 1073 tdp_mmu_set_spte(kvm, &iter, 0); 1074 1075 spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); 1076 } 1077 1078 if (spte_set) 1079 kvm_flush_remote_tlbs(kvm); 1080 } 1081 1082 /* 1083 * Clear non-leaf entries (and free associated page tables) which could 1084 * be replaced by large mappings, for GFNs within the slot. 1085 */ 1086 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1087 const struct kvm_memory_slot *slot) 1088 { 1089 struct kvm_mmu_page *root; 1090 int root_as_id; 1091 1092 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1093 root_as_id = kvm_mmu_page_as_id(root); 1094 if (root_as_id != slot->as_id) 1095 continue; 1096 1097 zap_collapsible_spte_range(kvm, root, slot->base_gfn, 1098 slot->base_gfn + slot->npages); 1099 } 1100 } 1101 1102 /* 1103 * Removes write access on the last level SPTE mapping this GFN and unsets the 1104 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1105 * Returns true if an SPTE was set and a TLB flush is needed. 1106 */ 1107 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1108 gfn_t gfn) 1109 { 1110 struct tdp_iter iter; 1111 u64 new_spte; 1112 bool spte_set = false; 1113 1114 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 1115 if (!is_writable_pte(iter.old_spte)) 1116 break; 1117 1118 new_spte = iter.old_spte & 1119 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 1120 1121 tdp_mmu_set_spte(kvm, &iter, new_spte); 1122 spte_set = true; 1123 } 1124 1125 return spte_set; 1126 } 1127 1128 /* 1129 * Removes write access on the last level SPTE mapping this GFN and unsets the 1130 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1131 * Returns true if an SPTE was set and a TLB flush is needed. 1132 */ 1133 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1134 struct kvm_memory_slot *slot, gfn_t gfn) 1135 { 1136 struct kvm_mmu_page *root; 1137 int root_as_id; 1138 bool spte_set = false; 1139 1140 lockdep_assert_held(&kvm->mmu_lock); 1141 for_each_tdp_mmu_root(kvm, root) { 1142 root_as_id = kvm_mmu_page_as_id(root); 1143 if (root_as_id != slot->as_id) 1144 continue; 1145 1146 spte_set |= write_protect_gfn(kvm, root, gfn); 1147 } 1148 return spte_set; 1149 } 1150 1151 /* 1152 * Return the level of the lowest level SPTE added to sptes. 1153 * That SPTE may be non-present. 1154 */ 1155 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1156 int *root_level) 1157 { 1158 struct tdp_iter iter; 1159 struct kvm_mmu *mmu = vcpu->arch.mmu; 1160 gfn_t gfn = addr >> PAGE_SHIFT; 1161 int leaf = -1; 1162 1163 *root_level = vcpu->arch.mmu->shadow_root_level; 1164 1165 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1166 leaf = iter.level; 1167 sptes[leaf] = iter.old_spte; 1168 } 1169 1170 return leaf; 1171 } 1172