1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #ifdef CONFIG_X86_64 11 static bool __read_mostly tdp_mmu_enabled = false; 12 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 13 #endif 14 15 static bool is_tdp_mmu_enabled(void) 16 { 17 #ifdef CONFIG_X86_64 18 return tdp_enabled && READ_ONCE(tdp_mmu_enabled); 19 #else 20 return false; 21 #endif /* CONFIG_X86_64 */ 22 } 23 24 /* Initializes the TDP MMU for the VM, if enabled. */ 25 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 26 { 27 if (!is_tdp_mmu_enabled()) 28 return; 29 30 /* This should not be changed for the lifetime of the VM. */ 31 kvm->arch.tdp_mmu_enabled = true; 32 33 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 34 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 35 } 36 37 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 38 { 39 if (!kvm->arch.tdp_mmu_enabled) 40 return; 41 42 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 43 } 44 45 #define for_each_tdp_mmu_root(_kvm, _root) \ 46 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 47 48 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) 49 { 50 struct kvm_mmu_page *sp; 51 52 sp = to_shadow_page(hpa); 53 54 return sp->tdp_mmu_page && sp->root_count; 55 } 56 57 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 58 gfn_t start, gfn_t end, bool can_yield); 59 60 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 61 { 62 gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); 63 64 lockdep_assert_held(&kvm->mmu_lock); 65 66 WARN_ON(root->root_count); 67 WARN_ON(!root->tdp_mmu_page); 68 69 list_del(&root->link); 70 71 zap_gfn_range(kvm, root, 0, max_gfn, false); 72 73 free_page((unsigned long)root->spt); 74 kmem_cache_free(mmu_page_header_cache, root); 75 } 76 77 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 78 int level) 79 { 80 union kvm_mmu_page_role role; 81 82 role = vcpu->arch.mmu->mmu_role.base; 83 role.level = level; 84 role.direct = true; 85 role.gpte_is_8_bytes = true; 86 role.access = ACC_ALL; 87 88 return role; 89 } 90 91 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 92 int level) 93 { 94 struct kvm_mmu_page *sp; 95 96 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 97 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 98 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 99 100 sp->role.word = page_role_for_level(vcpu, level).word; 101 sp->gfn = gfn; 102 sp->tdp_mmu_page = true; 103 104 return sp; 105 } 106 107 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 108 { 109 union kvm_mmu_page_role role; 110 struct kvm *kvm = vcpu->kvm; 111 struct kvm_mmu_page *root; 112 113 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 114 115 spin_lock(&kvm->mmu_lock); 116 117 /* Check for an existing root before allocating a new one. */ 118 for_each_tdp_mmu_root(kvm, root) { 119 if (root->role.word == role.word) { 120 kvm_mmu_get_root(kvm, root); 121 spin_unlock(&kvm->mmu_lock); 122 return root; 123 } 124 } 125 126 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 127 root->root_count = 1; 128 129 list_add(&root->link, &kvm->arch.tdp_mmu_roots); 130 131 spin_unlock(&kvm->mmu_lock); 132 133 return root; 134 } 135 136 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 137 { 138 struct kvm_mmu_page *root; 139 140 root = get_tdp_mmu_vcpu_root(vcpu); 141 if (!root) 142 return INVALID_PAGE; 143 144 return __pa(root->spt); 145 } 146 147 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 148 u64 old_spte, u64 new_spte, int level); 149 150 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 151 { 152 return sp->role.smm ? 1 : 0; 153 } 154 155 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 156 { 157 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 158 159 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 160 return; 161 162 if (is_accessed_spte(old_spte) && 163 (!is_accessed_spte(new_spte) || pfn_changed)) 164 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 165 } 166 167 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 168 u64 old_spte, u64 new_spte, int level) 169 { 170 bool pfn_changed; 171 struct kvm_memory_slot *slot; 172 173 if (level > PG_LEVEL_4K) 174 return; 175 176 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 177 178 if ((!is_writable_pte(old_spte) || pfn_changed) && 179 is_writable_pte(new_spte)) { 180 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 181 mark_page_dirty_in_slot(slot, gfn); 182 } 183 } 184 185 /** 186 * handle_changed_spte - handle bookkeeping associated with an SPTE change 187 * @kvm: kvm instance 188 * @as_id: the address space of the paging structure the SPTE was a part of 189 * @gfn: the base GFN that was mapped by the SPTE 190 * @old_spte: The value of the SPTE before the change 191 * @new_spte: The value of the SPTE after the change 192 * @level: the level of the PT the SPTE is part of in the paging structure 193 * 194 * Handle bookkeeping that might result from the modification of a SPTE. 195 * This function must be called for all TDP SPTE modifications. 196 */ 197 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 198 u64 old_spte, u64 new_spte, int level) 199 { 200 bool was_present = is_shadow_present_pte(old_spte); 201 bool is_present = is_shadow_present_pte(new_spte); 202 bool was_leaf = was_present && is_last_spte(old_spte, level); 203 bool is_leaf = is_present && is_last_spte(new_spte, level); 204 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 205 u64 *pt; 206 struct kvm_mmu_page *sp; 207 u64 old_child_spte; 208 int i; 209 210 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 211 WARN_ON(level < PG_LEVEL_4K); 212 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 213 214 /* 215 * If this warning were to trigger it would indicate that there was a 216 * missing MMU notifier or a race with some notifier handler. 217 * A present, leaf SPTE should never be directly replaced with another 218 * present leaf SPTE pointing to a differnt PFN. A notifier handler 219 * should be zapping the SPTE before the main MM's page table is 220 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 221 * thread before replacement. 222 */ 223 if (was_leaf && is_leaf && pfn_changed) { 224 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 225 "SPTE with another present leaf SPTE mapping a\n" 226 "different PFN!\n" 227 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 228 as_id, gfn, old_spte, new_spte, level); 229 230 /* 231 * Crash the host to prevent error propagation and guest data 232 * courruption. 233 */ 234 BUG(); 235 } 236 237 if (old_spte == new_spte) 238 return; 239 240 /* 241 * The only times a SPTE should be changed from a non-present to 242 * non-present state is when an MMIO entry is installed/modified/ 243 * removed. In that case, there is nothing to do here. 244 */ 245 if (!was_present && !is_present) { 246 /* 247 * If this change does not involve a MMIO SPTE, it is 248 * unexpected. Log the change, though it should not impact the 249 * guest since both the former and current SPTEs are nonpresent. 250 */ 251 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) 252 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 253 "should not be replaced with another,\n" 254 "different nonpresent SPTE, unless one or both\n" 255 "are MMIO SPTEs.\n" 256 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 257 as_id, gfn, old_spte, new_spte, level); 258 return; 259 } 260 261 262 if (was_leaf && is_dirty_spte(old_spte) && 263 (!is_dirty_spte(new_spte) || pfn_changed)) 264 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 265 266 /* 267 * Recursively handle child PTs if the change removed a subtree from 268 * the paging structure. 269 */ 270 if (was_present && !was_leaf && (pfn_changed || !is_present)) { 271 pt = spte_to_child_pt(old_spte, level); 272 sp = sptep_to_sp(pt); 273 274 list_del(&sp->link); 275 276 if (sp->lpage_disallowed) 277 unaccount_huge_nx_page(kvm, sp); 278 279 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 280 old_child_spte = READ_ONCE(*(pt + i)); 281 WRITE_ONCE(*(pt + i), 0); 282 handle_changed_spte(kvm, as_id, 283 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), 284 old_child_spte, 0, level - 1); 285 } 286 287 kvm_flush_remote_tlbs_with_address(kvm, gfn, 288 KVM_PAGES_PER_HPAGE(level)); 289 290 free_page((unsigned long)pt); 291 kmem_cache_free(mmu_page_header_cache, sp); 292 } 293 } 294 295 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 296 u64 old_spte, u64 new_spte, int level) 297 { 298 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); 299 handle_changed_spte_acc_track(old_spte, new_spte, level); 300 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 301 new_spte, level); 302 } 303 304 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 305 u64 new_spte, bool record_acc_track, 306 bool record_dirty_log) 307 { 308 u64 *root_pt = tdp_iter_root_pt(iter); 309 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 310 int as_id = kvm_mmu_page_as_id(root); 311 312 WRITE_ONCE(*iter->sptep, new_spte); 313 314 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 315 iter->level); 316 if (record_acc_track) 317 handle_changed_spte_acc_track(iter->old_spte, new_spte, 318 iter->level); 319 if (record_dirty_log) 320 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 321 iter->old_spte, new_spte, 322 iter->level); 323 } 324 325 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 326 u64 new_spte) 327 { 328 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 329 } 330 331 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 332 struct tdp_iter *iter, 333 u64 new_spte) 334 { 335 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 336 } 337 338 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 339 struct tdp_iter *iter, 340 u64 new_spte) 341 { 342 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 343 } 344 345 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 346 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 347 348 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 349 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 350 if (!is_shadow_present_pte(_iter.old_spte) || \ 351 !is_last_spte(_iter.old_spte, _iter.level)) \ 352 continue; \ 353 else 354 355 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 356 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 357 _mmu->shadow_root_level, _start, _end) 358 359 /* 360 * Flush the TLB if the process should drop kvm->mmu_lock. 361 * Return whether the caller still needs to flush the tlb. 362 */ 363 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) 364 { 365 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 366 kvm_flush_remote_tlbs(kvm); 367 cond_resched_lock(&kvm->mmu_lock); 368 tdp_iter_refresh_walk(iter); 369 return false; 370 } else { 371 return true; 372 } 373 } 374 375 static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) 376 { 377 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 378 cond_resched_lock(&kvm->mmu_lock); 379 tdp_iter_refresh_walk(iter); 380 } 381 } 382 383 /* 384 * Tears down the mappings for the range of gfns, [start, end), and frees the 385 * non-root pages mapping GFNs strictly within that range. Returns true if 386 * SPTEs have been cleared and a TLB flush is needed before releasing the 387 * MMU lock. 388 * If can_yield is true, will release the MMU lock and reschedule if the 389 * scheduler needs the CPU or there is contention on the MMU lock. If this 390 * function cannot yield, it will not release the MMU lock or reschedule and 391 * the caller must ensure it does not supply too large a GFN range, or the 392 * operation can cause a soft lockup. 393 */ 394 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 395 gfn_t start, gfn_t end, bool can_yield) 396 { 397 struct tdp_iter iter; 398 bool flush_needed = false; 399 400 tdp_root_for_each_pte(iter, root, start, end) { 401 if (!is_shadow_present_pte(iter.old_spte)) 402 continue; 403 404 /* 405 * If this is a non-last-level SPTE that covers a larger range 406 * than should be zapped, continue, and zap the mappings at a 407 * lower level. 408 */ 409 if ((iter.gfn < start || 410 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 411 !is_last_spte(iter.old_spte, iter.level)) 412 continue; 413 414 tdp_mmu_set_spte(kvm, &iter, 0); 415 416 if (can_yield) 417 flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); 418 else 419 flush_needed = true; 420 } 421 return flush_needed; 422 } 423 424 /* 425 * Tears down the mappings for the range of gfns, [start, end), and frees the 426 * non-root pages mapping GFNs strictly within that range. Returns true if 427 * SPTEs have been cleared and a TLB flush is needed before releasing the 428 * MMU lock. 429 */ 430 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) 431 { 432 struct kvm_mmu_page *root; 433 bool flush = false; 434 435 for_each_tdp_mmu_root(kvm, root) { 436 /* 437 * Take a reference on the root so that it cannot be freed if 438 * this thread releases the MMU lock and yields in this loop. 439 */ 440 kvm_mmu_get_root(kvm, root); 441 442 flush |= zap_gfn_range(kvm, root, start, end, true); 443 444 kvm_mmu_put_root(kvm, root); 445 } 446 447 return flush; 448 } 449 450 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 451 { 452 gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); 453 bool flush; 454 455 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 456 if (flush) 457 kvm_flush_remote_tlbs(kvm); 458 } 459 460 /* 461 * Installs a last-level SPTE to handle a TDP page fault. 462 * (NPT/EPT violation/misconfiguration) 463 */ 464 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 465 int map_writable, 466 struct tdp_iter *iter, 467 kvm_pfn_t pfn, bool prefault) 468 { 469 u64 new_spte; 470 int ret = 0; 471 int make_spte_ret = 0; 472 473 if (unlikely(is_noslot_pfn(pfn))) { 474 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 475 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); 476 } else 477 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 478 pfn, iter->old_spte, prefault, true, 479 map_writable, !shadow_accessed_mask, 480 &new_spte); 481 482 if (new_spte == iter->old_spte) 483 ret = RET_PF_SPURIOUS; 484 else 485 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); 486 487 /* 488 * If the page fault was caused by a write but the page is write 489 * protected, emulation is needed. If the emulation was skipped, 490 * the vCPU would have the same fault again. 491 */ 492 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 493 if (write) 494 ret = RET_PF_EMULATE; 495 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 496 } 497 498 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 499 if (unlikely(is_mmio_spte(new_spte))) 500 ret = RET_PF_EMULATE; 501 502 trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); 503 if (!prefault) 504 vcpu->stat.pf_fixed++; 505 506 return ret; 507 } 508 509 /* 510 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 511 * page tables and SPTEs to translate the faulting guest physical address. 512 */ 513 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 514 int map_writable, int max_level, kvm_pfn_t pfn, 515 bool prefault) 516 { 517 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 518 bool write = error_code & PFERR_WRITE_MASK; 519 bool exec = error_code & PFERR_FETCH_MASK; 520 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 521 struct kvm_mmu *mmu = vcpu->arch.mmu; 522 struct tdp_iter iter; 523 struct kvm_mmu_page *sp; 524 u64 *child_pt; 525 u64 new_spte; 526 int ret; 527 gfn_t gfn = gpa >> PAGE_SHIFT; 528 int level; 529 int req_level; 530 531 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 532 return RET_PF_RETRY; 533 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 534 return RET_PF_RETRY; 535 536 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 537 huge_page_disallowed, &req_level); 538 539 trace_kvm_mmu_spte_requested(gpa, level, pfn); 540 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 541 if (nx_huge_page_workaround_enabled) 542 disallowed_hugepage_adjust(iter.old_spte, gfn, 543 iter.level, &pfn, &level); 544 545 if (iter.level == level) 546 break; 547 548 /* 549 * If there is an SPTE mapping a large page at a higher level 550 * than the target, that SPTE must be cleared and replaced 551 * with a non-leaf SPTE. 552 */ 553 if (is_shadow_present_pte(iter.old_spte) && 554 is_large_pte(iter.old_spte)) { 555 tdp_mmu_set_spte(vcpu->kvm, &iter, 0); 556 557 kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, 558 KVM_PAGES_PER_HPAGE(iter.level)); 559 560 /* 561 * The iter must explicitly re-read the spte here 562 * because the new value informs the !present 563 * path below. 564 */ 565 iter.old_spte = READ_ONCE(*iter.sptep); 566 } 567 568 if (!is_shadow_present_pte(iter.old_spte)) { 569 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 570 list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); 571 child_pt = sp->spt; 572 clear_page(child_pt); 573 new_spte = make_nonleaf_spte(child_pt, 574 !shadow_accessed_mask); 575 576 trace_kvm_mmu_get_page(sp, true); 577 if (huge_page_disallowed && req_level >= iter.level) 578 account_huge_nx_page(vcpu->kvm, sp); 579 580 tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); 581 } 582 } 583 584 if (WARN_ON(iter.level != level)) 585 return RET_PF_RETRY; 586 587 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 588 pfn, prefault); 589 590 return ret; 591 } 592 593 static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, 594 unsigned long end, unsigned long data, 595 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, 596 struct kvm_mmu_page *root, gfn_t start, 597 gfn_t end, unsigned long data)) 598 { 599 struct kvm_memslots *slots; 600 struct kvm_memory_slot *memslot; 601 struct kvm_mmu_page *root; 602 int ret = 0; 603 int as_id; 604 605 for_each_tdp_mmu_root(kvm, root) { 606 /* 607 * Take a reference on the root so that it cannot be freed if 608 * this thread releases the MMU lock and yields in this loop. 609 */ 610 kvm_mmu_get_root(kvm, root); 611 612 as_id = kvm_mmu_page_as_id(root); 613 slots = __kvm_memslots(kvm, as_id); 614 kvm_for_each_memslot(memslot, slots) { 615 unsigned long hva_start, hva_end; 616 gfn_t gfn_start, gfn_end; 617 618 hva_start = max(start, memslot->userspace_addr); 619 hva_end = min(end, memslot->userspace_addr + 620 (memslot->npages << PAGE_SHIFT)); 621 if (hva_start >= hva_end) 622 continue; 623 /* 624 * {gfn(page) | page intersects with [hva_start, hva_end)} = 625 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 626 */ 627 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 628 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 629 630 ret |= handler(kvm, memslot, root, gfn_start, 631 gfn_end, data); 632 } 633 634 kvm_mmu_put_root(kvm, root); 635 } 636 637 return ret; 638 } 639 640 static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 641 struct kvm_memory_slot *slot, 642 struct kvm_mmu_page *root, gfn_t start, 643 gfn_t end, unsigned long unused) 644 { 645 return zap_gfn_range(kvm, root, start, end, false); 646 } 647 648 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 649 unsigned long end) 650 { 651 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 652 zap_gfn_range_hva_wrapper); 653 } 654 655 /* 656 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 657 * if any of the GFNs in the range have been accessed. 658 */ 659 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 660 struct kvm_mmu_page *root, gfn_t start, gfn_t end, 661 unsigned long unused) 662 { 663 struct tdp_iter iter; 664 int young = 0; 665 u64 new_spte = 0; 666 667 tdp_root_for_each_leaf_pte(iter, root, start, end) { 668 /* 669 * If we have a non-accessed entry we don't need to change the 670 * pte. 671 */ 672 if (!is_accessed_spte(iter.old_spte)) 673 continue; 674 675 new_spte = iter.old_spte; 676 677 if (spte_ad_enabled(new_spte)) { 678 clear_bit((ffs(shadow_accessed_mask) - 1), 679 (unsigned long *)&new_spte); 680 } else { 681 /* 682 * Capture the dirty status of the page, so that it doesn't get 683 * lost when the SPTE is marked for access tracking. 684 */ 685 if (is_writable_pte(new_spte)) 686 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 687 688 new_spte = mark_spte_for_access_track(new_spte); 689 } 690 new_spte &= ~shadow_dirty_mask; 691 692 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 693 young = 1; 694 } 695 696 return young; 697 } 698 699 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 700 unsigned long end) 701 { 702 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 703 age_gfn_range); 704 } 705 706 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 707 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 708 unsigned long unused2) 709 { 710 struct tdp_iter iter; 711 712 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 713 if (is_accessed_spte(iter.old_spte)) 714 return 1; 715 716 return 0; 717 } 718 719 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 720 { 721 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 722 test_age_gfn); 723 } 724 725 /* 726 * Handle the changed_pte MMU notifier for the TDP MMU. 727 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 728 * notifier. 729 * Returns non-zero if a flush is needed before releasing the MMU lock. 730 */ 731 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 732 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 733 unsigned long data) 734 { 735 struct tdp_iter iter; 736 pte_t *ptep = (pte_t *)data; 737 kvm_pfn_t new_pfn; 738 u64 new_spte; 739 int need_flush = 0; 740 741 WARN_ON(pte_huge(*ptep)); 742 743 new_pfn = pte_pfn(*ptep); 744 745 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 746 if (iter.level != PG_LEVEL_4K) 747 continue; 748 749 if (!is_shadow_present_pte(iter.old_spte)) 750 break; 751 752 tdp_mmu_set_spte(kvm, &iter, 0); 753 754 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 755 756 if (!pte_write(*ptep)) { 757 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 758 iter.old_spte, new_pfn); 759 760 tdp_mmu_set_spte(kvm, &iter, new_spte); 761 } 762 763 need_flush = 1; 764 } 765 766 if (need_flush) 767 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 768 769 return 0; 770 } 771 772 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 773 pte_t *host_ptep) 774 { 775 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 776 (unsigned long)host_ptep, 777 set_tdp_spte); 778 } 779 780 /* 781 * Remove write access from all the SPTEs mapping GFNs [start, end). If 782 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 783 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 784 */ 785 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 786 gfn_t start, gfn_t end, int min_level) 787 { 788 struct tdp_iter iter; 789 u64 new_spte; 790 bool spte_set = false; 791 792 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 793 794 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 795 min_level, start, end) { 796 if (!is_shadow_present_pte(iter.old_spte) || 797 !is_last_spte(iter.old_spte, iter.level)) 798 continue; 799 800 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 801 802 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 803 spte_set = true; 804 805 tdp_mmu_iter_cond_resched(kvm, &iter); 806 } 807 return spte_set; 808 } 809 810 /* 811 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 812 * only affect leaf SPTEs down to min_level. 813 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 814 */ 815 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 816 int min_level) 817 { 818 struct kvm_mmu_page *root; 819 int root_as_id; 820 bool spte_set = false; 821 822 for_each_tdp_mmu_root(kvm, root) { 823 root_as_id = kvm_mmu_page_as_id(root); 824 if (root_as_id != slot->as_id) 825 continue; 826 827 /* 828 * Take a reference on the root so that it cannot be freed if 829 * this thread releases the MMU lock and yields in this loop. 830 */ 831 kvm_mmu_get_root(kvm, root); 832 833 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 834 slot->base_gfn + slot->npages, min_level); 835 836 kvm_mmu_put_root(kvm, root); 837 } 838 839 return spte_set; 840 } 841 842 /* 843 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 844 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 845 * If AD bits are not enabled, this will require clearing the writable bit on 846 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 847 * be flushed. 848 */ 849 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 850 gfn_t start, gfn_t end) 851 { 852 struct tdp_iter iter; 853 u64 new_spte; 854 bool spte_set = false; 855 856 tdp_root_for_each_leaf_pte(iter, root, start, end) { 857 if (spte_ad_need_write_protect(iter.old_spte)) { 858 if (is_writable_pte(iter.old_spte)) 859 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 860 else 861 continue; 862 } else { 863 if (iter.old_spte & shadow_dirty_mask) 864 new_spte = iter.old_spte & ~shadow_dirty_mask; 865 else 866 continue; 867 } 868 869 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 870 spte_set = true; 871 872 tdp_mmu_iter_cond_resched(kvm, &iter); 873 } 874 return spte_set; 875 } 876 877 /* 878 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 879 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 880 * If AD bits are not enabled, this will require clearing the writable bit on 881 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 882 * be flushed. 883 */ 884 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 885 { 886 struct kvm_mmu_page *root; 887 int root_as_id; 888 bool spte_set = false; 889 890 for_each_tdp_mmu_root(kvm, root) { 891 root_as_id = kvm_mmu_page_as_id(root); 892 if (root_as_id != slot->as_id) 893 continue; 894 895 /* 896 * Take a reference on the root so that it cannot be freed if 897 * this thread releases the MMU lock and yields in this loop. 898 */ 899 kvm_mmu_get_root(kvm, root); 900 901 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 902 slot->base_gfn + slot->npages); 903 904 kvm_mmu_put_root(kvm, root); 905 } 906 907 return spte_set; 908 } 909 910 /* 911 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 912 * set in mask, starting at gfn. The given memslot is expected to contain all 913 * the GFNs represented by set bits in the mask. If AD bits are enabled, 914 * clearing the dirty status will involve clearing the dirty bit on each SPTE 915 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 916 */ 917 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 918 gfn_t gfn, unsigned long mask, bool wrprot) 919 { 920 struct tdp_iter iter; 921 u64 new_spte; 922 923 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 924 gfn + BITS_PER_LONG) { 925 if (!mask) 926 break; 927 928 if (iter.level > PG_LEVEL_4K || 929 !(mask & (1UL << (iter.gfn - gfn)))) 930 continue; 931 932 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 933 if (is_writable_pte(iter.old_spte)) 934 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 935 else 936 continue; 937 } else { 938 if (iter.old_spte & shadow_dirty_mask) 939 new_spte = iter.old_spte & ~shadow_dirty_mask; 940 else 941 continue; 942 } 943 944 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 945 946 mask &= ~(1UL << (iter.gfn - gfn)); 947 } 948 } 949 950 /* 951 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 952 * set in mask, starting at gfn. The given memslot is expected to contain all 953 * the GFNs represented by set bits in the mask. If AD bits are enabled, 954 * clearing the dirty status will involve clearing the dirty bit on each SPTE 955 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 956 */ 957 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 958 struct kvm_memory_slot *slot, 959 gfn_t gfn, unsigned long mask, 960 bool wrprot) 961 { 962 struct kvm_mmu_page *root; 963 int root_as_id; 964 965 lockdep_assert_held(&kvm->mmu_lock); 966 for_each_tdp_mmu_root(kvm, root) { 967 root_as_id = kvm_mmu_page_as_id(root); 968 if (root_as_id != slot->as_id) 969 continue; 970 971 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 972 } 973 } 974 975 /* 976 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 977 * only used for PML, and so will involve setting the dirty bit on each SPTE. 978 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 979 */ 980 static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 981 gfn_t start, gfn_t end) 982 { 983 struct tdp_iter iter; 984 u64 new_spte; 985 bool spte_set = false; 986 987 tdp_root_for_each_pte(iter, root, start, end) { 988 if (!is_shadow_present_pte(iter.old_spte)) 989 continue; 990 991 new_spte = iter.old_spte | shadow_dirty_mask; 992 993 tdp_mmu_set_spte(kvm, &iter, new_spte); 994 spte_set = true; 995 996 tdp_mmu_iter_cond_resched(kvm, &iter); 997 } 998 999 return spte_set; 1000 } 1001 1002 /* 1003 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1004 * only used for PML, and so will involve setting the dirty bit on each SPTE. 1005 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1006 */ 1007 bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) 1008 { 1009 struct kvm_mmu_page *root; 1010 int root_as_id; 1011 bool spte_set = false; 1012 1013 for_each_tdp_mmu_root(kvm, root) { 1014 root_as_id = kvm_mmu_page_as_id(root); 1015 if (root_as_id != slot->as_id) 1016 continue; 1017 1018 /* 1019 * Take a reference on the root so that it cannot be freed if 1020 * this thread releases the MMU lock and yields in this loop. 1021 */ 1022 kvm_mmu_get_root(kvm, root); 1023 1024 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, 1025 slot->base_gfn + slot->npages); 1026 1027 kvm_mmu_put_root(kvm, root); 1028 } 1029 return spte_set; 1030 } 1031 1032 /* 1033 * Clear non-leaf entries (and free associated page tables) which could 1034 * be replaced by large mappings, for GFNs within the slot. 1035 */ 1036 static void zap_collapsible_spte_range(struct kvm *kvm, 1037 struct kvm_mmu_page *root, 1038 gfn_t start, gfn_t end) 1039 { 1040 struct tdp_iter iter; 1041 kvm_pfn_t pfn; 1042 bool spte_set = false; 1043 1044 tdp_root_for_each_pte(iter, root, start, end) { 1045 if (!is_shadow_present_pte(iter.old_spte) || 1046 is_last_spte(iter.old_spte, iter.level)) 1047 continue; 1048 1049 pfn = spte_to_pfn(iter.old_spte); 1050 if (kvm_is_reserved_pfn(pfn) || 1051 !PageTransCompoundMap(pfn_to_page(pfn))) 1052 continue; 1053 1054 tdp_mmu_set_spte(kvm, &iter, 0); 1055 1056 spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); 1057 } 1058 1059 if (spte_set) 1060 kvm_flush_remote_tlbs(kvm); 1061 } 1062 1063 /* 1064 * Clear non-leaf entries (and free associated page tables) which could 1065 * be replaced by large mappings, for GFNs within the slot. 1066 */ 1067 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1068 const struct kvm_memory_slot *slot) 1069 { 1070 struct kvm_mmu_page *root; 1071 int root_as_id; 1072 1073 for_each_tdp_mmu_root(kvm, root) { 1074 root_as_id = kvm_mmu_page_as_id(root); 1075 if (root_as_id != slot->as_id) 1076 continue; 1077 1078 /* 1079 * Take a reference on the root so that it cannot be freed if 1080 * this thread releases the MMU lock and yields in this loop. 1081 */ 1082 kvm_mmu_get_root(kvm, root); 1083 1084 zap_collapsible_spte_range(kvm, root, slot->base_gfn, 1085 slot->base_gfn + slot->npages); 1086 1087 kvm_mmu_put_root(kvm, root); 1088 } 1089 } 1090 1091 /* 1092 * Removes write access on the last level SPTE mapping this GFN and unsets the 1093 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1094 * Returns true if an SPTE was set and a TLB flush is needed. 1095 */ 1096 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1097 gfn_t gfn) 1098 { 1099 struct tdp_iter iter; 1100 u64 new_spte; 1101 bool spte_set = false; 1102 1103 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 1104 if (!is_writable_pte(iter.old_spte)) 1105 break; 1106 1107 new_spte = iter.old_spte & 1108 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 1109 1110 tdp_mmu_set_spte(kvm, &iter, new_spte); 1111 spte_set = true; 1112 } 1113 1114 return spte_set; 1115 } 1116 1117 /* 1118 * Removes write access on the last level SPTE mapping this GFN and unsets the 1119 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1120 * Returns true if an SPTE was set and a TLB flush is needed. 1121 */ 1122 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1123 struct kvm_memory_slot *slot, gfn_t gfn) 1124 { 1125 struct kvm_mmu_page *root; 1126 int root_as_id; 1127 bool spte_set = false; 1128 1129 lockdep_assert_held(&kvm->mmu_lock); 1130 for_each_tdp_mmu_root(kvm, root) { 1131 root_as_id = kvm_mmu_page_as_id(root); 1132 if (root_as_id != slot->as_id) 1133 continue; 1134 1135 spte_set |= write_protect_gfn(kvm, root, gfn); 1136 } 1137 return spte_set; 1138 } 1139 1140 /* 1141 * Return the level of the lowest level SPTE added to sptes. 1142 * That SPTE may be non-present. 1143 */ 1144 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) 1145 { 1146 struct tdp_iter iter; 1147 struct kvm_mmu *mmu = vcpu->arch.mmu; 1148 int leaf = vcpu->arch.mmu->shadow_root_level; 1149 gfn_t gfn = addr >> PAGE_SHIFT; 1150 1151 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1152 leaf = iter.level; 1153 sptes[leaf - 1] = iter.old_spte; 1154 } 1155 1156 return leaf; 1157 } 1158