1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #ifdef CONFIG_X86_64 11 static bool __read_mostly tdp_mmu_enabled = false; 12 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 13 #endif 14 15 static bool is_tdp_mmu_enabled(void) 16 { 17 #ifdef CONFIG_X86_64 18 return tdp_enabled && READ_ONCE(tdp_mmu_enabled); 19 #else 20 return false; 21 #endif /* CONFIG_X86_64 */ 22 } 23 24 /* Initializes the TDP MMU for the VM, if enabled. */ 25 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 26 { 27 if (!is_tdp_mmu_enabled()) 28 return; 29 30 /* This should not be changed for the lifetime of the VM. */ 31 kvm->arch.tdp_mmu_enabled = true; 32 33 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 34 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 35 } 36 37 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 38 { 39 if (!kvm->arch.tdp_mmu_enabled) 40 return; 41 42 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 43 } 44 45 #define for_each_tdp_mmu_root(_kvm, _root) \ 46 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 47 48 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) 49 { 50 struct kvm_mmu_page *sp; 51 52 if (!kvm->arch.tdp_mmu_enabled) 53 return false; 54 if (WARN_ON(!VALID_PAGE(hpa))) 55 return false; 56 57 sp = to_shadow_page(hpa); 58 if (WARN_ON(!sp)) 59 return false; 60 61 return sp->tdp_mmu_page && sp->root_count; 62 } 63 64 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 65 gfn_t start, gfn_t end, bool can_yield); 66 67 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 68 { 69 gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); 70 71 lockdep_assert_held(&kvm->mmu_lock); 72 73 WARN_ON(root->root_count); 74 WARN_ON(!root->tdp_mmu_page); 75 76 list_del(&root->link); 77 78 zap_gfn_range(kvm, root, 0, max_gfn, false); 79 80 free_page((unsigned long)root->spt); 81 kmem_cache_free(mmu_page_header_cache, root); 82 } 83 84 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 85 int level) 86 { 87 union kvm_mmu_page_role role; 88 89 role = vcpu->arch.mmu->mmu_role.base; 90 role.level = level; 91 role.direct = true; 92 role.gpte_is_8_bytes = true; 93 role.access = ACC_ALL; 94 95 return role; 96 } 97 98 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 99 int level) 100 { 101 struct kvm_mmu_page *sp; 102 103 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 104 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 105 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 106 107 sp->role.word = page_role_for_level(vcpu, level).word; 108 sp->gfn = gfn; 109 sp->tdp_mmu_page = true; 110 111 return sp; 112 } 113 114 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 115 { 116 union kvm_mmu_page_role role; 117 struct kvm *kvm = vcpu->kvm; 118 struct kvm_mmu_page *root; 119 120 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 121 122 spin_lock(&kvm->mmu_lock); 123 124 /* Check for an existing root before allocating a new one. */ 125 for_each_tdp_mmu_root(kvm, root) { 126 if (root->role.word == role.word) { 127 kvm_mmu_get_root(kvm, root); 128 spin_unlock(&kvm->mmu_lock); 129 return root; 130 } 131 } 132 133 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 134 root->root_count = 1; 135 136 list_add(&root->link, &kvm->arch.tdp_mmu_roots); 137 138 spin_unlock(&kvm->mmu_lock); 139 140 return root; 141 } 142 143 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 144 { 145 struct kvm_mmu_page *root; 146 147 root = get_tdp_mmu_vcpu_root(vcpu); 148 if (!root) 149 return INVALID_PAGE; 150 151 return __pa(root->spt); 152 } 153 154 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 155 u64 old_spte, u64 new_spte, int level); 156 157 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 158 { 159 return sp->role.smm ? 1 : 0; 160 } 161 162 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 163 { 164 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 165 166 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 167 return; 168 169 if (is_accessed_spte(old_spte) && 170 (!is_accessed_spte(new_spte) || pfn_changed)) 171 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 172 } 173 174 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 175 u64 old_spte, u64 new_spte, int level) 176 { 177 bool pfn_changed; 178 struct kvm_memory_slot *slot; 179 180 if (level > PG_LEVEL_4K) 181 return; 182 183 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 184 185 if ((!is_writable_pte(old_spte) || pfn_changed) && 186 is_writable_pte(new_spte)) { 187 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 188 mark_page_dirty_in_slot(slot, gfn); 189 } 190 } 191 192 /** 193 * handle_changed_spte - handle bookkeeping associated with an SPTE change 194 * @kvm: kvm instance 195 * @as_id: the address space of the paging structure the SPTE was a part of 196 * @gfn: the base GFN that was mapped by the SPTE 197 * @old_spte: The value of the SPTE before the change 198 * @new_spte: The value of the SPTE after the change 199 * @level: the level of the PT the SPTE is part of in the paging structure 200 * 201 * Handle bookkeeping that might result from the modification of a SPTE. 202 * This function must be called for all TDP SPTE modifications. 203 */ 204 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 205 u64 old_spte, u64 new_spte, int level) 206 { 207 bool was_present = is_shadow_present_pte(old_spte); 208 bool is_present = is_shadow_present_pte(new_spte); 209 bool was_leaf = was_present && is_last_spte(old_spte, level); 210 bool is_leaf = is_present && is_last_spte(new_spte, level); 211 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 212 u64 *pt; 213 struct kvm_mmu_page *sp; 214 u64 old_child_spte; 215 int i; 216 217 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 218 WARN_ON(level < PG_LEVEL_4K); 219 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 220 221 /* 222 * If this warning were to trigger it would indicate that there was a 223 * missing MMU notifier or a race with some notifier handler. 224 * A present, leaf SPTE should never be directly replaced with another 225 * present leaf SPTE pointing to a differnt PFN. A notifier handler 226 * should be zapping the SPTE before the main MM's page table is 227 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 228 * thread before replacement. 229 */ 230 if (was_leaf && is_leaf && pfn_changed) { 231 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 232 "SPTE with another present leaf SPTE mapping a\n" 233 "different PFN!\n" 234 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 235 as_id, gfn, old_spte, new_spte, level); 236 237 /* 238 * Crash the host to prevent error propagation and guest data 239 * courruption. 240 */ 241 BUG(); 242 } 243 244 if (old_spte == new_spte) 245 return; 246 247 /* 248 * The only times a SPTE should be changed from a non-present to 249 * non-present state is when an MMIO entry is installed/modified/ 250 * removed. In that case, there is nothing to do here. 251 */ 252 if (!was_present && !is_present) { 253 /* 254 * If this change does not involve a MMIO SPTE, it is 255 * unexpected. Log the change, though it should not impact the 256 * guest since both the former and current SPTEs are nonpresent. 257 */ 258 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) 259 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 260 "should not be replaced with another,\n" 261 "different nonpresent SPTE, unless one or both\n" 262 "are MMIO SPTEs.\n" 263 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 264 as_id, gfn, old_spte, new_spte, level); 265 return; 266 } 267 268 269 if (was_leaf && is_dirty_spte(old_spte) && 270 (!is_dirty_spte(new_spte) || pfn_changed)) 271 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 272 273 /* 274 * Recursively handle child PTs if the change removed a subtree from 275 * the paging structure. 276 */ 277 if (was_present && !was_leaf && (pfn_changed || !is_present)) { 278 pt = spte_to_child_pt(old_spte, level); 279 sp = sptep_to_sp(pt); 280 281 list_del(&sp->link); 282 283 if (sp->lpage_disallowed) 284 unaccount_huge_nx_page(kvm, sp); 285 286 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 287 old_child_spte = READ_ONCE(*(pt + i)); 288 WRITE_ONCE(*(pt + i), 0); 289 handle_changed_spte(kvm, as_id, 290 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), 291 old_child_spte, 0, level - 1); 292 } 293 294 kvm_flush_remote_tlbs_with_address(kvm, gfn, 295 KVM_PAGES_PER_HPAGE(level)); 296 297 free_page((unsigned long)pt); 298 kmem_cache_free(mmu_page_header_cache, sp); 299 } 300 } 301 302 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 303 u64 old_spte, u64 new_spte, int level) 304 { 305 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); 306 handle_changed_spte_acc_track(old_spte, new_spte, level); 307 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 308 new_spte, level); 309 } 310 311 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 312 u64 new_spte, bool record_acc_track, 313 bool record_dirty_log) 314 { 315 u64 *root_pt = tdp_iter_root_pt(iter); 316 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 317 int as_id = kvm_mmu_page_as_id(root); 318 319 WRITE_ONCE(*iter->sptep, new_spte); 320 321 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 322 iter->level); 323 if (record_acc_track) 324 handle_changed_spte_acc_track(iter->old_spte, new_spte, 325 iter->level); 326 if (record_dirty_log) 327 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 328 iter->old_spte, new_spte, 329 iter->level); 330 } 331 332 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 333 u64 new_spte) 334 { 335 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 336 } 337 338 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 339 struct tdp_iter *iter, 340 u64 new_spte) 341 { 342 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 343 } 344 345 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 346 struct tdp_iter *iter, 347 u64 new_spte) 348 { 349 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 350 } 351 352 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 353 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 354 355 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 356 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 357 if (!is_shadow_present_pte(_iter.old_spte) || \ 358 !is_last_spte(_iter.old_spte, _iter.level)) \ 359 continue; \ 360 else 361 362 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 363 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 364 _mmu->shadow_root_level, _start, _end) 365 366 /* 367 * Flush the TLB if the process should drop kvm->mmu_lock. 368 * Return whether the caller still needs to flush the tlb. 369 */ 370 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) 371 { 372 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 373 kvm_flush_remote_tlbs(kvm); 374 cond_resched_lock(&kvm->mmu_lock); 375 tdp_iter_refresh_walk(iter); 376 return false; 377 } else { 378 return true; 379 } 380 } 381 382 static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) 383 { 384 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 385 cond_resched_lock(&kvm->mmu_lock); 386 tdp_iter_refresh_walk(iter); 387 } 388 } 389 390 /* 391 * Tears down the mappings for the range of gfns, [start, end), and frees the 392 * non-root pages mapping GFNs strictly within that range. Returns true if 393 * SPTEs have been cleared and a TLB flush is needed before releasing the 394 * MMU lock. 395 * If can_yield is true, will release the MMU lock and reschedule if the 396 * scheduler needs the CPU or there is contention on the MMU lock. If this 397 * function cannot yield, it will not release the MMU lock or reschedule and 398 * the caller must ensure it does not supply too large a GFN range, or the 399 * operation can cause a soft lockup. 400 */ 401 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 402 gfn_t start, gfn_t end, bool can_yield) 403 { 404 struct tdp_iter iter; 405 bool flush_needed = false; 406 407 tdp_root_for_each_pte(iter, root, start, end) { 408 if (!is_shadow_present_pte(iter.old_spte)) 409 continue; 410 411 /* 412 * If this is a non-last-level SPTE that covers a larger range 413 * than should be zapped, continue, and zap the mappings at a 414 * lower level. 415 */ 416 if ((iter.gfn < start || 417 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 418 !is_last_spte(iter.old_spte, iter.level)) 419 continue; 420 421 tdp_mmu_set_spte(kvm, &iter, 0); 422 423 if (can_yield) 424 flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); 425 else 426 flush_needed = true; 427 } 428 return flush_needed; 429 } 430 431 /* 432 * Tears down the mappings for the range of gfns, [start, end), and frees the 433 * non-root pages mapping GFNs strictly within that range. Returns true if 434 * SPTEs have been cleared and a TLB flush is needed before releasing the 435 * MMU lock. 436 */ 437 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) 438 { 439 struct kvm_mmu_page *root; 440 bool flush = false; 441 442 for_each_tdp_mmu_root(kvm, root) { 443 /* 444 * Take a reference on the root so that it cannot be freed if 445 * this thread releases the MMU lock and yields in this loop. 446 */ 447 kvm_mmu_get_root(kvm, root); 448 449 flush |= zap_gfn_range(kvm, root, start, end, true); 450 451 kvm_mmu_put_root(kvm, root); 452 } 453 454 return flush; 455 } 456 457 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 458 { 459 gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); 460 bool flush; 461 462 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 463 if (flush) 464 kvm_flush_remote_tlbs(kvm); 465 } 466 467 /* 468 * Installs a last-level SPTE to handle a TDP page fault. 469 * (NPT/EPT violation/misconfiguration) 470 */ 471 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 472 int map_writable, 473 struct tdp_iter *iter, 474 kvm_pfn_t pfn, bool prefault) 475 { 476 u64 new_spte; 477 int ret = 0; 478 int make_spte_ret = 0; 479 480 if (unlikely(is_noslot_pfn(pfn))) { 481 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 482 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); 483 } else 484 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 485 pfn, iter->old_spte, prefault, true, 486 map_writable, !shadow_accessed_mask, 487 &new_spte); 488 489 if (new_spte == iter->old_spte) 490 ret = RET_PF_SPURIOUS; 491 else 492 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); 493 494 /* 495 * If the page fault was caused by a write but the page is write 496 * protected, emulation is needed. If the emulation was skipped, 497 * the vCPU would have the same fault again. 498 */ 499 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 500 if (write) 501 ret = RET_PF_EMULATE; 502 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 503 } 504 505 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 506 if (unlikely(is_mmio_spte(new_spte))) 507 ret = RET_PF_EMULATE; 508 509 trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); 510 if (!prefault) 511 vcpu->stat.pf_fixed++; 512 513 return ret; 514 } 515 516 /* 517 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 518 * page tables and SPTEs to translate the faulting guest physical address. 519 */ 520 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 521 int map_writable, int max_level, kvm_pfn_t pfn, 522 bool prefault) 523 { 524 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 525 bool write = error_code & PFERR_WRITE_MASK; 526 bool exec = error_code & PFERR_FETCH_MASK; 527 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 528 struct kvm_mmu *mmu = vcpu->arch.mmu; 529 struct tdp_iter iter; 530 struct kvm_mmu_page *sp; 531 u64 *child_pt; 532 u64 new_spte; 533 int ret; 534 gfn_t gfn = gpa >> PAGE_SHIFT; 535 int level; 536 int req_level; 537 538 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 539 return RET_PF_RETRY; 540 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 541 return RET_PF_RETRY; 542 543 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 544 huge_page_disallowed, &req_level); 545 546 trace_kvm_mmu_spte_requested(gpa, level, pfn); 547 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 548 if (nx_huge_page_workaround_enabled) 549 disallowed_hugepage_adjust(iter.old_spte, gfn, 550 iter.level, &pfn, &level); 551 552 if (iter.level == level) 553 break; 554 555 /* 556 * If there is an SPTE mapping a large page at a higher level 557 * than the target, that SPTE must be cleared and replaced 558 * with a non-leaf SPTE. 559 */ 560 if (is_shadow_present_pte(iter.old_spte) && 561 is_large_pte(iter.old_spte)) { 562 tdp_mmu_set_spte(vcpu->kvm, &iter, 0); 563 564 kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, 565 KVM_PAGES_PER_HPAGE(iter.level)); 566 567 /* 568 * The iter must explicitly re-read the spte here 569 * because the new value informs the !present 570 * path below. 571 */ 572 iter.old_spte = READ_ONCE(*iter.sptep); 573 } 574 575 if (!is_shadow_present_pte(iter.old_spte)) { 576 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 577 list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); 578 child_pt = sp->spt; 579 clear_page(child_pt); 580 new_spte = make_nonleaf_spte(child_pt, 581 !shadow_accessed_mask); 582 583 trace_kvm_mmu_get_page(sp, true); 584 if (huge_page_disallowed && req_level >= iter.level) 585 account_huge_nx_page(vcpu->kvm, sp); 586 587 tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); 588 } 589 } 590 591 if (WARN_ON(iter.level != level)) 592 return RET_PF_RETRY; 593 594 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 595 pfn, prefault); 596 597 return ret; 598 } 599 600 static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, 601 unsigned long end, unsigned long data, 602 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, 603 struct kvm_mmu_page *root, gfn_t start, 604 gfn_t end, unsigned long data)) 605 { 606 struct kvm_memslots *slots; 607 struct kvm_memory_slot *memslot; 608 struct kvm_mmu_page *root; 609 int ret = 0; 610 int as_id; 611 612 for_each_tdp_mmu_root(kvm, root) { 613 /* 614 * Take a reference on the root so that it cannot be freed if 615 * this thread releases the MMU lock and yields in this loop. 616 */ 617 kvm_mmu_get_root(kvm, root); 618 619 as_id = kvm_mmu_page_as_id(root); 620 slots = __kvm_memslots(kvm, as_id); 621 kvm_for_each_memslot(memslot, slots) { 622 unsigned long hva_start, hva_end; 623 gfn_t gfn_start, gfn_end; 624 625 hva_start = max(start, memslot->userspace_addr); 626 hva_end = min(end, memslot->userspace_addr + 627 (memslot->npages << PAGE_SHIFT)); 628 if (hva_start >= hva_end) 629 continue; 630 /* 631 * {gfn(page) | page intersects with [hva_start, hva_end)} = 632 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 633 */ 634 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 635 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 636 637 ret |= handler(kvm, memslot, root, gfn_start, 638 gfn_end, data); 639 } 640 641 kvm_mmu_put_root(kvm, root); 642 } 643 644 return ret; 645 } 646 647 static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 648 struct kvm_memory_slot *slot, 649 struct kvm_mmu_page *root, gfn_t start, 650 gfn_t end, unsigned long unused) 651 { 652 return zap_gfn_range(kvm, root, start, end, false); 653 } 654 655 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 656 unsigned long end) 657 { 658 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 659 zap_gfn_range_hva_wrapper); 660 } 661 662 /* 663 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 664 * if any of the GFNs in the range have been accessed. 665 */ 666 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 667 struct kvm_mmu_page *root, gfn_t start, gfn_t end, 668 unsigned long unused) 669 { 670 struct tdp_iter iter; 671 int young = 0; 672 u64 new_spte = 0; 673 674 tdp_root_for_each_leaf_pte(iter, root, start, end) { 675 /* 676 * If we have a non-accessed entry we don't need to change the 677 * pte. 678 */ 679 if (!is_accessed_spte(iter.old_spte)) 680 continue; 681 682 new_spte = iter.old_spte; 683 684 if (spte_ad_enabled(new_spte)) { 685 clear_bit((ffs(shadow_accessed_mask) - 1), 686 (unsigned long *)&new_spte); 687 } else { 688 /* 689 * Capture the dirty status of the page, so that it doesn't get 690 * lost when the SPTE is marked for access tracking. 691 */ 692 if (is_writable_pte(new_spte)) 693 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 694 695 new_spte = mark_spte_for_access_track(new_spte); 696 } 697 new_spte &= ~shadow_dirty_mask; 698 699 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 700 young = 1; 701 } 702 703 return young; 704 } 705 706 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 707 unsigned long end) 708 { 709 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 710 age_gfn_range); 711 } 712 713 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 714 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 715 unsigned long unused2) 716 { 717 struct tdp_iter iter; 718 719 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 720 if (is_accessed_spte(iter.old_spte)) 721 return 1; 722 723 return 0; 724 } 725 726 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 727 { 728 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 729 test_age_gfn); 730 } 731 732 /* 733 * Handle the changed_pte MMU notifier for the TDP MMU. 734 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 735 * notifier. 736 * Returns non-zero if a flush is needed before releasing the MMU lock. 737 */ 738 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 739 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 740 unsigned long data) 741 { 742 struct tdp_iter iter; 743 pte_t *ptep = (pte_t *)data; 744 kvm_pfn_t new_pfn; 745 u64 new_spte; 746 int need_flush = 0; 747 748 WARN_ON(pte_huge(*ptep)); 749 750 new_pfn = pte_pfn(*ptep); 751 752 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 753 if (iter.level != PG_LEVEL_4K) 754 continue; 755 756 if (!is_shadow_present_pte(iter.old_spte)) 757 break; 758 759 tdp_mmu_set_spte(kvm, &iter, 0); 760 761 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 762 763 if (!pte_write(*ptep)) { 764 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 765 iter.old_spte, new_pfn); 766 767 tdp_mmu_set_spte(kvm, &iter, new_spte); 768 } 769 770 need_flush = 1; 771 } 772 773 if (need_flush) 774 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 775 776 return 0; 777 } 778 779 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 780 pte_t *host_ptep) 781 { 782 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 783 (unsigned long)host_ptep, 784 set_tdp_spte); 785 } 786 787 /* 788 * Remove write access from all the SPTEs mapping GFNs [start, end). If 789 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 790 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 791 */ 792 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 793 gfn_t start, gfn_t end, int min_level) 794 { 795 struct tdp_iter iter; 796 u64 new_spte; 797 bool spte_set = false; 798 799 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 800 801 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 802 min_level, start, end) { 803 if (!is_shadow_present_pte(iter.old_spte) || 804 !is_last_spte(iter.old_spte, iter.level)) 805 continue; 806 807 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 808 809 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 810 spte_set = true; 811 812 tdp_mmu_iter_cond_resched(kvm, &iter); 813 } 814 return spte_set; 815 } 816 817 /* 818 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 819 * only affect leaf SPTEs down to min_level. 820 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 821 */ 822 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 823 int min_level) 824 { 825 struct kvm_mmu_page *root; 826 int root_as_id; 827 bool spte_set = false; 828 829 for_each_tdp_mmu_root(kvm, root) { 830 root_as_id = kvm_mmu_page_as_id(root); 831 if (root_as_id != slot->as_id) 832 continue; 833 834 /* 835 * Take a reference on the root so that it cannot be freed if 836 * this thread releases the MMU lock and yields in this loop. 837 */ 838 kvm_mmu_get_root(kvm, root); 839 840 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 841 slot->base_gfn + slot->npages, min_level); 842 843 kvm_mmu_put_root(kvm, root); 844 } 845 846 return spte_set; 847 } 848 849 /* 850 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 851 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 852 * If AD bits are not enabled, this will require clearing the writable bit on 853 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 854 * be flushed. 855 */ 856 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 857 gfn_t start, gfn_t end) 858 { 859 struct tdp_iter iter; 860 u64 new_spte; 861 bool spte_set = false; 862 863 tdp_root_for_each_leaf_pte(iter, root, start, end) { 864 if (spte_ad_need_write_protect(iter.old_spte)) { 865 if (is_writable_pte(iter.old_spte)) 866 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 867 else 868 continue; 869 } else { 870 if (iter.old_spte & shadow_dirty_mask) 871 new_spte = iter.old_spte & ~shadow_dirty_mask; 872 else 873 continue; 874 } 875 876 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 877 spte_set = true; 878 879 tdp_mmu_iter_cond_resched(kvm, &iter); 880 } 881 return spte_set; 882 } 883 884 /* 885 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 886 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 887 * If AD bits are not enabled, this will require clearing the writable bit on 888 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 889 * be flushed. 890 */ 891 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 892 { 893 struct kvm_mmu_page *root; 894 int root_as_id; 895 bool spte_set = false; 896 897 for_each_tdp_mmu_root(kvm, root) { 898 root_as_id = kvm_mmu_page_as_id(root); 899 if (root_as_id != slot->as_id) 900 continue; 901 902 /* 903 * Take a reference on the root so that it cannot be freed if 904 * this thread releases the MMU lock and yields in this loop. 905 */ 906 kvm_mmu_get_root(kvm, root); 907 908 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 909 slot->base_gfn + slot->npages); 910 911 kvm_mmu_put_root(kvm, root); 912 } 913 914 return spte_set; 915 } 916 917 /* 918 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 919 * set in mask, starting at gfn. The given memslot is expected to contain all 920 * the GFNs represented by set bits in the mask. If AD bits are enabled, 921 * clearing the dirty status will involve clearing the dirty bit on each SPTE 922 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 923 */ 924 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 925 gfn_t gfn, unsigned long mask, bool wrprot) 926 { 927 struct tdp_iter iter; 928 u64 new_spte; 929 930 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 931 gfn + BITS_PER_LONG) { 932 if (!mask) 933 break; 934 935 if (iter.level > PG_LEVEL_4K || 936 !(mask & (1UL << (iter.gfn - gfn)))) 937 continue; 938 939 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 940 if (is_writable_pte(iter.old_spte)) 941 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 942 else 943 continue; 944 } else { 945 if (iter.old_spte & shadow_dirty_mask) 946 new_spte = iter.old_spte & ~shadow_dirty_mask; 947 else 948 continue; 949 } 950 951 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 952 953 mask &= ~(1UL << (iter.gfn - gfn)); 954 } 955 } 956 957 /* 958 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 959 * set in mask, starting at gfn. The given memslot is expected to contain all 960 * the GFNs represented by set bits in the mask. If AD bits are enabled, 961 * clearing the dirty status will involve clearing the dirty bit on each SPTE 962 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 963 */ 964 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 965 struct kvm_memory_slot *slot, 966 gfn_t gfn, unsigned long mask, 967 bool wrprot) 968 { 969 struct kvm_mmu_page *root; 970 int root_as_id; 971 972 lockdep_assert_held(&kvm->mmu_lock); 973 for_each_tdp_mmu_root(kvm, root) { 974 root_as_id = kvm_mmu_page_as_id(root); 975 if (root_as_id != slot->as_id) 976 continue; 977 978 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 979 } 980 } 981 982 /* 983 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 984 * only used for PML, and so will involve setting the dirty bit on each SPTE. 985 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 986 */ 987 static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 988 gfn_t start, gfn_t end) 989 { 990 struct tdp_iter iter; 991 u64 new_spte; 992 bool spte_set = false; 993 994 tdp_root_for_each_pte(iter, root, start, end) { 995 if (!is_shadow_present_pte(iter.old_spte)) 996 continue; 997 998 new_spte = iter.old_spte | shadow_dirty_mask; 999 1000 tdp_mmu_set_spte(kvm, &iter, new_spte); 1001 spte_set = true; 1002 1003 tdp_mmu_iter_cond_resched(kvm, &iter); 1004 } 1005 1006 return spte_set; 1007 } 1008 1009 /* 1010 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1011 * only used for PML, and so will involve setting the dirty bit on each SPTE. 1012 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1013 */ 1014 bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) 1015 { 1016 struct kvm_mmu_page *root; 1017 int root_as_id; 1018 bool spte_set = false; 1019 1020 for_each_tdp_mmu_root(kvm, root) { 1021 root_as_id = kvm_mmu_page_as_id(root); 1022 if (root_as_id != slot->as_id) 1023 continue; 1024 1025 /* 1026 * Take a reference on the root so that it cannot be freed if 1027 * this thread releases the MMU lock and yields in this loop. 1028 */ 1029 kvm_mmu_get_root(kvm, root); 1030 1031 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, 1032 slot->base_gfn + slot->npages); 1033 1034 kvm_mmu_put_root(kvm, root); 1035 } 1036 return spte_set; 1037 } 1038 1039 /* 1040 * Clear non-leaf entries (and free associated page tables) which could 1041 * be replaced by large mappings, for GFNs within the slot. 1042 */ 1043 static void zap_collapsible_spte_range(struct kvm *kvm, 1044 struct kvm_mmu_page *root, 1045 gfn_t start, gfn_t end) 1046 { 1047 struct tdp_iter iter; 1048 kvm_pfn_t pfn; 1049 bool spte_set = false; 1050 1051 tdp_root_for_each_pte(iter, root, start, end) { 1052 if (!is_shadow_present_pte(iter.old_spte) || 1053 is_last_spte(iter.old_spte, iter.level)) 1054 continue; 1055 1056 pfn = spte_to_pfn(iter.old_spte); 1057 if (kvm_is_reserved_pfn(pfn) || 1058 !PageTransCompoundMap(pfn_to_page(pfn))) 1059 continue; 1060 1061 tdp_mmu_set_spte(kvm, &iter, 0); 1062 1063 spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); 1064 } 1065 1066 if (spte_set) 1067 kvm_flush_remote_tlbs(kvm); 1068 } 1069 1070 /* 1071 * Clear non-leaf entries (and free associated page tables) which could 1072 * be replaced by large mappings, for GFNs within the slot. 1073 */ 1074 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1075 const struct kvm_memory_slot *slot) 1076 { 1077 struct kvm_mmu_page *root; 1078 int root_as_id; 1079 1080 for_each_tdp_mmu_root(kvm, root) { 1081 root_as_id = kvm_mmu_page_as_id(root); 1082 if (root_as_id != slot->as_id) 1083 continue; 1084 1085 /* 1086 * Take a reference on the root so that it cannot be freed if 1087 * this thread releases the MMU lock and yields in this loop. 1088 */ 1089 kvm_mmu_get_root(kvm, root); 1090 1091 zap_collapsible_spte_range(kvm, root, slot->base_gfn, 1092 slot->base_gfn + slot->npages); 1093 1094 kvm_mmu_put_root(kvm, root); 1095 } 1096 } 1097 1098 /* 1099 * Removes write access on the last level SPTE mapping this GFN and unsets the 1100 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1101 * Returns true if an SPTE was set and a TLB flush is needed. 1102 */ 1103 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1104 gfn_t gfn) 1105 { 1106 struct tdp_iter iter; 1107 u64 new_spte; 1108 bool spte_set = false; 1109 1110 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 1111 if (!is_writable_pte(iter.old_spte)) 1112 break; 1113 1114 new_spte = iter.old_spte & 1115 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 1116 1117 tdp_mmu_set_spte(kvm, &iter, new_spte); 1118 spte_set = true; 1119 } 1120 1121 return spte_set; 1122 } 1123 1124 /* 1125 * Removes write access on the last level SPTE mapping this GFN and unsets the 1126 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1127 * Returns true if an SPTE was set and a TLB flush is needed. 1128 */ 1129 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1130 struct kvm_memory_slot *slot, gfn_t gfn) 1131 { 1132 struct kvm_mmu_page *root; 1133 int root_as_id; 1134 bool spte_set = false; 1135 1136 lockdep_assert_held(&kvm->mmu_lock); 1137 for_each_tdp_mmu_root(kvm, root) { 1138 root_as_id = kvm_mmu_page_as_id(root); 1139 if (root_as_id != slot->as_id) 1140 continue; 1141 1142 spte_set |= write_protect_gfn(kvm, root, gfn); 1143 } 1144 return spte_set; 1145 } 1146 1147 /* 1148 * Return the level of the lowest level SPTE added to sptes. 1149 * That SPTE may be non-present. 1150 */ 1151 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) 1152 { 1153 struct tdp_iter iter; 1154 struct kvm_mmu *mmu = vcpu->arch.mmu; 1155 int leaf = vcpu->arch.mmu->shadow_root_level; 1156 gfn_t gfn = addr >> PAGE_SHIFT; 1157 1158 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1159 leaf = iter.level; 1160 sptes[leaf - 1] = iter.old_spte; 1161 } 1162 1163 return leaf; 1164 } 1165