1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * MMU support 9 * 10 * Copyright (C) 2006 Qumranet, Inc. 11 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 12 * 13 * Authors: 14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Avi Kivity <avi@qumranet.com> 16 */ 17 18 #include "irq.h" 19 #include "ioapic.h" 20 #include "mmu.h" 21 #include "mmu_internal.h" 22 #include "tdp_mmu.h" 23 #include "x86.h" 24 #include "kvm_cache_regs.h" 25 #include "kvm_emulate.h" 26 #include "cpuid.h" 27 #include "spte.h" 28 29 #include <linux/kvm_host.h> 30 #include <linux/types.h> 31 #include <linux/string.h> 32 #include <linux/mm.h> 33 #include <linux/highmem.h> 34 #include <linux/moduleparam.h> 35 #include <linux/export.h> 36 #include <linux/swap.h> 37 #include <linux/hugetlb.h> 38 #include <linux/compiler.h> 39 #include <linux/srcu.h> 40 #include <linux/slab.h> 41 #include <linux/sched/signal.h> 42 #include <linux/uaccess.h> 43 #include <linux/hash.h> 44 #include <linux/kern_levels.h> 45 #include <linux/kthread.h> 46 47 #include <asm/page.h> 48 #include <asm/memtype.h> 49 #include <asm/cmpxchg.h> 50 #include <asm/io.h> 51 #include <asm/set_memory.h> 52 #include <asm/vmx.h> 53 #include <asm/kvm_page_track.h> 54 #include "trace.h" 55 56 extern bool itlb_multihit_kvm_mitigation; 57 58 int __read_mostly nx_huge_pages = -1; 59 #ifdef CONFIG_PREEMPT_RT 60 /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ 61 static uint __read_mostly nx_huge_pages_recovery_ratio = 0; 62 #else 63 static uint __read_mostly nx_huge_pages_recovery_ratio = 60; 64 #endif 65 66 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); 67 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); 68 69 static const struct kernel_param_ops nx_huge_pages_ops = { 70 .set = set_nx_huge_pages, 71 .get = param_get_bool, 72 }; 73 74 static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { 75 .set = set_nx_huge_pages_recovery_ratio, 76 .get = param_get_uint, 77 }; 78 79 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); 80 __MODULE_PARM_TYPE(nx_huge_pages, "bool"); 81 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, 82 &nx_huge_pages_recovery_ratio, 0644); 83 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); 84 85 static bool __read_mostly force_flush_and_sync_on_reuse; 86 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); 87 88 /* 89 * When setting this variable to true it enables Two-Dimensional-Paging 90 * where the hardware walks 2 page tables: 91 * 1. the guest-virtual to guest-physical 92 * 2. while doing 1. it walks guest-physical to host-physical 93 * If the hardware supports that we don't need to do shadow paging. 94 */ 95 bool tdp_enabled = false; 96 97 static int max_huge_page_level __read_mostly; 98 static int max_tdp_level __read_mostly; 99 100 enum { 101 AUDIT_PRE_PAGE_FAULT, 102 AUDIT_POST_PAGE_FAULT, 103 AUDIT_PRE_PTE_WRITE, 104 AUDIT_POST_PTE_WRITE, 105 AUDIT_PRE_SYNC, 106 AUDIT_POST_SYNC 107 }; 108 109 #ifdef MMU_DEBUG 110 bool dbg = 0; 111 module_param(dbg, bool, 0644); 112 #endif 113 114 #define PTE_PREFETCH_NUM 8 115 116 #define PT32_LEVEL_BITS 10 117 118 #define PT32_LEVEL_SHIFT(level) \ 119 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 120 121 #define PT32_LVL_OFFSET_MASK(level) \ 122 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 123 * PT32_LEVEL_BITS))) - 1)) 124 125 #define PT32_INDEX(address, level)\ 126 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 127 128 129 #define PT32_BASE_ADDR_MASK PAGE_MASK 130 #define PT32_DIR_BASE_ADDR_MASK \ 131 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 132 #define PT32_LVL_ADDR_MASK(level) \ 133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 134 * PT32_LEVEL_BITS))) - 1)) 135 136 #include <trace/events/kvm.h> 137 138 /* make pte_list_desc fit well in cache line */ 139 #define PTE_LIST_EXT 3 140 141 struct pte_list_desc { 142 u64 *sptes[PTE_LIST_EXT]; 143 struct pte_list_desc *more; 144 }; 145 146 struct kvm_shadow_walk_iterator { 147 u64 addr; 148 hpa_t shadow_addr; 149 u64 *sptep; 150 int level; 151 unsigned index; 152 }; 153 154 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ 155 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ 156 (_root), (_addr)); \ 157 shadow_walk_okay(&(_walker)); \ 158 shadow_walk_next(&(_walker))) 159 160 #define for_each_shadow_entry(_vcpu, _addr, _walker) \ 161 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 162 shadow_walk_okay(&(_walker)); \ 163 shadow_walk_next(&(_walker))) 164 165 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ 166 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 167 shadow_walk_okay(&(_walker)) && \ 168 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ 169 __shadow_walk_next(&(_walker), spte)) 170 171 static struct kmem_cache *pte_list_desc_cache; 172 struct kmem_cache *mmu_page_header_cache; 173 static struct percpu_counter kvm_total_used_mmu_pages; 174 175 static void mmu_spte_set(u64 *sptep, u64 spte); 176 static union kvm_mmu_page_role 177 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); 178 179 struct kvm_mmu_role_regs { 180 const unsigned long cr0; 181 const unsigned long cr4; 182 const u64 efer; 183 }; 184 185 #define CREATE_TRACE_POINTS 186 #include "mmutrace.h" 187 188 /* 189 * Yes, lot's of underscores. They're a hint that you probably shouldn't be 190 * reading from the role_regs. Once the mmu_role is constructed, it becomes 191 * the single source of truth for the MMU's state. 192 */ 193 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ 194 static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ 195 { \ 196 return !!(regs->reg & flag); \ 197 } 198 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); 199 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); 200 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); 201 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); 202 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); 203 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); 204 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); 205 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); 206 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); 207 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); 208 209 /* 210 * The MMU itself (with a valid role) is the single source of truth for the 211 * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The 212 * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, 213 * and the vCPU may be incorrect/irrelevant. 214 */ 215 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ 216 static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ 217 { \ 218 return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ 219 } 220 BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg); 221 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); 222 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); 223 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae); 224 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); 225 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); 226 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); 227 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); 228 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); 229 230 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) 231 { 232 struct kvm_mmu_role_regs regs = { 233 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), 234 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), 235 .efer = vcpu->arch.efer, 236 }; 237 238 return regs; 239 } 240 241 static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs) 242 { 243 if (!____is_cr0_pg(regs)) 244 return 0; 245 else if (____is_efer_lma(regs)) 246 return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : 247 PT64_ROOT_4LEVEL; 248 else if (____is_cr4_pae(regs)) 249 return PT32E_ROOT_LEVEL; 250 else 251 return PT32_ROOT_LEVEL; 252 } 253 254 static inline bool kvm_available_flush_tlb_with_range(void) 255 { 256 return kvm_x86_ops.tlb_remote_flush_with_range; 257 } 258 259 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, 260 struct kvm_tlb_range *range) 261 { 262 int ret = -ENOTSUPP; 263 264 if (range && kvm_x86_ops.tlb_remote_flush_with_range) 265 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); 266 267 if (ret) 268 kvm_flush_remote_tlbs(kvm); 269 } 270 271 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, 272 u64 start_gfn, u64 pages) 273 { 274 struct kvm_tlb_range range; 275 276 range.start_gfn = start_gfn; 277 range.pages = pages; 278 279 kvm_flush_remote_tlbs_with_range(kvm, &range); 280 } 281 282 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, 283 unsigned int access) 284 { 285 u64 spte = make_mmio_spte(vcpu, gfn, access); 286 287 trace_mark_mmio_spte(sptep, gfn, spte); 288 mmu_spte_set(sptep, spte); 289 } 290 291 static gfn_t get_mmio_spte_gfn(u64 spte) 292 { 293 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; 294 295 gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN) 296 & shadow_nonpresent_or_rsvd_mask; 297 298 return gpa >> PAGE_SHIFT; 299 } 300 301 static unsigned get_mmio_spte_access(u64 spte) 302 { 303 return spte & shadow_mmio_access_mask; 304 } 305 306 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) 307 { 308 u64 kvm_gen, spte_gen, gen; 309 310 gen = kvm_vcpu_memslots(vcpu)->generation; 311 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) 312 return false; 313 314 kvm_gen = gen & MMIO_SPTE_GEN_MASK; 315 spte_gen = get_mmio_spte_generation(spte); 316 317 trace_check_mmio_spte(spte, kvm_gen, spte_gen); 318 return likely(kvm_gen == spte_gen); 319 } 320 321 static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, 322 struct x86_exception *exception) 323 { 324 /* Check if guest physical address doesn't exceed guest maximum */ 325 if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { 326 exception->error_code |= PFERR_RSVD_MASK; 327 return UNMAPPED_GVA; 328 } 329 330 return gpa; 331 } 332 333 static int is_cpuid_PSE36(void) 334 { 335 return 1; 336 } 337 338 static gfn_t pse36_gfn_delta(u32 gpte) 339 { 340 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; 341 342 return (gpte & PT32_DIR_PSE36_MASK) << shift; 343 } 344 345 #ifdef CONFIG_X86_64 346 static void __set_spte(u64 *sptep, u64 spte) 347 { 348 WRITE_ONCE(*sptep, spte); 349 } 350 351 static void __update_clear_spte_fast(u64 *sptep, u64 spte) 352 { 353 WRITE_ONCE(*sptep, spte); 354 } 355 356 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 357 { 358 return xchg(sptep, spte); 359 } 360 361 static u64 __get_spte_lockless(u64 *sptep) 362 { 363 return READ_ONCE(*sptep); 364 } 365 #else 366 union split_spte { 367 struct { 368 u32 spte_low; 369 u32 spte_high; 370 }; 371 u64 spte; 372 }; 373 374 static void count_spte_clear(u64 *sptep, u64 spte) 375 { 376 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 377 378 if (is_shadow_present_pte(spte)) 379 return; 380 381 /* Ensure the spte is completely set before we increase the count */ 382 smp_wmb(); 383 sp->clear_spte_count++; 384 } 385 386 static void __set_spte(u64 *sptep, u64 spte) 387 { 388 union split_spte *ssptep, sspte; 389 390 ssptep = (union split_spte *)sptep; 391 sspte = (union split_spte)spte; 392 393 ssptep->spte_high = sspte.spte_high; 394 395 /* 396 * If we map the spte from nonpresent to present, We should store 397 * the high bits firstly, then set present bit, so cpu can not 398 * fetch this spte while we are setting the spte. 399 */ 400 smp_wmb(); 401 402 WRITE_ONCE(ssptep->spte_low, sspte.spte_low); 403 } 404 405 static void __update_clear_spte_fast(u64 *sptep, u64 spte) 406 { 407 union split_spte *ssptep, sspte; 408 409 ssptep = (union split_spte *)sptep; 410 sspte = (union split_spte)spte; 411 412 WRITE_ONCE(ssptep->spte_low, sspte.spte_low); 413 414 /* 415 * If we map the spte from present to nonpresent, we should clear 416 * present bit firstly to avoid vcpu fetch the old high bits. 417 */ 418 smp_wmb(); 419 420 ssptep->spte_high = sspte.spte_high; 421 count_spte_clear(sptep, spte); 422 } 423 424 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 425 { 426 union split_spte *ssptep, sspte, orig; 427 428 ssptep = (union split_spte *)sptep; 429 sspte = (union split_spte)spte; 430 431 /* xchg acts as a barrier before the setting of the high bits */ 432 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); 433 orig.spte_high = ssptep->spte_high; 434 ssptep->spte_high = sspte.spte_high; 435 count_spte_clear(sptep, spte); 436 437 return orig.spte; 438 } 439 440 /* 441 * The idea using the light way get the spte on x86_32 guest is from 442 * gup_get_pte (mm/gup.c). 443 * 444 * An spte tlb flush may be pending, because kvm_set_pte_rmapp 445 * coalesces them and we are running out of the MMU lock. Therefore 446 * we need to protect against in-progress updates of the spte. 447 * 448 * Reading the spte while an update is in progress may get the old value 449 * for the high part of the spte. The race is fine for a present->non-present 450 * change (because the high part of the spte is ignored for non-present spte), 451 * but for a present->present change we must reread the spte. 452 * 453 * All such changes are done in two steps (present->non-present and 454 * non-present->present), hence it is enough to count the number of 455 * present->non-present updates: if it changed while reading the spte, 456 * we might have hit the race. This is done using clear_spte_count. 457 */ 458 static u64 __get_spte_lockless(u64 *sptep) 459 { 460 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 461 union split_spte spte, *orig = (union split_spte *)sptep; 462 int count; 463 464 retry: 465 count = sp->clear_spte_count; 466 smp_rmb(); 467 468 spte.spte_low = orig->spte_low; 469 smp_rmb(); 470 471 spte.spte_high = orig->spte_high; 472 smp_rmb(); 473 474 if (unlikely(spte.spte_low != orig->spte_low || 475 count != sp->clear_spte_count)) 476 goto retry; 477 478 return spte.spte; 479 } 480 #endif 481 482 static bool spte_has_volatile_bits(u64 spte) 483 { 484 if (!is_shadow_present_pte(spte)) 485 return false; 486 487 /* 488 * Always atomically update spte if it can be updated 489 * out of mmu-lock, it can ensure dirty bit is not lost, 490 * also, it can help us to get a stable is_writable_pte() 491 * to ensure tlb flush is not missed. 492 */ 493 if (spte_can_locklessly_be_made_writable(spte) || 494 is_access_track_spte(spte)) 495 return true; 496 497 if (spte_ad_enabled(spte)) { 498 if ((spte & shadow_accessed_mask) == 0 || 499 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) 500 return true; 501 } 502 503 return false; 504 } 505 506 /* Rules for using mmu_spte_set: 507 * Set the sptep from nonpresent to present. 508 * Note: the sptep being assigned *must* be either not present 509 * or in a state where the hardware will not attempt to update 510 * the spte. 511 */ 512 static void mmu_spte_set(u64 *sptep, u64 new_spte) 513 { 514 WARN_ON(is_shadow_present_pte(*sptep)); 515 __set_spte(sptep, new_spte); 516 } 517 518 /* 519 * Update the SPTE (excluding the PFN), but do not track changes in its 520 * accessed/dirty status. 521 */ 522 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) 523 { 524 u64 old_spte = *sptep; 525 526 WARN_ON(!is_shadow_present_pte(new_spte)); 527 528 if (!is_shadow_present_pte(old_spte)) { 529 mmu_spte_set(sptep, new_spte); 530 return old_spte; 531 } 532 533 if (!spte_has_volatile_bits(old_spte)) 534 __update_clear_spte_fast(sptep, new_spte); 535 else 536 old_spte = __update_clear_spte_slow(sptep, new_spte); 537 538 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); 539 540 return old_spte; 541 } 542 543 /* Rules for using mmu_spte_update: 544 * Update the state bits, it means the mapped pfn is not changed. 545 * 546 * Whenever we overwrite a writable spte with a read-only one we 547 * should flush remote TLBs. Otherwise rmap_write_protect 548 * will find a read-only spte, even though the writable spte 549 * might be cached on a CPU's TLB, the return value indicates this 550 * case. 551 * 552 * Returns true if the TLB needs to be flushed 553 */ 554 static bool mmu_spte_update(u64 *sptep, u64 new_spte) 555 { 556 bool flush = false; 557 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); 558 559 if (!is_shadow_present_pte(old_spte)) 560 return false; 561 562 /* 563 * For the spte updated out of mmu-lock is safe, since 564 * we always atomically update it, see the comments in 565 * spte_has_volatile_bits(). 566 */ 567 if (spte_can_locklessly_be_made_writable(old_spte) && 568 !is_writable_pte(new_spte)) 569 flush = true; 570 571 /* 572 * Flush TLB when accessed/dirty states are changed in the page tables, 573 * to guarantee consistency between TLB and page tables. 574 */ 575 576 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { 577 flush = true; 578 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 579 } 580 581 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { 582 flush = true; 583 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 584 } 585 586 return flush; 587 } 588 589 /* 590 * Rules for using mmu_spte_clear_track_bits: 591 * It sets the sptep from present to nonpresent, and track the 592 * state bits, it is used to clear the last level sptep. 593 * Returns non-zero if the PTE was previously valid. 594 */ 595 static int mmu_spte_clear_track_bits(u64 *sptep) 596 { 597 kvm_pfn_t pfn; 598 u64 old_spte = *sptep; 599 600 if (!spte_has_volatile_bits(old_spte)) 601 __update_clear_spte_fast(sptep, 0ull); 602 else 603 old_spte = __update_clear_spte_slow(sptep, 0ull); 604 605 if (!is_shadow_present_pte(old_spte)) 606 return 0; 607 608 pfn = spte_to_pfn(old_spte); 609 610 /* 611 * KVM does not hold the refcount of the page used by 612 * kvm mmu, before reclaiming the page, we should 613 * unmap it from mmu first. 614 */ 615 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); 616 617 if (is_accessed_spte(old_spte)) 618 kvm_set_pfn_accessed(pfn); 619 620 if (is_dirty_spte(old_spte)) 621 kvm_set_pfn_dirty(pfn); 622 623 return 1; 624 } 625 626 /* 627 * Rules for using mmu_spte_clear_no_track: 628 * Directly clear spte without caring the state bits of sptep, 629 * it is used to set the upper level spte. 630 */ 631 static void mmu_spte_clear_no_track(u64 *sptep) 632 { 633 __update_clear_spte_fast(sptep, 0ull); 634 } 635 636 static u64 mmu_spte_get_lockless(u64 *sptep) 637 { 638 return __get_spte_lockless(sptep); 639 } 640 641 /* Restore an acc-track PTE back to a regular PTE */ 642 static u64 restore_acc_track_spte(u64 spte) 643 { 644 u64 new_spte = spte; 645 u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) 646 & SHADOW_ACC_TRACK_SAVED_BITS_MASK; 647 648 WARN_ON_ONCE(spte_ad_enabled(spte)); 649 WARN_ON_ONCE(!is_access_track_spte(spte)); 650 651 new_spte &= ~shadow_acc_track_mask; 652 new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << 653 SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); 654 new_spte |= saved_bits; 655 656 return new_spte; 657 } 658 659 /* Returns the Accessed status of the PTE and resets it at the same time. */ 660 static bool mmu_spte_age(u64 *sptep) 661 { 662 u64 spte = mmu_spte_get_lockless(sptep); 663 664 if (!is_accessed_spte(spte)) 665 return false; 666 667 if (spte_ad_enabled(spte)) { 668 clear_bit((ffs(shadow_accessed_mask) - 1), 669 (unsigned long *)sptep); 670 } else { 671 /* 672 * Capture the dirty status of the page, so that it doesn't get 673 * lost when the SPTE is marked for access tracking. 674 */ 675 if (is_writable_pte(spte)) 676 kvm_set_pfn_dirty(spte_to_pfn(spte)); 677 678 spte = mark_spte_for_access_track(spte); 679 mmu_spte_update_no_track(sptep, spte); 680 } 681 682 return true; 683 } 684 685 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 686 { 687 /* 688 * Prevent page table teardown by making any free-er wait during 689 * kvm_flush_remote_tlbs() IPI to all active vcpus. 690 */ 691 local_irq_disable(); 692 693 /* 694 * Make sure a following spte read is not reordered ahead of the write 695 * to vcpu->mode. 696 */ 697 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); 698 } 699 700 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 701 { 702 /* 703 * Make sure the write to vcpu->mode is not reordered in front of 704 * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us 705 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. 706 */ 707 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); 708 local_irq_enable(); 709 } 710 711 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) 712 { 713 int r; 714 715 /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ 716 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 717 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); 718 if (r) 719 return r; 720 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, 721 PT64_ROOT_MAX_LEVEL); 722 if (r) 723 return r; 724 if (maybe_indirect) { 725 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, 726 PT64_ROOT_MAX_LEVEL); 727 if (r) 728 return r; 729 } 730 return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 731 PT64_ROOT_MAX_LEVEL); 732 } 733 734 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 735 { 736 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); 737 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); 738 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); 739 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 740 } 741 742 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 743 { 744 return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); 745 } 746 747 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 748 { 749 kmem_cache_free(pte_list_desc_cache, pte_list_desc); 750 } 751 752 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 753 { 754 if (!sp->role.direct) 755 return sp->gfns[index]; 756 757 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); 758 } 759 760 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 761 { 762 if (!sp->role.direct) { 763 sp->gfns[index] = gfn; 764 return; 765 } 766 767 if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) 768 pr_err_ratelimited("gfn mismatch under direct page %llx " 769 "(expected %llx, got %llx)\n", 770 sp->gfn, 771 kvm_mmu_page_get_gfn(sp, index), gfn); 772 } 773 774 /* 775 * Return the pointer to the large page information for a given gfn, 776 * handling slots that are not large page aligned. 777 */ 778 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, 779 const struct kvm_memory_slot *slot, int level) 780 { 781 unsigned long idx; 782 783 idx = gfn_to_index(gfn, slot->base_gfn, level); 784 return &slot->arch.lpage_info[level - 2][idx]; 785 } 786 787 static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, 788 gfn_t gfn, int count) 789 { 790 struct kvm_lpage_info *linfo; 791 int i; 792 793 for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { 794 linfo = lpage_info_slot(gfn, slot, i); 795 linfo->disallow_lpage += count; 796 WARN_ON(linfo->disallow_lpage < 0); 797 } 798 } 799 800 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) 801 { 802 update_gfn_disallow_lpage_count(slot, gfn, 1); 803 } 804 805 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) 806 { 807 update_gfn_disallow_lpage_count(slot, gfn, -1); 808 } 809 810 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 811 { 812 struct kvm_memslots *slots; 813 struct kvm_memory_slot *slot; 814 gfn_t gfn; 815 816 kvm->arch.indirect_shadow_pages++; 817 gfn = sp->gfn; 818 slots = kvm_memslots_for_spte_role(kvm, sp->role); 819 slot = __gfn_to_memslot(slots, gfn); 820 821 /* the non-leaf shadow pages are keeping readonly. */ 822 if (sp->role.level > PG_LEVEL_4K) 823 return kvm_slot_page_track_add_page(kvm, slot, gfn, 824 KVM_PAGE_TRACK_WRITE); 825 826 kvm_mmu_gfn_disallow_lpage(slot, gfn); 827 } 828 829 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) 830 { 831 if (sp->lpage_disallowed) 832 return; 833 834 ++kvm->stat.nx_lpage_splits; 835 list_add_tail(&sp->lpage_disallowed_link, 836 &kvm->arch.lpage_disallowed_mmu_pages); 837 sp->lpage_disallowed = true; 838 } 839 840 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 841 { 842 struct kvm_memslots *slots; 843 struct kvm_memory_slot *slot; 844 gfn_t gfn; 845 846 kvm->arch.indirect_shadow_pages--; 847 gfn = sp->gfn; 848 slots = kvm_memslots_for_spte_role(kvm, sp->role); 849 slot = __gfn_to_memslot(slots, gfn); 850 if (sp->role.level > PG_LEVEL_4K) 851 return kvm_slot_page_track_remove_page(kvm, slot, gfn, 852 KVM_PAGE_TRACK_WRITE); 853 854 kvm_mmu_gfn_allow_lpage(slot, gfn); 855 } 856 857 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) 858 { 859 --kvm->stat.nx_lpage_splits; 860 sp->lpage_disallowed = false; 861 list_del(&sp->lpage_disallowed_link); 862 } 863 864 static struct kvm_memory_slot * 865 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, 866 bool no_dirty_log) 867 { 868 struct kvm_memory_slot *slot; 869 870 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 871 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 872 return NULL; 873 if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) 874 return NULL; 875 876 return slot; 877 } 878 879 /* 880 * About rmap_head encoding: 881 * 882 * If the bit zero of rmap_head->val is clear, then it points to the only spte 883 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct 884 * pte_list_desc containing more mappings. 885 */ 886 887 /* 888 * Returns the number of pointers in the rmap chain, not counting the new one. 889 */ 890 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, 891 struct kvm_rmap_head *rmap_head) 892 { 893 struct pte_list_desc *desc; 894 int i, count = 0; 895 896 if (!rmap_head->val) { 897 rmap_printk("%p %llx 0->1\n", spte, *spte); 898 rmap_head->val = (unsigned long)spte; 899 } else if (!(rmap_head->val & 1)) { 900 rmap_printk("%p %llx 1->many\n", spte, *spte); 901 desc = mmu_alloc_pte_list_desc(vcpu); 902 desc->sptes[0] = (u64 *)rmap_head->val; 903 desc->sptes[1] = spte; 904 rmap_head->val = (unsigned long)desc | 1; 905 ++count; 906 } else { 907 rmap_printk("%p %llx many->many\n", spte, *spte); 908 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 909 while (desc->sptes[PTE_LIST_EXT-1]) { 910 count += PTE_LIST_EXT; 911 912 if (!desc->more) { 913 desc->more = mmu_alloc_pte_list_desc(vcpu); 914 desc = desc->more; 915 break; 916 } 917 desc = desc->more; 918 } 919 for (i = 0; desc->sptes[i]; ++i) 920 ++count; 921 desc->sptes[i] = spte; 922 } 923 return count; 924 } 925 926 static void 927 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, 928 struct pte_list_desc *desc, int i, 929 struct pte_list_desc *prev_desc) 930 { 931 int j; 932 933 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) 934 ; 935 desc->sptes[i] = desc->sptes[j]; 936 desc->sptes[j] = NULL; 937 if (j != 0) 938 return; 939 if (!prev_desc && !desc->more) 940 rmap_head->val = 0; 941 else 942 if (prev_desc) 943 prev_desc->more = desc->more; 944 else 945 rmap_head->val = (unsigned long)desc->more | 1; 946 mmu_free_pte_list_desc(desc); 947 } 948 949 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) 950 { 951 struct pte_list_desc *desc; 952 struct pte_list_desc *prev_desc; 953 int i; 954 955 if (!rmap_head->val) { 956 pr_err("%s: %p 0->BUG\n", __func__, spte); 957 BUG(); 958 } else if (!(rmap_head->val & 1)) { 959 rmap_printk("%p 1->0\n", spte); 960 if ((u64 *)rmap_head->val != spte) { 961 pr_err("%s: %p 1->BUG\n", __func__, spte); 962 BUG(); 963 } 964 rmap_head->val = 0; 965 } else { 966 rmap_printk("%p many->many\n", spte); 967 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 968 prev_desc = NULL; 969 while (desc) { 970 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { 971 if (desc->sptes[i] == spte) { 972 pte_list_desc_remove_entry(rmap_head, 973 desc, i, prev_desc); 974 return; 975 } 976 } 977 prev_desc = desc; 978 desc = desc->more; 979 } 980 pr_err("%s: %p many->many\n", __func__, spte); 981 BUG(); 982 } 983 } 984 985 static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) 986 { 987 mmu_spte_clear_track_bits(sptep); 988 __pte_list_remove(sptep, rmap_head); 989 } 990 991 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, 992 struct kvm_memory_slot *slot) 993 { 994 unsigned long idx; 995 996 idx = gfn_to_index(gfn, slot->base_gfn, level); 997 return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; 998 } 999 1000 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, 1001 struct kvm_mmu_page *sp) 1002 { 1003 struct kvm_memslots *slots; 1004 struct kvm_memory_slot *slot; 1005 1006 slots = kvm_memslots_for_spte_role(kvm, sp->role); 1007 slot = __gfn_to_memslot(slots, gfn); 1008 return __gfn_to_rmap(gfn, sp->role.level, slot); 1009 } 1010 1011 static bool rmap_can_add(struct kvm_vcpu *vcpu) 1012 { 1013 struct kvm_mmu_memory_cache *mc; 1014 1015 mc = &vcpu->arch.mmu_pte_list_desc_cache; 1016 return kvm_mmu_memory_cache_nr_free_objects(mc); 1017 } 1018 1019 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 1020 { 1021 struct kvm_mmu_page *sp; 1022 struct kvm_rmap_head *rmap_head; 1023 1024 sp = sptep_to_sp(spte); 1025 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 1026 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); 1027 return pte_list_add(vcpu, spte, rmap_head); 1028 } 1029 1030 static void rmap_remove(struct kvm *kvm, u64 *spte) 1031 { 1032 struct kvm_mmu_page *sp; 1033 gfn_t gfn; 1034 struct kvm_rmap_head *rmap_head; 1035 1036 sp = sptep_to_sp(spte); 1037 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 1038 rmap_head = gfn_to_rmap(kvm, gfn, sp); 1039 __pte_list_remove(spte, rmap_head); 1040 } 1041 1042 /* 1043 * Used by the following functions to iterate through the sptes linked by a 1044 * rmap. All fields are private and not assumed to be used outside. 1045 */ 1046 struct rmap_iterator { 1047 /* private fields */ 1048 struct pte_list_desc *desc; /* holds the sptep if not NULL */ 1049 int pos; /* index of the sptep */ 1050 }; 1051 1052 /* 1053 * Iteration must be started by this function. This should also be used after 1054 * removing/dropping sptes from the rmap link because in such cases the 1055 * information in the iterator may not be valid. 1056 * 1057 * Returns sptep if found, NULL otherwise. 1058 */ 1059 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, 1060 struct rmap_iterator *iter) 1061 { 1062 u64 *sptep; 1063 1064 if (!rmap_head->val) 1065 return NULL; 1066 1067 if (!(rmap_head->val & 1)) { 1068 iter->desc = NULL; 1069 sptep = (u64 *)rmap_head->val; 1070 goto out; 1071 } 1072 1073 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1074 iter->pos = 0; 1075 sptep = iter->desc->sptes[iter->pos]; 1076 out: 1077 BUG_ON(!is_shadow_present_pte(*sptep)); 1078 return sptep; 1079 } 1080 1081 /* 1082 * Must be used with a valid iterator: e.g. after rmap_get_first(). 1083 * 1084 * Returns sptep if found, NULL otherwise. 1085 */ 1086 static u64 *rmap_get_next(struct rmap_iterator *iter) 1087 { 1088 u64 *sptep; 1089 1090 if (iter->desc) { 1091 if (iter->pos < PTE_LIST_EXT - 1) { 1092 ++iter->pos; 1093 sptep = iter->desc->sptes[iter->pos]; 1094 if (sptep) 1095 goto out; 1096 } 1097 1098 iter->desc = iter->desc->more; 1099 1100 if (iter->desc) { 1101 iter->pos = 0; 1102 /* desc->sptes[0] cannot be NULL */ 1103 sptep = iter->desc->sptes[iter->pos]; 1104 goto out; 1105 } 1106 } 1107 1108 return NULL; 1109 out: 1110 BUG_ON(!is_shadow_present_pte(*sptep)); 1111 return sptep; 1112 } 1113 1114 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ 1115 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ 1116 _spte_; _spte_ = rmap_get_next(_iter_)) 1117 1118 static void drop_spte(struct kvm *kvm, u64 *sptep) 1119 { 1120 if (mmu_spte_clear_track_bits(sptep)) 1121 rmap_remove(kvm, sptep); 1122 } 1123 1124 1125 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) 1126 { 1127 if (is_large_pte(*sptep)) { 1128 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); 1129 drop_spte(kvm, sptep); 1130 --kvm->stat.lpages; 1131 return true; 1132 } 1133 1134 return false; 1135 } 1136 1137 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1138 { 1139 if (__drop_large_spte(vcpu->kvm, sptep)) { 1140 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 1141 1142 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1143 KVM_PAGES_PER_HPAGE(sp->role.level)); 1144 } 1145 } 1146 1147 /* 1148 * Write-protect on the specified @sptep, @pt_protect indicates whether 1149 * spte write-protection is caused by protecting shadow page table. 1150 * 1151 * Note: write protection is difference between dirty logging and spte 1152 * protection: 1153 * - for dirty logging, the spte can be set to writable at anytime if 1154 * its dirty bitmap is properly set. 1155 * - for spte protection, the spte can be writable only after unsync-ing 1156 * shadow page. 1157 * 1158 * Return true if tlb need be flushed. 1159 */ 1160 static bool spte_write_protect(u64 *sptep, bool pt_protect) 1161 { 1162 u64 spte = *sptep; 1163 1164 if (!is_writable_pte(spte) && 1165 !(pt_protect && spte_can_locklessly_be_made_writable(spte))) 1166 return false; 1167 1168 rmap_printk("spte %p %llx\n", sptep, *sptep); 1169 1170 if (pt_protect) 1171 spte &= ~shadow_mmu_writable_mask; 1172 spte = spte & ~PT_WRITABLE_MASK; 1173 1174 return mmu_spte_update(sptep, spte); 1175 } 1176 1177 static bool __rmap_write_protect(struct kvm *kvm, 1178 struct kvm_rmap_head *rmap_head, 1179 bool pt_protect) 1180 { 1181 u64 *sptep; 1182 struct rmap_iterator iter; 1183 bool flush = false; 1184 1185 for_each_rmap_spte(rmap_head, &iter, sptep) 1186 flush |= spte_write_protect(sptep, pt_protect); 1187 1188 return flush; 1189 } 1190 1191 static bool spte_clear_dirty(u64 *sptep) 1192 { 1193 u64 spte = *sptep; 1194 1195 rmap_printk("spte %p %llx\n", sptep, *sptep); 1196 1197 MMU_WARN_ON(!spte_ad_enabled(spte)); 1198 spte &= ~shadow_dirty_mask; 1199 return mmu_spte_update(sptep, spte); 1200 } 1201 1202 static bool spte_wrprot_for_clear_dirty(u64 *sptep) 1203 { 1204 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, 1205 (unsigned long *)sptep); 1206 if (was_writable && !spte_ad_enabled(*sptep)) 1207 kvm_set_pfn_dirty(spte_to_pfn(*sptep)); 1208 1209 return was_writable; 1210 } 1211 1212 /* 1213 * Gets the GFN ready for another round of dirty logging by clearing the 1214 * - D bit on ad-enabled SPTEs, and 1215 * - W bit on ad-disabled SPTEs. 1216 * Returns true iff any D or W bits were cleared. 1217 */ 1218 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1219 struct kvm_memory_slot *slot) 1220 { 1221 u64 *sptep; 1222 struct rmap_iterator iter; 1223 bool flush = false; 1224 1225 for_each_rmap_spte(rmap_head, &iter, sptep) 1226 if (spte_ad_need_write_protect(*sptep)) 1227 flush |= spte_wrprot_for_clear_dirty(sptep); 1228 else 1229 flush |= spte_clear_dirty(sptep); 1230 1231 return flush; 1232 } 1233 1234 /** 1235 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1236 * @kvm: kvm instance 1237 * @slot: slot to protect 1238 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1239 * @mask: indicates which pages we should protect 1240 * 1241 * Used when we do not need to care about huge page mappings. 1242 */ 1243 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1244 struct kvm_memory_slot *slot, 1245 gfn_t gfn_offset, unsigned long mask) 1246 { 1247 struct kvm_rmap_head *rmap_head; 1248 1249 if (is_tdp_mmu_enabled(kvm)) 1250 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, 1251 slot->base_gfn + gfn_offset, mask, true); 1252 1253 if (!kvm_memslots_have_rmaps(kvm)) 1254 return; 1255 1256 while (mask) { 1257 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1258 PG_LEVEL_4K, slot); 1259 __rmap_write_protect(kvm, rmap_head, false); 1260 1261 /* clear the first set bit */ 1262 mask &= mask - 1; 1263 } 1264 } 1265 1266 /** 1267 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write 1268 * protect the page if the D-bit isn't supported. 1269 * @kvm: kvm instance 1270 * @slot: slot to clear D-bit 1271 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1272 * @mask: indicates which pages we should clear D-bit 1273 * 1274 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 1275 */ 1276 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1277 struct kvm_memory_slot *slot, 1278 gfn_t gfn_offset, unsigned long mask) 1279 { 1280 struct kvm_rmap_head *rmap_head; 1281 1282 if (is_tdp_mmu_enabled(kvm)) 1283 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, 1284 slot->base_gfn + gfn_offset, mask, false); 1285 1286 if (!kvm_memslots_have_rmaps(kvm)) 1287 return; 1288 1289 while (mask) { 1290 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1291 PG_LEVEL_4K, slot); 1292 __rmap_clear_dirty(kvm, rmap_head, slot); 1293 1294 /* clear the first set bit */ 1295 mask &= mask - 1; 1296 } 1297 } 1298 1299 /** 1300 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1301 * PT level pages. 1302 * 1303 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1304 * enable dirty logging for them. 1305 * 1306 * We need to care about huge page mappings: e.g. during dirty logging we may 1307 * have such mappings. 1308 */ 1309 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1310 struct kvm_memory_slot *slot, 1311 gfn_t gfn_offset, unsigned long mask) 1312 { 1313 /* 1314 * Huge pages are NOT write protected when we start dirty logging in 1315 * initially-all-set mode; must write protect them here so that they 1316 * are split to 4K on the first write. 1317 * 1318 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn 1319 * of memslot has no such restriction, so the range can cross two large 1320 * pages. 1321 */ 1322 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { 1323 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); 1324 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); 1325 1326 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); 1327 1328 /* Cross two large pages? */ 1329 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != 1330 ALIGN(end << PAGE_SHIFT, PMD_SIZE)) 1331 kvm_mmu_slot_gfn_write_protect(kvm, slot, end, 1332 PG_LEVEL_2M); 1333 } 1334 1335 /* Now handle 4K PTEs. */ 1336 if (kvm_x86_ops.cpu_dirty_log_size) 1337 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); 1338 else 1339 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1340 } 1341 1342 int kvm_cpu_dirty_log_size(void) 1343 { 1344 return kvm_x86_ops.cpu_dirty_log_size; 1345 } 1346 1347 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, 1348 struct kvm_memory_slot *slot, u64 gfn, 1349 int min_level) 1350 { 1351 struct kvm_rmap_head *rmap_head; 1352 int i; 1353 bool write_protected = false; 1354 1355 if (kvm_memslots_have_rmaps(kvm)) { 1356 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { 1357 rmap_head = __gfn_to_rmap(gfn, i, slot); 1358 write_protected |= __rmap_write_protect(kvm, rmap_head, true); 1359 } 1360 } 1361 1362 if (is_tdp_mmu_enabled(kvm)) 1363 write_protected |= 1364 kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); 1365 1366 return write_protected; 1367 } 1368 1369 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) 1370 { 1371 struct kvm_memory_slot *slot; 1372 1373 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1374 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); 1375 } 1376 1377 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1378 struct kvm_memory_slot *slot) 1379 { 1380 u64 *sptep; 1381 struct rmap_iterator iter; 1382 bool flush = false; 1383 1384 while ((sptep = rmap_get_first(rmap_head, &iter))) { 1385 rmap_printk("spte %p %llx.\n", sptep, *sptep); 1386 1387 pte_list_remove(rmap_head, sptep); 1388 flush = true; 1389 } 1390 1391 return flush; 1392 } 1393 1394 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1395 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1396 pte_t unused) 1397 { 1398 return kvm_zap_rmapp(kvm, rmap_head, slot); 1399 } 1400 1401 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1402 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1403 pte_t pte) 1404 { 1405 u64 *sptep; 1406 struct rmap_iterator iter; 1407 int need_flush = 0; 1408 u64 new_spte; 1409 kvm_pfn_t new_pfn; 1410 1411 WARN_ON(pte_huge(pte)); 1412 new_pfn = pte_pfn(pte); 1413 1414 restart: 1415 for_each_rmap_spte(rmap_head, &iter, sptep) { 1416 rmap_printk("spte %p %llx gfn %llx (%d)\n", 1417 sptep, *sptep, gfn, level); 1418 1419 need_flush = 1; 1420 1421 if (pte_write(pte)) { 1422 pte_list_remove(rmap_head, sptep); 1423 goto restart; 1424 } else { 1425 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 1426 *sptep, new_pfn); 1427 1428 mmu_spte_clear_track_bits(sptep); 1429 mmu_spte_set(sptep, new_spte); 1430 } 1431 } 1432 1433 if (need_flush && kvm_available_flush_tlb_with_range()) { 1434 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 1435 return 0; 1436 } 1437 1438 return need_flush; 1439 } 1440 1441 struct slot_rmap_walk_iterator { 1442 /* input fields. */ 1443 struct kvm_memory_slot *slot; 1444 gfn_t start_gfn; 1445 gfn_t end_gfn; 1446 int start_level; 1447 int end_level; 1448 1449 /* output fields. */ 1450 gfn_t gfn; 1451 struct kvm_rmap_head *rmap; 1452 int level; 1453 1454 /* private field. */ 1455 struct kvm_rmap_head *end_rmap; 1456 }; 1457 1458 static void 1459 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) 1460 { 1461 iterator->level = level; 1462 iterator->gfn = iterator->start_gfn; 1463 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); 1464 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, 1465 iterator->slot); 1466 } 1467 1468 static void 1469 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, 1470 struct kvm_memory_slot *slot, int start_level, 1471 int end_level, gfn_t start_gfn, gfn_t end_gfn) 1472 { 1473 iterator->slot = slot; 1474 iterator->start_level = start_level; 1475 iterator->end_level = end_level; 1476 iterator->start_gfn = start_gfn; 1477 iterator->end_gfn = end_gfn; 1478 1479 rmap_walk_init_level(iterator, iterator->start_level); 1480 } 1481 1482 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) 1483 { 1484 return !!iterator->rmap; 1485 } 1486 1487 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 1488 { 1489 if (++iterator->rmap <= iterator->end_rmap) { 1490 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); 1491 return; 1492 } 1493 1494 if (++iterator->level > iterator->end_level) { 1495 iterator->rmap = NULL; 1496 return; 1497 } 1498 1499 rmap_walk_init_level(iterator, iterator->level); 1500 } 1501 1502 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ 1503 _start_gfn, _end_gfn, _iter_) \ 1504 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ 1505 _end_level_, _start_gfn, _end_gfn); \ 1506 slot_rmap_walk_okay(_iter_); \ 1507 slot_rmap_walk_next(_iter_)) 1508 1509 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1510 struct kvm_memory_slot *slot, gfn_t gfn, 1511 int level, pte_t pte); 1512 1513 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, 1514 struct kvm_gfn_range *range, 1515 rmap_handler_t handler) 1516 { 1517 struct slot_rmap_walk_iterator iterator; 1518 bool ret = false; 1519 1520 for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1521 range->start, range->end - 1, &iterator) 1522 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, 1523 iterator.level, range->pte); 1524 1525 return ret; 1526 } 1527 1528 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1529 { 1530 bool flush = false; 1531 1532 if (kvm_memslots_have_rmaps(kvm)) 1533 flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); 1534 1535 if (is_tdp_mmu_enabled(kvm)) 1536 flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); 1537 1538 return flush; 1539 } 1540 1541 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1542 { 1543 bool flush = false; 1544 1545 if (kvm_memslots_have_rmaps(kvm)) 1546 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); 1547 1548 if (is_tdp_mmu_enabled(kvm)) 1549 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); 1550 1551 return flush; 1552 } 1553 1554 static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1555 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1556 pte_t unused) 1557 { 1558 u64 *sptep; 1559 struct rmap_iterator iter; 1560 int young = 0; 1561 1562 for_each_rmap_spte(rmap_head, &iter, sptep) 1563 young |= mmu_spte_age(sptep); 1564 1565 return young; 1566 } 1567 1568 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1569 struct kvm_memory_slot *slot, gfn_t gfn, 1570 int level, pte_t unused) 1571 { 1572 u64 *sptep; 1573 struct rmap_iterator iter; 1574 1575 for_each_rmap_spte(rmap_head, &iter, sptep) 1576 if (is_accessed_spte(*sptep)) 1577 return 1; 1578 return 0; 1579 } 1580 1581 #define RMAP_RECYCLE_THRESHOLD 1000 1582 1583 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 1584 { 1585 struct kvm_rmap_head *rmap_head; 1586 struct kvm_mmu_page *sp; 1587 1588 sp = sptep_to_sp(spte); 1589 1590 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); 1591 1592 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); 1593 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1594 KVM_PAGES_PER_HPAGE(sp->role.level)); 1595 } 1596 1597 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1598 { 1599 bool young = false; 1600 1601 if (kvm_memslots_have_rmaps(kvm)) 1602 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); 1603 1604 if (is_tdp_mmu_enabled(kvm)) 1605 young |= kvm_tdp_mmu_age_gfn_range(kvm, range); 1606 1607 return young; 1608 } 1609 1610 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1611 { 1612 bool young = false; 1613 1614 if (kvm_memslots_have_rmaps(kvm)) 1615 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); 1616 1617 if (is_tdp_mmu_enabled(kvm)) 1618 young |= kvm_tdp_mmu_test_age_gfn(kvm, range); 1619 1620 return young; 1621 } 1622 1623 #ifdef MMU_DEBUG 1624 static int is_empty_shadow_page(u64 *spt) 1625 { 1626 u64 *pos; 1627 u64 *end; 1628 1629 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 1630 if (is_shadow_present_pte(*pos)) { 1631 printk(KERN_ERR "%s: %p %llx\n", __func__, 1632 pos, *pos); 1633 return 0; 1634 } 1635 return 1; 1636 } 1637 #endif 1638 1639 /* 1640 * This value is the sum of all of the kvm instances's 1641 * kvm->arch.n_used_mmu_pages values. We need a global, 1642 * aggregate version in order to make the slab shrinker 1643 * faster 1644 */ 1645 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr) 1646 { 1647 kvm->arch.n_used_mmu_pages += nr; 1648 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1649 } 1650 1651 static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1652 { 1653 MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); 1654 hlist_del(&sp->hash_link); 1655 list_del(&sp->link); 1656 free_page((unsigned long)sp->spt); 1657 if (!sp->role.direct) 1658 free_page((unsigned long)sp->gfns); 1659 kmem_cache_free(mmu_page_header_cache, sp); 1660 } 1661 1662 static unsigned kvm_page_table_hashfn(gfn_t gfn) 1663 { 1664 return hash_64(gfn, KVM_MMU_HASH_SHIFT); 1665 } 1666 1667 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1668 struct kvm_mmu_page *sp, u64 *parent_pte) 1669 { 1670 if (!parent_pte) 1671 return; 1672 1673 pte_list_add(vcpu, parent_pte, &sp->parent_ptes); 1674 } 1675 1676 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1677 u64 *parent_pte) 1678 { 1679 __pte_list_remove(parent_pte, &sp->parent_ptes); 1680 } 1681 1682 static void drop_parent_pte(struct kvm_mmu_page *sp, 1683 u64 *parent_pte) 1684 { 1685 mmu_page_remove_parent_pte(sp, parent_pte); 1686 mmu_spte_clear_no_track(parent_pte); 1687 } 1688 1689 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) 1690 { 1691 struct kvm_mmu_page *sp; 1692 1693 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1694 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 1695 if (!direct) 1696 sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); 1697 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1698 1699 /* 1700 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() 1701 * depends on valid pages being added to the head of the list. See 1702 * comments in kvm_zap_obsolete_pages(). 1703 */ 1704 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; 1705 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1706 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1707 return sp; 1708 } 1709 1710 static void mark_unsync(u64 *spte); 1711 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1712 { 1713 u64 *sptep; 1714 struct rmap_iterator iter; 1715 1716 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { 1717 mark_unsync(sptep); 1718 } 1719 } 1720 1721 static void mark_unsync(u64 *spte) 1722 { 1723 struct kvm_mmu_page *sp; 1724 unsigned int index; 1725 1726 sp = sptep_to_sp(spte); 1727 index = spte - sp->spt; 1728 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1729 return; 1730 if (sp->unsync_children++) 1731 return; 1732 kvm_mmu_mark_parents_unsync(sp); 1733 } 1734 1735 static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1736 struct kvm_mmu_page *sp) 1737 { 1738 return 0; 1739 } 1740 1741 #define KVM_PAGE_ARRAY_NR 16 1742 1743 struct kvm_mmu_pages { 1744 struct mmu_page_and_offset { 1745 struct kvm_mmu_page *sp; 1746 unsigned int idx; 1747 } page[KVM_PAGE_ARRAY_NR]; 1748 unsigned int nr; 1749 }; 1750 1751 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1752 int idx) 1753 { 1754 int i; 1755 1756 if (sp->unsync) 1757 for (i=0; i < pvec->nr; i++) 1758 if (pvec->page[i].sp == sp) 1759 return 0; 1760 1761 pvec->page[pvec->nr].sp = sp; 1762 pvec->page[pvec->nr].idx = idx; 1763 pvec->nr++; 1764 return (pvec->nr == KVM_PAGE_ARRAY_NR); 1765 } 1766 1767 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) 1768 { 1769 --sp->unsync_children; 1770 WARN_ON((int)sp->unsync_children < 0); 1771 __clear_bit(idx, sp->unsync_child_bitmap); 1772 } 1773 1774 static int __mmu_unsync_walk(struct kvm_mmu_page *sp, 1775 struct kvm_mmu_pages *pvec) 1776 { 1777 int i, ret, nr_unsync_leaf = 0; 1778 1779 for_each_set_bit(i, sp->unsync_child_bitmap, 512) { 1780 struct kvm_mmu_page *child; 1781 u64 ent = sp->spt[i]; 1782 1783 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { 1784 clear_unsync_child_bit(sp, i); 1785 continue; 1786 } 1787 1788 child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); 1789 1790 if (child->unsync_children) { 1791 if (mmu_pages_add(pvec, child, i)) 1792 return -ENOSPC; 1793 1794 ret = __mmu_unsync_walk(child, pvec); 1795 if (!ret) { 1796 clear_unsync_child_bit(sp, i); 1797 continue; 1798 } else if (ret > 0) { 1799 nr_unsync_leaf += ret; 1800 } else 1801 return ret; 1802 } else if (child->unsync) { 1803 nr_unsync_leaf++; 1804 if (mmu_pages_add(pvec, child, i)) 1805 return -ENOSPC; 1806 } else 1807 clear_unsync_child_bit(sp, i); 1808 } 1809 1810 return nr_unsync_leaf; 1811 } 1812 1813 #define INVALID_INDEX (-1) 1814 1815 static int mmu_unsync_walk(struct kvm_mmu_page *sp, 1816 struct kvm_mmu_pages *pvec) 1817 { 1818 pvec->nr = 0; 1819 if (!sp->unsync_children) 1820 return 0; 1821 1822 mmu_pages_add(pvec, sp, INVALID_INDEX); 1823 return __mmu_unsync_walk(sp, pvec); 1824 } 1825 1826 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1827 { 1828 WARN_ON(!sp->unsync); 1829 trace_kvm_mmu_sync_page(sp); 1830 sp->unsync = 0; 1831 --kvm->stat.mmu_unsync; 1832 } 1833 1834 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1835 struct list_head *invalid_list); 1836 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1837 struct list_head *invalid_list); 1838 1839 #define for_each_valid_sp(_kvm, _sp, _list) \ 1840 hlist_for_each_entry(_sp, _list, hash_link) \ 1841 if (is_obsolete_sp((_kvm), (_sp))) { \ 1842 } else 1843 1844 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 1845 for_each_valid_sp(_kvm, _sp, \ 1846 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ 1847 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else 1848 1849 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1850 struct list_head *invalid_list) 1851 { 1852 if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { 1853 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1854 return false; 1855 } 1856 1857 return true; 1858 } 1859 1860 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, 1861 struct list_head *invalid_list, 1862 bool remote_flush) 1863 { 1864 if (!remote_flush && list_empty(invalid_list)) 1865 return false; 1866 1867 if (!list_empty(invalid_list)) 1868 kvm_mmu_commit_zap_page(kvm, invalid_list); 1869 else 1870 kvm_flush_remote_tlbs(kvm); 1871 return true; 1872 } 1873 1874 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, 1875 struct list_head *invalid_list, 1876 bool remote_flush, bool local_flush) 1877 { 1878 if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) 1879 return; 1880 1881 if (local_flush) 1882 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1883 } 1884 1885 #ifdef CONFIG_KVM_MMU_AUDIT 1886 #include "mmu_audit.c" 1887 #else 1888 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } 1889 static void mmu_audit_disable(void) { } 1890 #endif 1891 1892 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 1893 { 1894 return sp->role.invalid || 1895 unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); 1896 } 1897 1898 struct mmu_page_path { 1899 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; 1900 unsigned int idx[PT64_ROOT_MAX_LEVEL]; 1901 }; 1902 1903 #define for_each_sp(pvec, sp, parents, i) \ 1904 for (i = mmu_pages_first(&pvec, &parents); \ 1905 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 1906 i = mmu_pages_next(&pvec, &parents, i)) 1907 1908 static int mmu_pages_next(struct kvm_mmu_pages *pvec, 1909 struct mmu_page_path *parents, 1910 int i) 1911 { 1912 int n; 1913 1914 for (n = i+1; n < pvec->nr; n++) { 1915 struct kvm_mmu_page *sp = pvec->page[n].sp; 1916 unsigned idx = pvec->page[n].idx; 1917 int level = sp->role.level; 1918 1919 parents->idx[level-1] = idx; 1920 if (level == PG_LEVEL_4K) 1921 break; 1922 1923 parents->parent[level-2] = sp; 1924 } 1925 1926 return n; 1927 } 1928 1929 static int mmu_pages_first(struct kvm_mmu_pages *pvec, 1930 struct mmu_page_path *parents) 1931 { 1932 struct kvm_mmu_page *sp; 1933 int level; 1934 1935 if (pvec->nr == 0) 1936 return 0; 1937 1938 WARN_ON(pvec->page[0].idx != INVALID_INDEX); 1939 1940 sp = pvec->page[0].sp; 1941 level = sp->role.level; 1942 WARN_ON(level == PG_LEVEL_4K); 1943 1944 parents->parent[level-2] = sp; 1945 1946 /* Also set up a sentinel. Further entries in pvec are all 1947 * children of sp, so this element is never overwritten. 1948 */ 1949 parents->parent[level-1] = NULL; 1950 return mmu_pages_next(pvec, parents, 0); 1951 } 1952 1953 static void mmu_pages_clear_parents(struct mmu_page_path *parents) 1954 { 1955 struct kvm_mmu_page *sp; 1956 unsigned int level = 0; 1957 1958 do { 1959 unsigned int idx = parents->idx[level]; 1960 sp = parents->parent[level]; 1961 if (!sp) 1962 return; 1963 1964 WARN_ON(idx == INVALID_INDEX); 1965 clear_unsync_child_bit(sp, idx); 1966 level++; 1967 } while (!sp->unsync_children); 1968 } 1969 1970 static void mmu_sync_children(struct kvm_vcpu *vcpu, 1971 struct kvm_mmu_page *parent) 1972 { 1973 int i; 1974 struct kvm_mmu_page *sp; 1975 struct mmu_page_path parents; 1976 struct kvm_mmu_pages pages; 1977 LIST_HEAD(invalid_list); 1978 bool flush = false; 1979 1980 while (mmu_unsync_walk(parent, &pages)) { 1981 bool protected = false; 1982 1983 for_each_sp(pages, sp, parents, i) 1984 protected |= rmap_write_protect(vcpu, sp->gfn); 1985 1986 if (protected) { 1987 kvm_flush_remote_tlbs(vcpu->kvm); 1988 flush = false; 1989 } 1990 1991 for_each_sp(pages, sp, parents, i) { 1992 kvm_unlink_unsync_page(vcpu->kvm, sp); 1993 flush |= kvm_sync_page(vcpu, sp, &invalid_list); 1994 mmu_pages_clear_parents(&parents); 1995 } 1996 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { 1997 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); 1998 cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); 1999 flush = false; 2000 } 2001 } 2002 2003 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); 2004 } 2005 2006 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) 2007 { 2008 atomic_set(&sp->write_flooding_count, 0); 2009 } 2010 2011 static void clear_sp_write_flooding_count(u64 *spte) 2012 { 2013 __clear_sp_write_flooding_count(sptep_to_sp(spte)); 2014 } 2015 2016 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 2017 gfn_t gfn, 2018 gva_t gaddr, 2019 unsigned level, 2020 int direct, 2021 unsigned int access) 2022 { 2023 bool direct_mmu = vcpu->arch.mmu->direct_map; 2024 union kvm_mmu_page_role role; 2025 struct hlist_head *sp_list; 2026 unsigned quadrant; 2027 struct kvm_mmu_page *sp; 2028 int collisions = 0; 2029 LIST_HEAD(invalid_list); 2030 2031 role = vcpu->arch.mmu->mmu_role.base; 2032 role.level = level; 2033 role.direct = direct; 2034 if (role.direct) 2035 role.gpte_is_8_bytes = true; 2036 role.access = access; 2037 if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { 2038 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 2039 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 2040 role.quadrant = quadrant; 2041 } 2042 2043 sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; 2044 for_each_valid_sp(vcpu->kvm, sp, sp_list) { 2045 if (sp->gfn != gfn) { 2046 collisions++; 2047 continue; 2048 } 2049 2050 if (sp->role.word != role.word) { 2051 /* 2052 * If the guest is creating an upper-level page, zap 2053 * unsync pages for the same gfn. While it's possible 2054 * the guest is using recursive page tables, in all 2055 * likelihood the guest has stopped using the unsync 2056 * page and is installing a completely unrelated page. 2057 * Unsync pages must not be left as is, because the new 2058 * upper-level page will be write-protected. 2059 */ 2060 if (level > PG_LEVEL_4K && sp->unsync) 2061 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2062 &invalid_list); 2063 continue; 2064 } 2065 2066 if (direct_mmu) 2067 goto trace_get_page; 2068 2069 if (sp->unsync) { 2070 /* 2071 * The page is good, but is stale. kvm_sync_page does 2072 * get the latest guest state, but (unlike mmu_unsync_children) 2073 * it doesn't write-protect the page or mark it synchronized! 2074 * This way the validity of the mapping is ensured, but the 2075 * overhead of write protection is not incurred until the 2076 * guest invalidates the TLB mapping. This allows multiple 2077 * SPs for a single gfn to be unsync. 2078 * 2079 * If the sync fails, the page is zapped. If so, break 2080 * in order to rebuild it. 2081 */ 2082 if (!kvm_sync_page(vcpu, sp, &invalid_list)) 2083 break; 2084 2085 WARN_ON(!list_empty(&invalid_list)); 2086 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 2087 } 2088 2089 if (sp->unsync_children) 2090 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 2091 2092 __clear_sp_write_flooding_count(sp); 2093 2094 trace_get_page: 2095 trace_kvm_mmu_get_page(sp, false); 2096 goto out; 2097 } 2098 2099 ++vcpu->kvm->stat.mmu_cache_miss; 2100 2101 sp = kvm_mmu_alloc_page(vcpu, direct); 2102 2103 sp->gfn = gfn; 2104 sp->role = role; 2105 hlist_add_head(&sp->hash_link, sp_list); 2106 if (!direct) { 2107 account_shadowed(vcpu->kvm, sp); 2108 if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) 2109 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); 2110 } 2111 trace_kvm_mmu_get_page(sp, true); 2112 out: 2113 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2114 2115 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) 2116 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; 2117 return sp; 2118 } 2119 2120 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, 2121 struct kvm_vcpu *vcpu, hpa_t root, 2122 u64 addr) 2123 { 2124 iterator->addr = addr; 2125 iterator->shadow_addr = root; 2126 iterator->level = vcpu->arch.mmu->shadow_root_level; 2127 2128 if (iterator->level == PT64_ROOT_4LEVEL && 2129 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && 2130 !vcpu->arch.mmu->direct_map) 2131 --iterator->level; 2132 2133 if (iterator->level == PT32E_ROOT_LEVEL) { 2134 /* 2135 * prev_root is currently only used for 64-bit hosts. So only 2136 * the active root_hpa is valid here. 2137 */ 2138 BUG_ON(root != vcpu->arch.mmu->root_hpa); 2139 2140 iterator->shadow_addr 2141 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; 2142 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; 2143 --iterator->level; 2144 if (!iterator->shadow_addr) 2145 iterator->level = 0; 2146 } 2147 } 2148 2149 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 2150 struct kvm_vcpu *vcpu, u64 addr) 2151 { 2152 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa, 2153 addr); 2154 } 2155 2156 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) 2157 { 2158 if (iterator->level < PG_LEVEL_4K) 2159 return false; 2160 2161 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 2162 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 2163 return true; 2164 } 2165 2166 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, 2167 u64 spte) 2168 { 2169 if (is_last_spte(spte, iterator->level)) { 2170 iterator->level = 0; 2171 return; 2172 } 2173 2174 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; 2175 --iterator->level; 2176 } 2177 2178 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 2179 { 2180 __shadow_walk_next(iterator, *iterator->sptep); 2181 } 2182 2183 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, 2184 struct kvm_mmu_page *sp) 2185 { 2186 u64 spte; 2187 2188 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2189 2190 spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); 2191 2192 mmu_spte_set(sptep, spte); 2193 2194 mmu_page_add_parent_pte(vcpu, sp, sptep); 2195 2196 if (sp->unsync_children || sp->unsync) 2197 mark_unsync(sptep); 2198 } 2199 2200 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2201 unsigned direct_access) 2202 { 2203 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { 2204 struct kvm_mmu_page *child; 2205 2206 /* 2207 * For the direct sp, if the guest pte's dirty bit 2208 * changed form clean to dirty, it will corrupt the 2209 * sp's access: allow writable in the read-only sp, 2210 * so we should update the spte at this point to get 2211 * a new sp with the correct access. 2212 */ 2213 child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); 2214 if (child->role.access == direct_access) 2215 return; 2216 2217 drop_parent_pte(child, sptep); 2218 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); 2219 } 2220 } 2221 2222 /* Returns the number of zapped non-leaf child shadow pages. */ 2223 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 2224 u64 *spte, struct list_head *invalid_list) 2225 { 2226 u64 pte; 2227 struct kvm_mmu_page *child; 2228 2229 pte = *spte; 2230 if (is_shadow_present_pte(pte)) { 2231 if (is_last_spte(pte, sp->role.level)) { 2232 drop_spte(kvm, spte); 2233 if (is_large_pte(pte)) 2234 --kvm->stat.lpages; 2235 } else { 2236 child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); 2237 drop_parent_pte(child, spte); 2238 2239 /* 2240 * Recursively zap nested TDP SPs, parentless SPs are 2241 * unlikely to be used again in the near future. This 2242 * avoids retaining a large number of stale nested SPs. 2243 */ 2244 if (tdp_enabled && invalid_list && 2245 child->role.guest_mode && !child->parent_ptes.val) 2246 return kvm_mmu_prepare_zap_page(kvm, child, 2247 invalid_list); 2248 } 2249 } else if (is_mmio_spte(pte)) { 2250 mmu_spte_clear_no_track(spte); 2251 } 2252 return 0; 2253 } 2254 2255 static int kvm_mmu_page_unlink_children(struct kvm *kvm, 2256 struct kvm_mmu_page *sp, 2257 struct list_head *invalid_list) 2258 { 2259 int zapped = 0; 2260 unsigned i; 2261 2262 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 2263 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); 2264 2265 return zapped; 2266 } 2267 2268 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 2269 { 2270 u64 *sptep; 2271 struct rmap_iterator iter; 2272 2273 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) 2274 drop_parent_pte(sp, sptep); 2275 } 2276 2277 static int mmu_zap_unsync_children(struct kvm *kvm, 2278 struct kvm_mmu_page *parent, 2279 struct list_head *invalid_list) 2280 { 2281 int i, zapped = 0; 2282 struct mmu_page_path parents; 2283 struct kvm_mmu_pages pages; 2284 2285 if (parent->role.level == PG_LEVEL_4K) 2286 return 0; 2287 2288 while (mmu_unsync_walk(parent, &pages)) { 2289 struct kvm_mmu_page *sp; 2290 2291 for_each_sp(pages, sp, parents, i) { 2292 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2293 mmu_pages_clear_parents(&parents); 2294 zapped++; 2295 } 2296 } 2297 2298 return zapped; 2299 } 2300 2301 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, 2302 struct kvm_mmu_page *sp, 2303 struct list_head *invalid_list, 2304 int *nr_zapped) 2305 { 2306 bool list_unstable; 2307 2308 trace_kvm_mmu_prepare_zap_page(sp); 2309 ++kvm->stat.mmu_shadow_zapped; 2310 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); 2311 *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); 2312 kvm_mmu_unlink_parents(kvm, sp); 2313 2314 /* Zapping children means active_mmu_pages has become unstable. */ 2315 list_unstable = *nr_zapped; 2316 2317 if (!sp->role.invalid && !sp->role.direct) 2318 unaccount_shadowed(kvm, sp); 2319 2320 if (sp->unsync) 2321 kvm_unlink_unsync_page(kvm, sp); 2322 if (!sp->root_count) { 2323 /* Count self */ 2324 (*nr_zapped)++; 2325 2326 /* 2327 * Already invalid pages (previously active roots) are not on 2328 * the active page list. See list_del() in the "else" case of 2329 * !sp->root_count. 2330 */ 2331 if (sp->role.invalid) 2332 list_add(&sp->link, invalid_list); 2333 else 2334 list_move(&sp->link, invalid_list); 2335 kvm_mod_used_mmu_pages(kvm, -1); 2336 } else { 2337 /* 2338 * Remove the active root from the active page list, the root 2339 * will be explicitly freed when the root_count hits zero. 2340 */ 2341 list_del(&sp->link); 2342 2343 /* 2344 * Obsolete pages cannot be used on any vCPUs, see the comment 2345 * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also 2346 * treats invalid shadow pages as being obsolete. 2347 */ 2348 if (!is_obsolete_sp(kvm, sp)) 2349 kvm_reload_remote_mmus(kvm); 2350 } 2351 2352 if (sp->lpage_disallowed) 2353 unaccount_huge_nx_page(kvm, sp); 2354 2355 sp->role.invalid = 1; 2356 return list_unstable; 2357 } 2358 2359 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 2360 struct list_head *invalid_list) 2361 { 2362 int nr_zapped; 2363 2364 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); 2365 return nr_zapped; 2366 } 2367 2368 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2369 struct list_head *invalid_list) 2370 { 2371 struct kvm_mmu_page *sp, *nsp; 2372 2373 if (list_empty(invalid_list)) 2374 return; 2375 2376 /* 2377 * We need to make sure everyone sees our modifications to 2378 * the page tables and see changes to vcpu->mode here. The barrier 2379 * in the kvm_flush_remote_tlbs() achieves this. This pairs 2380 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. 2381 * 2382 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit 2383 * guest mode and/or lockless shadow page table walks. 2384 */ 2385 kvm_flush_remote_tlbs(kvm); 2386 2387 list_for_each_entry_safe(sp, nsp, invalid_list, link) { 2388 WARN_ON(!sp->role.invalid || sp->root_count); 2389 kvm_mmu_free_page(sp); 2390 } 2391 } 2392 2393 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, 2394 unsigned long nr_to_zap) 2395 { 2396 unsigned long total_zapped = 0; 2397 struct kvm_mmu_page *sp, *tmp; 2398 LIST_HEAD(invalid_list); 2399 bool unstable; 2400 int nr_zapped; 2401 2402 if (list_empty(&kvm->arch.active_mmu_pages)) 2403 return 0; 2404 2405 restart: 2406 list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { 2407 /* 2408 * Don't zap active root pages, the page itself can't be freed 2409 * and zapping it will just force vCPUs to realloc and reload. 2410 */ 2411 if (sp->root_count) 2412 continue; 2413 2414 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, 2415 &nr_zapped); 2416 total_zapped += nr_zapped; 2417 if (total_zapped >= nr_to_zap) 2418 break; 2419 2420 if (unstable) 2421 goto restart; 2422 } 2423 2424 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2425 2426 kvm->stat.mmu_recycled += total_zapped; 2427 return total_zapped; 2428 } 2429 2430 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) 2431 { 2432 if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) 2433 return kvm->arch.n_max_mmu_pages - 2434 kvm->arch.n_used_mmu_pages; 2435 2436 return 0; 2437 } 2438 2439 static int make_mmu_pages_available(struct kvm_vcpu *vcpu) 2440 { 2441 unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); 2442 2443 if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) 2444 return 0; 2445 2446 kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); 2447 2448 /* 2449 * Note, this check is intentionally soft, it only guarantees that one 2450 * page is available, while the caller may end up allocating as many as 2451 * four pages, e.g. for PAE roots or for 5-level paging. Temporarily 2452 * exceeding the (arbitrary by default) limit will not harm the host, 2453 * being too aggressive may unnecessarily kill the guest, and getting an 2454 * exact count is far more trouble than it's worth, especially in the 2455 * page fault paths. 2456 */ 2457 if (!kvm_mmu_available_pages(vcpu->kvm)) 2458 return -ENOSPC; 2459 return 0; 2460 } 2461 2462 /* 2463 * Changing the number of mmu pages allocated to the vm 2464 * Note: if goal_nr_mmu_pages is too small, you will get dead lock 2465 */ 2466 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) 2467 { 2468 write_lock(&kvm->mmu_lock); 2469 2470 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2471 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - 2472 goal_nr_mmu_pages); 2473 2474 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 2475 } 2476 2477 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2478 2479 write_unlock(&kvm->mmu_lock); 2480 } 2481 2482 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2483 { 2484 struct kvm_mmu_page *sp; 2485 LIST_HEAD(invalid_list); 2486 int r; 2487 2488 pgprintk("%s: looking for gfn %llx\n", __func__, gfn); 2489 r = 0; 2490 write_lock(&kvm->mmu_lock); 2491 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { 2492 pgprintk("%s: gfn %llx role %x\n", __func__, gfn, 2493 sp->role.word); 2494 r = 1; 2495 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2496 } 2497 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2498 write_unlock(&kvm->mmu_lock); 2499 2500 return r; 2501 } 2502 2503 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 2504 { 2505 gpa_t gpa; 2506 int r; 2507 2508 if (vcpu->arch.mmu->direct_map) 2509 return 0; 2510 2511 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 2512 2513 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2514 2515 return r; 2516 } 2517 2518 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 2519 { 2520 trace_kvm_mmu_unsync_page(sp); 2521 ++vcpu->kvm->stat.mmu_unsync; 2522 sp->unsync = 1; 2523 2524 kvm_mmu_mark_parents_unsync(sp); 2525 } 2526 2527 /* 2528 * Attempt to unsync any shadow pages that can be reached by the specified gfn, 2529 * KVM is creating a writable mapping for said gfn. Returns 0 if all pages 2530 * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must 2531 * be write-protected. 2532 */ 2533 int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) 2534 { 2535 struct kvm_mmu_page *sp; 2536 2537 /* 2538 * Force write-protection if the page is being tracked. Note, the page 2539 * track machinery is used to write-protect upper-level shadow pages, 2540 * i.e. this guards the role.level == 4K assertion below! 2541 */ 2542 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) 2543 return -EPERM; 2544 2545 /* 2546 * The page is not write-tracked, mark existing shadow pages unsync 2547 * unless KVM is synchronizing an unsync SP (can_unsync = false). In 2548 * that case, KVM must complete emulation of the guest TLB flush before 2549 * allowing shadow pages to become unsync (writable by the guest). 2550 */ 2551 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { 2552 if (!can_unsync) 2553 return -EPERM; 2554 2555 if (sp->unsync) 2556 continue; 2557 2558 WARN_ON(sp->role.level != PG_LEVEL_4K); 2559 kvm_unsync_page(vcpu, sp); 2560 } 2561 2562 /* 2563 * We need to ensure that the marking of unsync pages is visible 2564 * before the SPTE is updated to allow writes because 2565 * kvm_mmu_sync_roots() checks the unsync flags without holding 2566 * the MMU lock and so can race with this. If the SPTE was updated 2567 * before the page had been marked as unsync-ed, something like the 2568 * following could happen: 2569 * 2570 * CPU 1 CPU 2 2571 * --------------------------------------------------------------------- 2572 * 1.2 Host updates SPTE 2573 * to be writable 2574 * 2.1 Guest writes a GPTE for GVA X. 2575 * (GPTE being in the guest page table shadowed 2576 * by the SP from CPU 1.) 2577 * This reads SPTE during the page table walk. 2578 * Since SPTE.W is read as 1, there is no 2579 * fault. 2580 * 2581 * 2.2 Guest issues TLB flush. 2582 * That causes a VM Exit. 2583 * 2584 * 2.3 Walking of unsync pages sees sp->unsync is 2585 * false and skips the page. 2586 * 2587 * 2.4 Guest accesses GVA X. 2588 * Since the mapping in the SP was not updated, 2589 * so the old mapping for GVA X incorrectly 2590 * gets used. 2591 * 1.1 Host marks SP 2592 * as unsync 2593 * (sp->unsync = true) 2594 * 2595 * The write barrier below ensures that 1.1 happens before 1.2 and thus 2596 * the situation in 2.4 does not arise. The implicit barrier in 2.2 2597 * pairs with this write barrier. 2598 */ 2599 smp_wmb(); 2600 2601 return 0; 2602 } 2603 2604 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2605 unsigned int pte_access, int level, 2606 gfn_t gfn, kvm_pfn_t pfn, bool speculative, 2607 bool can_unsync, bool host_writable) 2608 { 2609 u64 spte; 2610 struct kvm_mmu_page *sp; 2611 int ret; 2612 2613 sp = sptep_to_sp(sptep); 2614 2615 ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, 2616 can_unsync, host_writable, sp_ad_disabled(sp), &spte); 2617 2618 if (spte & PT_WRITABLE_MASK) 2619 kvm_vcpu_mark_page_dirty(vcpu, gfn); 2620 2621 if (*sptep == spte) 2622 ret |= SET_SPTE_SPURIOUS; 2623 else if (mmu_spte_update(sptep, spte)) 2624 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; 2625 return ret; 2626 } 2627 2628 static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2629 unsigned int pte_access, bool write_fault, int level, 2630 gfn_t gfn, kvm_pfn_t pfn, bool speculative, 2631 bool host_writable) 2632 { 2633 int was_rmapped = 0; 2634 int rmap_count; 2635 int set_spte_ret; 2636 int ret = RET_PF_FIXED; 2637 bool flush = false; 2638 2639 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, 2640 *sptep, write_fault, gfn); 2641 2642 if (unlikely(is_noslot_pfn(pfn))) { 2643 mark_mmio_spte(vcpu, sptep, gfn, pte_access); 2644 return RET_PF_EMULATE; 2645 } 2646 2647 if (is_shadow_present_pte(*sptep)) { 2648 /* 2649 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 2650 * the parent of the now unreachable PTE. 2651 */ 2652 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { 2653 struct kvm_mmu_page *child; 2654 u64 pte = *sptep; 2655 2656 child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); 2657 drop_parent_pte(child, sptep); 2658 flush = true; 2659 } else if (pfn != spte_to_pfn(*sptep)) { 2660 pgprintk("hfn old %llx new %llx\n", 2661 spte_to_pfn(*sptep), pfn); 2662 drop_spte(vcpu->kvm, sptep); 2663 flush = true; 2664 } else 2665 was_rmapped = 1; 2666 } 2667 2668 set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, 2669 speculative, true, host_writable); 2670 if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 2671 if (write_fault) 2672 ret = RET_PF_EMULATE; 2673 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 2674 } 2675 2676 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) 2677 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 2678 KVM_PAGES_PER_HPAGE(level)); 2679 2680 /* 2681 * The fault is fully spurious if and only if the new SPTE and old SPTE 2682 * are identical, and emulation is not required. 2683 */ 2684 if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { 2685 WARN_ON_ONCE(!was_rmapped); 2686 return RET_PF_SPURIOUS; 2687 } 2688 2689 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2690 trace_kvm_mmu_set_spte(level, gfn, sptep); 2691 if (!was_rmapped && is_large_pte(*sptep)) 2692 ++vcpu->kvm->stat.lpages; 2693 2694 if (is_shadow_present_pte(*sptep)) { 2695 if (!was_rmapped) { 2696 rmap_count = rmap_add(vcpu, sptep, gfn); 2697 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2698 rmap_recycle(vcpu, sptep, gfn); 2699 } 2700 } 2701 2702 return ret; 2703 } 2704 2705 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2706 bool no_dirty_log) 2707 { 2708 struct kvm_memory_slot *slot; 2709 2710 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2711 if (!slot) 2712 return KVM_PFN_ERR_FAULT; 2713 2714 return gfn_to_pfn_memslot_atomic(slot, gfn); 2715 } 2716 2717 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2718 struct kvm_mmu_page *sp, 2719 u64 *start, u64 *end) 2720 { 2721 struct page *pages[PTE_PREFETCH_NUM]; 2722 struct kvm_memory_slot *slot; 2723 unsigned int access = sp->role.access; 2724 int i, ret; 2725 gfn_t gfn; 2726 2727 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2728 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); 2729 if (!slot) 2730 return -1; 2731 2732 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); 2733 if (ret <= 0) 2734 return -1; 2735 2736 for (i = 0; i < ret; i++, gfn++, start++) { 2737 mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, 2738 page_to_pfn(pages[i]), true, true); 2739 put_page(pages[i]); 2740 } 2741 2742 return 0; 2743 } 2744 2745 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, 2746 struct kvm_mmu_page *sp, u64 *sptep) 2747 { 2748 u64 *spte, *start = NULL; 2749 int i; 2750 2751 WARN_ON(!sp->role.direct); 2752 2753 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); 2754 spte = sp->spt + i; 2755 2756 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2757 if (is_shadow_present_pte(*spte) || spte == sptep) { 2758 if (!start) 2759 continue; 2760 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2761 break; 2762 start = NULL; 2763 } else if (!start) 2764 start = spte; 2765 } 2766 } 2767 2768 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) 2769 { 2770 struct kvm_mmu_page *sp; 2771 2772 sp = sptep_to_sp(sptep); 2773 2774 /* 2775 * Without accessed bits, there's no way to distinguish between 2776 * actually accessed translations and prefetched, so disable pte 2777 * prefetch if accessed bits aren't available. 2778 */ 2779 if (sp_ad_disabled(sp)) 2780 return; 2781 2782 if (sp->role.level > PG_LEVEL_4K) 2783 return; 2784 2785 /* 2786 * If addresses are being invalidated, skip prefetching to avoid 2787 * accidentally prefetching those addresses. 2788 */ 2789 if (unlikely(vcpu->kvm->mmu_notifier_count)) 2790 return; 2791 2792 __direct_pte_prefetch(vcpu, sp, sptep); 2793 } 2794 2795 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 2796 const struct kvm_memory_slot *slot) 2797 { 2798 unsigned long hva; 2799 pte_t *pte; 2800 int level; 2801 2802 if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) 2803 return PG_LEVEL_4K; 2804 2805 /* 2806 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() 2807 * is not solely for performance, it's also necessary to avoid the 2808 * "writable" check in __gfn_to_hva_many(), which will always fail on 2809 * read-only memslots due to gfn_to_hva() assuming writes. Earlier 2810 * page fault steps have already verified the guest isn't writing a 2811 * read-only memslot. 2812 */ 2813 hva = __gfn_to_hva_memslot(slot, gfn); 2814 2815 pte = lookup_address_in_mm(kvm->mm, hva, &level); 2816 if (unlikely(!pte)) 2817 return PG_LEVEL_4K; 2818 2819 return level; 2820 } 2821 2822 int kvm_mmu_max_mapping_level(struct kvm *kvm, 2823 const struct kvm_memory_slot *slot, gfn_t gfn, 2824 kvm_pfn_t pfn, int max_level) 2825 { 2826 struct kvm_lpage_info *linfo; 2827 2828 max_level = min(max_level, max_huge_page_level); 2829 for ( ; max_level > PG_LEVEL_4K; max_level--) { 2830 linfo = lpage_info_slot(gfn, slot, max_level); 2831 if (!linfo->disallow_lpage) 2832 break; 2833 } 2834 2835 if (max_level == PG_LEVEL_4K) 2836 return PG_LEVEL_4K; 2837 2838 return host_pfn_mapping_level(kvm, gfn, pfn, slot); 2839 } 2840 2841 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, 2842 int max_level, kvm_pfn_t *pfnp, 2843 bool huge_page_disallowed, int *req_level) 2844 { 2845 struct kvm_memory_slot *slot; 2846 kvm_pfn_t pfn = *pfnp; 2847 kvm_pfn_t mask; 2848 int level; 2849 2850 *req_level = PG_LEVEL_4K; 2851 2852 if (unlikely(max_level == PG_LEVEL_4K)) 2853 return PG_LEVEL_4K; 2854 2855 if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) 2856 return PG_LEVEL_4K; 2857 2858 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); 2859 if (!slot) 2860 return PG_LEVEL_4K; 2861 2862 level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); 2863 if (level == PG_LEVEL_4K) 2864 return level; 2865 2866 *req_level = level = min(level, max_level); 2867 2868 /* 2869 * Enforce the iTLB multihit workaround after capturing the requested 2870 * level, which will be used to do precise, accurate accounting. 2871 */ 2872 if (huge_page_disallowed) 2873 return PG_LEVEL_4K; 2874 2875 /* 2876 * mmu_notifier_retry() was successful and mmu_lock is held, so 2877 * the pmd can't be split from under us. 2878 */ 2879 mask = KVM_PAGES_PER_HPAGE(level) - 1; 2880 VM_BUG_ON((gfn & mask) != (pfn & mask)); 2881 *pfnp = pfn & ~mask; 2882 2883 return level; 2884 } 2885 2886 void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, 2887 kvm_pfn_t *pfnp, int *goal_levelp) 2888 { 2889 int level = *goal_levelp; 2890 2891 if (cur_level == level && level > PG_LEVEL_4K && 2892 is_shadow_present_pte(spte) && 2893 !is_large_pte(spte)) { 2894 /* 2895 * A small SPTE exists for this pfn, but FNAME(fetch) 2896 * and __direct_map would like to create a large PTE 2897 * instead: just force them to go down another level, 2898 * patching back for them into pfn the next 9 bits of 2899 * the address. 2900 */ 2901 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - 2902 KVM_PAGES_PER_HPAGE(level - 1); 2903 *pfnp |= gfn & page_mask; 2904 (*goal_levelp)--; 2905 } 2906 } 2907 2908 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 2909 int map_writable, int max_level, kvm_pfn_t pfn, 2910 bool prefault, bool is_tdp) 2911 { 2912 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 2913 bool write = error_code & PFERR_WRITE_MASK; 2914 bool exec = error_code & PFERR_FETCH_MASK; 2915 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 2916 struct kvm_shadow_walk_iterator it; 2917 struct kvm_mmu_page *sp; 2918 int level, req_level, ret; 2919 gfn_t gfn = gpa >> PAGE_SHIFT; 2920 gfn_t base_gfn = gfn; 2921 2922 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 2923 huge_page_disallowed, &req_level); 2924 2925 trace_kvm_mmu_spte_requested(gpa, level, pfn); 2926 for_each_shadow_entry(vcpu, gpa, it) { 2927 /* 2928 * We cannot overwrite existing page tables with an NX 2929 * large page, as the leaf could be executable. 2930 */ 2931 if (nx_huge_page_workaround_enabled) 2932 disallowed_hugepage_adjust(*it.sptep, gfn, it.level, 2933 &pfn, &level); 2934 2935 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 2936 if (it.level == level) 2937 break; 2938 2939 drop_large_spte(vcpu, it.sptep); 2940 if (!is_shadow_present_pte(*it.sptep)) { 2941 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, 2942 it.level - 1, true, ACC_ALL); 2943 2944 link_shadow_page(vcpu, it.sptep, sp); 2945 if (is_tdp && huge_page_disallowed && 2946 req_level >= it.level) 2947 account_huge_nx_page(vcpu->kvm, sp); 2948 } 2949 } 2950 2951 ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, 2952 write, level, base_gfn, pfn, prefault, 2953 map_writable); 2954 if (ret == RET_PF_SPURIOUS) 2955 return ret; 2956 2957 direct_pte_prefetch(vcpu, it.sptep); 2958 ++vcpu->stat.pf_fixed; 2959 return ret; 2960 } 2961 2962 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2963 { 2964 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); 2965 } 2966 2967 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) 2968 { 2969 /* 2970 * Do not cache the mmio info caused by writing the readonly gfn 2971 * into the spte otherwise read access on readonly gfn also can 2972 * caused mmio page fault and treat it as mmio access. 2973 */ 2974 if (pfn == KVM_PFN_ERR_RO_FAULT) 2975 return RET_PF_EMULATE; 2976 2977 if (pfn == KVM_PFN_ERR_HWPOISON) { 2978 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); 2979 return RET_PF_RETRY; 2980 } 2981 2982 return -EFAULT; 2983 } 2984 2985 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, 2986 kvm_pfn_t pfn, unsigned int access, 2987 int *ret_val) 2988 { 2989 /* The pfn is invalid, report the error! */ 2990 if (unlikely(is_error_pfn(pfn))) { 2991 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); 2992 return true; 2993 } 2994 2995 if (unlikely(is_noslot_pfn(pfn))) { 2996 vcpu_cache_mmio_info(vcpu, gva, gfn, 2997 access & shadow_mmio_access_mask); 2998 /* 2999 * If MMIO caching is disabled, emulate immediately without 3000 * touching the shadow page tables as attempting to install an 3001 * MMIO SPTE will just be an expensive nop. 3002 */ 3003 if (unlikely(!shadow_mmio_value)) { 3004 *ret_val = RET_PF_EMULATE; 3005 return true; 3006 } 3007 } 3008 3009 return false; 3010 } 3011 3012 static bool page_fault_can_be_fast(u32 error_code) 3013 { 3014 /* 3015 * Do not fix the mmio spte with invalid generation number which 3016 * need to be updated by slow page fault path. 3017 */ 3018 if (unlikely(error_code & PFERR_RSVD_MASK)) 3019 return false; 3020 3021 /* See if the page fault is due to an NX violation */ 3022 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) 3023 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) 3024 return false; 3025 3026 /* 3027 * #PF can be fast if: 3028 * 1. The shadow page table entry is not present, which could mean that 3029 * the fault is potentially caused by access tracking (if enabled). 3030 * 2. The shadow page table entry is present and the fault 3031 * is caused by write-protect, that means we just need change the W 3032 * bit of the spte which can be done out of mmu-lock. 3033 * 3034 * However, if access tracking is disabled we know that a non-present 3035 * page must be a genuine page fault where we have to create a new SPTE. 3036 * So, if access tracking is disabled, we return true only for write 3037 * accesses to a present page. 3038 */ 3039 3040 return shadow_acc_track_mask != 0 || 3041 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) 3042 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); 3043 } 3044 3045 /* 3046 * Returns true if the SPTE was fixed successfully. Otherwise, 3047 * someone else modified the SPTE from its original value. 3048 */ 3049 static bool 3050 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 3051 u64 *sptep, u64 old_spte, u64 new_spte) 3052 { 3053 gfn_t gfn; 3054 3055 WARN_ON(!sp->role.direct); 3056 3057 /* 3058 * Theoretically we could also set dirty bit (and flush TLB) here in 3059 * order to eliminate unnecessary PML logging. See comments in 3060 * set_spte. But fast_page_fault is very unlikely to happen with PML 3061 * enabled, so we do not do this. This might result in the same GPA 3062 * to be logged in PML buffer again when the write really happens, and 3063 * eventually to be called by mark_page_dirty twice. But it's also no 3064 * harm. This also avoids the TLB flush needed after setting dirty bit 3065 * so non-PML cases won't be impacted. 3066 * 3067 * Compare with set_spte where instead shadow_dirty_mask is set. 3068 */ 3069 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) 3070 return false; 3071 3072 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { 3073 /* 3074 * The gfn of direct spte is stable since it is 3075 * calculated by sp->gfn. 3076 */ 3077 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 3078 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3079 } 3080 3081 return true; 3082 } 3083 3084 static bool is_access_allowed(u32 fault_err_code, u64 spte) 3085 { 3086 if (fault_err_code & PFERR_FETCH_MASK) 3087 return is_executable_pte(spte); 3088 3089 if (fault_err_code & PFERR_WRITE_MASK) 3090 return is_writable_pte(spte); 3091 3092 /* Fault was on Read access */ 3093 return spte & PT_PRESENT_MASK; 3094 } 3095 3096 /* 3097 * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. 3098 */ 3099 static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 3100 u32 error_code) 3101 { 3102 struct kvm_shadow_walk_iterator iterator; 3103 struct kvm_mmu_page *sp; 3104 int ret = RET_PF_INVALID; 3105 u64 spte = 0ull; 3106 uint retry_count = 0; 3107 3108 if (!page_fault_can_be_fast(error_code)) 3109 return ret; 3110 3111 walk_shadow_page_lockless_begin(vcpu); 3112 3113 do { 3114 u64 new_spte; 3115 3116 for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) 3117 if (!is_shadow_present_pte(spte)) 3118 break; 3119 3120 if (!is_shadow_present_pte(spte)) 3121 break; 3122 3123 sp = sptep_to_sp(iterator.sptep); 3124 if (!is_last_spte(spte, sp->role.level)) 3125 break; 3126 3127 /* 3128 * Check whether the memory access that caused the fault would 3129 * still cause it if it were to be performed right now. If not, 3130 * then this is a spurious fault caused by TLB lazily flushed, 3131 * or some other CPU has already fixed the PTE after the 3132 * current CPU took the fault. 3133 * 3134 * Need not check the access of upper level table entries since 3135 * they are always ACC_ALL. 3136 */ 3137 if (is_access_allowed(error_code, spte)) { 3138 ret = RET_PF_SPURIOUS; 3139 break; 3140 } 3141 3142 new_spte = spte; 3143 3144 if (is_access_track_spte(spte)) 3145 new_spte = restore_acc_track_spte(new_spte); 3146 3147 /* 3148 * Currently, to simplify the code, write-protection can 3149 * be removed in the fast path only if the SPTE was 3150 * write-protected for dirty-logging or access tracking. 3151 */ 3152 if ((error_code & PFERR_WRITE_MASK) && 3153 spte_can_locklessly_be_made_writable(spte)) { 3154 new_spte |= PT_WRITABLE_MASK; 3155 3156 /* 3157 * Do not fix write-permission on the large spte. Since 3158 * we only dirty the first page into the dirty-bitmap in 3159 * fast_pf_fix_direct_spte(), other pages are missed 3160 * if its slot has dirty logging enabled. 3161 * 3162 * Instead, we let the slow page fault path create a 3163 * normal spte to fix the access. 3164 * 3165 * See the comments in kvm_arch_commit_memory_region(). 3166 */ 3167 if (sp->role.level > PG_LEVEL_4K) 3168 break; 3169 } 3170 3171 /* Verify that the fault can be handled in the fast path */ 3172 if (new_spte == spte || 3173 !is_access_allowed(error_code, new_spte)) 3174 break; 3175 3176 /* 3177 * Currently, fast page fault only works for direct mapping 3178 * since the gfn is not stable for indirect shadow page. See 3179 * Documentation/virt/kvm/locking.rst to get more detail. 3180 */ 3181 if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, 3182 new_spte)) { 3183 ret = RET_PF_FIXED; 3184 break; 3185 } 3186 3187 if (++retry_count > 4) { 3188 printk_once(KERN_WARNING 3189 "kvm: Fast #PF retrying more than 4 times.\n"); 3190 break; 3191 } 3192 3193 } while (true); 3194 3195 trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, 3196 spte, ret); 3197 walk_shadow_page_lockless_end(vcpu); 3198 3199 return ret; 3200 } 3201 3202 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, 3203 struct list_head *invalid_list) 3204 { 3205 struct kvm_mmu_page *sp; 3206 3207 if (!VALID_PAGE(*root_hpa)) 3208 return; 3209 3210 sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); 3211 3212 if (is_tdp_mmu_page(sp)) 3213 kvm_tdp_mmu_put_root(kvm, sp, false); 3214 else if (!--sp->root_count && sp->role.invalid) 3215 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 3216 3217 *root_hpa = INVALID_PAGE; 3218 } 3219 3220 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ 3221 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 3222 ulong roots_to_free) 3223 { 3224 struct kvm *kvm = vcpu->kvm; 3225 int i; 3226 LIST_HEAD(invalid_list); 3227 bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; 3228 3229 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); 3230 3231 /* Before acquiring the MMU lock, see if we need to do any real work. */ 3232 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { 3233 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3234 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && 3235 VALID_PAGE(mmu->prev_roots[i].hpa)) 3236 break; 3237 3238 if (i == KVM_MMU_NUM_PREV_ROOTS) 3239 return; 3240 } 3241 3242 write_lock(&kvm->mmu_lock); 3243 3244 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3245 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) 3246 mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, 3247 &invalid_list); 3248 3249 if (free_active_root) { 3250 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 3251 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { 3252 mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); 3253 } else if (mmu->pae_root) { 3254 for (i = 0; i < 4; ++i) { 3255 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) 3256 continue; 3257 3258 mmu_free_root_page(kvm, &mmu->pae_root[i], 3259 &invalid_list); 3260 mmu->pae_root[i] = INVALID_PAE_ROOT; 3261 } 3262 } 3263 mmu->root_hpa = INVALID_PAGE; 3264 mmu->root_pgd = 0; 3265 } 3266 3267 kvm_mmu_commit_zap_page(kvm, &invalid_list); 3268 write_unlock(&kvm->mmu_lock); 3269 } 3270 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); 3271 3272 void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 3273 { 3274 unsigned long roots_to_free = 0; 3275 hpa_t root_hpa; 3276 int i; 3277 3278 /* 3279 * This should not be called while L2 is active, L2 can't invalidate 3280 * _only_ its own roots, e.g. INVVPID unconditionally exits. 3281 */ 3282 WARN_ON_ONCE(mmu->mmu_role.base.guest_mode); 3283 3284 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 3285 root_hpa = mmu->prev_roots[i].hpa; 3286 if (!VALID_PAGE(root_hpa)) 3287 continue; 3288 3289 if (!to_shadow_page(root_hpa) || 3290 to_shadow_page(root_hpa)->role.guest_mode) 3291 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 3292 } 3293 3294 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 3295 } 3296 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); 3297 3298 3299 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) 3300 { 3301 int ret = 0; 3302 3303 if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { 3304 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3305 ret = 1; 3306 } 3307 3308 return ret; 3309 } 3310 3311 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, 3312 u8 level, bool direct) 3313 { 3314 struct kvm_mmu_page *sp; 3315 3316 sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); 3317 ++sp->root_count; 3318 3319 return __pa(sp->spt); 3320 } 3321 3322 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) 3323 { 3324 struct kvm_mmu *mmu = vcpu->arch.mmu; 3325 u8 shadow_root_level = mmu->shadow_root_level; 3326 hpa_t root; 3327 unsigned i; 3328 int r; 3329 3330 write_lock(&vcpu->kvm->mmu_lock); 3331 r = make_mmu_pages_available(vcpu); 3332 if (r < 0) 3333 goto out_unlock; 3334 3335 if (is_tdp_mmu_enabled(vcpu->kvm)) { 3336 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); 3337 mmu->root_hpa = root; 3338 } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { 3339 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); 3340 mmu->root_hpa = root; 3341 } else if (shadow_root_level == PT32E_ROOT_LEVEL) { 3342 if (WARN_ON_ONCE(!mmu->pae_root)) { 3343 r = -EIO; 3344 goto out_unlock; 3345 } 3346 3347 for (i = 0; i < 4; ++i) { 3348 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); 3349 3350 root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 3351 i << 30, PT32_ROOT_LEVEL, true); 3352 mmu->pae_root[i] = root | PT_PRESENT_MASK | 3353 shadow_me_mask; 3354 } 3355 mmu->root_hpa = __pa(mmu->pae_root); 3356 } else { 3357 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); 3358 r = -EIO; 3359 goto out_unlock; 3360 } 3361 3362 /* root_pgd is ignored for direct MMUs. */ 3363 mmu->root_pgd = 0; 3364 out_unlock: 3365 write_unlock(&vcpu->kvm->mmu_lock); 3366 return r; 3367 } 3368 3369 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) 3370 { 3371 struct kvm_mmu *mmu = vcpu->arch.mmu; 3372 u64 pdptrs[4], pm_mask; 3373 gfn_t root_gfn, root_pgd; 3374 hpa_t root; 3375 unsigned i; 3376 int r; 3377 3378 root_pgd = mmu->get_guest_pgd(vcpu); 3379 root_gfn = root_pgd >> PAGE_SHIFT; 3380 3381 if (mmu_check_root(vcpu, root_gfn)) 3382 return 1; 3383 3384 /* 3385 * On SVM, reading PDPTRs might access guest memory, which might fault 3386 * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. 3387 */ 3388 if (mmu->root_level == PT32E_ROOT_LEVEL) { 3389 for (i = 0; i < 4; ++i) { 3390 pdptrs[i] = mmu->get_pdptr(vcpu, i); 3391 if (!(pdptrs[i] & PT_PRESENT_MASK)) 3392 continue; 3393 3394 if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) 3395 return 1; 3396 } 3397 } 3398 3399 r = alloc_all_memslots_rmaps(vcpu->kvm); 3400 if (r) 3401 return r; 3402 3403 write_lock(&vcpu->kvm->mmu_lock); 3404 r = make_mmu_pages_available(vcpu); 3405 if (r < 0) 3406 goto out_unlock; 3407 3408 /* 3409 * Do we shadow a long mode page table? If so we need to 3410 * write-protect the guests page table root. 3411 */ 3412 if (mmu->root_level >= PT64_ROOT_4LEVEL) { 3413 root = mmu_alloc_root(vcpu, root_gfn, 0, 3414 mmu->shadow_root_level, false); 3415 mmu->root_hpa = root; 3416 goto set_root_pgd; 3417 } 3418 3419 if (WARN_ON_ONCE(!mmu->pae_root)) { 3420 r = -EIO; 3421 goto out_unlock; 3422 } 3423 3424 /* 3425 * We shadow a 32 bit page table. This may be a legacy 2-level 3426 * or a PAE 3-level page table. In either case we need to be aware that 3427 * the shadow page table may be a PAE or a long mode page table. 3428 */ 3429 pm_mask = PT_PRESENT_MASK | shadow_me_mask; 3430 if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { 3431 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 3432 3433 if (WARN_ON_ONCE(!mmu->pml4_root)) { 3434 r = -EIO; 3435 goto out_unlock; 3436 } 3437 3438 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; 3439 } 3440 3441 for (i = 0; i < 4; ++i) { 3442 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); 3443 3444 if (mmu->root_level == PT32E_ROOT_LEVEL) { 3445 if (!(pdptrs[i] & PT_PRESENT_MASK)) { 3446 mmu->pae_root[i] = INVALID_PAE_ROOT; 3447 continue; 3448 } 3449 root_gfn = pdptrs[i] >> PAGE_SHIFT; 3450 } 3451 3452 root = mmu_alloc_root(vcpu, root_gfn, i << 30, 3453 PT32_ROOT_LEVEL, false); 3454 mmu->pae_root[i] = root | pm_mask; 3455 } 3456 3457 if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) 3458 mmu->root_hpa = __pa(mmu->pml4_root); 3459 else 3460 mmu->root_hpa = __pa(mmu->pae_root); 3461 3462 set_root_pgd: 3463 mmu->root_pgd = root_pgd; 3464 out_unlock: 3465 write_unlock(&vcpu->kvm->mmu_lock); 3466 3467 return 0; 3468 } 3469 3470 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) 3471 { 3472 struct kvm_mmu *mmu = vcpu->arch.mmu; 3473 u64 *pml4_root, *pae_root; 3474 3475 /* 3476 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP 3477 * tables are allocated and initialized at root creation as there is no 3478 * equivalent level in the guest's NPT to shadow. Allocate the tables 3479 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. 3480 */ 3481 if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL || 3482 mmu->shadow_root_level < PT64_ROOT_4LEVEL) 3483 return 0; 3484 3485 /* 3486 * This mess only works with 4-level paging and needs to be updated to 3487 * work with 5-level paging. 3488 */ 3489 if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) 3490 return -EIO; 3491 3492 if (mmu->pae_root && mmu->pml4_root) 3493 return 0; 3494 3495 /* 3496 * The special roots should always be allocated in concert. Yell and 3497 * bail if KVM ends up in a state where only one of the roots is valid. 3498 */ 3499 if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) 3500 return -EIO; 3501 3502 /* 3503 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and 3504 * doesn't need to be decrypted. 3505 */ 3506 pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3507 if (!pae_root) 3508 return -ENOMEM; 3509 3510 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3511 if (!pml4_root) { 3512 free_page((unsigned long)pae_root); 3513 return -ENOMEM; 3514 } 3515 3516 mmu->pae_root = pae_root; 3517 mmu->pml4_root = pml4_root; 3518 3519 return 0; 3520 } 3521 3522 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 3523 { 3524 int i; 3525 struct kvm_mmu_page *sp; 3526 3527 if (vcpu->arch.mmu->direct_map) 3528 return; 3529 3530 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 3531 return; 3532 3533 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 3534 3535 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { 3536 hpa_t root = vcpu->arch.mmu->root_hpa; 3537 sp = to_shadow_page(root); 3538 3539 /* 3540 * Even if another CPU was marking the SP as unsync-ed 3541 * simultaneously, any guest page table changes are not 3542 * guaranteed to be visible anyway until this VCPU issues a TLB 3543 * flush strictly after those changes are made. We only need to 3544 * ensure that the other CPU sets these flags before any actual 3545 * changes to the page tables are made. The comments in 3546 * mmu_try_to_unsync_pages() describe what could go wrong if 3547 * this requirement isn't satisfied. 3548 */ 3549 if (!smp_load_acquire(&sp->unsync) && 3550 !smp_load_acquire(&sp->unsync_children)) 3551 return; 3552 3553 write_lock(&vcpu->kvm->mmu_lock); 3554 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3555 3556 mmu_sync_children(vcpu, sp); 3557 3558 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3559 write_unlock(&vcpu->kvm->mmu_lock); 3560 return; 3561 } 3562 3563 write_lock(&vcpu->kvm->mmu_lock); 3564 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3565 3566 for (i = 0; i < 4; ++i) { 3567 hpa_t root = vcpu->arch.mmu->pae_root[i]; 3568 3569 if (IS_VALID_PAE_ROOT(root)) { 3570 root &= PT64_BASE_ADDR_MASK; 3571 sp = to_shadow_page(root); 3572 mmu_sync_children(vcpu, sp); 3573 } 3574 } 3575 3576 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3577 write_unlock(&vcpu->kvm->mmu_lock); 3578 } 3579 3580 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, 3581 u32 access, struct x86_exception *exception) 3582 { 3583 if (exception) 3584 exception->error_code = 0; 3585 return vaddr; 3586 } 3587 3588 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, 3589 u32 access, 3590 struct x86_exception *exception) 3591 { 3592 if (exception) 3593 exception->error_code = 0; 3594 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); 3595 } 3596 3597 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3598 { 3599 /* 3600 * A nested guest cannot use the MMIO cache if it is using nested 3601 * page tables, because cr2 is a nGPA while the cache stores GPAs. 3602 */ 3603 if (mmu_is_nested(vcpu)) 3604 return false; 3605 3606 if (direct) 3607 return vcpu_match_mmio_gpa(vcpu, addr); 3608 3609 return vcpu_match_mmio_gva(vcpu, addr); 3610 } 3611 3612 /* 3613 * Return the level of the lowest level SPTE added to sptes. 3614 * That SPTE may be non-present. 3615 */ 3616 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) 3617 { 3618 struct kvm_shadow_walk_iterator iterator; 3619 int leaf = -1; 3620 u64 spte; 3621 3622 walk_shadow_page_lockless_begin(vcpu); 3623 3624 for (shadow_walk_init(&iterator, vcpu, addr), 3625 *root_level = iterator.level; 3626 shadow_walk_okay(&iterator); 3627 __shadow_walk_next(&iterator, spte)) { 3628 leaf = iterator.level; 3629 spte = mmu_spte_get_lockless(iterator.sptep); 3630 3631 sptes[leaf] = spte; 3632 3633 if (!is_shadow_present_pte(spte)) 3634 break; 3635 } 3636 3637 walk_shadow_page_lockless_end(vcpu); 3638 3639 return leaf; 3640 } 3641 3642 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ 3643 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) 3644 { 3645 u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; 3646 struct rsvd_bits_validate *rsvd_check; 3647 int root, leaf, level; 3648 bool reserved = false; 3649 3650 if (is_tdp_mmu(vcpu->arch.mmu)) 3651 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); 3652 else 3653 leaf = get_walk(vcpu, addr, sptes, &root); 3654 3655 if (unlikely(leaf < 0)) { 3656 *sptep = 0ull; 3657 return reserved; 3658 } 3659 3660 *sptep = sptes[leaf]; 3661 3662 /* 3663 * Skip reserved bits checks on the terminal leaf if it's not a valid 3664 * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by 3665 * design, always have reserved bits set. The purpose of the checks is 3666 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. 3667 */ 3668 if (!is_shadow_present_pte(sptes[leaf])) 3669 leaf++; 3670 3671 rsvd_check = &vcpu->arch.mmu->shadow_zero_check; 3672 3673 for (level = root; level >= leaf; level--) 3674 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); 3675 3676 if (reserved) { 3677 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", 3678 __func__, addr); 3679 for (level = root; level >= leaf; level--) 3680 pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", 3681 sptes[level], level, 3682 get_rsvd_bits(rsvd_check, sptes[level], level)); 3683 } 3684 3685 return reserved; 3686 } 3687 3688 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3689 { 3690 u64 spte; 3691 bool reserved; 3692 3693 if (mmio_info_in_cache(vcpu, addr, direct)) 3694 return RET_PF_EMULATE; 3695 3696 reserved = get_mmio_spte(vcpu, addr, &spte); 3697 if (WARN_ON(reserved)) 3698 return -EINVAL; 3699 3700 if (is_mmio_spte(spte)) { 3701 gfn_t gfn = get_mmio_spte_gfn(spte); 3702 unsigned int access = get_mmio_spte_access(spte); 3703 3704 if (!check_mmio_spte(vcpu, spte)) 3705 return RET_PF_INVALID; 3706 3707 if (direct) 3708 addr = 0; 3709 3710 trace_handle_mmio_page_fault(addr, gfn, access); 3711 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3712 return RET_PF_EMULATE; 3713 } 3714 3715 /* 3716 * If the page table is zapped by other cpus, let CPU fault again on 3717 * the address. 3718 */ 3719 return RET_PF_RETRY; 3720 } 3721 3722 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, 3723 u32 error_code, gfn_t gfn) 3724 { 3725 if (unlikely(error_code & PFERR_RSVD_MASK)) 3726 return false; 3727 3728 if (!(error_code & PFERR_PRESENT_MASK) || 3729 !(error_code & PFERR_WRITE_MASK)) 3730 return false; 3731 3732 /* 3733 * guest is writing the page which is write tracked which can 3734 * not be fixed by page fault handler. 3735 */ 3736 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) 3737 return true; 3738 3739 return false; 3740 } 3741 3742 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) 3743 { 3744 struct kvm_shadow_walk_iterator iterator; 3745 u64 spte; 3746 3747 walk_shadow_page_lockless_begin(vcpu); 3748 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { 3749 clear_sp_write_flooding_count(iterator.sptep); 3750 if (!is_shadow_present_pte(spte)) 3751 break; 3752 } 3753 walk_shadow_page_lockless_end(vcpu); 3754 } 3755 3756 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 3757 gfn_t gfn) 3758 { 3759 struct kvm_arch_async_pf arch; 3760 3761 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; 3762 arch.gfn = gfn; 3763 arch.direct_map = vcpu->arch.mmu->direct_map; 3764 arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); 3765 3766 return kvm_setup_async_pf(vcpu, cr2_or_gpa, 3767 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 3768 } 3769 3770 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3771 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, 3772 bool write, bool *writable) 3773 { 3774 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 3775 bool async; 3776 3777 /* 3778 * Retry the page fault if the gfn hit a memslot that is being deleted 3779 * or moved. This ensures any existing SPTEs for the old memslot will 3780 * be zapped before KVM inserts a new MMIO SPTE for the gfn. 3781 */ 3782 if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) 3783 return true; 3784 3785 /* Don't expose private memslots to L2. */ 3786 if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { 3787 *pfn = KVM_PFN_NOSLOT; 3788 *writable = false; 3789 return false; 3790 } 3791 3792 async = false; 3793 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, 3794 write, writable, hva); 3795 if (!async) 3796 return false; /* *pfn has correct page already */ 3797 3798 if (!prefault && kvm_can_do_async_pf(vcpu)) { 3799 trace_kvm_try_async_get_page(cr2_or_gpa, gfn); 3800 if (kvm_find_async_pf_gfn(vcpu, gfn)) { 3801 trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); 3802 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 3803 return true; 3804 } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) 3805 return true; 3806 } 3807 3808 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, 3809 write, writable, hva); 3810 return false; 3811 } 3812 3813 static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 3814 bool prefault, int max_level, bool is_tdp) 3815 { 3816 bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); 3817 bool write = error_code & PFERR_WRITE_MASK; 3818 bool map_writable; 3819 3820 gfn_t gfn = gpa >> PAGE_SHIFT; 3821 unsigned long mmu_seq; 3822 kvm_pfn_t pfn; 3823 hva_t hva; 3824 int r; 3825 3826 if (page_fault_handle_page_track(vcpu, error_code, gfn)) 3827 return RET_PF_EMULATE; 3828 3829 if (!is_tdp_mmu_fault) { 3830 r = fast_page_fault(vcpu, gpa, error_code); 3831 if (r != RET_PF_INVALID) 3832 return r; 3833 } 3834 3835 r = mmu_topup_memory_caches(vcpu, false); 3836 if (r) 3837 return r; 3838 3839 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3840 smp_rmb(); 3841 3842 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, 3843 write, &map_writable)) 3844 return RET_PF_RETRY; 3845 3846 if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) 3847 return r; 3848 3849 r = RET_PF_RETRY; 3850 3851 if (is_tdp_mmu_fault) 3852 read_lock(&vcpu->kvm->mmu_lock); 3853 else 3854 write_lock(&vcpu->kvm->mmu_lock); 3855 3856 if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) 3857 goto out_unlock; 3858 r = make_mmu_pages_available(vcpu); 3859 if (r) 3860 goto out_unlock; 3861 3862 if (is_tdp_mmu_fault) 3863 r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, 3864 pfn, prefault); 3865 else 3866 r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, 3867 prefault, is_tdp); 3868 3869 out_unlock: 3870 if (is_tdp_mmu_fault) 3871 read_unlock(&vcpu->kvm->mmu_lock); 3872 else 3873 write_unlock(&vcpu->kvm->mmu_lock); 3874 kvm_release_pfn_clean(pfn); 3875 return r; 3876 } 3877 3878 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, 3879 u32 error_code, bool prefault) 3880 { 3881 pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); 3882 3883 /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ 3884 return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, 3885 PG_LEVEL_2M, false); 3886 } 3887 3888 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 3889 u64 fault_address, char *insn, int insn_len) 3890 { 3891 int r = 1; 3892 u32 flags = vcpu->arch.apf.host_apf_flags; 3893 3894 #ifndef CONFIG_X86_64 3895 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ 3896 if (WARN_ON_ONCE(fault_address >> 32)) 3897 return -EFAULT; 3898 #endif 3899 3900 vcpu->arch.l1tf_flush_l1d = true; 3901 if (!flags) { 3902 trace_kvm_page_fault(fault_address, error_code); 3903 3904 if (kvm_event_needs_reinjection(vcpu)) 3905 kvm_mmu_unprotect_page_virt(vcpu, fault_address); 3906 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 3907 insn_len); 3908 } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { 3909 vcpu->arch.apf.host_apf_flags = 0; 3910 local_irq_disable(); 3911 kvm_async_pf_task_wait_schedule(fault_address); 3912 local_irq_enable(); 3913 } else { 3914 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); 3915 } 3916 3917 return r; 3918 } 3919 EXPORT_SYMBOL_GPL(kvm_handle_page_fault); 3920 3921 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 3922 bool prefault) 3923 { 3924 int max_level; 3925 3926 for (max_level = KVM_MAX_HUGEPAGE_LEVEL; 3927 max_level > PG_LEVEL_4K; 3928 max_level--) { 3929 int page_num = KVM_PAGES_PER_HPAGE(max_level); 3930 gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); 3931 3932 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) 3933 break; 3934 } 3935 3936 return direct_page_fault(vcpu, gpa, error_code, prefault, 3937 max_level, true); 3938 } 3939 3940 static void nonpaging_init_context(struct kvm_mmu *context) 3941 { 3942 context->page_fault = nonpaging_page_fault; 3943 context->gva_to_gpa = nonpaging_gva_to_gpa; 3944 context->sync_page = nonpaging_sync_page; 3945 context->invlpg = NULL; 3946 context->direct_map = true; 3947 } 3948 3949 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, 3950 union kvm_mmu_page_role role) 3951 { 3952 return (role.direct || pgd == root->pgd) && 3953 VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && 3954 role.word == to_shadow_page(root->hpa)->role.word; 3955 } 3956 3957 /* 3958 * Find out if a previously cached root matching the new pgd/role is available. 3959 * The current root is also inserted into the cache. 3960 * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is 3961 * returned. 3962 * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and 3963 * false is returned. This root should now be freed by the caller. 3964 */ 3965 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd, 3966 union kvm_mmu_page_role new_role) 3967 { 3968 uint i; 3969 struct kvm_mmu_root_info root; 3970 struct kvm_mmu *mmu = vcpu->arch.mmu; 3971 3972 root.pgd = mmu->root_pgd; 3973 root.hpa = mmu->root_hpa; 3974 3975 if (is_root_usable(&root, new_pgd, new_role)) 3976 return true; 3977 3978 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 3979 swap(root, mmu->prev_roots[i]); 3980 3981 if (is_root_usable(&root, new_pgd, new_role)) 3982 break; 3983 } 3984 3985 mmu->root_hpa = root.hpa; 3986 mmu->root_pgd = root.pgd; 3987 3988 return i < KVM_MMU_NUM_PREV_ROOTS; 3989 } 3990 3991 static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, 3992 union kvm_mmu_page_role new_role) 3993 { 3994 struct kvm_mmu *mmu = vcpu->arch.mmu; 3995 3996 /* 3997 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid 3998 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs 3999 * later if necessary. 4000 */ 4001 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 4002 mmu->root_level >= PT64_ROOT_4LEVEL) 4003 return cached_root_available(vcpu, new_pgd, new_role); 4004 4005 return false; 4006 } 4007 4008 static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, 4009 union kvm_mmu_page_role new_role) 4010 { 4011 if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { 4012 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); 4013 return; 4014 } 4015 4016 /* 4017 * It's possible that the cached previous root page is obsolete because 4018 * of a change in the MMU generation number. However, changing the 4019 * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will 4020 * free the root set here and allocate a new one. 4021 */ 4022 kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); 4023 4024 if (force_flush_and_sync_on_reuse) { 4025 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 4026 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 4027 } 4028 4029 /* 4030 * The last MMIO access's GVA and GPA are cached in the VCPU. When 4031 * switching to a new CR3, that GVA->GPA mapping may no longer be 4032 * valid. So clear any cached MMIO info even when we don't need to sync 4033 * the shadow page tables. 4034 */ 4035 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 4036 4037 /* 4038 * If this is a direct root page, it doesn't have a write flooding 4039 * count. Otherwise, clear the write flooding count. 4040 */ 4041 if (!new_role.direct) 4042 __clear_sp_write_flooding_count( 4043 to_shadow_page(vcpu->arch.mmu->root_hpa)); 4044 } 4045 4046 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) 4047 { 4048 __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu)); 4049 } 4050 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); 4051 4052 static unsigned long get_cr3(struct kvm_vcpu *vcpu) 4053 { 4054 return kvm_read_cr3(vcpu); 4055 } 4056 4057 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 4058 unsigned int access, int *nr_present) 4059 { 4060 if (unlikely(is_mmio_spte(*sptep))) { 4061 if (gfn != get_mmio_spte_gfn(*sptep)) { 4062 mmu_spte_clear_no_track(sptep); 4063 return true; 4064 } 4065 4066 (*nr_present)++; 4067 mark_mmio_spte(vcpu, sptep, gfn, access); 4068 return true; 4069 } 4070 4071 return false; 4072 } 4073 4074 #define PTTYPE_EPT 18 /* arbitrary */ 4075 #define PTTYPE PTTYPE_EPT 4076 #include "paging_tmpl.h" 4077 #undef PTTYPE 4078 4079 #define PTTYPE 64 4080 #include "paging_tmpl.h" 4081 #undef PTTYPE 4082 4083 #define PTTYPE 32 4084 #include "paging_tmpl.h" 4085 #undef PTTYPE 4086 4087 static void 4088 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, 4089 u64 pa_bits_rsvd, int level, bool nx, bool gbpages, 4090 bool pse, bool amd) 4091 { 4092 u64 gbpages_bit_rsvd = 0; 4093 u64 nonleaf_bit8_rsvd = 0; 4094 u64 high_bits_rsvd; 4095 4096 rsvd_check->bad_mt_xwr = 0; 4097 4098 if (!gbpages) 4099 gbpages_bit_rsvd = rsvd_bits(7, 7); 4100 4101 if (level == PT32E_ROOT_LEVEL) 4102 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); 4103 else 4104 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); 4105 4106 /* Note, NX doesn't exist in PDPTEs, this is handled below. */ 4107 if (!nx) 4108 high_bits_rsvd |= rsvd_bits(63, 63); 4109 4110 /* 4111 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for 4112 * leaf entries) on AMD CPUs only. 4113 */ 4114 if (amd) 4115 nonleaf_bit8_rsvd = rsvd_bits(8, 8); 4116 4117 switch (level) { 4118 case PT32_ROOT_LEVEL: 4119 /* no rsvd bits for 2 level 4K page table entries */ 4120 rsvd_check->rsvd_bits_mask[0][1] = 0; 4121 rsvd_check->rsvd_bits_mask[0][0] = 0; 4122 rsvd_check->rsvd_bits_mask[1][0] = 4123 rsvd_check->rsvd_bits_mask[0][0]; 4124 4125 if (!pse) { 4126 rsvd_check->rsvd_bits_mask[1][1] = 0; 4127 break; 4128 } 4129 4130 if (is_cpuid_PSE36()) 4131 /* 36bits PSE 4MB page */ 4132 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 4133 else 4134 /* 32 bits PSE 4MB page */ 4135 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 4136 break; 4137 case PT32E_ROOT_LEVEL: 4138 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | 4139 high_bits_rsvd | 4140 rsvd_bits(5, 8) | 4141 rsvd_bits(1, 2); /* PDPTE */ 4142 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ 4143 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ 4144 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | 4145 rsvd_bits(13, 20); /* large page */ 4146 rsvd_check->rsvd_bits_mask[1][0] = 4147 rsvd_check->rsvd_bits_mask[0][0]; 4148 break; 4149 case PT64_ROOT_5LEVEL: 4150 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | 4151 nonleaf_bit8_rsvd | 4152 rsvd_bits(7, 7); 4153 rsvd_check->rsvd_bits_mask[1][4] = 4154 rsvd_check->rsvd_bits_mask[0][4]; 4155 fallthrough; 4156 case PT64_ROOT_4LEVEL: 4157 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | 4158 nonleaf_bit8_rsvd | 4159 rsvd_bits(7, 7); 4160 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | 4161 gbpages_bit_rsvd; 4162 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; 4163 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; 4164 rsvd_check->rsvd_bits_mask[1][3] = 4165 rsvd_check->rsvd_bits_mask[0][3]; 4166 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | 4167 gbpages_bit_rsvd | 4168 rsvd_bits(13, 29); 4169 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | 4170 rsvd_bits(13, 20); /* large page */ 4171 rsvd_check->rsvd_bits_mask[1][0] = 4172 rsvd_check->rsvd_bits_mask[0][0]; 4173 break; 4174 } 4175 } 4176 4177 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) 4178 { 4179 /* 4180 * If TDP is enabled, let the guest use GBPAGES if they're supported in 4181 * hardware. The hardware page walker doesn't let KVM disable GBPAGES, 4182 * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA 4183 * walk for performance and complexity reasons. Not to mention KVM 4184 * _can't_ solve the problem because GVA->GPA walks aren't visible to 4185 * KVM once a TDP translation is installed. Mimic hardware behavior so 4186 * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. 4187 */ 4188 return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : 4189 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); 4190 } 4191 4192 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 4193 struct kvm_mmu *context) 4194 { 4195 __reset_rsvds_bits_mask(&context->guest_rsvd_check, 4196 vcpu->arch.reserved_gpa_bits, 4197 context->root_level, is_efer_nx(context), 4198 guest_can_use_gbpages(vcpu), 4199 is_cr4_pse(context), 4200 guest_cpuid_is_amd_or_hygon(vcpu)); 4201 } 4202 4203 static void 4204 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, 4205 u64 pa_bits_rsvd, bool execonly) 4206 { 4207 u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); 4208 u64 bad_mt_xwr; 4209 4210 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); 4211 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); 4212 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6); 4213 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6); 4214 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; 4215 4216 /* large page */ 4217 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; 4218 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; 4219 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29); 4220 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20); 4221 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; 4222 4223 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ 4224 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ 4225 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ 4226 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ 4227 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ 4228 if (!execonly) { 4229 /* bits 0..2 must not be 100 unless VMX capabilities allow it */ 4230 bad_mt_xwr |= REPEAT_BYTE(1ull << 4); 4231 } 4232 rsvd_check->bad_mt_xwr = bad_mt_xwr; 4233 } 4234 4235 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, 4236 struct kvm_mmu *context, bool execonly) 4237 { 4238 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, 4239 vcpu->arch.reserved_gpa_bits, execonly); 4240 } 4241 4242 static inline u64 reserved_hpa_bits(void) 4243 { 4244 return rsvd_bits(shadow_phys_bits, 63); 4245 } 4246 4247 /* 4248 * the page table on host is the shadow page table for the page 4249 * table in guest or amd nested guest, its mmu features completely 4250 * follow the features in guest. 4251 */ 4252 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4253 struct kvm_mmu *context) 4254 { 4255 /* 4256 * KVM uses NX when TDP is disabled to handle a variety of scenarios, 4257 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and 4258 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. 4259 * The iTLB multi-hit workaround can be toggled at any time, so assume 4260 * NX can be used by any non-nested shadow MMU to avoid having to reset 4261 * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. 4262 */ 4263 bool uses_nx = is_efer_nx(context) || !tdp_enabled; 4264 4265 /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ 4266 bool is_amd = true; 4267 /* KVM doesn't use 2-level page tables for the shadow MMU. */ 4268 bool is_pse = false; 4269 struct rsvd_bits_validate *shadow_zero_check; 4270 int i; 4271 4272 WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL); 4273 4274 shadow_zero_check = &context->shadow_zero_check; 4275 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), 4276 context->shadow_root_level, uses_nx, 4277 guest_can_use_gbpages(vcpu), is_pse, is_amd); 4278 4279 if (!shadow_me_mask) 4280 return; 4281 4282 for (i = context->shadow_root_level; --i >= 0;) { 4283 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4284 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4285 } 4286 4287 } 4288 4289 static inline bool boot_cpu_is_amd(void) 4290 { 4291 WARN_ON_ONCE(!tdp_enabled); 4292 return shadow_x_mask == 0; 4293 } 4294 4295 /* 4296 * the direct page table on host, use as much mmu features as 4297 * possible, however, kvm currently does not do execution-protection. 4298 */ 4299 static void 4300 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4301 struct kvm_mmu *context) 4302 { 4303 struct rsvd_bits_validate *shadow_zero_check; 4304 int i; 4305 4306 shadow_zero_check = &context->shadow_zero_check; 4307 4308 if (boot_cpu_is_amd()) 4309 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), 4310 context->shadow_root_level, false, 4311 boot_cpu_has(X86_FEATURE_GBPAGES), 4312 false, true); 4313 else 4314 __reset_rsvds_bits_mask_ept(shadow_zero_check, 4315 reserved_hpa_bits(), false); 4316 4317 if (!shadow_me_mask) 4318 return; 4319 4320 for (i = context->shadow_root_level; --i >= 0;) { 4321 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4322 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4323 } 4324 } 4325 4326 /* 4327 * as the comments in reset_shadow_zero_bits_mask() except it 4328 * is the shadow page table for intel nested guest. 4329 */ 4330 static void 4331 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4332 struct kvm_mmu *context, bool execonly) 4333 { 4334 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 4335 reserved_hpa_bits(), execonly); 4336 } 4337 4338 #define BYTE_MASK(access) \ 4339 ((1 & (access) ? 2 : 0) | \ 4340 (2 & (access) ? 4 : 0) | \ 4341 (3 & (access) ? 8 : 0) | \ 4342 (4 & (access) ? 16 : 0) | \ 4343 (5 & (access) ? 32 : 0) | \ 4344 (6 & (access) ? 64 : 0) | \ 4345 (7 & (access) ? 128 : 0)) 4346 4347 4348 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) 4349 { 4350 unsigned byte; 4351 4352 const u8 x = BYTE_MASK(ACC_EXEC_MASK); 4353 const u8 w = BYTE_MASK(ACC_WRITE_MASK); 4354 const u8 u = BYTE_MASK(ACC_USER_MASK); 4355 4356 bool cr4_smep = is_cr4_smep(mmu); 4357 bool cr4_smap = is_cr4_smap(mmu); 4358 bool cr0_wp = is_cr0_wp(mmu); 4359 bool efer_nx = is_efer_nx(mmu); 4360 4361 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { 4362 unsigned pfec = byte << 1; 4363 4364 /* 4365 * Each "*f" variable has a 1 bit for each UWX value 4366 * that causes a fault with the given PFEC. 4367 */ 4368 4369 /* Faults from writes to non-writable pages */ 4370 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; 4371 /* Faults from user mode accesses to supervisor pages */ 4372 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; 4373 /* Faults from fetches of non-executable pages*/ 4374 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; 4375 /* Faults from kernel mode fetches of user pages */ 4376 u8 smepf = 0; 4377 /* Faults from kernel mode accesses of user pages */ 4378 u8 smapf = 0; 4379 4380 if (!ept) { 4381 /* Faults from kernel mode accesses to user pages */ 4382 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; 4383 4384 /* Not really needed: !nx will cause pte.nx to fault */ 4385 if (!efer_nx) 4386 ff = 0; 4387 4388 /* Allow supervisor writes if !cr0.wp */ 4389 if (!cr0_wp) 4390 wf = (pfec & PFERR_USER_MASK) ? wf : 0; 4391 4392 /* Disallow supervisor fetches of user code if cr4.smep */ 4393 if (cr4_smep) 4394 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; 4395 4396 /* 4397 * SMAP:kernel-mode data accesses from user-mode 4398 * mappings should fault. A fault is considered 4399 * as a SMAP violation if all of the following 4400 * conditions are true: 4401 * - X86_CR4_SMAP is set in CR4 4402 * - A user page is accessed 4403 * - The access is not a fetch 4404 * - Page fault in kernel mode 4405 * - if CPL = 3 or X86_EFLAGS_AC is clear 4406 * 4407 * Here, we cover the first three conditions. 4408 * The fourth is computed dynamically in permission_fault(); 4409 * PFERR_RSVD_MASK bit will be set in PFEC if the access is 4410 * *not* subject to SMAP restrictions. 4411 */ 4412 if (cr4_smap) 4413 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; 4414 } 4415 4416 mmu->permissions[byte] = ff | uf | wf | smepf | smapf; 4417 } 4418 } 4419 4420 /* 4421 * PKU is an additional mechanism by which the paging controls access to 4422 * user-mode addresses based on the value in the PKRU register. Protection 4423 * key violations are reported through a bit in the page fault error code. 4424 * Unlike other bits of the error code, the PK bit is not known at the 4425 * call site of e.g. gva_to_gpa; it must be computed directly in 4426 * permission_fault based on two bits of PKRU, on some machine state (CR4, 4427 * CR0, EFER, CPL), and on other bits of the error code and the page tables. 4428 * 4429 * In particular the following conditions come from the error code, the 4430 * page tables and the machine state: 4431 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 4432 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) 4433 * - PK is always zero if U=0 in the page tables 4434 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. 4435 * 4436 * The PKRU bitmask caches the result of these four conditions. The error 4437 * code (minus the P bit) and the page table's U bit form an index into the 4438 * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed 4439 * with the two bits of the PKRU register corresponding to the protection key. 4440 * For the first three conditions above the bits will be 00, thus masking 4441 * away both AD and WD. For all reads or if the last condition holds, WD 4442 * only will be masked away. 4443 */ 4444 static void update_pkru_bitmask(struct kvm_mmu *mmu) 4445 { 4446 unsigned bit; 4447 bool wp; 4448 4449 if (!is_cr4_pke(mmu)) { 4450 mmu->pkru_mask = 0; 4451 return; 4452 } 4453 4454 wp = is_cr0_wp(mmu); 4455 4456 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { 4457 unsigned pfec, pkey_bits; 4458 bool check_pkey, check_write, ff, uf, wf, pte_user; 4459 4460 pfec = bit << 1; 4461 ff = pfec & PFERR_FETCH_MASK; 4462 uf = pfec & PFERR_USER_MASK; 4463 wf = pfec & PFERR_WRITE_MASK; 4464 4465 /* PFEC.RSVD is replaced by ACC_USER_MASK. */ 4466 pte_user = pfec & PFERR_RSVD_MASK; 4467 4468 /* 4469 * Only need to check the access which is not an 4470 * instruction fetch and is to a user page. 4471 */ 4472 check_pkey = (!ff && pte_user); 4473 /* 4474 * write access is controlled by PKRU if it is a 4475 * user access or CR0.WP = 1. 4476 */ 4477 check_write = check_pkey && wf && (uf || wp); 4478 4479 /* PKRU.AD stops both read and write access. */ 4480 pkey_bits = !!check_pkey; 4481 /* PKRU.WD stops write access. */ 4482 pkey_bits |= (!!check_write) << 1; 4483 4484 mmu->pkru_mask |= (pkey_bits & 3) << pfec; 4485 } 4486 } 4487 4488 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, 4489 struct kvm_mmu *mmu) 4490 { 4491 if (!is_cr0_pg(mmu)) 4492 return; 4493 4494 reset_rsvds_bits_mask(vcpu, mmu); 4495 update_permission_bitmask(mmu, false); 4496 update_pkru_bitmask(mmu); 4497 } 4498 4499 static void paging64_init_context(struct kvm_mmu *context) 4500 { 4501 context->page_fault = paging64_page_fault; 4502 context->gva_to_gpa = paging64_gva_to_gpa; 4503 context->sync_page = paging64_sync_page; 4504 context->invlpg = paging64_invlpg; 4505 context->direct_map = false; 4506 } 4507 4508 static void paging32_init_context(struct kvm_mmu *context) 4509 { 4510 context->page_fault = paging32_page_fault; 4511 context->gva_to_gpa = paging32_gva_to_gpa; 4512 context->sync_page = paging32_sync_page; 4513 context->invlpg = paging32_invlpg; 4514 context->direct_map = false; 4515 } 4516 4517 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, 4518 struct kvm_mmu_role_regs *regs) 4519 { 4520 union kvm_mmu_extended_role ext = {0}; 4521 4522 if (____is_cr0_pg(regs)) { 4523 ext.cr0_pg = 1; 4524 ext.cr4_pae = ____is_cr4_pae(regs); 4525 ext.cr4_smep = ____is_cr4_smep(regs); 4526 ext.cr4_smap = ____is_cr4_smap(regs); 4527 ext.cr4_pse = ____is_cr4_pse(regs); 4528 4529 /* PKEY and LA57 are active iff long mode is active. */ 4530 ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); 4531 ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); 4532 } 4533 4534 ext.valid = 1; 4535 4536 return ext; 4537 } 4538 4539 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, 4540 struct kvm_mmu_role_regs *regs, 4541 bool base_only) 4542 { 4543 union kvm_mmu_role role = {0}; 4544 4545 role.base.access = ACC_ALL; 4546 if (____is_cr0_pg(regs)) { 4547 role.base.efer_nx = ____is_efer_nx(regs); 4548 role.base.cr0_wp = ____is_cr0_wp(regs); 4549 } 4550 role.base.smm = is_smm(vcpu); 4551 role.base.guest_mode = is_guest_mode(vcpu); 4552 4553 if (base_only) 4554 return role; 4555 4556 role.ext = kvm_calc_mmu_role_ext(vcpu, regs); 4557 4558 return role; 4559 } 4560 4561 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) 4562 { 4563 /* Use 5-level TDP if and only if it's useful/necessary. */ 4564 if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) 4565 return 4; 4566 4567 return max_tdp_level; 4568 } 4569 4570 static union kvm_mmu_role 4571 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, 4572 struct kvm_mmu_role_regs *regs, bool base_only) 4573 { 4574 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); 4575 4576 role.base.ad_disabled = (shadow_accessed_mask == 0); 4577 role.base.level = kvm_mmu_get_tdp_level(vcpu); 4578 role.base.direct = true; 4579 role.base.gpte_is_8_bytes = true; 4580 4581 return role; 4582 } 4583 4584 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 4585 { 4586 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4587 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4588 union kvm_mmu_role new_role = 4589 kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, false); 4590 4591 if (new_role.as_u64 == context->mmu_role.as_u64) 4592 return; 4593 4594 context->mmu_role.as_u64 = new_role.as_u64; 4595 context->page_fault = kvm_tdp_page_fault; 4596 context->sync_page = nonpaging_sync_page; 4597 context->invlpg = NULL; 4598 context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); 4599 context->direct_map = true; 4600 context->get_guest_pgd = get_cr3; 4601 context->get_pdptr = kvm_pdptr_read; 4602 context->inject_page_fault = kvm_inject_page_fault; 4603 context->root_level = role_regs_to_root_level(®s); 4604 4605 if (!is_cr0_pg(context)) 4606 context->gva_to_gpa = nonpaging_gva_to_gpa; 4607 else if (is_cr4_pae(context)) 4608 context->gva_to_gpa = paging64_gva_to_gpa; 4609 else 4610 context->gva_to_gpa = paging32_gva_to_gpa; 4611 4612 reset_guest_paging_metadata(vcpu, context); 4613 reset_tdp_shadow_zero_bits_mask(vcpu, context); 4614 } 4615 4616 static union kvm_mmu_role 4617 kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, 4618 struct kvm_mmu_role_regs *regs, bool base_only) 4619 { 4620 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); 4621 4622 role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); 4623 role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); 4624 role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs); 4625 4626 return role; 4627 } 4628 4629 static union kvm_mmu_role 4630 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, 4631 struct kvm_mmu_role_regs *regs, bool base_only) 4632 { 4633 union kvm_mmu_role role = 4634 kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only); 4635 4636 role.base.direct = !____is_cr0_pg(regs); 4637 4638 if (!____is_efer_lma(regs)) 4639 role.base.level = PT32E_ROOT_LEVEL; 4640 else if (____is_cr4_la57(regs)) 4641 role.base.level = PT64_ROOT_5LEVEL; 4642 else 4643 role.base.level = PT64_ROOT_4LEVEL; 4644 4645 return role; 4646 } 4647 4648 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, 4649 struct kvm_mmu_role_regs *regs, 4650 union kvm_mmu_role new_role) 4651 { 4652 if (new_role.as_u64 == context->mmu_role.as_u64) 4653 return; 4654 4655 context->mmu_role.as_u64 = new_role.as_u64; 4656 4657 if (!is_cr0_pg(context)) 4658 nonpaging_init_context(context); 4659 else if (is_cr4_pae(context)) 4660 paging64_init_context(context); 4661 else 4662 paging32_init_context(context); 4663 context->root_level = role_regs_to_root_level(regs); 4664 4665 reset_guest_paging_metadata(vcpu, context); 4666 context->shadow_root_level = new_role.base.level; 4667 4668 reset_shadow_zero_bits_mask(vcpu, context); 4669 } 4670 4671 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, 4672 struct kvm_mmu_role_regs *regs) 4673 { 4674 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4675 union kvm_mmu_role new_role = 4676 kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false); 4677 4678 shadow_mmu_init_context(vcpu, context, regs, new_role); 4679 } 4680 4681 static union kvm_mmu_role 4682 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu, 4683 struct kvm_mmu_role_regs *regs) 4684 { 4685 union kvm_mmu_role role = 4686 kvm_calc_shadow_root_page_role_common(vcpu, regs, false); 4687 4688 role.base.direct = false; 4689 role.base.level = kvm_mmu_get_tdp_level(vcpu); 4690 4691 return role; 4692 } 4693 4694 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, 4695 unsigned long cr4, u64 efer, gpa_t nested_cr3) 4696 { 4697 struct kvm_mmu *context = &vcpu->arch.guest_mmu; 4698 struct kvm_mmu_role_regs regs = { 4699 .cr0 = cr0, 4700 .cr4 = cr4, 4701 .efer = efer, 4702 }; 4703 union kvm_mmu_role new_role; 4704 4705 new_role = kvm_calc_shadow_npt_root_page_role(vcpu, ®s); 4706 4707 __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); 4708 4709 shadow_mmu_init_context(vcpu, context, ®s, new_role); 4710 } 4711 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); 4712 4713 static union kvm_mmu_role 4714 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, 4715 bool execonly, u8 level) 4716 { 4717 union kvm_mmu_role role = {0}; 4718 4719 /* SMM flag is inherited from root_mmu */ 4720 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; 4721 4722 role.base.level = level; 4723 role.base.gpte_is_8_bytes = true; 4724 role.base.direct = false; 4725 role.base.ad_disabled = !accessed_dirty; 4726 role.base.guest_mode = true; 4727 role.base.access = ACC_ALL; 4728 4729 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ 4730 role.ext.word = 0; 4731 role.ext.execonly = execonly; 4732 role.ext.valid = 1; 4733 4734 return role; 4735 } 4736 4737 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 4738 bool accessed_dirty, gpa_t new_eptp) 4739 { 4740 struct kvm_mmu *context = &vcpu->arch.guest_mmu; 4741 u8 level = vmx_eptp_page_walk_level(new_eptp); 4742 union kvm_mmu_role new_role = 4743 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, 4744 execonly, level); 4745 4746 __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base); 4747 4748 if (new_role.as_u64 == context->mmu_role.as_u64) 4749 return; 4750 4751 context->mmu_role.as_u64 = new_role.as_u64; 4752 4753 context->shadow_root_level = level; 4754 4755 context->ept_ad = accessed_dirty; 4756 context->page_fault = ept_page_fault; 4757 context->gva_to_gpa = ept_gva_to_gpa; 4758 context->sync_page = ept_sync_page; 4759 context->invlpg = ept_invlpg; 4760 context->root_level = level; 4761 context->direct_map = false; 4762 4763 update_permission_bitmask(context, true); 4764 update_pkru_bitmask(context); 4765 reset_rsvds_bits_mask_ept(vcpu, context, execonly); 4766 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); 4767 } 4768 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); 4769 4770 static void init_kvm_softmmu(struct kvm_vcpu *vcpu) 4771 { 4772 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4773 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4774 4775 kvm_init_shadow_mmu(vcpu, ®s); 4776 4777 context->get_guest_pgd = get_cr3; 4778 context->get_pdptr = kvm_pdptr_read; 4779 context->inject_page_fault = kvm_inject_page_fault; 4780 } 4781 4782 static union kvm_mmu_role 4783 kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) 4784 { 4785 union kvm_mmu_role role; 4786 4787 role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false); 4788 4789 /* 4790 * Nested MMUs are used only for walking L2's gva->gpa, they never have 4791 * shadow pages of their own and so "direct" has no meaning. Set it 4792 * to "true" to try to detect bogus usage of the nested MMU. 4793 */ 4794 role.base.direct = true; 4795 role.base.level = role_regs_to_root_level(regs); 4796 return role; 4797 } 4798 4799 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) 4800 { 4801 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4802 union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, ®s); 4803 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; 4804 4805 if (new_role.as_u64 == g_context->mmu_role.as_u64) 4806 return; 4807 4808 g_context->mmu_role.as_u64 = new_role.as_u64; 4809 g_context->get_guest_pgd = get_cr3; 4810 g_context->get_pdptr = kvm_pdptr_read; 4811 g_context->inject_page_fault = kvm_inject_page_fault; 4812 g_context->root_level = new_role.base.level; 4813 4814 /* 4815 * L2 page tables are never shadowed, so there is no need to sync 4816 * SPTEs. 4817 */ 4818 g_context->invlpg = NULL; 4819 4820 /* 4821 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using 4822 * L1's nested page tables (e.g. EPT12). The nested translation 4823 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using 4824 * L2's page tables as the first level of translation and L1's 4825 * nested page tables as the second level of translation. Basically 4826 * the gva_to_gpa functions between mmu and nested_mmu are swapped. 4827 */ 4828 if (!is_paging(vcpu)) 4829 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; 4830 else if (is_long_mode(vcpu)) 4831 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 4832 else if (is_pae(vcpu)) 4833 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 4834 else 4835 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 4836 4837 reset_guest_paging_metadata(vcpu, g_context); 4838 } 4839 4840 void kvm_init_mmu(struct kvm_vcpu *vcpu) 4841 { 4842 if (mmu_is_nested(vcpu)) 4843 init_kvm_nested_mmu(vcpu); 4844 else if (tdp_enabled) 4845 init_kvm_tdp_mmu(vcpu); 4846 else 4847 init_kvm_softmmu(vcpu); 4848 } 4849 EXPORT_SYMBOL_GPL(kvm_init_mmu); 4850 4851 static union kvm_mmu_page_role 4852 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) 4853 { 4854 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4855 union kvm_mmu_role role; 4856 4857 if (tdp_enabled) 4858 role = kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, true); 4859 else 4860 role = kvm_calc_shadow_mmu_root_page_role(vcpu, ®s, true); 4861 4862 return role.base; 4863 } 4864 4865 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) 4866 { 4867 /* 4868 * Invalidate all MMU roles to force them to reinitialize as CPUID 4869 * information is factored into reserved bit calculations. 4870 */ 4871 vcpu->arch.root_mmu.mmu_role.ext.valid = 0; 4872 vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; 4873 vcpu->arch.nested_mmu.mmu_role.ext.valid = 0; 4874 kvm_mmu_reset_context(vcpu); 4875 4876 /* 4877 * KVM does not correctly handle changing guest CPUID after KVM_RUN, as 4878 * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't 4879 * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page 4880 * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise 4881 * sweep the problem under the rug. 4882 * 4883 * KVM's horrific CPUID ABI makes the problem all but impossible to 4884 * solve, as correctly handling multiple vCPU models (with respect to 4885 * paging and physical address properties) in a single VM would require 4886 * tracking all relevant CPUID information in kvm_mmu_page_role. That 4887 * is very undesirable as it would double the memory requirements for 4888 * gfn_track (see struct kvm_mmu_page_role comments), and in practice 4889 * no sane VMM mucks with the core vCPU model on the fly. 4890 */ 4891 if (vcpu->arch.last_vmentry_cpu != -1) { 4892 pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); 4893 pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); 4894 } 4895 } 4896 4897 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 4898 { 4899 kvm_mmu_unload(vcpu); 4900 kvm_init_mmu(vcpu); 4901 } 4902 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 4903 4904 int kvm_mmu_load(struct kvm_vcpu *vcpu) 4905 { 4906 int r; 4907 4908 r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); 4909 if (r) 4910 goto out; 4911 r = mmu_alloc_special_roots(vcpu); 4912 if (r) 4913 goto out; 4914 if (vcpu->arch.mmu->direct_map) 4915 r = mmu_alloc_direct_roots(vcpu); 4916 else 4917 r = mmu_alloc_shadow_roots(vcpu); 4918 if (r) 4919 goto out; 4920 4921 kvm_mmu_sync_roots(vcpu); 4922 4923 kvm_mmu_load_pgd(vcpu); 4924 static_call(kvm_x86_tlb_flush_current)(vcpu); 4925 out: 4926 return r; 4927 } 4928 4929 void kvm_mmu_unload(struct kvm_vcpu *vcpu) 4930 { 4931 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); 4932 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa)); 4933 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4934 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); 4935 } 4936 4937 static bool need_remote_flush(u64 old, u64 new) 4938 { 4939 if (!is_shadow_present_pte(old)) 4940 return false; 4941 if (!is_shadow_present_pte(new)) 4942 return true; 4943 if ((old ^ new) & PT64_BASE_ADDR_MASK) 4944 return true; 4945 old ^= shadow_nx_mask; 4946 new ^= shadow_nx_mask; 4947 return (old & ~new & PT64_PERM_MASK) != 0; 4948 } 4949 4950 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, 4951 int *bytes) 4952 { 4953 u64 gentry = 0; 4954 int r; 4955 4956 /* 4957 * Assume that the pte write on a page table of the same type 4958 * as the current vcpu paging mode since we update the sptes only 4959 * when they have the same mode. 4960 */ 4961 if (is_pae(vcpu) && *bytes == 4) { 4962 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 4963 *gpa &= ~(gpa_t)7; 4964 *bytes = 8; 4965 } 4966 4967 if (*bytes == 4 || *bytes == 8) { 4968 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); 4969 if (r) 4970 gentry = 0; 4971 } 4972 4973 return gentry; 4974 } 4975 4976 /* 4977 * If we're seeing too many writes to a page, it may no longer be a page table, 4978 * or we may be forking, in which case it is better to unmap the page. 4979 */ 4980 static bool detect_write_flooding(struct kvm_mmu_page *sp) 4981 { 4982 /* 4983 * Skip write-flooding detected for the sp whose level is 1, because 4984 * it can become unsync, then the guest page is not write-protected. 4985 */ 4986 if (sp->role.level == PG_LEVEL_4K) 4987 return false; 4988 4989 atomic_inc(&sp->write_flooding_count); 4990 return atomic_read(&sp->write_flooding_count) >= 3; 4991 } 4992 4993 /* 4994 * Misaligned accesses are too much trouble to fix up; also, they usually 4995 * indicate a page is not used as a page table. 4996 */ 4997 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, 4998 int bytes) 4999 { 5000 unsigned offset, pte_size, misaligned; 5001 5002 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 5003 gpa, bytes, sp->role.word); 5004 5005 offset = offset_in_page(gpa); 5006 pte_size = sp->role.gpte_is_8_bytes ? 8 : 4; 5007 5008 /* 5009 * Sometimes, the OS only writes the last one bytes to update status 5010 * bits, for example, in linux, andb instruction is used in clear_bit(). 5011 */ 5012 if (!(offset & (pte_size - 1)) && bytes == 1) 5013 return false; 5014 5015 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 5016 misaligned |= bytes < 4; 5017 5018 return misaligned; 5019 } 5020 5021 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) 5022 { 5023 unsigned page_offset, quadrant; 5024 u64 *spte; 5025 int level; 5026 5027 page_offset = offset_in_page(gpa); 5028 level = sp->role.level; 5029 *nspte = 1; 5030 if (!sp->role.gpte_is_8_bytes) { 5031 page_offset <<= 1; /* 32->64 */ 5032 /* 5033 * A 32-bit pde maps 4MB while the shadow pdes map 5034 * only 2MB. So we need to double the offset again 5035 * and zap two pdes instead of one. 5036 */ 5037 if (level == PT32_ROOT_LEVEL) { 5038 page_offset &= ~7; /* kill rounding error */ 5039 page_offset <<= 1; 5040 *nspte = 2; 5041 } 5042 quadrant = page_offset >> PAGE_SHIFT; 5043 page_offset &= ~PAGE_MASK; 5044 if (quadrant != sp->role.quadrant) 5045 return NULL; 5046 } 5047 5048 spte = &sp->spt[page_offset / sizeof(*spte)]; 5049 return spte; 5050 } 5051 5052 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 5053 const u8 *new, int bytes, 5054 struct kvm_page_track_notifier_node *node) 5055 { 5056 gfn_t gfn = gpa >> PAGE_SHIFT; 5057 struct kvm_mmu_page *sp; 5058 LIST_HEAD(invalid_list); 5059 u64 entry, gentry, *spte; 5060 int npte; 5061 bool remote_flush, local_flush; 5062 5063 /* 5064 * If we don't have indirect shadow pages, it means no page is 5065 * write-protected, so we can exit simply. 5066 */ 5067 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) 5068 return; 5069 5070 remote_flush = local_flush = false; 5071 5072 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 5073 5074 /* 5075 * No need to care whether allocation memory is successful 5076 * or not since pte prefetch is skipped if it does not have 5077 * enough objects in the cache. 5078 */ 5079 mmu_topup_memory_caches(vcpu, true); 5080 5081 write_lock(&vcpu->kvm->mmu_lock); 5082 5083 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); 5084 5085 ++vcpu->kvm->stat.mmu_pte_write; 5086 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 5087 5088 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { 5089 if (detect_write_misaligned(sp, gpa, bytes) || 5090 detect_write_flooding(sp)) { 5091 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 5092 ++vcpu->kvm->stat.mmu_flooded; 5093 continue; 5094 } 5095 5096 spte = get_written_sptes(sp, gpa, &npte); 5097 if (!spte) 5098 continue; 5099 5100 local_flush = true; 5101 while (npte--) { 5102 entry = *spte; 5103 mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); 5104 if (gentry && sp->role.level != PG_LEVEL_4K) 5105 ++vcpu->kvm->stat.mmu_pde_zapped; 5106 if (need_remote_flush(entry, *spte)) 5107 remote_flush = true; 5108 ++spte; 5109 } 5110 } 5111 kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); 5112 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 5113 write_unlock(&vcpu->kvm->mmu_lock); 5114 } 5115 5116 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 5117 void *insn, int insn_len) 5118 { 5119 int r, emulation_type = EMULTYPE_PF; 5120 bool direct = vcpu->arch.mmu->direct_map; 5121 5122 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 5123 return RET_PF_RETRY; 5124 5125 r = RET_PF_INVALID; 5126 if (unlikely(error_code & PFERR_RSVD_MASK)) { 5127 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); 5128 if (r == RET_PF_EMULATE) 5129 goto emulate; 5130 } 5131 5132 if (r == RET_PF_INVALID) { 5133 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, 5134 lower_32_bits(error_code), false); 5135 if (WARN_ON_ONCE(r == RET_PF_INVALID)) 5136 return -EIO; 5137 } 5138 5139 if (r < 0) 5140 return r; 5141 if (r != RET_PF_EMULATE) 5142 return 1; 5143 5144 /* 5145 * Before emulating the instruction, check if the error code 5146 * was due to a RO violation while translating the guest page. 5147 * This can occur when using nested virtualization with nested 5148 * paging in both guests. If true, we simply unprotect the page 5149 * and resume the guest. 5150 */ 5151 if (vcpu->arch.mmu->direct_map && 5152 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { 5153 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); 5154 return 1; 5155 } 5156 5157 /* 5158 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still 5159 * optimistically try to just unprotect the page and let the processor 5160 * re-execute the instruction that caused the page fault. Do not allow 5161 * retrying MMIO emulation, as it's not only pointless but could also 5162 * cause us to enter an infinite loop because the processor will keep 5163 * faulting on the non-existent MMIO address. Retrying an instruction 5164 * from a nested guest is also pointless and dangerous as we are only 5165 * explicitly shadowing L1's page tables, i.e. unprotecting something 5166 * for L1 isn't going to magically fix whatever issue cause L2 to fail. 5167 */ 5168 if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) 5169 emulation_type |= EMULTYPE_ALLOW_RETRY_PF; 5170 emulate: 5171 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, 5172 insn_len); 5173 } 5174 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 5175 5176 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 5177 gva_t gva, hpa_t root_hpa) 5178 { 5179 int i; 5180 5181 /* It's actually a GPA for vcpu->arch.guest_mmu. */ 5182 if (mmu != &vcpu->arch.guest_mmu) { 5183 /* INVLPG on a non-canonical address is a NOP according to the SDM. */ 5184 if (is_noncanonical_address(gva, vcpu)) 5185 return; 5186 5187 static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); 5188 } 5189 5190 if (!mmu->invlpg) 5191 return; 5192 5193 if (root_hpa == INVALID_PAGE) { 5194 mmu->invlpg(vcpu, gva, mmu->root_hpa); 5195 5196 /* 5197 * INVLPG is required to invalidate any global mappings for the VA, 5198 * irrespective of PCID. Since it would take us roughly similar amount 5199 * of work to determine whether any of the prev_root mappings of the VA 5200 * is marked global, or to just sync it blindly, so we might as well 5201 * just always sync it. 5202 * 5203 * Mappings not reachable via the current cr3 or the prev_roots will be 5204 * synced when switching to that cr3, so nothing needs to be done here 5205 * for them. 5206 */ 5207 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5208 if (VALID_PAGE(mmu->prev_roots[i].hpa)) 5209 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); 5210 } else { 5211 mmu->invlpg(vcpu, gva, root_hpa); 5212 } 5213 } 5214 5215 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 5216 { 5217 kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); 5218 ++vcpu->stat.invlpg; 5219 } 5220 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 5221 5222 5223 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) 5224 { 5225 struct kvm_mmu *mmu = vcpu->arch.mmu; 5226 bool tlb_flush = false; 5227 uint i; 5228 5229 if (pcid == kvm_get_active_pcid(vcpu)) { 5230 mmu->invlpg(vcpu, gva, mmu->root_hpa); 5231 tlb_flush = true; 5232 } 5233 5234 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5235 if (VALID_PAGE(mmu->prev_roots[i].hpa) && 5236 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { 5237 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); 5238 tlb_flush = true; 5239 } 5240 } 5241 5242 if (tlb_flush) 5243 static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); 5244 5245 ++vcpu->stat.invlpg; 5246 5247 /* 5248 * Mappings not reachable via the current cr3 or the prev_roots will be 5249 * synced when switching to that cr3, so nothing needs to be done here 5250 * for them. 5251 */ 5252 } 5253 5254 void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, 5255 int tdp_huge_page_level) 5256 { 5257 tdp_enabled = enable_tdp; 5258 max_tdp_level = tdp_max_root_level; 5259 5260 /* 5261 * max_huge_page_level reflects KVM's MMU capabilities irrespective 5262 * of kernel support, e.g. KVM may be capable of using 1GB pages when 5263 * the kernel is not. But, KVM never creates a page size greater than 5264 * what is used by the kernel for any given HVA, i.e. the kernel's 5265 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). 5266 */ 5267 if (tdp_enabled) 5268 max_huge_page_level = tdp_huge_page_level; 5269 else if (boot_cpu_has(X86_FEATURE_GBPAGES)) 5270 max_huge_page_level = PG_LEVEL_1G; 5271 else 5272 max_huge_page_level = PG_LEVEL_2M; 5273 } 5274 EXPORT_SYMBOL_GPL(kvm_configure_mmu); 5275 5276 /* The return value indicates if tlb flush on all vcpus is needed. */ 5277 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, 5278 struct kvm_memory_slot *slot); 5279 5280 /* The caller should hold mmu-lock before calling this function. */ 5281 static __always_inline bool 5282 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, 5283 slot_level_handler fn, int start_level, int end_level, 5284 gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, 5285 bool flush) 5286 { 5287 struct slot_rmap_walk_iterator iterator; 5288 5289 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, 5290 end_gfn, &iterator) { 5291 if (iterator.rmap) 5292 flush |= fn(kvm, iterator.rmap, memslot); 5293 5294 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 5295 if (flush && flush_on_yield) { 5296 kvm_flush_remote_tlbs_with_address(kvm, 5297 start_gfn, 5298 iterator.gfn - start_gfn + 1); 5299 flush = false; 5300 } 5301 cond_resched_rwlock_write(&kvm->mmu_lock); 5302 } 5303 } 5304 5305 return flush; 5306 } 5307 5308 static __always_inline bool 5309 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 5310 slot_level_handler fn, int start_level, int end_level, 5311 bool flush_on_yield) 5312 { 5313 return slot_handle_level_range(kvm, memslot, fn, start_level, 5314 end_level, memslot->base_gfn, 5315 memslot->base_gfn + memslot->npages - 1, 5316 flush_on_yield, false); 5317 } 5318 5319 static __always_inline bool 5320 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, 5321 slot_level_handler fn, bool flush_on_yield) 5322 { 5323 return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, 5324 PG_LEVEL_4K, flush_on_yield); 5325 } 5326 5327 static void free_mmu_pages(struct kvm_mmu *mmu) 5328 { 5329 if (!tdp_enabled && mmu->pae_root) 5330 set_memory_encrypted((unsigned long)mmu->pae_root, 1); 5331 free_page((unsigned long)mmu->pae_root); 5332 free_page((unsigned long)mmu->pml4_root); 5333 } 5334 5335 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 5336 { 5337 struct page *page; 5338 int i; 5339 5340 mmu->root_hpa = INVALID_PAGE; 5341 mmu->root_pgd = 0; 5342 mmu->translate_gpa = translate_gpa; 5343 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5344 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; 5345 5346 /* 5347 * When using PAE paging, the four PDPTEs are treated as 'root' pages, 5348 * while the PDP table is a per-vCPU construct that's allocated at MMU 5349 * creation. When emulating 32-bit mode, cr3 is only 32 bits even on 5350 * x86_64. Therefore we need to allocate the PDP table in the first 5351 * 4GB of memory, which happens to fit the DMA32 zone. TDP paging 5352 * generally doesn't use PAE paging and can skip allocating the PDP 5353 * table. The main exception, handled here, is SVM's 32-bit NPT. The 5354 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit 5355 * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots(). 5356 */ 5357 if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) 5358 return 0; 5359 5360 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); 5361 if (!page) 5362 return -ENOMEM; 5363 5364 mmu->pae_root = page_address(page); 5365 5366 /* 5367 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to 5368 * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so 5369 * that KVM's writes and the CPU's reads get along. Note, this is 5370 * only necessary when using shadow paging, as 64-bit NPT can get at 5371 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported 5372 * by 32-bit kernels (when KVM itself uses 32-bit NPT). 5373 */ 5374 if (!tdp_enabled) 5375 set_memory_decrypted((unsigned long)mmu->pae_root, 1); 5376 else 5377 WARN_ON_ONCE(shadow_me_mask); 5378 5379 for (i = 0; i < 4; ++i) 5380 mmu->pae_root[i] = INVALID_PAE_ROOT; 5381 5382 return 0; 5383 } 5384 5385 int kvm_mmu_create(struct kvm_vcpu *vcpu) 5386 { 5387 int ret; 5388 5389 vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; 5390 vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; 5391 5392 vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; 5393 vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; 5394 5395 vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; 5396 5397 vcpu->arch.mmu = &vcpu->arch.root_mmu; 5398 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 5399 5400 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; 5401 5402 ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); 5403 if (ret) 5404 return ret; 5405 5406 ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); 5407 if (ret) 5408 goto fail_allocate_root; 5409 5410 return ret; 5411 fail_allocate_root: 5412 free_mmu_pages(&vcpu->arch.guest_mmu); 5413 return ret; 5414 } 5415 5416 #define BATCH_ZAP_PAGES 10 5417 static void kvm_zap_obsolete_pages(struct kvm *kvm) 5418 { 5419 struct kvm_mmu_page *sp, *node; 5420 int nr_zapped, batch = 0; 5421 5422 restart: 5423 list_for_each_entry_safe_reverse(sp, node, 5424 &kvm->arch.active_mmu_pages, link) { 5425 /* 5426 * No obsolete valid page exists before a newly created page 5427 * since active_mmu_pages is a FIFO list. 5428 */ 5429 if (!is_obsolete_sp(kvm, sp)) 5430 break; 5431 5432 /* 5433 * Invalid pages should never land back on the list of active 5434 * pages. Skip the bogus page, otherwise we'll get stuck in an 5435 * infinite loop if the page gets put back on the list (again). 5436 */ 5437 if (WARN_ON(sp->role.invalid)) 5438 continue; 5439 5440 /* 5441 * No need to flush the TLB since we're only zapping shadow 5442 * pages with an obsolete generation number and all vCPUS have 5443 * loaded a new root, i.e. the shadow pages being zapped cannot 5444 * be in active use by the guest. 5445 */ 5446 if (batch >= BATCH_ZAP_PAGES && 5447 cond_resched_rwlock_write(&kvm->mmu_lock)) { 5448 batch = 0; 5449 goto restart; 5450 } 5451 5452 if (__kvm_mmu_prepare_zap_page(kvm, sp, 5453 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { 5454 batch += nr_zapped; 5455 goto restart; 5456 } 5457 } 5458 5459 /* 5460 * Trigger a remote TLB flush before freeing the page tables to ensure 5461 * KVM is not in the middle of a lockless shadow page table walk, which 5462 * may reference the pages. 5463 */ 5464 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); 5465 } 5466 5467 /* 5468 * Fast invalidate all shadow pages and use lock-break technique 5469 * to zap obsolete pages. 5470 * 5471 * It's required when memslot is being deleted or VM is being 5472 * destroyed, in these cases, we should ensure that KVM MMU does 5473 * not use any resource of the being-deleted slot or all slots 5474 * after calling the function. 5475 */ 5476 static void kvm_mmu_zap_all_fast(struct kvm *kvm) 5477 { 5478 lockdep_assert_held(&kvm->slots_lock); 5479 5480 write_lock(&kvm->mmu_lock); 5481 trace_kvm_mmu_zap_all_fast(kvm); 5482 5483 /* 5484 * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is 5485 * held for the entire duration of zapping obsolete pages, it's 5486 * impossible for there to be multiple invalid generations associated 5487 * with *valid* shadow pages at any given time, i.e. there is exactly 5488 * one valid generation and (at most) one invalid generation. 5489 */ 5490 kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; 5491 5492 /* In order to ensure all threads see this change when 5493 * handling the MMU reload signal, this must happen in the 5494 * same critical section as kvm_reload_remote_mmus, and 5495 * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages 5496 * could drop the MMU lock and yield. 5497 */ 5498 if (is_tdp_mmu_enabled(kvm)) 5499 kvm_tdp_mmu_invalidate_all_roots(kvm); 5500 5501 /* 5502 * Notify all vcpus to reload its shadow page table and flush TLB. 5503 * Then all vcpus will switch to new shadow page table with the new 5504 * mmu_valid_gen. 5505 * 5506 * Note: we need to do this under the protection of mmu_lock, 5507 * otherwise, vcpu would purge shadow page but miss tlb flush. 5508 */ 5509 kvm_reload_remote_mmus(kvm); 5510 5511 kvm_zap_obsolete_pages(kvm); 5512 5513 write_unlock(&kvm->mmu_lock); 5514 5515 if (is_tdp_mmu_enabled(kvm)) { 5516 read_lock(&kvm->mmu_lock); 5517 kvm_tdp_mmu_zap_invalidated_roots(kvm); 5518 read_unlock(&kvm->mmu_lock); 5519 } 5520 } 5521 5522 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) 5523 { 5524 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); 5525 } 5526 5527 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, 5528 struct kvm_memory_slot *slot, 5529 struct kvm_page_track_notifier_node *node) 5530 { 5531 kvm_mmu_zap_all_fast(kvm); 5532 } 5533 5534 void kvm_mmu_init_vm(struct kvm *kvm) 5535 { 5536 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 5537 5538 if (!kvm_mmu_init_tdp_mmu(kvm)) 5539 /* 5540 * No smp_load/store wrappers needed here as we are in 5541 * VM init and there cannot be any memslots / other threads 5542 * accessing this struct kvm yet. 5543 */ 5544 kvm->arch.memslots_have_rmaps = true; 5545 5546 node->track_write = kvm_mmu_pte_write; 5547 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; 5548 kvm_page_track_register_notifier(kvm, node); 5549 } 5550 5551 void kvm_mmu_uninit_vm(struct kvm *kvm) 5552 { 5553 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 5554 5555 kvm_page_track_unregister_notifier(kvm, node); 5556 5557 kvm_mmu_uninit_tdp_mmu(kvm); 5558 } 5559 5560 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 5561 { 5562 struct kvm_memslots *slots; 5563 struct kvm_memory_slot *memslot; 5564 int i; 5565 bool flush = false; 5566 5567 if (kvm_memslots_have_rmaps(kvm)) { 5568 write_lock(&kvm->mmu_lock); 5569 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 5570 slots = __kvm_memslots(kvm, i); 5571 kvm_for_each_memslot(memslot, slots) { 5572 gfn_t start, end; 5573 5574 start = max(gfn_start, memslot->base_gfn); 5575 end = min(gfn_end, memslot->base_gfn + memslot->npages); 5576 if (start >= end) 5577 continue; 5578 5579 flush = slot_handle_level_range(kvm, memslot, 5580 kvm_zap_rmapp, PG_LEVEL_4K, 5581 KVM_MAX_HUGEPAGE_LEVEL, start, 5582 end - 1, true, flush); 5583 } 5584 } 5585 if (flush) 5586 kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); 5587 write_unlock(&kvm->mmu_lock); 5588 } 5589 5590 if (is_tdp_mmu_enabled(kvm)) { 5591 flush = false; 5592 5593 read_lock(&kvm->mmu_lock); 5594 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 5595 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, 5596 gfn_end, flush, true); 5597 if (flush) 5598 kvm_flush_remote_tlbs_with_address(kvm, gfn_start, 5599 gfn_end); 5600 5601 read_unlock(&kvm->mmu_lock); 5602 } 5603 } 5604 5605 static bool slot_rmap_write_protect(struct kvm *kvm, 5606 struct kvm_rmap_head *rmap_head, 5607 struct kvm_memory_slot *slot) 5608 { 5609 return __rmap_write_protect(kvm, rmap_head, false); 5610 } 5611 5612 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 5613 struct kvm_memory_slot *memslot, 5614 int start_level) 5615 { 5616 bool flush = false; 5617 5618 if (kvm_memslots_have_rmaps(kvm)) { 5619 write_lock(&kvm->mmu_lock); 5620 flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, 5621 start_level, KVM_MAX_HUGEPAGE_LEVEL, 5622 false); 5623 write_unlock(&kvm->mmu_lock); 5624 } 5625 5626 if (is_tdp_mmu_enabled(kvm)) { 5627 read_lock(&kvm->mmu_lock); 5628 flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); 5629 read_unlock(&kvm->mmu_lock); 5630 } 5631 5632 /* 5633 * We can flush all the TLBs out of the mmu lock without TLB 5634 * corruption since we just change the spte from writable to 5635 * readonly so that we only need to care the case of changing 5636 * spte from present to present (changing the spte from present 5637 * to nonpresent will flush all the TLBs immediately), in other 5638 * words, the only case we care is mmu_spte_update() where we 5639 * have checked Host-writable | MMU-writable instead of 5640 * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK 5641 * anymore. 5642 */ 5643 if (flush) 5644 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5645 } 5646 5647 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, 5648 struct kvm_rmap_head *rmap_head, 5649 struct kvm_memory_slot *slot) 5650 { 5651 u64 *sptep; 5652 struct rmap_iterator iter; 5653 int need_tlb_flush = 0; 5654 kvm_pfn_t pfn; 5655 struct kvm_mmu_page *sp; 5656 5657 restart: 5658 for_each_rmap_spte(rmap_head, &iter, sptep) { 5659 sp = sptep_to_sp(sptep); 5660 pfn = spte_to_pfn(*sptep); 5661 5662 /* 5663 * We cannot do huge page mapping for indirect shadow pages, 5664 * which are found on the last rmap (level = 1) when not using 5665 * tdp; such shadow pages are synced with the page table in 5666 * the guest, and the guest page table is using 4K page size 5667 * mapping if the indirect sp has level = 1. 5668 */ 5669 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && 5670 sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, 5671 pfn, PG_LEVEL_NUM)) { 5672 pte_list_remove(rmap_head, sptep); 5673 5674 if (kvm_available_flush_tlb_with_range()) 5675 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, 5676 KVM_PAGES_PER_HPAGE(sp->role.level)); 5677 else 5678 need_tlb_flush = 1; 5679 5680 goto restart; 5681 } 5682 } 5683 5684 return need_tlb_flush; 5685 } 5686 5687 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, 5688 const struct kvm_memory_slot *memslot) 5689 { 5690 /* FIXME: const-ify all uses of struct kvm_memory_slot. */ 5691 struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; 5692 bool flush = false; 5693 5694 if (kvm_memslots_have_rmaps(kvm)) { 5695 write_lock(&kvm->mmu_lock); 5696 flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); 5697 if (flush) 5698 kvm_arch_flush_remote_tlbs_memslot(kvm, slot); 5699 write_unlock(&kvm->mmu_lock); 5700 } 5701 5702 if (is_tdp_mmu_enabled(kvm)) { 5703 read_lock(&kvm->mmu_lock); 5704 flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); 5705 if (flush) 5706 kvm_arch_flush_remote_tlbs_memslot(kvm, slot); 5707 read_unlock(&kvm->mmu_lock); 5708 } 5709 } 5710 5711 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 5712 const struct kvm_memory_slot *memslot) 5713 { 5714 /* 5715 * All current use cases for flushing the TLBs for a specific memslot 5716 * related to dirty logging, and many do the TLB flush out of mmu_lock. 5717 * The interaction between the various operations on memslot must be 5718 * serialized by slots_locks to ensure the TLB flush from one operation 5719 * is observed by any other operation on the same memslot. 5720 */ 5721 lockdep_assert_held(&kvm->slots_lock); 5722 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, 5723 memslot->npages); 5724 } 5725 5726 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 5727 struct kvm_memory_slot *memslot) 5728 { 5729 bool flush = false; 5730 5731 if (kvm_memslots_have_rmaps(kvm)) { 5732 write_lock(&kvm->mmu_lock); 5733 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, 5734 false); 5735 write_unlock(&kvm->mmu_lock); 5736 } 5737 5738 if (is_tdp_mmu_enabled(kvm)) { 5739 read_lock(&kvm->mmu_lock); 5740 flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); 5741 read_unlock(&kvm->mmu_lock); 5742 } 5743 5744 /* 5745 * It's also safe to flush TLBs out of mmu lock here as currently this 5746 * function is only used for dirty logging, in which case flushing TLB 5747 * out of mmu lock also guarantees no dirty pages will be lost in 5748 * dirty_bitmap. 5749 */ 5750 if (flush) 5751 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5752 } 5753 5754 void kvm_mmu_zap_all(struct kvm *kvm) 5755 { 5756 struct kvm_mmu_page *sp, *node; 5757 LIST_HEAD(invalid_list); 5758 int ign; 5759 5760 write_lock(&kvm->mmu_lock); 5761 restart: 5762 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 5763 if (WARN_ON(sp->role.invalid)) 5764 continue; 5765 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) 5766 goto restart; 5767 if (cond_resched_rwlock_write(&kvm->mmu_lock)) 5768 goto restart; 5769 } 5770 5771 kvm_mmu_commit_zap_page(kvm, &invalid_list); 5772 5773 if (is_tdp_mmu_enabled(kvm)) 5774 kvm_tdp_mmu_zap_all(kvm); 5775 5776 write_unlock(&kvm->mmu_lock); 5777 } 5778 5779 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) 5780 { 5781 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 5782 5783 gen &= MMIO_SPTE_GEN_MASK; 5784 5785 /* 5786 * Generation numbers are incremented in multiples of the number of 5787 * address spaces in order to provide unique generations across all 5788 * address spaces. Strip what is effectively the address space 5789 * modifier prior to checking for a wrap of the MMIO generation so 5790 * that a wrap in any address space is detected. 5791 */ 5792 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); 5793 5794 /* 5795 * The very rare case: if the MMIO generation number has wrapped, 5796 * zap all shadow pages. 5797 */ 5798 if (unlikely(gen == 0)) { 5799 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); 5800 kvm_mmu_zap_all_fast(kvm); 5801 } 5802 } 5803 5804 static unsigned long 5805 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 5806 { 5807 struct kvm *kvm; 5808 int nr_to_scan = sc->nr_to_scan; 5809 unsigned long freed = 0; 5810 5811 mutex_lock(&kvm_lock); 5812 5813 list_for_each_entry(kvm, &vm_list, vm_list) { 5814 int idx; 5815 LIST_HEAD(invalid_list); 5816 5817 /* 5818 * Never scan more than sc->nr_to_scan VM instances. 5819 * Will not hit this condition practically since we do not try 5820 * to shrink more than one VM and it is very unlikely to see 5821 * !n_used_mmu_pages so many times. 5822 */ 5823 if (!nr_to_scan--) 5824 break; 5825 /* 5826 * n_used_mmu_pages is accessed without holding kvm->mmu_lock 5827 * here. We may skip a VM instance errorneosly, but we do not 5828 * want to shrink a VM that only started to populate its MMU 5829 * anyway. 5830 */ 5831 if (!kvm->arch.n_used_mmu_pages && 5832 !kvm_has_zapped_obsolete_pages(kvm)) 5833 continue; 5834 5835 idx = srcu_read_lock(&kvm->srcu); 5836 write_lock(&kvm->mmu_lock); 5837 5838 if (kvm_has_zapped_obsolete_pages(kvm)) { 5839 kvm_mmu_commit_zap_page(kvm, 5840 &kvm->arch.zapped_obsolete_pages); 5841 goto unlock; 5842 } 5843 5844 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); 5845 5846 unlock: 5847 write_unlock(&kvm->mmu_lock); 5848 srcu_read_unlock(&kvm->srcu, idx); 5849 5850 /* 5851 * unfair on small ones 5852 * per-vm shrinkers cry out 5853 * sadness comes quickly 5854 */ 5855 list_move_tail(&kvm->vm_list, &vm_list); 5856 break; 5857 } 5858 5859 mutex_unlock(&kvm_lock); 5860 return freed; 5861 } 5862 5863 static unsigned long 5864 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 5865 { 5866 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 5867 } 5868 5869 static struct shrinker mmu_shrinker = { 5870 .count_objects = mmu_shrink_count, 5871 .scan_objects = mmu_shrink_scan, 5872 .seeks = DEFAULT_SEEKS * 10, 5873 }; 5874 5875 static void mmu_destroy_caches(void) 5876 { 5877 kmem_cache_destroy(pte_list_desc_cache); 5878 kmem_cache_destroy(mmu_page_header_cache); 5879 } 5880 5881 static bool get_nx_auto_mode(void) 5882 { 5883 /* Return true when CPU has the bug, and mitigations are ON */ 5884 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); 5885 } 5886 5887 static void __set_nx_huge_pages(bool val) 5888 { 5889 nx_huge_pages = itlb_multihit_kvm_mitigation = val; 5890 } 5891 5892 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) 5893 { 5894 bool old_val = nx_huge_pages; 5895 bool new_val; 5896 5897 /* In "auto" mode deploy workaround only if CPU has the bug. */ 5898 if (sysfs_streq(val, "off")) 5899 new_val = 0; 5900 else if (sysfs_streq(val, "force")) 5901 new_val = 1; 5902 else if (sysfs_streq(val, "auto")) 5903 new_val = get_nx_auto_mode(); 5904 else if (strtobool(val, &new_val) < 0) 5905 return -EINVAL; 5906 5907 __set_nx_huge_pages(new_val); 5908 5909 if (new_val != old_val) { 5910 struct kvm *kvm; 5911 5912 mutex_lock(&kvm_lock); 5913 5914 list_for_each_entry(kvm, &vm_list, vm_list) { 5915 mutex_lock(&kvm->slots_lock); 5916 kvm_mmu_zap_all_fast(kvm); 5917 mutex_unlock(&kvm->slots_lock); 5918 5919 wake_up_process(kvm->arch.nx_lpage_recovery_thread); 5920 } 5921 mutex_unlock(&kvm_lock); 5922 } 5923 5924 return 0; 5925 } 5926 5927 int kvm_mmu_module_init(void) 5928 { 5929 int ret = -ENOMEM; 5930 5931 if (nx_huge_pages == -1) 5932 __set_nx_huge_pages(get_nx_auto_mode()); 5933 5934 /* 5935 * MMU roles use union aliasing which is, generally speaking, an 5936 * undefined behavior. However, we supposedly know how compilers behave 5937 * and the current status quo is unlikely to change. Guardians below are 5938 * supposed to let us know if the assumption becomes false. 5939 */ 5940 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); 5941 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); 5942 BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64)); 5943 5944 kvm_mmu_reset_all_pte_masks(); 5945 5946 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 5947 sizeof(struct pte_list_desc), 5948 0, SLAB_ACCOUNT, NULL); 5949 if (!pte_list_desc_cache) 5950 goto out; 5951 5952 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 5953 sizeof(struct kvm_mmu_page), 5954 0, SLAB_ACCOUNT, NULL); 5955 if (!mmu_page_header_cache) 5956 goto out; 5957 5958 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) 5959 goto out; 5960 5961 ret = register_shrinker(&mmu_shrinker); 5962 if (ret) 5963 goto out; 5964 5965 return 0; 5966 5967 out: 5968 mmu_destroy_caches(); 5969 return ret; 5970 } 5971 5972 /* 5973 * Calculate mmu pages needed for kvm. 5974 */ 5975 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) 5976 { 5977 unsigned long nr_mmu_pages; 5978 unsigned long nr_pages = 0; 5979 struct kvm_memslots *slots; 5980 struct kvm_memory_slot *memslot; 5981 int i; 5982 5983 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 5984 slots = __kvm_memslots(kvm, i); 5985 5986 kvm_for_each_memslot(memslot, slots) 5987 nr_pages += memslot->npages; 5988 } 5989 5990 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 5991 nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); 5992 5993 return nr_mmu_pages; 5994 } 5995 5996 void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 5997 { 5998 kvm_mmu_unload(vcpu); 5999 free_mmu_pages(&vcpu->arch.root_mmu); 6000 free_mmu_pages(&vcpu->arch.guest_mmu); 6001 mmu_free_memory_caches(vcpu); 6002 } 6003 6004 void kvm_mmu_module_exit(void) 6005 { 6006 mmu_destroy_caches(); 6007 percpu_counter_destroy(&kvm_total_used_mmu_pages); 6008 unregister_shrinker(&mmu_shrinker); 6009 mmu_audit_disable(); 6010 } 6011 6012 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) 6013 { 6014 unsigned int old_val; 6015 int err; 6016 6017 old_val = nx_huge_pages_recovery_ratio; 6018 err = param_set_uint(val, kp); 6019 if (err) 6020 return err; 6021 6022 if (READ_ONCE(nx_huge_pages) && 6023 !old_val && nx_huge_pages_recovery_ratio) { 6024 struct kvm *kvm; 6025 6026 mutex_lock(&kvm_lock); 6027 6028 list_for_each_entry(kvm, &vm_list, vm_list) 6029 wake_up_process(kvm->arch.nx_lpage_recovery_thread); 6030 6031 mutex_unlock(&kvm_lock); 6032 } 6033 6034 return err; 6035 } 6036 6037 static void kvm_recover_nx_lpages(struct kvm *kvm) 6038 { 6039 unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; 6040 int rcu_idx; 6041 struct kvm_mmu_page *sp; 6042 unsigned int ratio; 6043 LIST_HEAD(invalid_list); 6044 bool flush = false; 6045 ulong to_zap; 6046 6047 rcu_idx = srcu_read_lock(&kvm->srcu); 6048 write_lock(&kvm->mmu_lock); 6049 6050 ratio = READ_ONCE(nx_huge_pages_recovery_ratio); 6051 to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; 6052 for ( ; to_zap; --to_zap) { 6053 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) 6054 break; 6055 6056 /* 6057 * We use a separate list instead of just using active_mmu_pages 6058 * because the number of lpage_disallowed pages is expected to 6059 * be relatively small compared to the total. 6060 */ 6061 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, 6062 struct kvm_mmu_page, 6063 lpage_disallowed_link); 6064 WARN_ON_ONCE(!sp->lpage_disallowed); 6065 if (is_tdp_mmu_page(sp)) { 6066 flush |= kvm_tdp_mmu_zap_sp(kvm, sp); 6067 } else { 6068 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 6069 WARN_ON_ONCE(sp->lpage_disallowed); 6070 } 6071 6072 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 6073 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 6074 cond_resched_rwlock_write(&kvm->mmu_lock); 6075 flush = false; 6076 } 6077 } 6078 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 6079 6080 write_unlock(&kvm->mmu_lock); 6081 srcu_read_unlock(&kvm->srcu, rcu_idx); 6082 } 6083 6084 static long get_nx_lpage_recovery_timeout(u64 start_time) 6085 { 6086 return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) 6087 ? start_time + 60 * HZ - get_jiffies_64() 6088 : MAX_SCHEDULE_TIMEOUT; 6089 } 6090 6091 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) 6092 { 6093 u64 start_time; 6094 long remaining_time; 6095 6096 while (true) { 6097 start_time = get_jiffies_64(); 6098 remaining_time = get_nx_lpage_recovery_timeout(start_time); 6099 6100 set_current_state(TASK_INTERRUPTIBLE); 6101 while (!kthread_should_stop() && remaining_time > 0) { 6102 schedule_timeout(remaining_time); 6103 remaining_time = get_nx_lpage_recovery_timeout(start_time); 6104 set_current_state(TASK_INTERRUPTIBLE); 6105 } 6106 6107 set_current_state(TASK_RUNNING); 6108 6109 if (kthread_should_stop()) 6110 return 0; 6111 6112 kvm_recover_nx_lpages(kvm); 6113 } 6114 } 6115 6116 int kvm_mmu_post_init_vm(struct kvm *kvm) 6117 { 6118 int err; 6119 6120 err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, 6121 "kvm-nx-lpage-recovery", 6122 &kvm->arch.nx_lpage_recovery_thread); 6123 if (!err) 6124 kthread_unpark(kvm->arch.nx_lpage_recovery_thread); 6125 6126 return err; 6127 } 6128 6129 void kvm_mmu_pre_destroy_vm(struct kvm *kvm) 6130 { 6131 if (kvm->arch.nx_lpage_recovery_thread) 6132 kthread_stop(kvm->arch.nx_lpage_recovery_thread); 6133 } 6134