1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * MMU support 9 * 10 * Copyright (C) 2006 Qumranet, Inc. 11 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 12 * 13 * Authors: 14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Avi Kivity <avi@qumranet.com> 16 */ 17 18 #include "irq.h" 19 #include "ioapic.h" 20 #include "mmu.h" 21 #include "mmu_internal.h" 22 #include "tdp_mmu.h" 23 #include "x86.h" 24 #include "kvm_cache_regs.h" 25 #include "kvm_emulate.h" 26 #include "cpuid.h" 27 #include "spte.h" 28 29 #include <linux/kvm_host.h> 30 #include <linux/types.h> 31 #include <linux/string.h> 32 #include <linux/mm.h> 33 #include <linux/highmem.h> 34 #include <linux/moduleparam.h> 35 #include <linux/export.h> 36 #include <linux/swap.h> 37 #include <linux/hugetlb.h> 38 #include <linux/compiler.h> 39 #include <linux/srcu.h> 40 #include <linux/slab.h> 41 #include <linux/sched/signal.h> 42 #include <linux/uaccess.h> 43 #include <linux/hash.h> 44 #include <linux/kern_levels.h> 45 #include <linux/kthread.h> 46 47 #include <asm/page.h> 48 #include <asm/memtype.h> 49 #include <asm/cmpxchg.h> 50 #include <asm/io.h> 51 #include <asm/set_memory.h> 52 #include <asm/vmx.h> 53 #include <asm/kvm_page_track.h> 54 #include "trace.h" 55 56 #include "paging.h" 57 58 extern bool itlb_multihit_kvm_mitigation; 59 60 int __read_mostly nx_huge_pages = -1; 61 static uint __read_mostly nx_huge_pages_recovery_period_ms; 62 #ifdef CONFIG_PREEMPT_RT 63 /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ 64 static uint __read_mostly nx_huge_pages_recovery_ratio = 0; 65 #else 66 static uint __read_mostly nx_huge_pages_recovery_ratio = 60; 67 #endif 68 69 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); 70 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); 71 72 static const struct kernel_param_ops nx_huge_pages_ops = { 73 .set = set_nx_huge_pages, 74 .get = param_get_bool, 75 }; 76 77 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { 78 .set = set_nx_huge_pages_recovery_param, 79 .get = param_get_uint, 80 }; 81 82 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); 83 __MODULE_PARM_TYPE(nx_huge_pages, "bool"); 84 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, 85 &nx_huge_pages_recovery_ratio, 0644); 86 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); 87 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, 88 &nx_huge_pages_recovery_period_ms, 0644); 89 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); 90 91 static bool __read_mostly force_flush_and_sync_on_reuse; 92 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); 93 94 /* 95 * When setting this variable to true it enables Two-Dimensional-Paging 96 * where the hardware walks 2 page tables: 97 * 1. the guest-virtual to guest-physical 98 * 2. while doing 1. it walks guest-physical to host-physical 99 * If the hardware supports that we don't need to do shadow paging. 100 */ 101 bool tdp_enabled = false; 102 103 static int max_huge_page_level __read_mostly; 104 static int tdp_root_level __read_mostly; 105 static int max_tdp_level __read_mostly; 106 107 enum { 108 AUDIT_PRE_PAGE_FAULT, 109 AUDIT_POST_PAGE_FAULT, 110 AUDIT_PRE_PTE_WRITE, 111 AUDIT_POST_PTE_WRITE, 112 AUDIT_PRE_SYNC, 113 AUDIT_POST_SYNC 114 }; 115 116 #ifdef MMU_DEBUG 117 bool dbg = 0; 118 module_param(dbg, bool, 0644); 119 #endif 120 121 #define PTE_PREFETCH_NUM 8 122 123 #define PT32_LEVEL_BITS 10 124 125 #define PT32_LEVEL_SHIFT(level) \ 126 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 127 128 #define PT32_LVL_OFFSET_MASK(level) \ 129 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 130 * PT32_LEVEL_BITS))) - 1)) 131 132 #define PT32_INDEX(address, level)\ 133 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 134 135 136 #define PT32_BASE_ADDR_MASK PAGE_MASK 137 #define PT32_DIR_BASE_ADDR_MASK \ 138 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 139 #define PT32_LVL_ADDR_MASK(level) \ 140 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 141 * PT32_LEVEL_BITS))) - 1)) 142 143 #include <trace/events/kvm.h> 144 145 /* make pte_list_desc fit well in cache lines */ 146 #define PTE_LIST_EXT 14 147 148 /* 149 * Slight optimization of cacheline layout, by putting `more' and `spte_count' 150 * at the start; then accessing it will only use one single cacheline for 151 * either full (entries==PTE_LIST_EXT) case or entries<=6. 152 */ 153 struct pte_list_desc { 154 struct pte_list_desc *more; 155 /* 156 * Stores number of entries stored in the pte_list_desc. No need to be 157 * u64 but just for easier alignment. When PTE_LIST_EXT, means full. 158 */ 159 u64 spte_count; 160 u64 *sptes[PTE_LIST_EXT]; 161 }; 162 163 struct kvm_shadow_walk_iterator { 164 u64 addr; 165 hpa_t shadow_addr; 166 u64 *sptep; 167 int level; 168 unsigned index; 169 }; 170 171 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ 172 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ 173 (_root), (_addr)); \ 174 shadow_walk_okay(&(_walker)); \ 175 shadow_walk_next(&(_walker))) 176 177 #define for_each_shadow_entry(_vcpu, _addr, _walker) \ 178 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 179 shadow_walk_okay(&(_walker)); \ 180 shadow_walk_next(&(_walker))) 181 182 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ 183 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 184 shadow_walk_okay(&(_walker)) && \ 185 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ 186 __shadow_walk_next(&(_walker), spte)) 187 188 static struct kmem_cache *pte_list_desc_cache; 189 struct kmem_cache *mmu_page_header_cache; 190 static struct percpu_counter kvm_total_used_mmu_pages; 191 192 static void mmu_spte_set(u64 *sptep, u64 spte); 193 static union kvm_mmu_page_role 194 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); 195 196 struct kvm_mmu_role_regs { 197 const unsigned long cr0; 198 const unsigned long cr4; 199 const u64 efer; 200 }; 201 202 #define CREATE_TRACE_POINTS 203 #include "mmutrace.h" 204 205 /* 206 * Yes, lot's of underscores. They're a hint that you probably shouldn't be 207 * reading from the role_regs. Once the mmu_role is constructed, it becomes 208 * the single source of truth for the MMU's state. 209 */ 210 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ 211 static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ 212 { \ 213 return !!(regs->reg & flag); \ 214 } 215 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); 216 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); 217 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); 218 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); 219 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); 220 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); 221 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); 222 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); 223 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); 224 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); 225 226 /* 227 * The MMU itself (with a valid role) is the single source of truth for the 228 * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The 229 * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, 230 * and the vCPU may be incorrect/irrelevant. 231 */ 232 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ 233 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ 234 { \ 235 return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ 236 } 237 BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg); 238 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); 239 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); 240 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae); 241 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); 242 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); 243 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); 244 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); 245 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); 246 247 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) 248 { 249 struct kvm_mmu_role_regs regs = { 250 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), 251 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), 252 .efer = vcpu->arch.efer, 253 }; 254 255 return regs; 256 } 257 258 static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs) 259 { 260 if (!____is_cr0_pg(regs)) 261 return 0; 262 else if (____is_efer_lma(regs)) 263 return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : 264 PT64_ROOT_4LEVEL; 265 else if (____is_cr4_pae(regs)) 266 return PT32E_ROOT_LEVEL; 267 else 268 return PT32_ROOT_LEVEL; 269 } 270 271 static inline bool kvm_available_flush_tlb_with_range(void) 272 { 273 return kvm_x86_ops.tlb_remote_flush_with_range; 274 } 275 276 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, 277 struct kvm_tlb_range *range) 278 { 279 int ret = -ENOTSUPP; 280 281 if (range && kvm_x86_ops.tlb_remote_flush_with_range) 282 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); 283 284 if (ret) 285 kvm_flush_remote_tlbs(kvm); 286 } 287 288 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, 289 u64 start_gfn, u64 pages) 290 { 291 struct kvm_tlb_range range; 292 293 range.start_gfn = start_gfn; 294 range.pages = pages; 295 296 kvm_flush_remote_tlbs_with_range(kvm, &range); 297 } 298 299 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, 300 unsigned int access) 301 { 302 u64 spte = make_mmio_spte(vcpu, gfn, access); 303 304 trace_mark_mmio_spte(sptep, gfn, spte); 305 mmu_spte_set(sptep, spte); 306 } 307 308 static gfn_t get_mmio_spte_gfn(u64 spte) 309 { 310 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; 311 312 gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN) 313 & shadow_nonpresent_or_rsvd_mask; 314 315 return gpa >> PAGE_SHIFT; 316 } 317 318 static unsigned get_mmio_spte_access(u64 spte) 319 { 320 return spte & shadow_mmio_access_mask; 321 } 322 323 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) 324 { 325 u64 kvm_gen, spte_gen, gen; 326 327 gen = kvm_vcpu_memslots(vcpu)->generation; 328 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) 329 return false; 330 331 kvm_gen = gen & MMIO_SPTE_GEN_MASK; 332 spte_gen = get_mmio_spte_generation(spte); 333 334 trace_check_mmio_spte(spte, kvm_gen, spte_gen); 335 return likely(kvm_gen == spte_gen); 336 } 337 338 static int is_cpuid_PSE36(void) 339 { 340 return 1; 341 } 342 343 static gfn_t pse36_gfn_delta(u32 gpte) 344 { 345 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; 346 347 return (gpte & PT32_DIR_PSE36_MASK) << shift; 348 } 349 350 #ifdef CONFIG_X86_64 351 static void __set_spte(u64 *sptep, u64 spte) 352 { 353 WRITE_ONCE(*sptep, spte); 354 } 355 356 static void __update_clear_spte_fast(u64 *sptep, u64 spte) 357 { 358 WRITE_ONCE(*sptep, spte); 359 } 360 361 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 362 { 363 return xchg(sptep, spte); 364 } 365 366 static u64 __get_spte_lockless(u64 *sptep) 367 { 368 return READ_ONCE(*sptep); 369 } 370 #else 371 union split_spte { 372 struct { 373 u32 spte_low; 374 u32 spte_high; 375 }; 376 u64 spte; 377 }; 378 379 static void count_spte_clear(u64 *sptep, u64 spte) 380 { 381 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 382 383 if (is_shadow_present_pte(spte)) 384 return; 385 386 /* Ensure the spte is completely set before we increase the count */ 387 smp_wmb(); 388 sp->clear_spte_count++; 389 } 390 391 static void __set_spte(u64 *sptep, u64 spte) 392 { 393 union split_spte *ssptep, sspte; 394 395 ssptep = (union split_spte *)sptep; 396 sspte = (union split_spte)spte; 397 398 ssptep->spte_high = sspte.spte_high; 399 400 /* 401 * If we map the spte from nonpresent to present, We should store 402 * the high bits firstly, then set present bit, so cpu can not 403 * fetch this spte while we are setting the spte. 404 */ 405 smp_wmb(); 406 407 WRITE_ONCE(ssptep->spte_low, sspte.spte_low); 408 } 409 410 static void __update_clear_spte_fast(u64 *sptep, u64 spte) 411 { 412 union split_spte *ssptep, sspte; 413 414 ssptep = (union split_spte *)sptep; 415 sspte = (union split_spte)spte; 416 417 WRITE_ONCE(ssptep->spte_low, sspte.spte_low); 418 419 /* 420 * If we map the spte from present to nonpresent, we should clear 421 * present bit firstly to avoid vcpu fetch the old high bits. 422 */ 423 smp_wmb(); 424 425 ssptep->spte_high = sspte.spte_high; 426 count_spte_clear(sptep, spte); 427 } 428 429 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 430 { 431 union split_spte *ssptep, sspte, orig; 432 433 ssptep = (union split_spte *)sptep; 434 sspte = (union split_spte)spte; 435 436 /* xchg acts as a barrier before the setting of the high bits */ 437 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); 438 orig.spte_high = ssptep->spte_high; 439 ssptep->spte_high = sspte.spte_high; 440 count_spte_clear(sptep, spte); 441 442 return orig.spte; 443 } 444 445 /* 446 * The idea using the light way get the spte on x86_32 guest is from 447 * gup_get_pte (mm/gup.c). 448 * 449 * An spte tlb flush may be pending, because kvm_set_pte_rmapp 450 * coalesces them and we are running out of the MMU lock. Therefore 451 * we need to protect against in-progress updates of the spte. 452 * 453 * Reading the spte while an update is in progress may get the old value 454 * for the high part of the spte. The race is fine for a present->non-present 455 * change (because the high part of the spte is ignored for non-present spte), 456 * but for a present->present change we must reread the spte. 457 * 458 * All such changes are done in two steps (present->non-present and 459 * non-present->present), hence it is enough to count the number of 460 * present->non-present updates: if it changed while reading the spte, 461 * we might have hit the race. This is done using clear_spte_count. 462 */ 463 static u64 __get_spte_lockless(u64 *sptep) 464 { 465 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 466 union split_spte spte, *orig = (union split_spte *)sptep; 467 int count; 468 469 retry: 470 count = sp->clear_spte_count; 471 smp_rmb(); 472 473 spte.spte_low = orig->spte_low; 474 smp_rmb(); 475 476 spte.spte_high = orig->spte_high; 477 smp_rmb(); 478 479 if (unlikely(spte.spte_low != orig->spte_low || 480 count != sp->clear_spte_count)) 481 goto retry; 482 483 return spte.spte; 484 } 485 #endif 486 487 static bool spte_has_volatile_bits(u64 spte) 488 { 489 if (!is_shadow_present_pte(spte)) 490 return false; 491 492 /* 493 * Always atomically update spte if it can be updated 494 * out of mmu-lock, it can ensure dirty bit is not lost, 495 * also, it can help us to get a stable is_writable_pte() 496 * to ensure tlb flush is not missed. 497 */ 498 if (spte_can_locklessly_be_made_writable(spte) || 499 is_access_track_spte(spte)) 500 return true; 501 502 if (spte_ad_enabled(spte)) { 503 if ((spte & shadow_accessed_mask) == 0 || 504 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) 505 return true; 506 } 507 508 return false; 509 } 510 511 /* Rules for using mmu_spte_set: 512 * Set the sptep from nonpresent to present. 513 * Note: the sptep being assigned *must* be either not present 514 * or in a state where the hardware will not attempt to update 515 * the spte. 516 */ 517 static void mmu_spte_set(u64 *sptep, u64 new_spte) 518 { 519 WARN_ON(is_shadow_present_pte(*sptep)); 520 __set_spte(sptep, new_spte); 521 } 522 523 /* 524 * Update the SPTE (excluding the PFN), but do not track changes in its 525 * accessed/dirty status. 526 */ 527 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) 528 { 529 u64 old_spte = *sptep; 530 531 WARN_ON(!is_shadow_present_pte(new_spte)); 532 533 if (!is_shadow_present_pte(old_spte)) { 534 mmu_spte_set(sptep, new_spte); 535 return old_spte; 536 } 537 538 if (!spte_has_volatile_bits(old_spte)) 539 __update_clear_spte_fast(sptep, new_spte); 540 else 541 old_spte = __update_clear_spte_slow(sptep, new_spte); 542 543 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); 544 545 return old_spte; 546 } 547 548 /* Rules for using mmu_spte_update: 549 * Update the state bits, it means the mapped pfn is not changed. 550 * 551 * Whenever we overwrite a writable spte with a read-only one we 552 * should flush remote TLBs. Otherwise rmap_write_protect 553 * will find a read-only spte, even though the writable spte 554 * might be cached on a CPU's TLB, the return value indicates this 555 * case. 556 * 557 * Returns true if the TLB needs to be flushed 558 */ 559 static bool mmu_spte_update(u64 *sptep, u64 new_spte) 560 { 561 bool flush = false; 562 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); 563 564 if (!is_shadow_present_pte(old_spte)) 565 return false; 566 567 /* 568 * For the spte updated out of mmu-lock is safe, since 569 * we always atomically update it, see the comments in 570 * spte_has_volatile_bits(). 571 */ 572 if (spte_can_locklessly_be_made_writable(old_spte) && 573 !is_writable_pte(new_spte)) 574 flush = true; 575 576 /* 577 * Flush TLB when accessed/dirty states are changed in the page tables, 578 * to guarantee consistency between TLB and page tables. 579 */ 580 581 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { 582 flush = true; 583 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 584 } 585 586 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { 587 flush = true; 588 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 589 } 590 591 return flush; 592 } 593 594 /* 595 * Rules for using mmu_spte_clear_track_bits: 596 * It sets the sptep from present to nonpresent, and track the 597 * state bits, it is used to clear the last level sptep. 598 * Returns the old PTE. 599 */ 600 static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) 601 { 602 kvm_pfn_t pfn; 603 u64 old_spte = *sptep; 604 int level = sptep_to_sp(sptep)->role.level; 605 606 if (!spte_has_volatile_bits(old_spte)) 607 __update_clear_spte_fast(sptep, 0ull); 608 else 609 old_spte = __update_clear_spte_slow(sptep, 0ull); 610 611 if (!is_shadow_present_pte(old_spte)) 612 return old_spte; 613 614 kvm_update_page_stats(kvm, level, -1); 615 616 pfn = spte_to_pfn(old_spte); 617 618 /* 619 * KVM does not hold the refcount of the page used by 620 * kvm mmu, before reclaiming the page, we should 621 * unmap it from mmu first. 622 */ 623 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); 624 625 if (is_accessed_spte(old_spte)) 626 kvm_set_pfn_accessed(pfn); 627 628 if (is_dirty_spte(old_spte)) 629 kvm_set_pfn_dirty(pfn); 630 631 return old_spte; 632 } 633 634 /* 635 * Rules for using mmu_spte_clear_no_track: 636 * Directly clear spte without caring the state bits of sptep, 637 * it is used to set the upper level spte. 638 */ 639 static void mmu_spte_clear_no_track(u64 *sptep) 640 { 641 __update_clear_spte_fast(sptep, 0ull); 642 } 643 644 static u64 mmu_spte_get_lockless(u64 *sptep) 645 { 646 return __get_spte_lockless(sptep); 647 } 648 649 /* Restore an acc-track PTE back to a regular PTE */ 650 static u64 restore_acc_track_spte(u64 spte) 651 { 652 u64 new_spte = spte; 653 u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) 654 & SHADOW_ACC_TRACK_SAVED_BITS_MASK; 655 656 WARN_ON_ONCE(spte_ad_enabled(spte)); 657 WARN_ON_ONCE(!is_access_track_spte(spte)); 658 659 new_spte &= ~shadow_acc_track_mask; 660 new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << 661 SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); 662 new_spte |= saved_bits; 663 664 return new_spte; 665 } 666 667 /* Returns the Accessed status of the PTE and resets it at the same time. */ 668 static bool mmu_spte_age(u64 *sptep) 669 { 670 u64 spte = mmu_spte_get_lockless(sptep); 671 672 if (!is_accessed_spte(spte)) 673 return false; 674 675 if (spte_ad_enabled(spte)) { 676 clear_bit((ffs(shadow_accessed_mask) - 1), 677 (unsigned long *)sptep); 678 } else { 679 /* 680 * Capture the dirty status of the page, so that it doesn't get 681 * lost when the SPTE is marked for access tracking. 682 */ 683 if (is_writable_pte(spte)) 684 kvm_set_pfn_dirty(spte_to_pfn(spte)); 685 686 spte = mark_spte_for_access_track(spte); 687 mmu_spte_update_no_track(sptep, spte); 688 } 689 690 return true; 691 } 692 693 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 694 { 695 if (is_tdp_mmu(vcpu->arch.mmu)) { 696 kvm_tdp_mmu_walk_lockless_begin(); 697 } else { 698 /* 699 * Prevent page table teardown by making any free-er wait during 700 * kvm_flush_remote_tlbs() IPI to all active vcpus. 701 */ 702 local_irq_disable(); 703 704 /* 705 * Make sure a following spte read is not reordered ahead of the write 706 * to vcpu->mode. 707 */ 708 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); 709 } 710 } 711 712 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 713 { 714 if (is_tdp_mmu(vcpu->arch.mmu)) { 715 kvm_tdp_mmu_walk_lockless_end(); 716 } else { 717 /* 718 * Make sure the write to vcpu->mode is not reordered in front of 719 * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us 720 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. 721 */ 722 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); 723 local_irq_enable(); 724 } 725 } 726 727 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) 728 { 729 int r; 730 731 /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ 732 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 733 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); 734 if (r) 735 return r; 736 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, 737 PT64_ROOT_MAX_LEVEL); 738 if (r) 739 return r; 740 if (maybe_indirect) { 741 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, 742 PT64_ROOT_MAX_LEVEL); 743 if (r) 744 return r; 745 } 746 return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 747 PT64_ROOT_MAX_LEVEL); 748 } 749 750 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 751 { 752 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); 753 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); 754 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); 755 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 756 } 757 758 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 759 { 760 return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); 761 } 762 763 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 764 { 765 kmem_cache_free(pte_list_desc_cache, pte_list_desc); 766 } 767 768 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 769 { 770 if (!sp->role.direct) 771 return sp->gfns[index]; 772 773 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); 774 } 775 776 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 777 { 778 if (!sp->role.direct) { 779 sp->gfns[index] = gfn; 780 return; 781 } 782 783 if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) 784 pr_err_ratelimited("gfn mismatch under direct page %llx " 785 "(expected %llx, got %llx)\n", 786 sp->gfn, 787 kvm_mmu_page_get_gfn(sp, index), gfn); 788 } 789 790 /* 791 * Return the pointer to the large page information for a given gfn, 792 * handling slots that are not large page aligned. 793 */ 794 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, 795 const struct kvm_memory_slot *slot, int level) 796 { 797 unsigned long idx; 798 799 idx = gfn_to_index(gfn, slot->base_gfn, level); 800 return &slot->arch.lpage_info[level - 2][idx]; 801 } 802 803 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, 804 gfn_t gfn, int count) 805 { 806 struct kvm_lpage_info *linfo; 807 int i; 808 809 for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { 810 linfo = lpage_info_slot(gfn, slot, i); 811 linfo->disallow_lpage += count; 812 WARN_ON(linfo->disallow_lpage < 0); 813 } 814 } 815 816 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) 817 { 818 update_gfn_disallow_lpage_count(slot, gfn, 1); 819 } 820 821 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) 822 { 823 update_gfn_disallow_lpage_count(slot, gfn, -1); 824 } 825 826 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 827 { 828 struct kvm_memslots *slots; 829 struct kvm_memory_slot *slot; 830 gfn_t gfn; 831 832 kvm->arch.indirect_shadow_pages++; 833 gfn = sp->gfn; 834 slots = kvm_memslots_for_spte_role(kvm, sp->role); 835 slot = __gfn_to_memslot(slots, gfn); 836 837 /* the non-leaf shadow pages are keeping readonly. */ 838 if (sp->role.level > PG_LEVEL_4K) 839 return kvm_slot_page_track_add_page(kvm, slot, gfn, 840 KVM_PAGE_TRACK_WRITE); 841 842 kvm_mmu_gfn_disallow_lpage(slot, gfn); 843 } 844 845 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) 846 { 847 if (sp->lpage_disallowed) 848 return; 849 850 ++kvm->stat.nx_lpage_splits; 851 list_add_tail(&sp->lpage_disallowed_link, 852 &kvm->arch.lpage_disallowed_mmu_pages); 853 sp->lpage_disallowed = true; 854 } 855 856 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 857 { 858 struct kvm_memslots *slots; 859 struct kvm_memory_slot *slot; 860 gfn_t gfn; 861 862 kvm->arch.indirect_shadow_pages--; 863 gfn = sp->gfn; 864 slots = kvm_memslots_for_spte_role(kvm, sp->role); 865 slot = __gfn_to_memslot(slots, gfn); 866 if (sp->role.level > PG_LEVEL_4K) 867 return kvm_slot_page_track_remove_page(kvm, slot, gfn, 868 KVM_PAGE_TRACK_WRITE); 869 870 kvm_mmu_gfn_allow_lpage(slot, gfn); 871 } 872 873 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) 874 { 875 --kvm->stat.nx_lpage_splits; 876 sp->lpage_disallowed = false; 877 list_del(&sp->lpage_disallowed_link); 878 } 879 880 static struct kvm_memory_slot * 881 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, 882 bool no_dirty_log) 883 { 884 struct kvm_memory_slot *slot; 885 886 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 887 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 888 return NULL; 889 if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) 890 return NULL; 891 892 return slot; 893 } 894 895 /* 896 * About rmap_head encoding: 897 * 898 * If the bit zero of rmap_head->val is clear, then it points to the only spte 899 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct 900 * pte_list_desc containing more mappings. 901 */ 902 903 /* 904 * Returns the number of pointers in the rmap chain, not counting the new one. 905 */ 906 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, 907 struct kvm_rmap_head *rmap_head) 908 { 909 struct pte_list_desc *desc; 910 int count = 0; 911 912 if (!rmap_head->val) { 913 rmap_printk("%p %llx 0->1\n", spte, *spte); 914 rmap_head->val = (unsigned long)spte; 915 } else if (!(rmap_head->val & 1)) { 916 rmap_printk("%p %llx 1->many\n", spte, *spte); 917 desc = mmu_alloc_pte_list_desc(vcpu); 918 desc->sptes[0] = (u64 *)rmap_head->val; 919 desc->sptes[1] = spte; 920 desc->spte_count = 2; 921 rmap_head->val = (unsigned long)desc | 1; 922 ++count; 923 } else { 924 rmap_printk("%p %llx many->many\n", spte, *spte); 925 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 926 while (desc->spte_count == PTE_LIST_EXT) { 927 count += PTE_LIST_EXT; 928 if (!desc->more) { 929 desc->more = mmu_alloc_pte_list_desc(vcpu); 930 desc = desc->more; 931 desc->spte_count = 0; 932 break; 933 } 934 desc = desc->more; 935 } 936 count += desc->spte_count; 937 desc->sptes[desc->spte_count++] = spte; 938 } 939 return count; 940 } 941 942 static void 943 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, 944 struct pte_list_desc *desc, int i, 945 struct pte_list_desc *prev_desc) 946 { 947 int j = desc->spte_count - 1; 948 949 desc->sptes[i] = desc->sptes[j]; 950 desc->sptes[j] = NULL; 951 desc->spte_count--; 952 if (desc->spte_count) 953 return; 954 if (!prev_desc && !desc->more) 955 rmap_head->val = 0; 956 else 957 if (prev_desc) 958 prev_desc->more = desc->more; 959 else 960 rmap_head->val = (unsigned long)desc->more | 1; 961 mmu_free_pte_list_desc(desc); 962 } 963 964 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) 965 { 966 struct pte_list_desc *desc; 967 struct pte_list_desc *prev_desc; 968 int i; 969 970 if (!rmap_head->val) { 971 pr_err("%s: %p 0->BUG\n", __func__, spte); 972 BUG(); 973 } else if (!(rmap_head->val & 1)) { 974 rmap_printk("%p 1->0\n", spte); 975 if ((u64 *)rmap_head->val != spte) { 976 pr_err("%s: %p 1->BUG\n", __func__, spte); 977 BUG(); 978 } 979 rmap_head->val = 0; 980 } else { 981 rmap_printk("%p many->many\n", spte); 982 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 983 prev_desc = NULL; 984 while (desc) { 985 for (i = 0; i < desc->spte_count; ++i) { 986 if (desc->sptes[i] == spte) { 987 pte_list_desc_remove_entry(rmap_head, 988 desc, i, prev_desc); 989 return; 990 } 991 } 992 prev_desc = desc; 993 desc = desc->more; 994 } 995 pr_err("%s: %p many->many\n", __func__, spte); 996 BUG(); 997 } 998 } 999 1000 static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1001 u64 *sptep) 1002 { 1003 mmu_spte_clear_track_bits(kvm, sptep); 1004 __pte_list_remove(sptep, rmap_head); 1005 } 1006 1007 /* Return true if rmap existed, false otherwise */ 1008 static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1009 { 1010 struct pte_list_desc *desc, *next; 1011 int i; 1012 1013 if (!rmap_head->val) 1014 return false; 1015 1016 if (!(rmap_head->val & 1)) { 1017 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); 1018 goto out; 1019 } 1020 1021 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1022 1023 for (; desc; desc = next) { 1024 for (i = 0; i < desc->spte_count; i++) 1025 mmu_spte_clear_track_bits(kvm, desc->sptes[i]); 1026 next = desc->more; 1027 mmu_free_pte_list_desc(desc); 1028 } 1029 out: 1030 /* rmap_head is meaningless now, remember to reset it */ 1031 rmap_head->val = 0; 1032 return true; 1033 } 1034 1035 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) 1036 { 1037 struct pte_list_desc *desc; 1038 unsigned int count = 0; 1039 1040 if (!rmap_head->val) 1041 return 0; 1042 else if (!(rmap_head->val & 1)) 1043 return 1; 1044 1045 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1046 1047 while (desc) { 1048 count += desc->spte_count; 1049 desc = desc->more; 1050 } 1051 1052 return count; 1053 } 1054 1055 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, 1056 const struct kvm_memory_slot *slot) 1057 { 1058 unsigned long idx; 1059 1060 idx = gfn_to_index(gfn, slot->base_gfn, level); 1061 return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; 1062 } 1063 1064 static bool rmap_can_add(struct kvm_vcpu *vcpu) 1065 { 1066 struct kvm_mmu_memory_cache *mc; 1067 1068 mc = &vcpu->arch.mmu_pte_list_desc_cache; 1069 return kvm_mmu_memory_cache_nr_free_objects(mc); 1070 } 1071 1072 static void rmap_remove(struct kvm *kvm, u64 *spte) 1073 { 1074 struct kvm_memslots *slots; 1075 struct kvm_memory_slot *slot; 1076 struct kvm_mmu_page *sp; 1077 gfn_t gfn; 1078 struct kvm_rmap_head *rmap_head; 1079 1080 sp = sptep_to_sp(spte); 1081 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 1082 1083 /* 1084 * Unlike rmap_add, rmap_remove does not run in the context of a vCPU 1085 * so we have to determine which memslots to use based on context 1086 * information in sp->role. 1087 */ 1088 slots = kvm_memslots_for_spte_role(kvm, sp->role); 1089 1090 slot = __gfn_to_memslot(slots, gfn); 1091 rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); 1092 1093 __pte_list_remove(spte, rmap_head); 1094 } 1095 1096 /* 1097 * Used by the following functions to iterate through the sptes linked by a 1098 * rmap. All fields are private and not assumed to be used outside. 1099 */ 1100 struct rmap_iterator { 1101 /* private fields */ 1102 struct pte_list_desc *desc; /* holds the sptep if not NULL */ 1103 int pos; /* index of the sptep */ 1104 }; 1105 1106 /* 1107 * Iteration must be started by this function. This should also be used after 1108 * removing/dropping sptes from the rmap link because in such cases the 1109 * information in the iterator may not be valid. 1110 * 1111 * Returns sptep if found, NULL otherwise. 1112 */ 1113 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, 1114 struct rmap_iterator *iter) 1115 { 1116 u64 *sptep; 1117 1118 if (!rmap_head->val) 1119 return NULL; 1120 1121 if (!(rmap_head->val & 1)) { 1122 iter->desc = NULL; 1123 sptep = (u64 *)rmap_head->val; 1124 goto out; 1125 } 1126 1127 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1128 iter->pos = 0; 1129 sptep = iter->desc->sptes[iter->pos]; 1130 out: 1131 BUG_ON(!is_shadow_present_pte(*sptep)); 1132 return sptep; 1133 } 1134 1135 /* 1136 * Must be used with a valid iterator: e.g. after rmap_get_first(). 1137 * 1138 * Returns sptep if found, NULL otherwise. 1139 */ 1140 static u64 *rmap_get_next(struct rmap_iterator *iter) 1141 { 1142 u64 *sptep; 1143 1144 if (iter->desc) { 1145 if (iter->pos < PTE_LIST_EXT - 1) { 1146 ++iter->pos; 1147 sptep = iter->desc->sptes[iter->pos]; 1148 if (sptep) 1149 goto out; 1150 } 1151 1152 iter->desc = iter->desc->more; 1153 1154 if (iter->desc) { 1155 iter->pos = 0; 1156 /* desc->sptes[0] cannot be NULL */ 1157 sptep = iter->desc->sptes[iter->pos]; 1158 goto out; 1159 } 1160 } 1161 1162 return NULL; 1163 out: 1164 BUG_ON(!is_shadow_present_pte(*sptep)); 1165 return sptep; 1166 } 1167 1168 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ 1169 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ 1170 _spte_; _spte_ = rmap_get_next(_iter_)) 1171 1172 static void drop_spte(struct kvm *kvm, u64 *sptep) 1173 { 1174 u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); 1175 1176 if (is_shadow_present_pte(old_spte)) 1177 rmap_remove(kvm, sptep); 1178 } 1179 1180 1181 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) 1182 { 1183 if (is_large_pte(*sptep)) { 1184 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); 1185 drop_spte(kvm, sptep); 1186 return true; 1187 } 1188 1189 return false; 1190 } 1191 1192 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1193 { 1194 if (__drop_large_spte(vcpu->kvm, sptep)) { 1195 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 1196 1197 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1198 KVM_PAGES_PER_HPAGE(sp->role.level)); 1199 } 1200 } 1201 1202 /* 1203 * Write-protect on the specified @sptep, @pt_protect indicates whether 1204 * spte write-protection is caused by protecting shadow page table. 1205 * 1206 * Note: write protection is difference between dirty logging and spte 1207 * protection: 1208 * - for dirty logging, the spte can be set to writable at anytime if 1209 * its dirty bitmap is properly set. 1210 * - for spte protection, the spte can be writable only after unsync-ing 1211 * shadow page. 1212 * 1213 * Return true if tlb need be flushed. 1214 */ 1215 static bool spte_write_protect(u64 *sptep, bool pt_protect) 1216 { 1217 u64 spte = *sptep; 1218 1219 if (!is_writable_pte(spte) && 1220 !(pt_protect && spte_can_locklessly_be_made_writable(spte))) 1221 return false; 1222 1223 rmap_printk("spte %p %llx\n", sptep, *sptep); 1224 1225 if (pt_protect) 1226 spte &= ~shadow_mmu_writable_mask; 1227 spte = spte & ~PT_WRITABLE_MASK; 1228 1229 return mmu_spte_update(sptep, spte); 1230 } 1231 1232 static bool __rmap_write_protect(struct kvm *kvm, 1233 struct kvm_rmap_head *rmap_head, 1234 bool pt_protect) 1235 { 1236 u64 *sptep; 1237 struct rmap_iterator iter; 1238 bool flush = false; 1239 1240 for_each_rmap_spte(rmap_head, &iter, sptep) 1241 flush |= spte_write_protect(sptep, pt_protect); 1242 1243 return flush; 1244 } 1245 1246 static bool spte_clear_dirty(u64 *sptep) 1247 { 1248 u64 spte = *sptep; 1249 1250 rmap_printk("spte %p %llx\n", sptep, *sptep); 1251 1252 MMU_WARN_ON(!spte_ad_enabled(spte)); 1253 spte &= ~shadow_dirty_mask; 1254 return mmu_spte_update(sptep, spte); 1255 } 1256 1257 static bool spte_wrprot_for_clear_dirty(u64 *sptep) 1258 { 1259 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, 1260 (unsigned long *)sptep); 1261 if (was_writable && !spte_ad_enabled(*sptep)) 1262 kvm_set_pfn_dirty(spte_to_pfn(*sptep)); 1263 1264 return was_writable; 1265 } 1266 1267 /* 1268 * Gets the GFN ready for another round of dirty logging by clearing the 1269 * - D bit on ad-enabled SPTEs, and 1270 * - W bit on ad-disabled SPTEs. 1271 * Returns true iff any D or W bits were cleared. 1272 */ 1273 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1274 const struct kvm_memory_slot *slot) 1275 { 1276 u64 *sptep; 1277 struct rmap_iterator iter; 1278 bool flush = false; 1279 1280 for_each_rmap_spte(rmap_head, &iter, sptep) 1281 if (spte_ad_need_write_protect(*sptep)) 1282 flush |= spte_wrprot_for_clear_dirty(sptep); 1283 else 1284 flush |= spte_clear_dirty(sptep); 1285 1286 return flush; 1287 } 1288 1289 /** 1290 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1291 * @kvm: kvm instance 1292 * @slot: slot to protect 1293 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1294 * @mask: indicates which pages we should protect 1295 * 1296 * Used when we do not need to care about huge page mappings. 1297 */ 1298 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1299 struct kvm_memory_slot *slot, 1300 gfn_t gfn_offset, unsigned long mask) 1301 { 1302 struct kvm_rmap_head *rmap_head; 1303 1304 if (is_tdp_mmu_enabled(kvm)) 1305 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, 1306 slot->base_gfn + gfn_offset, mask, true); 1307 1308 if (!kvm_memslots_have_rmaps(kvm)) 1309 return; 1310 1311 while (mask) { 1312 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1313 PG_LEVEL_4K, slot); 1314 __rmap_write_protect(kvm, rmap_head, false); 1315 1316 /* clear the first set bit */ 1317 mask &= mask - 1; 1318 } 1319 } 1320 1321 /** 1322 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write 1323 * protect the page if the D-bit isn't supported. 1324 * @kvm: kvm instance 1325 * @slot: slot to clear D-bit 1326 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1327 * @mask: indicates which pages we should clear D-bit 1328 * 1329 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 1330 */ 1331 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1332 struct kvm_memory_slot *slot, 1333 gfn_t gfn_offset, unsigned long mask) 1334 { 1335 struct kvm_rmap_head *rmap_head; 1336 1337 if (is_tdp_mmu_enabled(kvm)) 1338 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, 1339 slot->base_gfn + gfn_offset, mask, false); 1340 1341 if (!kvm_memslots_have_rmaps(kvm)) 1342 return; 1343 1344 while (mask) { 1345 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1346 PG_LEVEL_4K, slot); 1347 __rmap_clear_dirty(kvm, rmap_head, slot); 1348 1349 /* clear the first set bit */ 1350 mask &= mask - 1; 1351 } 1352 } 1353 1354 /** 1355 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1356 * PT level pages. 1357 * 1358 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1359 * enable dirty logging for them. 1360 * 1361 * We need to care about huge page mappings: e.g. during dirty logging we may 1362 * have such mappings. 1363 */ 1364 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1365 struct kvm_memory_slot *slot, 1366 gfn_t gfn_offset, unsigned long mask) 1367 { 1368 /* 1369 * Huge pages are NOT write protected when we start dirty logging in 1370 * initially-all-set mode; must write protect them here so that they 1371 * are split to 4K on the first write. 1372 * 1373 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn 1374 * of memslot has no such restriction, so the range can cross two large 1375 * pages. 1376 */ 1377 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { 1378 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); 1379 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); 1380 1381 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); 1382 1383 /* Cross two large pages? */ 1384 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != 1385 ALIGN(end << PAGE_SHIFT, PMD_SIZE)) 1386 kvm_mmu_slot_gfn_write_protect(kvm, slot, end, 1387 PG_LEVEL_2M); 1388 } 1389 1390 /* Now handle 4K PTEs. */ 1391 if (kvm_x86_ops.cpu_dirty_log_size) 1392 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); 1393 else 1394 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1395 } 1396 1397 int kvm_cpu_dirty_log_size(void) 1398 { 1399 return kvm_x86_ops.cpu_dirty_log_size; 1400 } 1401 1402 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, 1403 struct kvm_memory_slot *slot, u64 gfn, 1404 int min_level) 1405 { 1406 struct kvm_rmap_head *rmap_head; 1407 int i; 1408 bool write_protected = false; 1409 1410 if (kvm_memslots_have_rmaps(kvm)) { 1411 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { 1412 rmap_head = gfn_to_rmap(gfn, i, slot); 1413 write_protected |= __rmap_write_protect(kvm, rmap_head, true); 1414 } 1415 } 1416 1417 if (is_tdp_mmu_enabled(kvm)) 1418 write_protected |= 1419 kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); 1420 1421 return write_protected; 1422 } 1423 1424 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) 1425 { 1426 struct kvm_memory_slot *slot; 1427 1428 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1429 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); 1430 } 1431 1432 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1433 const struct kvm_memory_slot *slot) 1434 { 1435 return pte_list_destroy(kvm, rmap_head); 1436 } 1437 1438 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1439 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1440 pte_t unused) 1441 { 1442 return kvm_zap_rmapp(kvm, rmap_head, slot); 1443 } 1444 1445 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1446 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1447 pte_t pte) 1448 { 1449 u64 *sptep; 1450 struct rmap_iterator iter; 1451 bool need_flush = false; 1452 u64 new_spte; 1453 kvm_pfn_t new_pfn; 1454 1455 WARN_ON(pte_huge(pte)); 1456 new_pfn = pte_pfn(pte); 1457 1458 restart: 1459 for_each_rmap_spte(rmap_head, &iter, sptep) { 1460 rmap_printk("spte %p %llx gfn %llx (%d)\n", 1461 sptep, *sptep, gfn, level); 1462 1463 need_flush = true; 1464 1465 if (pte_write(pte)) { 1466 pte_list_remove(kvm, rmap_head, sptep); 1467 goto restart; 1468 } else { 1469 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 1470 *sptep, new_pfn); 1471 1472 mmu_spte_clear_track_bits(kvm, sptep); 1473 mmu_spte_set(sptep, new_spte); 1474 } 1475 } 1476 1477 if (need_flush && kvm_available_flush_tlb_with_range()) { 1478 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 1479 return false; 1480 } 1481 1482 return need_flush; 1483 } 1484 1485 struct slot_rmap_walk_iterator { 1486 /* input fields. */ 1487 const struct kvm_memory_slot *slot; 1488 gfn_t start_gfn; 1489 gfn_t end_gfn; 1490 int start_level; 1491 int end_level; 1492 1493 /* output fields. */ 1494 gfn_t gfn; 1495 struct kvm_rmap_head *rmap; 1496 int level; 1497 1498 /* private field. */ 1499 struct kvm_rmap_head *end_rmap; 1500 }; 1501 1502 static void 1503 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) 1504 { 1505 iterator->level = level; 1506 iterator->gfn = iterator->start_gfn; 1507 iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); 1508 iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); 1509 } 1510 1511 static void 1512 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, 1513 const struct kvm_memory_slot *slot, int start_level, 1514 int end_level, gfn_t start_gfn, gfn_t end_gfn) 1515 { 1516 iterator->slot = slot; 1517 iterator->start_level = start_level; 1518 iterator->end_level = end_level; 1519 iterator->start_gfn = start_gfn; 1520 iterator->end_gfn = end_gfn; 1521 1522 rmap_walk_init_level(iterator, iterator->start_level); 1523 } 1524 1525 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) 1526 { 1527 return !!iterator->rmap; 1528 } 1529 1530 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 1531 { 1532 if (++iterator->rmap <= iterator->end_rmap) { 1533 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); 1534 return; 1535 } 1536 1537 if (++iterator->level > iterator->end_level) { 1538 iterator->rmap = NULL; 1539 return; 1540 } 1541 1542 rmap_walk_init_level(iterator, iterator->level); 1543 } 1544 1545 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ 1546 _start_gfn, _end_gfn, _iter_) \ 1547 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ 1548 _end_level_, _start_gfn, _end_gfn); \ 1549 slot_rmap_walk_okay(_iter_); \ 1550 slot_rmap_walk_next(_iter_)) 1551 1552 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1553 struct kvm_memory_slot *slot, gfn_t gfn, 1554 int level, pte_t pte); 1555 1556 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, 1557 struct kvm_gfn_range *range, 1558 rmap_handler_t handler) 1559 { 1560 struct slot_rmap_walk_iterator iterator; 1561 bool ret = false; 1562 1563 for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1564 range->start, range->end - 1, &iterator) 1565 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, 1566 iterator.level, range->pte); 1567 1568 return ret; 1569 } 1570 1571 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1572 { 1573 bool flush = false; 1574 1575 if (kvm_memslots_have_rmaps(kvm)) 1576 flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); 1577 1578 if (is_tdp_mmu_enabled(kvm)) 1579 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); 1580 1581 return flush; 1582 } 1583 1584 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1585 { 1586 bool flush = false; 1587 1588 if (kvm_memslots_have_rmaps(kvm)) 1589 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); 1590 1591 if (is_tdp_mmu_enabled(kvm)) 1592 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); 1593 1594 return flush; 1595 } 1596 1597 static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1598 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1599 pte_t unused) 1600 { 1601 u64 *sptep; 1602 struct rmap_iterator iter; 1603 int young = 0; 1604 1605 for_each_rmap_spte(rmap_head, &iter, sptep) 1606 young |= mmu_spte_age(sptep); 1607 1608 return young; 1609 } 1610 1611 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1612 struct kvm_memory_slot *slot, gfn_t gfn, 1613 int level, pte_t unused) 1614 { 1615 u64 *sptep; 1616 struct rmap_iterator iter; 1617 1618 for_each_rmap_spte(rmap_head, &iter, sptep) 1619 if (is_accessed_spte(*sptep)) 1620 return true; 1621 return false; 1622 } 1623 1624 #define RMAP_RECYCLE_THRESHOLD 1000 1625 1626 static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, 1627 u64 *spte, gfn_t gfn) 1628 { 1629 struct kvm_mmu_page *sp; 1630 struct kvm_rmap_head *rmap_head; 1631 int rmap_count; 1632 1633 sp = sptep_to_sp(spte); 1634 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 1635 rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); 1636 rmap_count = pte_list_add(vcpu, spte, rmap_head); 1637 1638 if (rmap_count > RMAP_RECYCLE_THRESHOLD) { 1639 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); 1640 kvm_flush_remote_tlbs_with_address( 1641 vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); 1642 } 1643 } 1644 1645 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1646 { 1647 bool young = false; 1648 1649 if (kvm_memslots_have_rmaps(kvm)) 1650 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); 1651 1652 if (is_tdp_mmu_enabled(kvm)) 1653 young |= kvm_tdp_mmu_age_gfn_range(kvm, range); 1654 1655 return young; 1656 } 1657 1658 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1659 { 1660 bool young = false; 1661 1662 if (kvm_memslots_have_rmaps(kvm)) 1663 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); 1664 1665 if (is_tdp_mmu_enabled(kvm)) 1666 young |= kvm_tdp_mmu_test_age_gfn(kvm, range); 1667 1668 return young; 1669 } 1670 1671 #ifdef MMU_DEBUG 1672 static int is_empty_shadow_page(u64 *spt) 1673 { 1674 u64 *pos; 1675 u64 *end; 1676 1677 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 1678 if (is_shadow_present_pte(*pos)) { 1679 printk(KERN_ERR "%s: %p %llx\n", __func__, 1680 pos, *pos); 1681 return 0; 1682 } 1683 return 1; 1684 } 1685 #endif 1686 1687 /* 1688 * This value is the sum of all of the kvm instances's 1689 * kvm->arch.n_used_mmu_pages values. We need a global, 1690 * aggregate version in order to make the slab shrinker 1691 * faster 1692 */ 1693 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) 1694 { 1695 kvm->arch.n_used_mmu_pages += nr; 1696 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1697 } 1698 1699 static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1700 { 1701 MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); 1702 hlist_del(&sp->hash_link); 1703 list_del(&sp->link); 1704 free_page((unsigned long)sp->spt); 1705 if (!sp->role.direct) 1706 free_page((unsigned long)sp->gfns); 1707 kmem_cache_free(mmu_page_header_cache, sp); 1708 } 1709 1710 static unsigned kvm_page_table_hashfn(gfn_t gfn) 1711 { 1712 return hash_64(gfn, KVM_MMU_HASH_SHIFT); 1713 } 1714 1715 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1716 struct kvm_mmu_page *sp, u64 *parent_pte) 1717 { 1718 if (!parent_pte) 1719 return; 1720 1721 pte_list_add(vcpu, parent_pte, &sp->parent_ptes); 1722 } 1723 1724 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1725 u64 *parent_pte) 1726 { 1727 __pte_list_remove(parent_pte, &sp->parent_ptes); 1728 } 1729 1730 static void drop_parent_pte(struct kvm_mmu_page *sp, 1731 u64 *parent_pte) 1732 { 1733 mmu_page_remove_parent_pte(sp, parent_pte); 1734 mmu_spte_clear_no_track(parent_pte); 1735 } 1736 1737 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) 1738 { 1739 struct kvm_mmu_page *sp; 1740 1741 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1742 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 1743 if (!direct) 1744 sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); 1745 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1746 1747 /* 1748 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() 1749 * depends on valid pages being added to the head of the list. See 1750 * comments in kvm_zap_obsolete_pages(). 1751 */ 1752 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; 1753 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1754 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1755 return sp; 1756 } 1757 1758 static void mark_unsync(u64 *spte); 1759 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1760 { 1761 u64 *sptep; 1762 struct rmap_iterator iter; 1763 1764 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { 1765 mark_unsync(sptep); 1766 } 1767 } 1768 1769 static void mark_unsync(u64 *spte) 1770 { 1771 struct kvm_mmu_page *sp; 1772 unsigned int index; 1773 1774 sp = sptep_to_sp(spte); 1775 index = spte - sp->spt; 1776 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1777 return; 1778 if (sp->unsync_children++) 1779 return; 1780 kvm_mmu_mark_parents_unsync(sp); 1781 } 1782 1783 static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1784 struct kvm_mmu_page *sp) 1785 { 1786 return -1; 1787 } 1788 1789 #define KVM_PAGE_ARRAY_NR 16 1790 1791 struct kvm_mmu_pages { 1792 struct mmu_page_and_offset { 1793 struct kvm_mmu_page *sp; 1794 unsigned int idx; 1795 } page[KVM_PAGE_ARRAY_NR]; 1796 unsigned int nr; 1797 }; 1798 1799 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1800 int idx) 1801 { 1802 int i; 1803 1804 if (sp->unsync) 1805 for (i=0; i < pvec->nr; i++) 1806 if (pvec->page[i].sp == sp) 1807 return 0; 1808 1809 pvec->page[pvec->nr].sp = sp; 1810 pvec->page[pvec->nr].idx = idx; 1811 pvec->nr++; 1812 return (pvec->nr == KVM_PAGE_ARRAY_NR); 1813 } 1814 1815 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) 1816 { 1817 --sp->unsync_children; 1818 WARN_ON((int)sp->unsync_children < 0); 1819 __clear_bit(idx, sp->unsync_child_bitmap); 1820 } 1821 1822 static int __mmu_unsync_walk(struct kvm_mmu_page *sp, 1823 struct kvm_mmu_pages *pvec) 1824 { 1825 int i, ret, nr_unsync_leaf = 0; 1826 1827 for_each_set_bit(i, sp->unsync_child_bitmap, 512) { 1828 struct kvm_mmu_page *child; 1829 u64 ent = sp->spt[i]; 1830 1831 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { 1832 clear_unsync_child_bit(sp, i); 1833 continue; 1834 } 1835 1836 child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); 1837 1838 if (child->unsync_children) { 1839 if (mmu_pages_add(pvec, child, i)) 1840 return -ENOSPC; 1841 1842 ret = __mmu_unsync_walk(child, pvec); 1843 if (!ret) { 1844 clear_unsync_child_bit(sp, i); 1845 continue; 1846 } else if (ret > 0) { 1847 nr_unsync_leaf += ret; 1848 } else 1849 return ret; 1850 } else if (child->unsync) { 1851 nr_unsync_leaf++; 1852 if (mmu_pages_add(pvec, child, i)) 1853 return -ENOSPC; 1854 } else 1855 clear_unsync_child_bit(sp, i); 1856 } 1857 1858 return nr_unsync_leaf; 1859 } 1860 1861 #define INVALID_INDEX (-1) 1862 1863 static int mmu_unsync_walk(struct kvm_mmu_page *sp, 1864 struct kvm_mmu_pages *pvec) 1865 { 1866 pvec->nr = 0; 1867 if (!sp->unsync_children) 1868 return 0; 1869 1870 mmu_pages_add(pvec, sp, INVALID_INDEX); 1871 return __mmu_unsync_walk(sp, pvec); 1872 } 1873 1874 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1875 { 1876 WARN_ON(!sp->unsync); 1877 trace_kvm_mmu_sync_page(sp); 1878 sp->unsync = 0; 1879 --kvm->stat.mmu_unsync; 1880 } 1881 1882 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1883 struct list_head *invalid_list); 1884 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1885 struct list_head *invalid_list); 1886 1887 #define for_each_valid_sp(_kvm, _sp, _list) \ 1888 hlist_for_each_entry(_sp, _list, hash_link) \ 1889 if (is_obsolete_sp((_kvm), (_sp))) { \ 1890 } else 1891 1892 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 1893 for_each_valid_sp(_kvm, _sp, \ 1894 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ 1895 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else 1896 1897 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1898 struct list_head *invalid_list) 1899 { 1900 int ret = vcpu->arch.mmu->sync_page(vcpu, sp); 1901 1902 if (ret < 0) { 1903 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1904 return false; 1905 } 1906 1907 return !!ret; 1908 } 1909 1910 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, 1911 struct list_head *invalid_list, 1912 bool remote_flush) 1913 { 1914 if (!remote_flush && list_empty(invalid_list)) 1915 return false; 1916 1917 if (!list_empty(invalid_list)) 1918 kvm_mmu_commit_zap_page(kvm, invalid_list); 1919 else 1920 kvm_flush_remote_tlbs(kvm); 1921 return true; 1922 } 1923 1924 #ifdef CONFIG_KVM_MMU_AUDIT 1925 #include "mmu_audit.c" 1926 #else 1927 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } 1928 static void mmu_audit_disable(void) { } 1929 #endif 1930 1931 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 1932 { 1933 if (sp->role.invalid) 1934 return true; 1935 1936 /* TDP MMU pages due not use the MMU generation. */ 1937 return !sp->tdp_mmu_page && 1938 unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); 1939 } 1940 1941 struct mmu_page_path { 1942 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; 1943 unsigned int idx[PT64_ROOT_MAX_LEVEL]; 1944 }; 1945 1946 #define for_each_sp(pvec, sp, parents, i) \ 1947 for (i = mmu_pages_first(&pvec, &parents); \ 1948 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 1949 i = mmu_pages_next(&pvec, &parents, i)) 1950 1951 static int mmu_pages_next(struct kvm_mmu_pages *pvec, 1952 struct mmu_page_path *parents, 1953 int i) 1954 { 1955 int n; 1956 1957 for (n = i+1; n < pvec->nr; n++) { 1958 struct kvm_mmu_page *sp = pvec->page[n].sp; 1959 unsigned idx = pvec->page[n].idx; 1960 int level = sp->role.level; 1961 1962 parents->idx[level-1] = idx; 1963 if (level == PG_LEVEL_4K) 1964 break; 1965 1966 parents->parent[level-2] = sp; 1967 } 1968 1969 return n; 1970 } 1971 1972 static int mmu_pages_first(struct kvm_mmu_pages *pvec, 1973 struct mmu_page_path *parents) 1974 { 1975 struct kvm_mmu_page *sp; 1976 int level; 1977 1978 if (pvec->nr == 0) 1979 return 0; 1980 1981 WARN_ON(pvec->page[0].idx != INVALID_INDEX); 1982 1983 sp = pvec->page[0].sp; 1984 level = sp->role.level; 1985 WARN_ON(level == PG_LEVEL_4K); 1986 1987 parents->parent[level-2] = sp; 1988 1989 /* Also set up a sentinel. Further entries in pvec are all 1990 * children of sp, so this element is never overwritten. 1991 */ 1992 parents->parent[level-1] = NULL; 1993 return mmu_pages_next(pvec, parents, 0); 1994 } 1995 1996 static void mmu_pages_clear_parents(struct mmu_page_path *parents) 1997 { 1998 struct kvm_mmu_page *sp; 1999 unsigned int level = 0; 2000 2001 do { 2002 unsigned int idx = parents->idx[level]; 2003 sp = parents->parent[level]; 2004 if (!sp) 2005 return; 2006 2007 WARN_ON(idx == INVALID_INDEX); 2008 clear_unsync_child_bit(sp, idx); 2009 level++; 2010 } while (!sp->unsync_children); 2011 } 2012 2013 static int mmu_sync_children(struct kvm_vcpu *vcpu, 2014 struct kvm_mmu_page *parent, bool can_yield) 2015 { 2016 int i; 2017 struct kvm_mmu_page *sp; 2018 struct mmu_page_path parents; 2019 struct kvm_mmu_pages pages; 2020 LIST_HEAD(invalid_list); 2021 bool flush = false; 2022 2023 while (mmu_unsync_walk(parent, &pages)) { 2024 bool protected = false; 2025 2026 for_each_sp(pages, sp, parents, i) 2027 protected |= rmap_write_protect(vcpu, sp->gfn); 2028 2029 if (protected) { 2030 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); 2031 flush = false; 2032 } 2033 2034 for_each_sp(pages, sp, parents, i) { 2035 kvm_unlink_unsync_page(vcpu->kvm, sp); 2036 flush |= kvm_sync_page(vcpu, sp, &invalid_list); 2037 mmu_pages_clear_parents(&parents); 2038 } 2039 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { 2040 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 2041 if (!can_yield) { 2042 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 2043 return -EINTR; 2044 } 2045 2046 cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); 2047 flush = false; 2048 } 2049 } 2050 2051 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 2052 return 0; 2053 } 2054 2055 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) 2056 { 2057 atomic_set(&sp->write_flooding_count, 0); 2058 } 2059 2060 static void clear_sp_write_flooding_count(u64 *spte) 2061 { 2062 __clear_sp_write_flooding_count(sptep_to_sp(spte)); 2063 } 2064 2065 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 2066 gfn_t gfn, 2067 gva_t gaddr, 2068 unsigned level, 2069 int direct, 2070 unsigned int access) 2071 { 2072 bool direct_mmu = vcpu->arch.mmu->direct_map; 2073 union kvm_mmu_page_role role; 2074 struct hlist_head *sp_list; 2075 unsigned quadrant; 2076 struct kvm_mmu_page *sp; 2077 int collisions = 0; 2078 LIST_HEAD(invalid_list); 2079 2080 role = vcpu->arch.mmu->mmu_role.base; 2081 role.level = level; 2082 role.direct = direct; 2083 role.access = access; 2084 if (role.has_4_byte_gpte) { 2085 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 2086 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 2087 role.quadrant = quadrant; 2088 } 2089 2090 sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; 2091 for_each_valid_sp(vcpu->kvm, sp, sp_list) { 2092 if (sp->gfn != gfn) { 2093 collisions++; 2094 continue; 2095 } 2096 2097 if (sp->role.word != role.word) { 2098 /* 2099 * If the guest is creating an upper-level page, zap 2100 * unsync pages for the same gfn. While it's possible 2101 * the guest is using recursive page tables, in all 2102 * likelihood the guest has stopped using the unsync 2103 * page and is installing a completely unrelated page. 2104 * Unsync pages must not be left as is, because the new 2105 * upper-level page will be write-protected. 2106 */ 2107 if (level > PG_LEVEL_4K && sp->unsync) 2108 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2109 &invalid_list); 2110 continue; 2111 } 2112 2113 if (direct_mmu) 2114 goto trace_get_page; 2115 2116 if (sp->unsync) { 2117 /* 2118 * The page is good, but is stale. kvm_sync_page does 2119 * get the latest guest state, but (unlike mmu_unsync_children) 2120 * it doesn't write-protect the page or mark it synchronized! 2121 * This way the validity of the mapping is ensured, but the 2122 * overhead of write protection is not incurred until the 2123 * guest invalidates the TLB mapping. This allows multiple 2124 * SPs for a single gfn to be unsync. 2125 * 2126 * If the sync fails, the page is zapped. If so, break 2127 * in order to rebuild it. 2128 */ 2129 if (!kvm_sync_page(vcpu, sp, &invalid_list)) 2130 break; 2131 2132 WARN_ON(!list_empty(&invalid_list)); 2133 kvm_flush_remote_tlbs(vcpu->kvm); 2134 } 2135 2136 __clear_sp_write_flooding_count(sp); 2137 2138 trace_get_page: 2139 trace_kvm_mmu_get_page(sp, false); 2140 goto out; 2141 } 2142 2143 ++vcpu->kvm->stat.mmu_cache_miss; 2144 2145 sp = kvm_mmu_alloc_page(vcpu, direct); 2146 2147 sp->gfn = gfn; 2148 sp->role = role; 2149 hlist_add_head(&sp->hash_link, sp_list); 2150 if (!direct) { 2151 account_shadowed(vcpu->kvm, sp); 2152 if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) 2153 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); 2154 } 2155 trace_kvm_mmu_get_page(sp, true); 2156 out: 2157 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2158 2159 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) 2160 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; 2161 return sp; 2162 } 2163 2164 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, 2165 struct kvm_vcpu *vcpu, hpa_t root, 2166 u64 addr) 2167 { 2168 iterator->addr = addr; 2169 iterator->shadow_addr = root; 2170 iterator->level = vcpu->arch.mmu->shadow_root_level; 2171 2172 if (iterator->level >= PT64_ROOT_4LEVEL && 2173 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && 2174 !vcpu->arch.mmu->direct_map) 2175 iterator->level = PT32E_ROOT_LEVEL; 2176 2177 if (iterator->level == PT32E_ROOT_LEVEL) { 2178 /* 2179 * prev_root is currently only used for 64-bit hosts. So only 2180 * the active root_hpa is valid here. 2181 */ 2182 BUG_ON(root != vcpu->arch.mmu->root_hpa); 2183 2184 iterator->shadow_addr 2185 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; 2186 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; 2187 --iterator->level; 2188 if (!iterator->shadow_addr) 2189 iterator->level = 0; 2190 } 2191 } 2192 2193 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 2194 struct kvm_vcpu *vcpu, u64 addr) 2195 { 2196 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa, 2197 addr); 2198 } 2199 2200 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) 2201 { 2202 if (iterator->level < PG_LEVEL_4K) 2203 return false; 2204 2205 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 2206 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 2207 return true; 2208 } 2209 2210 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, 2211 u64 spte) 2212 { 2213 if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { 2214 iterator->level = 0; 2215 return; 2216 } 2217 2218 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; 2219 --iterator->level; 2220 } 2221 2222 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 2223 { 2224 __shadow_walk_next(iterator, *iterator->sptep); 2225 } 2226 2227 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, 2228 struct kvm_mmu_page *sp) 2229 { 2230 u64 spte; 2231 2232 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2233 2234 spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); 2235 2236 mmu_spte_set(sptep, spte); 2237 2238 mmu_page_add_parent_pte(vcpu, sp, sptep); 2239 2240 if (sp->unsync_children || sp->unsync) 2241 mark_unsync(sptep); 2242 } 2243 2244 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2245 unsigned direct_access) 2246 { 2247 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { 2248 struct kvm_mmu_page *child; 2249 2250 /* 2251 * For the direct sp, if the guest pte's dirty bit 2252 * changed form clean to dirty, it will corrupt the 2253 * sp's access: allow writable in the read-only sp, 2254 * so we should update the spte at this point to get 2255 * a new sp with the correct access. 2256 */ 2257 child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); 2258 if (child->role.access == direct_access) 2259 return; 2260 2261 drop_parent_pte(child, sptep); 2262 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); 2263 } 2264 } 2265 2266 /* Returns the number of zapped non-leaf child shadow pages. */ 2267 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 2268 u64 *spte, struct list_head *invalid_list) 2269 { 2270 u64 pte; 2271 struct kvm_mmu_page *child; 2272 2273 pte = *spte; 2274 if (is_shadow_present_pte(pte)) { 2275 if (is_last_spte(pte, sp->role.level)) { 2276 drop_spte(kvm, spte); 2277 } else { 2278 child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); 2279 drop_parent_pte(child, spte); 2280 2281 /* 2282 * Recursively zap nested TDP SPs, parentless SPs are 2283 * unlikely to be used again in the near future. This 2284 * avoids retaining a large number of stale nested SPs. 2285 */ 2286 if (tdp_enabled && invalid_list && 2287 child->role.guest_mode && !child->parent_ptes.val) 2288 return kvm_mmu_prepare_zap_page(kvm, child, 2289 invalid_list); 2290 } 2291 } else if (is_mmio_spte(pte)) { 2292 mmu_spte_clear_no_track(spte); 2293 } 2294 return 0; 2295 } 2296 2297 static int kvm_mmu_page_unlink_children(struct kvm *kvm, 2298 struct kvm_mmu_page *sp, 2299 struct list_head *invalid_list) 2300 { 2301 int zapped = 0; 2302 unsigned i; 2303 2304 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 2305 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); 2306 2307 return zapped; 2308 } 2309 2310 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 2311 { 2312 u64 *sptep; 2313 struct rmap_iterator iter; 2314 2315 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) 2316 drop_parent_pte(sp, sptep); 2317 } 2318 2319 static int mmu_zap_unsync_children(struct kvm *kvm, 2320 struct kvm_mmu_page *parent, 2321 struct list_head *invalid_list) 2322 { 2323 int i, zapped = 0; 2324 struct mmu_page_path parents; 2325 struct kvm_mmu_pages pages; 2326 2327 if (parent->role.level == PG_LEVEL_4K) 2328 return 0; 2329 2330 while (mmu_unsync_walk(parent, &pages)) { 2331 struct kvm_mmu_page *sp; 2332 2333 for_each_sp(pages, sp, parents, i) { 2334 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2335 mmu_pages_clear_parents(&parents); 2336 zapped++; 2337 } 2338 } 2339 2340 return zapped; 2341 } 2342 2343 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, 2344 struct kvm_mmu_page *sp, 2345 struct list_head *invalid_list, 2346 int *nr_zapped) 2347 { 2348 bool list_unstable; 2349 2350 trace_kvm_mmu_prepare_zap_page(sp); 2351 ++kvm->stat.mmu_shadow_zapped; 2352 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); 2353 *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); 2354 kvm_mmu_unlink_parents(kvm, sp); 2355 2356 /* Zapping children means active_mmu_pages has become unstable. */ 2357 list_unstable = *nr_zapped; 2358 2359 if (!sp->role.invalid && !sp->role.direct) 2360 unaccount_shadowed(kvm, sp); 2361 2362 if (sp->unsync) 2363 kvm_unlink_unsync_page(kvm, sp); 2364 if (!sp->root_count) { 2365 /* Count self */ 2366 (*nr_zapped)++; 2367 2368 /* 2369 * Already invalid pages (previously active roots) are not on 2370 * the active page list. See list_del() in the "else" case of 2371 * !sp->root_count. 2372 */ 2373 if (sp->role.invalid) 2374 list_add(&sp->link, invalid_list); 2375 else 2376 list_move(&sp->link, invalid_list); 2377 kvm_mod_used_mmu_pages(kvm, -1); 2378 } else { 2379 /* 2380 * Remove the active root from the active page list, the root 2381 * will be explicitly freed when the root_count hits zero. 2382 */ 2383 list_del(&sp->link); 2384 2385 /* 2386 * Obsolete pages cannot be used on any vCPUs, see the comment 2387 * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also 2388 * treats invalid shadow pages as being obsolete. 2389 */ 2390 if (!is_obsolete_sp(kvm, sp)) 2391 kvm_reload_remote_mmus(kvm); 2392 } 2393 2394 if (sp->lpage_disallowed) 2395 unaccount_huge_nx_page(kvm, sp); 2396 2397 sp->role.invalid = 1; 2398 return list_unstable; 2399 } 2400 2401 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 2402 struct list_head *invalid_list) 2403 { 2404 int nr_zapped; 2405 2406 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); 2407 return nr_zapped; 2408 } 2409 2410 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2411 struct list_head *invalid_list) 2412 { 2413 struct kvm_mmu_page *sp, *nsp; 2414 2415 if (list_empty(invalid_list)) 2416 return; 2417 2418 /* 2419 * We need to make sure everyone sees our modifications to 2420 * the page tables and see changes to vcpu->mode here. The barrier 2421 * in the kvm_flush_remote_tlbs() achieves this. This pairs 2422 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. 2423 * 2424 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit 2425 * guest mode and/or lockless shadow page table walks. 2426 */ 2427 kvm_flush_remote_tlbs(kvm); 2428 2429 list_for_each_entry_safe(sp, nsp, invalid_list, link) { 2430 WARN_ON(!sp->role.invalid || sp->root_count); 2431 kvm_mmu_free_page(sp); 2432 } 2433 } 2434 2435 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, 2436 unsigned long nr_to_zap) 2437 { 2438 unsigned long total_zapped = 0; 2439 struct kvm_mmu_page *sp, *tmp; 2440 LIST_HEAD(invalid_list); 2441 bool unstable; 2442 int nr_zapped; 2443 2444 if (list_empty(&kvm->arch.active_mmu_pages)) 2445 return 0; 2446 2447 restart: 2448 list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { 2449 /* 2450 * Don't zap active root pages, the page itself can't be freed 2451 * and zapping it will just force vCPUs to realloc and reload. 2452 */ 2453 if (sp->root_count) 2454 continue; 2455 2456 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, 2457 &nr_zapped); 2458 total_zapped += nr_zapped; 2459 if (total_zapped >= nr_to_zap) 2460 break; 2461 2462 if (unstable) 2463 goto restart; 2464 } 2465 2466 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2467 2468 kvm->stat.mmu_recycled += total_zapped; 2469 return total_zapped; 2470 } 2471 2472 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) 2473 { 2474 if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) 2475 return kvm->arch.n_max_mmu_pages - 2476 kvm->arch.n_used_mmu_pages; 2477 2478 return 0; 2479 } 2480 2481 static int make_mmu_pages_available(struct kvm_vcpu *vcpu) 2482 { 2483 unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); 2484 2485 if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) 2486 return 0; 2487 2488 kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); 2489 2490 /* 2491 * Note, this check is intentionally soft, it only guarantees that one 2492 * page is available, while the caller may end up allocating as many as 2493 * four pages, e.g. for PAE roots or for 5-level paging. Temporarily 2494 * exceeding the (arbitrary by default) limit will not harm the host, 2495 * being too aggressive may unnecessarily kill the guest, and getting an 2496 * exact count is far more trouble than it's worth, especially in the 2497 * page fault paths. 2498 */ 2499 if (!kvm_mmu_available_pages(vcpu->kvm)) 2500 return -ENOSPC; 2501 return 0; 2502 } 2503 2504 /* 2505 * Changing the number of mmu pages allocated to the vm 2506 * Note: if goal_nr_mmu_pages is too small, you will get dead lock 2507 */ 2508 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) 2509 { 2510 write_lock(&kvm->mmu_lock); 2511 2512 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2513 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - 2514 goal_nr_mmu_pages); 2515 2516 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 2517 } 2518 2519 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2520 2521 write_unlock(&kvm->mmu_lock); 2522 } 2523 2524 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2525 { 2526 struct kvm_mmu_page *sp; 2527 LIST_HEAD(invalid_list); 2528 int r; 2529 2530 pgprintk("%s: looking for gfn %llx\n", __func__, gfn); 2531 r = 0; 2532 write_lock(&kvm->mmu_lock); 2533 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { 2534 pgprintk("%s: gfn %llx role %x\n", __func__, gfn, 2535 sp->role.word); 2536 r = 1; 2537 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2538 } 2539 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2540 write_unlock(&kvm->mmu_lock); 2541 2542 return r; 2543 } 2544 2545 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 2546 { 2547 gpa_t gpa; 2548 int r; 2549 2550 if (vcpu->arch.mmu->direct_map) 2551 return 0; 2552 2553 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 2554 2555 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2556 2557 return r; 2558 } 2559 2560 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 2561 { 2562 trace_kvm_mmu_unsync_page(sp); 2563 ++kvm->stat.mmu_unsync; 2564 sp->unsync = 1; 2565 2566 kvm_mmu_mark_parents_unsync(sp); 2567 } 2568 2569 /* 2570 * Attempt to unsync any shadow pages that can be reached by the specified gfn, 2571 * KVM is creating a writable mapping for said gfn. Returns 0 if all pages 2572 * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must 2573 * be write-protected. 2574 */ 2575 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, 2576 gfn_t gfn, bool can_unsync, bool prefetch) 2577 { 2578 struct kvm_mmu_page *sp; 2579 bool locked = false; 2580 2581 /* 2582 * Force write-protection if the page is being tracked. Note, the page 2583 * track machinery is used to write-protect upper-level shadow pages, 2584 * i.e. this guards the role.level == 4K assertion below! 2585 */ 2586 if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) 2587 return -EPERM; 2588 2589 /* 2590 * The page is not write-tracked, mark existing shadow pages unsync 2591 * unless KVM is synchronizing an unsync SP (can_unsync = false). In 2592 * that case, KVM must complete emulation of the guest TLB flush before 2593 * allowing shadow pages to become unsync (writable by the guest). 2594 */ 2595 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { 2596 if (!can_unsync) 2597 return -EPERM; 2598 2599 if (sp->unsync) 2600 continue; 2601 2602 if (prefetch) 2603 return -EEXIST; 2604 2605 /* 2606 * TDP MMU page faults require an additional spinlock as they 2607 * run with mmu_lock held for read, not write, and the unsync 2608 * logic is not thread safe. Take the spinklock regardless of 2609 * the MMU type to avoid extra conditionals/parameters, there's 2610 * no meaningful penalty if mmu_lock is held for write. 2611 */ 2612 if (!locked) { 2613 locked = true; 2614 spin_lock(&kvm->arch.mmu_unsync_pages_lock); 2615 2616 /* 2617 * Recheck after taking the spinlock, a different vCPU 2618 * may have since marked the page unsync. A false 2619 * positive on the unprotected check above is not 2620 * possible as clearing sp->unsync _must_ hold mmu_lock 2621 * for write, i.e. unsync cannot transition from 0->1 2622 * while this CPU holds mmu_lock for read (or write). 2623 */ 2624 if (READ_ONCE(sp->unsync)) 2625 continue; 2626 } 2627 2628 WARN_ON(sp->role.level != PG_LEVEL_4K); 2629 kvm_unsync_page(kvm, sp); 2630 } 2631 if (locked) 2632 spin_unlock(&kvm->arch.mmu_unsync_pages_lock); 2633 2634 /* 2635 * We need to ensure that the marking of unsync pages is visible 2636 * before the SPTE is updated to allow writes because 2637 * kvm_mmu_sync_roots() checks the unsync flags without holding 2638 * the MMU lock and so can race with this. If the SPTE was updated 2639 * before the page had been marked as unsync-ed, something like the 2640 * following could happen: 2641 * 2642 * CPU 1 CPU 2 2643 * --------------------------------------------------------------------- 2644 * 1.2 Host updates SPTE 2645 * to be writable 2646 * 2.1 Guest writes a GPTE for GVA X. 2647 * (GPTE being in the guest page table shadowed 2648 * by the SP from CPU 1.) 2649 * This reads SPTE during the page table walk. 2650 * Since SPTE.W is read as 1, there is no 2651 * fault. 2652 * 2653 * 2.2 Guest issues TLB flush. 2654 * That causes a VM Exit. 2655 * 2656 * 2.3 Walking of unsync pages sees sp->unsync is 2657 * false and skips the page. 2658 * 2659 * 2.4 Guest accesses GVA X. 2660 * Since the mapping in the SP was not updated, 2661 * so the old mapping for GVA X incorrectly 2662 * gets used. 2663 * 1.1 Host marks SP 2664 * as unsync 2665 * (sp->unsync = true) 2666 * 2667 * The write barrier below ensures that 1.1 happens before 1.2 and thus 2668 * the situation in 2.4 does not arise. It pairs with the read barrier 2669 * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. 2670 */ 2671 smp_wmb(); 2672 2673 return 0; 2674 } 2675 2676 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, 2677 u64 *sptep, unsigned int pte_access, gfn_t gfn, 2678 kvm_pfn_t pfn, struct kvm_page_fault *fault) 2679 { 2680 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 2681 int level = sp->role.level; 2682 int was_rmapped = 0; 2683 int ret = RET_PF_FIXED; 2684 bool flush = false; 2685 bool wrprot; 2686 u64 spte; 2687 2688 /* Prefetching always gets a writable pfn. */ 2689 bool host_writable = !fault || fault->map_writable; 2690 bool prefetch = !fault || fault->prefetch; 2691 bool write_fault = fault && fault->write; 2692 2693 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, 2694 *sptep, write_fault, gfn); 2695 2696 if (unlikely(is_noslot_pfn(pfn))) { 2697 mark_mmio_spte(vcpu, sptep, gfn, pte_access); 2698 return RET_PF_EMULATE; 2699 } 2700 2701 if (is_shadow_present_pte(*sptep)) { 2702 /* 2703 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 2704 * the parent of the now unreachable PTE. 2705 */ 2706 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { 2707 struct kvm_mmu_page *child; 2708 u64 pte = *sptep; 2709 2710 child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); 2711 drop_parent_pte(child, sptep); 2712 flush = true; 2713 } else if (pfn != spte_to_pfn(*sptep)) { 2714 pgprintk("hfn old %llx new %llx\n", 2715 spte_to_pfn(*sptep), pfn); 2716 drop_spte(vcpu->kvm, sptep); 2717 flush = true; 2718 } else 2719 was_rmapped = 1; 2720 } 2721 2722 wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, 2723 true, host_writable, &spte); 2724 2725 if (*sptep == spte) { 2726 ret = RET_PF_SPURIOUS; 2727 } else { 2728 trace_kvm_mmu_set_spte(level, gfn, sptep); 2729 flush |= mmu_spte_update(sptep, spte); 2730 } 2731 2732 if (wrprot) { 2733 if (write_fault) 2734 ret = RET_PF_EMULATE; 2735 } 2736 2737 if (flush) 2738 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 2739 KVM_PAGES_PER_HPAGE(level)); 2740 2741 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2742 2743 if (!was_rmapped) { 2744 WARN_ON_ONCE(ret == RET_PF_SPURIOUS); 2745 kvm_update_page_stats(vcpu->kvm, level, 1); 2746 rmap_add(vcpu, slot, sptep, gfn); 2747 } 2748 2749 return ret; 2750 } 2751 2752 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2753 struct kvm_mmu_page *sp, 2754 u64 *start, u64 *end) 2755 { 2756 struct page *pages[PTE_PREFETCH_NUM]; 2757 struct kvm_memory_slot *slot; 2758 unsigned int access = sp->role.access; 2759 int i, ret; 2760 gfn_t gfn; 2761 2762 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2763 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); 2764 if (!slot) 2765 return -1; 2766 2767 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); 2768 if (ret <= 0) 2769 return -1; 2770 2771 for (i = 0; i < ret; i++, gfn++, start++) { 2772 mmu_set_spte(vcpu, slot, start, access, gfn, 2773 page_to_pfn(pages[i]), NULL); 2774 put_page(pages[i]); 2775 } 2776 2777 return 0; 2778 } 2779 2780 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, 2781 struct kvm_mmu_page *sp, u64 *sptep) 2782 { 2783 u64 *spte, *start = NULL; 2784 int i; 2785 2786 WARN_ON(!sp->role.direct); 2787 2788 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); 2789 spte = sp->spt + i; 2790 2791 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2792 if (is_shadow_present_pte(*spte) || spte == sptep) { 2793 if (!start) 2794 continue; 2795 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2796 return; 2797 start = NULL; 2798 } else if (!start) 2799 start = spte; 2800 } 2801 if (start) 2802 direct_pte_prefetch_many(vcpu, sp, start, spte); 2803 } 2804 2805 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) 2806 { 2807 struct kvm_mmu_page *sp; 2808 2809 sp = sptep_to_sp(sptep); 2810 2811 /* 2812 * Without accessed bits, there's no way to distinguish between 2813 * actually accessed translations and prefetched, so disable pte 2814 * prefetch if accessed bits aren't available. 2815 */ 2816 if (sp_ad_disabled(sp)) 2817 return; 2818 2819 if (sp->role.level > PG_LEVEL_4K) 2820 return; 2821 2822 /* 2823 * If addresses are being invalidated, skip prefetching to avoid 2824 * accidentally prefetching those addresses. 2825 */ 2826 if (unlikely(vcpu->kvm->mmu_notifier_count)) 2827 return; 2828 2829 __direct_pte_prefetch(vcpu, sp, sptep); 2830 } 2831 2832 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 2833 const struct kvm_memory_slot *slot) 2834 { 2835 unsigned long hva; 2836 pte_t *pte; 2837 int level; 2838 2839 if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) 2840 return PG_LEVEL_4K; 2841 2842 /* 2843 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() 2844 * is not solely for performance, it's also necessary to avoid the 2845 * "writable" check in __gfn_to_hva_many(), which will always fail on 2846 * read-only memslots due to gfn_to_hva() assuming writes. Earlier 2847 * page fault steps have already verified the guest isn't writing a 2848 * read-only memslot. 2849 */ 2850 hva = __gfn_to_hva_memslot(slot, gfn); 2851 2852 pte = lookup_address_in_mm(kvm->mm, hva, &level); 2853 if (unlikely(!pte)) 2854 return PG_LEVEL_4K; 2855 2856 return level; 2857 } 2858 2859 int kvm_mmu_max_mapping_level(struct kvm *kvm, 2860 const struct kvm_memory_slot *slot, gfn_t gfn, 2861 kvm_pfn_t pfn, int max_level) 2862 { 2863 struct kvm_lpage_info *linfo; 2864 int host_level; 2865 2866 max_level = min(max_level, max_huge_page_level); 2867 for ( ; max_level > PG_LEVEL_4K; max_level--) { 2868 linfo = lpage_info_slot(gfn, slot, max_level); 2869 if (!linfo->disallow_lpage) 2870 break; 2871 } 2872 2873 if (max_level == PG_LEVEL_4K) 2874 return PG_LEVEL_4K; 2875 2876 host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); 2877 return min(host_level, max_level); 2878 } 2879 2880 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 2881 { 2882 struct kvm_memory_slot *slot = fault->slot; 2883 kvm_pfn_t mask; 2884 2885 fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; 2886 2887 if (unlikely(fault->max_level == PG_LEVEL_4K)) 2888 return; 2889 2890 if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) 2891 return; 2892 2893 if (kvm_slot_dirty_track_enabled(slot)) 2894 return; 2895 2896 /* 2897 * Enforce the iTLB multihit workaround after capturing the requested 2898 * level, which will be used to do precise, accurate accounting. 2899 */ 2900 fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, 2901 fault->gfn, fault->pfn, 2902 fault->max_level); 2903 if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) 2904 return; 2905 2906 /* 2907 * mmu_notifier_retry() was successful and mmu_lock is held, so 2908 * the pmd can't be split from under us. 2909 */ 2910 fault->goal_level = fault->req_level; 2911 mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; 2912 VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); 2913 fault->pfn &= ~mask; 2914 } 2915 2916 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) 2917 { 2918 if (cur_level > PG_LEVEL_4K && 2919 cur_level == fault->goal_level && 2920 is_shadow_present_pte(spte) && 2921 !is_large_pte(spte)) { 2922 /* 2923 * A small SPTE exists for this pfn, but FNAME(fetch) 2924 * and __direct_map would like to create a large PTE 2925 * instead: just force them to go down another level, 2926 * patching back for them into pfn the next 9 bits of 2927 * the address. 2928 */ 2929 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - 2930 KVM_PAGES_PER_HPAGE(cur_level - 1); 2931 fault->pfn |= fault->gfn & page_mask; 2932 fault->goal_level--; 2933 } 2934 } 2935 2936 static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 2937 { 2938 struct kvm_shadow_walk_iterator it; 2939 struct kvm_mmu_page *sp; 2940 int ret; 2941 gfn_t base_gfn = fault->gfn; 2942 2943 kvm_mmu_hugepage_adjust(vcpu, fault); 2944 2945 trace_kvm_mmu_spte_requested(fault); 2946 for_each_shadow_entry(vcpu, fault->addr, it) { 2947 /* 2948 * We cannot overwrite existing page tables with an NX 2949 * large page, as the leaf could be executable. 2950 */ 2951 if (fault->nx_huge_page_workaround_enabled) 2952 disallowed_hugepage_adjust(fault, *it.sptep, it.level); 2953 2954 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 2955 if (it.level == fault->goal_level) 2956 break; 2957 2958 drop_large_spte(vcpu, it.sptep); 2959 if (is_shadow_present_pte(*it.sptep)) 2960 continue; 2961 2962 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, 2963 it.level - 1, true, ACC_ALL); 2964 2965 link_shadow_page(vcpu, it.sptep, sp); 2966 if (fault->is_tdp && fault->huge_page_disallowed && 2967 fault->req_level >= it.level) 2968 account_huge_nx_page(vcpu->kvm, sp); 2969 } 2970 2971 if (WARN_ON_ONCE(it.level != fault->goal_level)) 2972 return -EFAULT; 2973 2974 ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, 2975 base_gfn, fault->pfn, fault); 2976 if (ret == RET_PF_SPURIOUS) 2977 return ret; 2978 2979 direct_pte_prefetch(vcpu, it.sptep); 2980 ++vcpu->stat.pf_fixed; 2981 return ret; 2982 } 2983 2984 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2985 { 2986 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); 2987 } 2988 2989 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) 2990 { 2991 /* 2992 * Do not cache the mmio info caused by writing the readonly gfn 2993 * into the spte otherwise read access on readonly gfn also can 2994 * caused mmio page fault and treat it as mmio access. 2995 */ 2996 if (pfn == KVM_PFN_ERR_RO_FAULT) 2997 return RET_PF_EMULATE; 2998 2999 if (pfn == KVM_PFN_ERR_HWPOISON) { 3000 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); 3001 return RET_PF_RETRY; 3002 } 3003 3004 return -EFAULT; 3005 } 3006 3007 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, 3008 unsigned int access, int *ret_val) 3009 { 3010 /* The pfn is invalid, report the error! */ 3011 if (unlikely(is_error_pfn(fault->pfn))) { 3012 *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); 3013 return true; 3014 } 3015 3016 if (unlikely(!fault->slot)) { 3017 gva_t gva = fault->is_tdp ? 0 : fault->addr; 3018 3019 vcpu_cache_mmio_info(vcpu, gva, fault->gfn, 3020 access & shadow_mmio_access_mask); 3021 /* 3022 * If MMIO caching is disabled, emulate immediately without 3023 * touching the shadow page tables as attempting to install an 3024 * MMIO SPTE will just be an expensive nop. 3025 */ 3026 if (unlikely(!shadow_mmio_value)) { 3027 *ret_val = RET_PF_EMULATE; 3028 return true; 3029 } 3030 } 3031 3032 return false; 3033 } 3034 3035 static bool page_fault_can_be_fast(struct kvm_page_fault *fault) 3036 { 3037 /* 3038 * Do not fix the mmio spte with invalid generation number which 3039 * need to be updated by slow page fault path. 3040 */ 3041 if (fault->rsvd) 3042 return false; 3043 3044 /* See if the page fault is due to an NX violation */ 3045 if (unlikely(fault->exec && fault->present)) 3046 return false; 3047 3048 /* 3049 * #PF can be fast if: 3050 * 1. The shadow page table entry is not present, which could mean that 3051 * the fault is potentially caused by access tracking (if enabled). 3052 * 2. The shadow page table entry is present and the fault 3053 * is caused by write-protect, that means we just need change the W 3054 * bit of the spte which can be done out of mmu-lock. 3055 * 3056 * However, if access tracking is disabled we know that a non-present 3057 * page must be a genuine page fault where we have to create a new SPTE. 3058 * So, if access tracking is disabled, we return true only for write 3059 * accesses to a present page. 3060 */ 3061 3062 return shadow_acc_track_mask != 0 || (fault->write && fault->present); 3063 } 3064 3065 /* 3066 * Returns true if the SPTE was fixed successfully. Otherwise, 3067 * someone else modified the SPTE from its original value. 3068 */ 3069 static bool 3070 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, 3071 u64 *sptep, u64 old_spte, u64 new_spte) 3072 { 3073 /* 3074 * Theoretically we could also set dirty bit (and flush TLB) here in 3075 * order to eliminate unnecessary PML logging. See comments in 3076 * set_spte. But fast_page_fault is very unlikely to happen with PML 3077 * enabled, so we do not do this. This might result in the same GPA 3078 * to be logged in PML buffer again when the write really happens, and 3079 * eventually to be called by mark_page_dirty twice. But it's also no 3080 * harm. This also avoids the TLB flush needed after setting dirty bit 3081 * so non-PML cases won't be impacted. 3082 * 3083 * Compare with set_spte where instead shadow_dirty_mask is set. 3084 */ 3085 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) 3086 return false; 3087 3088 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) 3089 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); 3090 3091 return true; 3092 } 3093 3094 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) 3095 { 3096 if (fault->exec) 3097 return is_executable_pte(spte); 3098 3099 if (fault->write) 3100 return is_writable_pte(spte); 3101 3102 /* Fault was on Read access */ 3103 return spte & PT_PRESENT_MASK; 3104 } 3105 3106 /* 3107 * Returns the last level spte pointer of the shadow page walk for the given 3108 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 3109 * walk could be performed, returns NULL and *spte does not contain valid data. 3110 * 3111 * Contract: 3112 * - Must be called between walk_shadow_page_lockless_{begin,end}. 3113 * - The returned sptep must not be used after walk_shadow_page_lockless_end. 3114 */ 3115 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) 3116 { 3117 struct kvm_shadow_walk_iterator iterator; 3118 u64 old_spte; 3119 u64 *sptep = NULL; 3120 3121 for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { 3122 sptep = iterator.sptep; 3123 *spte = old_spte; 3124 } 3125 3126 return sptep; 3127 } 3128 3129 /* 3130 * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. 3131 */ 3132 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 3133 { 3134 struct kvm_mmu_page *sp; 3135 int ret = RET_PF_INVALID; 3136 u64 spte = 0ull; 3137 u64 *sptep = NULL; 3138 uint retry_count = 0; 3139 3140 if (!page_fault_can_be_fast(fault)) 3141 return ret; 3142 3143 walk_shadow_page_lockless_begin(vcpu); 3144 3145 do { 3146 u64 new_spte; 3147 3148 if (is_tdp_mmu(vcpu->arch.mmu)) 3149 sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); 3150 else 3151 sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); 3152 3153 if (!is_shadow_present_pte(spte)) 3154 break; 3155 3156 sp = sptep_to_sp(sptep); 3157 if (!is_last_spte(spte, sp->role.level)) 3158 break; 3159 3160 /* 3161 * Check whether the memory access that caused the fault would 3162 * still cause it if it were to be performed right now. If not, 3163 * then this is a spurious fault caused by TLB lazily flushed, 3164 * or some other CPU has already fixed the PTE after the 3165 * current CPU took the fault. 3166 * 3167 * Need not check the access of upper level table entries since 3168 * they are always ACC_ALL. 3169 */ 3170 if (is_access_allowed(fault, spte)) { 3171 ret = RET_PF_SPURIOUS; 3172 break; 3173 } 3174 3175 new_spte = spte; 3176 3177 if (is_access_track_spte(spte)) 3178 new_spte = restore_acc_track_spte(new_spte); 3179 3180 /* 3181 * Currently, to simplify the code, write-protection can 3182 * be removed in the fast path only if the SPTE was 3183 * write-protected for dirty-logging or access tracking. 3184 */ 3185 if (fault->write && 3186 spte_can_locklessly_be_made_writable(spte)) { 3187 new_spte |= PT_WRITABLE_MASK; 3188 3189 /* 3190 * Do not fix write-permission on the large spte when 3191 * dirty logging is enabled. Since we only dirty the 3192 * first page into the dirty-bitmap in 3193 * fast_pf_fix_direct_spte(), other pages are missed 3194 * if its slot has dirty logging enabled. 3195 * 3196 * Instead, we let the slow page fault path create a 3197 * normal spte to fix the access. 3198 */ 3199 if (sp->role.level > PG_LEVEL_4K && 3200 kvm_slot_dirty_track_enabled(fault->slot)) 3201 break; 3202 } 3203 3204 /* Verify that the fault can be handled in the fast path */ 3205 if (new_spte == spte || 3206 !is_access_allowed(fault, new_spte)) 3207 break; 3208 3209 /* 3210 * Currently, fast page fault only works for direct mapping 3211 * since the gfn is not stable for indirect shadow page. See 3212 * Documentation/virt/kvm/locking.rst to get more detail. 3213 */ 3214 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { 3215 ret = RET_PF_FIXED; 3216 break; 3217 } 3218 3219 if (++retry_count > 4) { 3220 printk_once(KERN_WARNING 3221 "kvm: Fast #PF retrying more than 4 times.\n"); 3222 break; 3223 } 3224 3225 } while (true); 3226 3227 trace_fast_page_fault(vcpu, fault, sptep, spte, ret); 3228 walk_shadow_page_lockless_end(vcpu); 3229 3230 return ret; 3231 } 3232 3233 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, 3234 struct list_head *invalid_list) 3235 { 3236 struct kvm_mmu_page *sp; 3237 3238 if (!VALID_PAGE(*root_hpa)) 3239 return; 3240 3241 sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); 3242 3243 if (is_tdp_mmu_page(sp)) 3244 kvm_tdp_mmu_put_root(kvm, sp, false); 3245 else if (!--sp->root_count && sp->role.invalid) 3246 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 3247 3248 *root_hpa = INVALID_PAGE; 3249 } 3250 3251 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ 3252 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 3253 ulong roots_to_free) 3254 { 3255 struct kvm *kvm = vcpu->kvm; 3256 int i; 3257 LIST_HEAD(invalid_list); 3258 bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; 3259 3260 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); 3261 3262 /* Before acquiring the MMU lock, see if we need to do any real work. */ 3263 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { 3264 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3265 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && 3266 VALID_PAGE(mmu->prev_roots[i].hpa)) 3267 break; 3268 3269 if (i == KVM_MMU_NUM_PREV_ROOTS) 3270 return; 3271 } 3272 3273 write_lock(&kvm->mmu_lock); 3274 3275 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3276 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) 3277 mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, 3278 &invalid_list); 3279 3280 if (free_active_root) { 3281 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 3282 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { 3283 mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); 3284 } else if (mmu->pae_root) { 3285 for (i = 0; i < 4; ++i) { 3286 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) 3287 continue; 3288 3289 mmu_free_root_page(kvm, &mmu->pae_root[i], 3290 &invalid_list); 3291 mmu->pae_root[i] = INVALID_PAE_ROOT; 3292 } 3293 } 3294 mmu->root_hpa = INVALID_PAGE; 3295 mmu->root_pgd = 0; 3296 } 3297 3298 kvm_mmu_commit_zap_page(kvm, &invalid_list); 3299 write_unlock(&kvm->mmu_lock); 3300 } 3301 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); 3302 3303 void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 3304 { 3305 unsigned long roots_to_free = 0; 3306 hpa_t root_hpa; 3307 int i; 3308 3309 /* 3310 * This should not be called while L2 is active, L2 can't invalidate 3311 * _only_ its own roots, e.g. INVVPID unconditionally exits. 3312 */ 3313 WARN_ON_ONCE(mmu->mmu_role.base.guest_mode); 3314 3315 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 3316 root_hpa = mmu->prev_roots[i].hpa; 3317 if (!VALID_PAGE(root_hpa)) 3318 continue; 3319 3320 if (!to_shadow_page(root_hpa) || 3321 to_shadow_page(root_hpa)->role.guest_mode) 3322 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 3323 } 3324 3325 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 3326 } 3327 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); 3328 3329 3330 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) 3331 { 3332 int ret = 0; 3333 3334 if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { 3335 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3336 ret = 1; 3337 } 3338 3339 return ret; 3340 } 3341 3342 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, 3343 u8 level, bool direct) 3344 { 3345 struct kvm_mmu_page *sp; 3346 3347 sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); 3348 ++sp->root_count; 3349 3350 return __pa(sp->spt); 3351 } 3352 3353 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) 3354 { 3355 struct kvm_mmu *mmu = vcpu->arch.mmu; 3356 u8 shadow_root_level = mmu->shadow_root_level; 3357 hpa_t root; 3358 unsigned i; 3359 int r; 3360 3361 write_lock(&vcpu->kvm->mmu_lock); 3362 r = make_mmu_pages_available(vcpu); 3363 if (r < 0) 3364 goto out_unlock; 3365 3366 if (is_tdp_mmu_enabled(vcpu->kvm)) { 3367 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); 3368 mmu->root_hpa = root; 3369 } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { 3370 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); 3371 mmu->root_hpa = root; 3372 } else if (shadow_root_level == PT32E_ROOT_LEVEL) { 3373 if (WARN_ON_ONCE(!mmu->pae_root)) { 3374 r = -EIO; 3375 goto out_unlock; 3376 } 3377 3378 for (i = 0; i < 4; ++i) { 3379 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); 3380 3381 root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 3382 i << 30, PT32_ROOT_LEVEL, true); 3383 mmu->pae_root[i] = root | PT_PRESENT_MASK | 3384 shadow_me_mask; 3385 } 3386 mmu->root_hpa = __pa(mmu->pae_root); 3387 } else { 3388 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); 3389 r = -EIO; 3390 goto out_unlock; 3391 } 3392 3393 /* root_pgd is ignored for direct MMUs. */ 3394 mmu->root_pgd = 0; 3395 out_unlock: 3396 write_unlock(&vcpu->kvm->mmu_lock); 3397 return r; 3398 } 3399 3400 static int mmu_first_shadow_root_alloc(struct kvm *kvm) 3401 { 3402 struct kvm_memslots *slots; 3403 struct kvm_memory_slot *slot; 3404 int r = 0, i, bkt; 3405 3406 /* 3407 * Check if this is the first shadow root being allocated before 3408 * taking the lock. 3409 */ 3410 if (kvm_shadow_root_allocated(kvm)) 3411 return 0; 3412 3413 mutex_lock(&kvm->slots_arch_lock); 3414 3415 /* Recheck, under the lock, whether this is the first shadow root. */ 3416 if (kvm_shadow_root_allocated(kvm)) 3417 goto out_unlock; 3418 3419 /* 3420 * Check if anything actually needs to be allocated, e.g. all metadata 3421 * will be allocated upfront if TDP is disabled. 3422 */ 3423 if (kvm_memslots_have_rmaps(kvm) && 3424 kvm_page_track_write_tracking_enabled(kvm)) 3425 goto out_success; 3426 3427 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 3428 slots = __kvm_memslots(kvm, i); 3429 kvm_for_each_memslot(slot, bkt, slots) { 3430 /* 3431 * Both of these functions are no-ops if the target is 3432 * already allocated, so unconditionally calling both 3433 * is safe. Intentionally do NOT free allocations on 3434 * failure to avoid having to track which allocations 3435 * were made now versus when the memslot was created. 3436 * The metadata is guaranteed to be freed when the slot 3437 * is freed, and will be kept/used if userspace retries 3438 * KVM_RUN instead of killing the VM. 3439 */ 3440 r = memslot_rmap_alloc(slot, slot->npages); 3441 if (r) 3442 goto out_unlock; 3443 r = kvm_page_track_write_tracking_alloc(slot); 3444 if (r) 3445 goto out_unlock; 3446 } 3447 } 3448 3449 /* 3450 * Ensure that shadow_root_allocated becomes true strictly after 3451 * all the related pointers are set. 3452 */ 3453 out_success: 3454 smp_store_release(&kvm->arch.shadow_root_allocated, true); 3455 3456 out_unlock: 3457 mutex_unlock(&kvm->slots_arch_lock); 3458 return r; 3459 } 3460 3461 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) 3462 { 3463 struct kvm_mmu *mmu = vcpu->arch.mmu; 3464 u64 pdptrs[4], pm_mask; 3465 gfn_t root_gfn, root_pgd; 3466 hpa_t root; 3467 unsigned i; 3468 int r; 3469 3470 root_pgd = mmu->get_guest_pgd(vcpu); 3471 root_gfn = root_pgd >> PAGE_SHIFT; 3472 3473 if (mmu_check_root(vcpu, root_gfn)) 3474 return 1; 3475 3476 /* 3477 * On SVM, reading PDPTRs might access guest memory, which might fault 3478 * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. 3479 */ 3480 if (mmu->root_level == PT32E_ROOT_LEVEL) { 3481 for (i = 0; i < 4; ++i) { 3482 pdptrs[i] = mmu->get_pdptr(vcpu, i); 3483 if (!(pdptrs[i] & PT_PRESENT_MASK)) 3484 continue; 3485 3486 if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) 3487 return 1; 3488 } 3489 } 3490 3491 r = mmu_first_shadow_root_alloc(vcpu->kvm); 3492 if (r) 3493 return r; 3494 3495 write_lock(&vcpu->kvm->mmu_lock); 3496 r = make_mmu_pages_available(vcpu); 3497 if (r < 0) 3498 goto out_unlock; 3499 3500 /* 3501 * Do we shadow a long mode page table? If so we need to 3502 * write-protect the guests page table root. 3503 */ 3504 if (mmu->root_level >= PT64_ROOT_4LEVEL) { 3505 root = mmu_alloc_root(vcpu, root_gfn, 0, 3506 mmu->shadow_root_level, false); 3507 mmu->root_hpa = root; 3508 goto set_root_pgd; 3509 } 3510 3511 if (WARN_ON_ONCE(!mmu->pae_root)) { 3512 r = -EIO; 3513 goto out_unlock; 3514 } 3515 3516 /* 3517 * We shadow a 32 bit page table. This may be a legacy 2-level 3518 * or a PAE 3-level page table. In either case we need to be aware that 3519 * the shadow page table may be a PAE or a long mode page table. 3520 */ 3521 pm_mask = PT_PRESENT_MASK | shadow_me_mask; 3522 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { 3523 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 3524 3525 if (WARN_ON_ONCE(!mmu->pml4_root)) { 3526 r = -EIO; 3527 goto out_unlock; 3528 } 3529 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; 3530 3531 if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { 3532 if (WARN_ON_ONCE(!mmu->pml5_root)) { 3533 r = -EIO; 3534 goto out_unlock; 3535 } 3536 mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; 3537 } 3538 } 3539 3540 for (i = 0; i < 4; ++i) { 3541 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); 3542 3543 if (mmu->root_level == PT32E_ROOT_LEVEL) { 3544 if (!(pdptrs[i] & PT_PRESENT_MASK)) { 3545 mmu->pae_root[i] = INVALID_PAE_ROOT; 3546 continue; 3547 } 3548 root_gfn = pdptrs[i] >> PAGE_SHIFT; 3549 } 3550 3551 root = mmu_alloc_root(vcpu, root_gfn, i << 30, 3552 PT32_ROOT_LEVEL, false); 3553 mmu->pae_root[i] = root | pm_mask; 3554 } 3555 3556 if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) 3557 mmu->root_hpa = __pa(mmu->pml5_root); 3558 else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) 3559 mmu->root_hpa = __pa(mmu->pml4_root); 3560 else 3561 mmu->root_hpa = __pa(mmu->pae_root); 3562 3563 set_root_pgd: 3564 mmu->root_pgd = root_pgd; 3565 out_unlock: 3566 write_unlock(&vcpu->kvm->mmu_lock); 3567 3568 return 0; 3569 } 3570 3571 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) 3572 { 3573 struct kvm_mmu *mmu = vcpu->arch.mmu; 3574 bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL; 3575 u64 *pml5_root = NULL; 3576 u64 *pml4_root = NULL; 3577 u64 *pae_root; 3578 3579 /* 3580 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP 3581 * tables are allocated and initialized at root creation as there is no 3582 * equivalent level in the guest's NPT to shadow. Allocate the tables 3583 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. 3584 */ 3585 if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL || 3586 mmu->shadow_root_level < PT64_ROOT_4LEVEL) 3587 return 0; 3588 3589 /* 3590 * NPT, the only paging mode that uses this horror, uses a fixed number 3591 * of levels for the shadow page tables, e.g. all MMUs are 4-level or 3592 * all MMus are 5-level. Thus, this can safely require that pml5_root 3593 * is allocated if the other roots are valid and pml5 is needed, as any 3594 * prior MMU would also have required pml5. 3595 */ 3596 if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) 3597 return 0; 3598 3599 /* 3600 * The special roots should always be allocated in concert. Yell and 3601 * bail if KVM ends up in a state where only one of the roots is valid. 3602 */ 3603 if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || 3604 (need_pml5 && mmu->pml5_root))) 3605 return -EIO; 3606 3607 /* 3608 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and 3609 * doesn't need to be decrypted. 3610 */ 3611 pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3612 if (!pae_root) 3613 return -ENOMEM; 3614 3615 #ifdef CONFIG_X86_64 3616 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3617 if (!pml4_root) 3618 goto err_pml4; 3619 3620 if (need_pml5) { 3621 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3622 if (!pml5_root) 3623 goto err_pml5; 3624 } 3625 #endif 3626 3627 mmu->pae_root = pae_root; 3628 mmu->pml4_root = pml4_root; 3629 mmu->pml5_root = pml5_root; 3630 3631 return 0; 3632 3633 #ifdef CONFIG_X86_64 3634 err_pml5: 3635 free_page((unsigned long)pml4_root); 3636 err_pml4: 3637 free_page((unsigned long)pae_root); 3638 return -ENOMEM; 3639 #endif 3640 } 3641 3642 static bool is_unsync_root(hpa_t root) 3643 { 3644 struct kvm_mmu_page *sp; 3645 3646 if (!VALID_PAGE(root)) 3647 return false; 3648 3649 /* 3650 * The read barrier orders the CPU's read of SPTE.W during the page table 3651 * walk before the reads of sp->unsync/sp->unsync_children here. 3652 * 3653 * Even if another CPU was marking the SP as unsync-ed simultaneously, 3654 * any guest page table changes are not guaranteed to be visible anyway 3655 * until this VCPU issues a TLB flush strictly after those changes are 3656 * made. We only need to ensure that the other CPU sets these flags 3657 * before any actual changes to the page tables are made. The comments 3658 * in mmu_try_to_unsync_pages() describe what could go wrong if this 3659 * requirement isn't satisfied. 3660 */ 3661 smp_rmb(); 3662 sp = to_shadow_page(root); 3663 if (sp->unsync || sp->unsync_children) 3664 return true; 3665 3666 return false; 3667 } 3668 3669 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 3670 { 3671 int i; 3672 struct kvm_mmu_page *sp; 3673 3674 if (vcpu->arch.mmu->direct_map) 3675 return; 3676 3677 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 3678 return; 3679 3680 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 3681 3682 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { 3683 hpa_t root = vcpu->arch.mmu->root_hpa; 3684 sp = to_shadow_page(root); 3685 3686 if (!is_unsync_root(root)) 3687 return; 3688 3689 write_lock(&vcpu->kvm->mmu_lock); 3690 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3691 3692 mmu_sync_children(vcpu, sp, true); 3693 3694 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3695 write_unlock(&vcpu->kvm->mmu_lock); 3696 return; 3697 } 3698 3699 write_lock(&vcpu->kvm->mmu_lock); 3700 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3701 3702 for (i = 0; i < 4; ++i) { 3703 hpa_t root = vcpu->arch.mmu->pae_root[i]; 3704 3705 if (IS_VALID_PAE_ROOT(root)) { 3706 root &= PT64_BASE_ADDR_MASK; 3707 sp = to_shadow_page(root); 3708 mmu_sync_children(vcpu, sp, true); 3709 } 3710 } 3711 3712 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3713 write_unlock(&vcpu->kvm->mmu_lock); 3714 } 3715 3716 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) 3717 { 3718 unsigned long roots_to_free = 0; 3719 int i; 3720 3721 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3722 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) 3723 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 3724 3725 /* sync prev_roots by simply freeing them */ 3726 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); 3727 } 3728 3729 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 3730 gpa_t vaddr, u32 access, 3731 struct x86_exception *exception) 3732 { 3733 if (exception) 3734 exception->error_code = 0; 3735 return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); 3736 } 3737 3738 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3739 { 3740 /* 3741 * A nested guest cannot use the MMIO cache if it is using nested 3742 * page tables, because cr2 is a nGPA while the cache stores GPAs. 3743 */ 3744 if (mmu_is_nested(vcpu)) 3745 return false; 3746 3747 if (direct) 3748 return vcpu_match_mmio_gpa(vcpu, addr); 3749 3750 return vcpu_match_mmio_gva(vcpu, addr); 3751 } 3752 3753 /* 3754 * Return the level of the lowest level SPTE added to sptes. 3755 * That SPTE may be non-present. 3756 * 3757 * Must be called between walk_shadow_page_lockless_{begin,end}. 3758 */ 3759 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) 3760 { 3761 struct kvm_shadow_walk_iterator iterator; 3762 int leaf = -1; 3763 u64 spte; 3764 3765 for (shadow_walk_init(&iterator, vcpu, addr), 3766 *root_level = iterator.level; 3767 shadow_walk_okay(&iterator); 3768 __shadow_walk_next(&iterator, spte)) { 3769 leaf = iterator.level; 3770 spte = mmu_spte_get_lockless(iterator.sptep); 3771 3772 sptes[leaf] = spte; 3773 } 3774 3775 return leaf; 3776 } 3777 3778 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ 3779 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) 3780 { 3781 u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; 3782 struct rsvd_bits_validate *rsvd_check; 3783 int root, leaf, level; 3784 bool reserved = false; 3785 3786 walk_shadow_page_lockless_begin(vcpu); 3787 3788 if (is_tdp_mmu(vcpu->arch.mmu)) 3789 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); 3790 else 3791 leaf = get_walk(vcpu, addr, sptes, &root); 3792 3793 walk_shadow_page_lockless_end(vcpu); 3794 3795 if (unlikely(leaf < 0)) { 3796 *sptep = 0ull; 3797 return reserved; 3798 } 3799 3800 *sptep = sptes[leaf]; 3801 3802 /* 3803 * Skip reserved bits checks on the terminal leaf if it's not a valid 3804 * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by 3805 * design, always have reserved bits set. The purpose of the checks is 3806 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. 3807 */ 3808 if (!is_shadow_present_pte(sptes[leaf])) 3809 leaf++; 3810 3811 rsvd_check = &vcpu->arch.mmu->shadow_zero_check; 3812 3813 for (level = root; level >= leaf; level--) 3814 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); 3815 3816 if (reserved) { 3817 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", 3818 __func__, addr); 3819 for (level = root; level >= leaf; level--) 3820 pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", 3821 sptes[level], level, 3822 get_rsvd_bits(rsvd_check, sptes[level], level)); 3823 } 3824 3825 return reserved; 3826 } 3827 3828 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3829 { 3830 u64 spte; 3831 bool reserved; 3832 3833 if (mmio_info_in_cache(vcpu, addr, direct)) 3834 return RET_PF_EMULATE; 3835 3836 reserved = get_mmio_spte(vcpu, addr, &spte); 3837 if (WARN_ON(reserved)) 3838 return -EINVAL; 3839 3840 if (is_mmio_spte(spte)) { 3841 gfn_t gfn = get_mmio_spte_gfn(spte); 3842 unsigned int access = get_mmio_spte_access(spte); 3843 3844 if (!check_mmio_spte(vcpu, spte)) 3845 return RET_PF_INVALID; 3846 3847 if (direct) 3848 addr = 0; 3849 3850 trace_handle_mmio_page_fault(addr, gfn, access); 3851 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3852 return RET_PF_EMULATE; 3853 } 3854 3855 /* 3856 * If the page table is zapped by other cpus, let CPU fault again on 3857 * the address. 3858 */ 3859 return RET_PF_RETRY; 3860 } 3861 3862 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, 3863 struct kvm_page_fault *fault) 3864 { 3865 if (unlikely(fault->rsvd)) 3866 return false; 3867 3868 if (!fault->present || !fault->write) 3869 return false; 3870 3871 /* 3872 * guest is writing the page which is write tracked which can 3873 * not be fixed by page fault handler. 3874 */ 3875 if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) 3876 return true; 3877 3878 return false; 3879 } 3880 3881 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) 3882 { 3883 struct kvm_shadow_walk_iterator iterator; 3884 u64 spte; 3885 3886 walk_shadow_page_lockless_begin(vcpu); 3887 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) 3888 clear_sp_write_flooding_count(iterator.sptep); 3889 walk_shadow_page_lockless_end(vcpu); 3890 } 3891 3892 static u32 alloc_apf_token(struct kvm_vcpu *vcpu) 3893 { 3894 /* make sure the token value is not 0 */ 3895 u32 id = vcpu->arch.apf.id; 3896 3897 if (id << 12 == 0) 3898 vcpu->arch.apf.id = 1; 3899 3900 return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; 3901 } 3902 3903 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 3904 gfn_t gfn) 3905 { 3906 struct kvm_arch_async_pf arch; 3907 3908 arch.token = alloc_apf_token(vcpu); 3909 arch.gfn = gfn; 3910 arch.direct_map = vcpu->arch.mmu->direct_map; 3911 arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); 3912 3913 return kvm_setup_async_pf(vcpu, cr2_or_gpa, 3914 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 3915 } 3916 3917 static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r) 3918 { 3919 struct kvm_memory_slot *slot = fault->slot; 3920 bool async; 3921 3922 /* 3923 * Retry the page fault if the gfn hit a memslot that is being deleted 3924 * or moved. This ensures any existing SPTEs for the old memslot will 3925 * be zapped before KVM inserts a new MMIO SPTE for the gfn. 3926 */ 3927 if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) 3928 goto out_retry; 3929 3930 if (!kvm_is_visible_memslot(slot)) { 3931 /* Don't expose private memslots to L2. */ 3932 if (is_guest_mode(vcpu)) { 3933 fault->slot = NULL; 3934 fault->pfn = KVM_PFN_NOSLOT; 3935 fault->map_writable = false; 3936 return false; 3937 } 3938 /* 3939 * If the APIC access page exists but is disabled, go directly 3940 * to emulation without caching the MMIO access or creating a 3941 * MMIO SPTE. That way the cache doesn't need to be purged 3942 * when the AVIC is re-enabled. 3943 */ 3944 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && 3945 !kvm_apicv_activated(vcpu->kvm)) { 3946 *r = RET_PF_EMULATE; 3947 return true; 3948 } 3949 } 3950 3951 async = false; 3952 fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, 3953 fault->write, &fault->map_writable, 3954 &fault->hva); 3955 if (!async) 3956 return false; /* *pfn has correct page already */ 3957 3958 if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { 3959 trace_kvm_try_async_get_page(fault->addr, fault->gfn); 3960 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { 3961 trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); 3962 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 3963 goto out_retry; 3964 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) 3965 goto out_retry; 3966 } 3967 3968 fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, 3969 fault->write, &fault->map_writable, 3970 &fault->hva); 3971 return false; 3972 3973 out_retry: 3974 *r = RET_PF_RETRY; 3975 return true; 3976 } 3977 3978 /* 3979 * Returns true if the page fault is stale and needs to be retried, i.e. if the 3980 * root was invalidated by a memslot update or a relevant mmu_notifier fired. 3981 */ 3982 static bool is_page_fault_stale(struct kvm_vcpu *vcpu, 3983 struct kvm_page_fault *fault, int mmu_seq) 3984 { 3985 struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa); 3986 3987 /* Special roots, e.g. pae_root, are not backed by shadow pages. */ 3988 if (sp && is_obsolete_sp(vcpu->kvm, sp)) 3989 return true; 3990 3991 /* 3992 * Roots without an associated shadow page are considered invalid if 3993 * there is a pending request to free obsolete roots. The request is 3994 * only a hint that the current root _may_ be obsolete and needs to be 3995 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a 3996 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs 3997 * to reload even if no vCPU is actively using the root. 3998 */ 3999 if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu)) 4000 return true; 4001 4002 return fault->slot && 4003 mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); 4004 } 4005 4006 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 4007 { 4008 bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); 4009 4010 unsigned long mmu_seq; 4011 int r; 4012 4013 fault->gfn = fault->addr >> PAGE_SHIFT; 4014 fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); 4015 4016 if (page_fault_handle_page_track(vcpu, fault)) 4017 return RET_PF_EMULATE; 4018 4019 r = fast_page_fault(vcpu, fault); 4020 if (r != RET_PF_INVALID) 4021 return r; 4022 4023 r = mmu_topup_memory_caches(vcpu, false); 4024 if (r) 4025 return r; 4026 4027 mmu_seq = vcpu->kvm->mmu_notifier_seq; 4028 smp_rmb(); 4029 4030 if (kvm_faultin_pfn(vcpu, fault, &r)) 4031 return r; 4032 4033 if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r)) 4034 return r; 4035 4036 r = RET_PF_RETRY; 4037 4038 if (is_tdp_mmu_fault) 4039 read_lock(&vcpu->kvm->mmu_lock); 4040 else 4041 write_lock(&vcpu->kvm->mmu_lock); 4042 4043 if (is_page_fault_stale(vcpu, fault, mmu_seq)) 4044 goto out_unlock; 4045 4046 r = make_mmu_pages_available(vcpu); 4047 if (r) 4048 goto out_unlock; 4049 4050 if (is_tdp_mmu_fault) 4051 r = kvm_tdp_mmu_map(vcpu, fault); 4052 else 4053 r = __direct_map(vcpu, fault); 4054 4055 out_unlock: 4056 if (is_tdp_mmu_fault) 4057 read_unlock(&vcpu->kvm->mmu_lock); 4058 else 4059 write_unlock(&vcpu->kvm->mmu_lock); 4060 kvm_release_pfn_clean(fault->pfn); 4061 return r; 4062 } 4063 4064 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, 4065 struct kvm_page_fault *fault) 4066 { 4067 pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); 4068 4069 /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ 4070 fault->max_level = PG_LEVEL_2M; 4071 return direct_page_fault(vcpu, fault); 4072 } 4073 4074 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 4075 u64 fault_address, char *insn, int insn_len) 4076 { 4077 int r = 1; 4078 u32 flags = vcpu->arch.apf.host_apf_flags; 4079 4080 #ifndef CONFIG_X86_64 4081 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ 4082 if (WARN_ON_ONCE(fault_address >> 32)) 4083 return -EFAULT; 4084 #endif 4085 4086 vcpu->arch.l1tf_flush_l1d = true; 4087 if (!flags) { 4088 trace_kvm_page_fault(fault_address, error_code); 4089 4090 if (kvm_event_needs_reinjection(vcpu)) 4091 kvm_mmu_unprotect_page_virt(vcpu, fault_address); 4092 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 4093 insn_len); 4094 } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { 4095 vcpu->arch.apf.host_apf_flags = 0; 4096 local_irq_disable(); 4097 kvm_async_pf_task_wait_schedule(fault_address); 4098 local_irq_enable(); 4099 } else { 4100 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); 4101 } 4102 4103 return r; 4104 } 4105 EXPORT_SYMBOL_GPL(kvm_handle_page_fault); 4106 4107 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 4108 { 4109 while (fault->max_level > PG_LEVEL_4K) { 4110 int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); 4111 gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); 4112 4113 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) 4114 break; 4115 4116 --fault->max_level; 4117 } 4118 4119 return direct_page_fault(vcpu, fault); 4120 } 4121 4122 static void nonpaging_init_context(struct kvm_mmu *context) 4123 { 4124 context->page_fault = nonpaging_page_fault; 4125 context->gva_to_gpa = nonpaging_gva_to_gpa; 4126 context->sync_page = nonpaging_sync_page; 4127 context->invlpg = NULL; 4128 context->direct_map = true; 4129 } 4130 4131 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, 4132 union kvm_mmu_page_role role) 4133 { 4134 return (role.direct || pgd == root->pgd) && 4135 VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && 4136 role.word == to_shadow_page(root->hpa)->role.word; 4137 } 4138 4139 /* 4140 * Find out if a previously cached root matching the new pgd/role is available. 4141 * The current root is also inserted into the cache. 4142 * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is 4143 * returned. 4144 * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and 4145 * false is returned. This root should now be freed by the caller. 4146 */ 4147 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd, 4148 union kvm_mmu_page_role new_role) 4149 { 4150 uint i; 4151 struct kvm_mmu_root_info root; 4152 struct kvm_mmu *mmu = vcpu->arch.mmu; 4153 4154 root.pgd = mmu->root_pgd; 4155 root.hpa = mmu->root_hpa; 4156 4157 if (is_root_usable(&root, new_pgd, new_role)) 4158 return true; 4159 4160 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 4161 swap(root, mmu->prev_roots[i]); 4162 4163 if (is_root_usable(&root, new_pgd, new_role)) 4164 break; 4165 } 4166 4167 mmu->root_hpa = root.hpa; 4168 mmu->root_pgd = root.pgd; 4169 4170 return i < KVM_MMU_NUM_PREV_ROOTS; 4171 } 4172 4173 static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, 4174 union kvm_mmu_page_role new_role) 4175 { 4176 struct kvm_mmu *mmu = vcpu->arch.mmu; 4177 4178 /* 4179 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid 4180 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs 4181 * later if necessary. 4182 */ 4183 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 4184 mmu->root_level >= PT64_ROOT_4LEVEL) 4185 return cached_root_available(vcpu, new_pgd, new_role); 4186 4187 return false; 4188 } 4189 4190 static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, 4191 union kvm_mmu_page_role new_role) 4192 { 4193 if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { 4194 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); 4195 return; 4196 } 4197 4198 /* 4199 * It's possible that the cached previous root page is obsolete because 4200 * of a change in the MMU generation number. However, changing the 4201 * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will 4202 * free the root set here and allocate a new one. 4203 */ 4204 kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); 4205 4206 if (force_flush_and_sync_on_reuse) { 4207 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 4208 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 4209 } 4210 4211 /* 4212 * The last MMIO access's GVA and GPA are cached in the VCPU. When 4213 * switching to a new CR3, that GVA->GPA mapping may no longer be 4214 * valid. So clear any cached MMIO info even when we don't need to sync 4215 * the shadow page tables. 4216 */ 4217 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 4218 4219 /* 4220 * If this is a direct root page, it doesn't have a write flooding 4221 * count. Otherwise, clear the write flooding count. 4222 */ 4223 if (!new_role.direct) 4224 __clear_sp_write_flooding_count( 4225 to_shadow_page(vcpu->arch.mmu->root_hpa)); 4226 } 4227 4228 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) 4229 { 4230 __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu)); 4231 } 4232 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); 4233 4234 static unsigned long get_cr3(struct kvm_vcpu *vcpu) 4235 { 4236 return kvm_read_cr3(vcpu); 4237 } 4238 4239 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 4240 unsigned int access) 4241 { 4242 if (unlikely(is_mmio_spte(*sptep))) { 4243 if (gfn != get_mmio_spte_gfn(*sptep)) { 4244 mmu_spte_clear_no_track(sptep); 4245 return true; 4246 } 4247 4248 mark_mmio_spte(vcpu, sptep, gfn, access); 4249 return true; 4250 } 4251 4252 return false; 4253 } 4254 4255 #define PTTYPE_EPT 18 /* arbitrary */ 4256 #define PTTYPE PTTYPE_EPT 4257 #include "paging_tmpl.h" 4258 #undef PTTYPE 4259 4260 #define PTTYPE 64 4261 #include "paging_tmpl.h" 4262 #undef PTTYPE 4263 4264 #define PTTYPE 32 4265 #include "paging_tmpl.h" 4266 #undef PTTYPE 4267 4268 static void 4269 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, 4270 u64 pa_bits_rsvd, int level, bool nx, bool gbpages, 4271 bool pse, bool amd) 4272 { 4273 u64 gbpages_bit_rsvd = 0; 4274 u64 nonleaf_bit8_rsvd = 0; 4275 u64 high_bits_rsvd; 4276 4277 rsvd_check->bad_mt_xwr = 0; 4278 4279 if (!gbpages) 4280 gbpages_bit_rsvd = rsvd_bits(7, 7); 4281 4282 if (level == PT32E_ROOT_LEVEL) 4283 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); 4284 else 4285 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); 4286 4287 /* Note, NX doesn't exist in PDPTEs, this is handled below. */ 4288 if (!nx) 4289 high_bits_rsvd |= rsvd_bits(63, 63); 4290 4291 /* 4292 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for 4293 * leaf entries) on AMD CPUs only. 4294 */ 4295 if (amd) 4296 nonleaf_bit8_rsvd = rsvd_bits(8, 8); 4297 4298 switch (level) { 4299 case PT32_ROOT_LEVEL: 4300 /* no rsvd bits for 2 level 4K page table entries */ 4301 rsvd_check->rsvd_bits_mask[0][1] = 0; 4302 rsvd_check->rsvd_bits_mask[0][0] = 0; 4303 rsvd_check->rsvd_bits_mask[1][0] = 4304 rsvd_check->rsvd_bits_mask[0][0]; 4305 4306 if (!pse) { 4307 rsvd_check->rsvd_bits_mask[1][1] = 0; 4308 break; 4309 } 4310 4311 if (is_cpuid_PSE36()) 4312 /* 36bits PSE 4MB page */ 4313 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 4314 else 4315 /* 32 bits PSE 4MB page */ 4316 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 4317 break; 4318 case PT32E_ROOT_LEVEL: 4319 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | 4320 high_bits_rsvd | 4321 rsvd_bits(5, 8) | 4322 rsvd_bits(1, 2); /* PDPTE */ 4323 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ 4324 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ 4325 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | 4326 rsvd_bits(13, 20); /* large page */ 4327 rsvd_check->rsvd_bits_mask[1][0] = 4328 rsvd_check->rsvd_bits_mask[0][0]; 4329 break; 4330 case PT64_ROOT_5LEVEL: 4331 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | 4332 nonleaf_bit8_rsvd | 4333 rsvd_bits(7, 7); 4334 rsvd_check->rsvd_bits_mask[1][4] = 4335 rsvd_check->rsvd_bits_mask[0][4]; 4336 fallthrough; 4337 case PT64_ROOT_4LEVEL: 4338 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | 4339 nonleaf_bit8_rsvd | 4340 rsvd_bits(7, 7); 4341 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | 4342 gbpages_bit_rsvd; 4343 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; 4344 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; 4345 rsvd_check->rsvd_bits_mask[1][3] = 4346 rsvd_check->rsvd_bits_mask[0][3]; 4347 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | 4348 gbpages_bit_rsvd | 4349 rsvd_bits(13, 29); 4350 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | 4351 rsvd_bits(13, 20); /* large page */ 4352 rsvd_check->rsvd_bits_mask[1][0] = 4353 rsvd_check->rsvd_bits_mask[0][0]; 4354 break; 4355 } 4356 } 4357 4358 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) 4359 { 4360 /* 4361 * If TDP is enabled, let the guest use GBPAGES if they're supported in 4362 * hardware. The hardware page walker doesn't let KVM disable GBPAGES, 4363 * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA 4364 * walk for performance and complexity reasons. Not to mention KVM 4365 * _can't_ solve the problem because GVA->GPA walks aren't visible to 4366 * KVM once a TDP translation is installed. Mimic hardware behavior so 4367 * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. 4368 */ 4369 return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : 4370 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); 4371 } 4372 4373 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 4374 struct kvm_mmu *context) 4375 { 4376 __reset_rsvds_bits_mask(&context->guest_rsvd_check, 4377 vcpu->arch.reserved_gpa_bits, 4378 context->root_level, is_efer_nx(context), 4379 guest_can_use_gbpages(vcpu), 4380 is_cr4_pse(context), 4381 guest_cpuid_is_amd_or_hygon(vcpu)); 4382 } 4383 4384 static void 4385 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, 4386 u64 pa_bits_rsvd, bool execonly, int huge_page_level) 4387 { 4388 u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); 4389 u64 large_1g_rsvd = 0, large_2m_rsvd = 0; 4390 u64 bad_mt_xwr; 4391 4392 if (huge_page_level < PG_LEVEL_1G) 4393 large_1g_rsvd = rsvd_bits(7, 7); 4394 if (huge_page_level < PG_LEVEL_2M) 4395 large_2m_rsvd = rsvd_bits(7, 7); 4396 4397 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); 4398 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); 4399 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; 4400 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; 4401 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; 4402 4403 /* large page */ 4404 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; 4405 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; 4406 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; 4407 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; 4408 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; 4409 4410 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ 4411 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ 4412 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ 4413 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ 4414 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ 4415 if (!execonly) { 4416 /* bits 0..2 must not be 100 unless VMX capabilities allow it */ 4417 bad_mt_xwr |= REPEAT_BYTE(1ull << 4); 4418 } 4419 rsvd_check->bad_mt_xwr = bad_mt_xwr; 4420 } 4421 4422 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, 4423 struct kvm_mmu *context, bool execonly, int huge_page_level) 4424 { 4425 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, 4426 vcpu->arch.reserved_gpa_bits, execonly, 4427 huge_page_level); 4428 } 4429 4430 static inline u64 reserved_hpa_bits(void) 4431 { 4432 return rsvd_bits(shadow_phys_bits, 63); 4433 } 4434 4435 /* 4436 * the page table on host is the shadow page table for the page 4437 * table in guest or amd nested guest, its mmu features completely 4438 * follow the features in guest. 4439 */ 4440 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4441 struct kvm_mmu *context) 4442 { 4443 /* 4444 * KVM uses NX when TDP is disabled to handle a variety of scenarios, 4445 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and 4446 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. 4447 * The iTLB multi-hit workaround can be toggled at any time, so assume 4448 * NX can be used by any non-nested shadow MMU to avoid having to reset 4449 * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. 4450 */ 4451 bool uses_nx = is_efer_nx(context) || !tdp_enabled; 4452 4453 /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ 4454 bool is_amd = true; 4455 /* KVM doesn't use 2-level page tables for the shadow MMU. */ 4456 bool is_pse = false; 4457 struct rsvd_bits_validate *shadow_zero_check; 4458 int i; 4459 4460 WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL); 4461 4462 shadow_zero_check = &context->shadow_zero_check; 4463 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), 4464 context->shadow_root_level, uses_nx, 4465 guest_can_use_gbpages(vcpu), is_pse, is_amd); 4466 4467 if (!shadow_me_mask) 4468 return; 4469 4470 for (i = context->shadow_root_level; --i >= 0;) { 4471 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4472 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4473 } 4474 4475 } 4476 4477 static inline bool boot_cpu_is_amd(void) 4478 { 4479 WARN_ON_ONCE(!tdp_enabled); 4480 return shadow_x_mask == 0; 4481 } 4482 4483 /* 4484 * the direct page table on host, use as much mmu features as 4485 * possible, however, kvm currently does not do execution-protection. 4486 */ 4487 static void 4488 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4489 struct kvm_mmu *context) 4490 { 4491 struct rsvd_bits_validate *shadow_zero_check; 4492 int i; 4493 4494 shadow_zero_check = &context->shadow_zero_check; 4495 4496 if (boot_cpu_is_amd()) 4497 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), 4498 context->shadow_root_level, false, 4499 boot_cpu_has(X86_FEATURE_GBPAGES), 4500 false, true); 4501 else 4502 __reset_rsvds_bits_mask_ept(shadow_zero_check, 4503 reserved_hpa_bits(), false, 4504 max_huge_page_level); 4505 4506 if (!shadow_me_mask) 4507 return; 4508 4509 for (i = context->shadow_root_level; --i >= 0;) { 4510 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4511 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4512 } 4513 } 4514 4515 /* 4516 * as the comments in reset_shadow_zero_bits_mask() except it 4517 * is the shadow page table for intel nested guest. 4518 */ 4519 static void 4520 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4521 struct kvm_mmu *context, bool execonly) 4522 { 4523 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 4524 reserved_hpa_bits(), execonly, 4525 max_huge_page_level); 4526 } 4527 4528 #define BYTE_MASK(access) \ 4529 ((1 & (access) ? 2 : 0) | \ 4530 (2 & (access) ? 4 : 0) | \ 4531 (3 & (access) ? 8 : 0) | \ 4532 (4 & (access) ? 16 : 0) | \ 4533 (5 & (access) ? 32 : 0) | \ 4534 (6 & (access) ? 64 : 0) | \ 4535 (7 & (access) ? 128 : 0)) 4536 4537 4538 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) 4539 { 4540 unsigned byte; 4541 4542 const u8 x = BYTE_MASK(ACC_EXEC_MASK); 4543 const u8 w = BYTE_MASK(ACC_WRITE_MASK); 4544 const u8 u = BYTE_MASK(ACC_USER_MASK); 4545 4546 bool cr4_smep = is_cr4_smep(mmu); 4547 bool cr4_smap = is_cr4_smap(mmu); 4548 bool cr0_wp = is_cr0_wp(mmu); 4549 bool efer_nx = is_efer_nx(mmu); 4550 4551 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { 4552 unsigned pfec = byte << 1; 4553 4554 /* 4555 * Each "*f" variable has a 1 bit for each UWX value 4556 * that causes a fault with the given PFEC. 4557 */ 4558 4559 /* Faults from writes to non-writable pages */ 4560 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; 4561 /* Faults from user mode accesses to supervisor pages */ 4562 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; 4563 /* Faults from fetches of non-executable pages*/ 4564 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; 4565 /* Faults from kernel mode fetches of user pages */ 4566 u8 smepf = 0; 4567 /* Faults from kernel mode accesses of user pages */ 4568 u8 smapf = 0; 4569 4570 if (!ept) { 4571 /* Faults from kernel mode accesses to user pages */ 4572 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; 4573 4574 /* Not really needed: !nx will cause pte.nx to fault */ 4575 if (!efer_nx) 4576 ff = 0; 4577 4578 /* Allow supervisor writes if !cr0.wp */ 4579 if (!cr0_wp) 4580 wf = (pfec & PFERR_USER_MASK) ? wf : 0; 4581 4582 /* Disallow supervisor fetches of user code if cr4.smep */ 4583 if (cr4_smep) 4584 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; 4585 4586 /* 4587 * SMAP:kernel-mode data accesses from user-mode 4588 * mappings should fault. A fault is considered 4589 * as a SMAP violation if all of the following 4590 * conditions are true: 4591 * - X86_CR4_SMAP is set in CR4 4592 * - A user page is accessed 4593 * - The access is not a fetch 4594 * - Page fault in kernel mode 4595 * - if CPL = 3 or X86_EFLAGS_AC is clear 4596 * 4597 * Here, we cover the first three conditions. 4598 * The fourth is computed dynamically in permission_fault(); 4599 * PFERR_RSVD_MASK bit will be set in PFEC if the access is 4600 * *not* subject to SMAP restrictions. 4601 */ 4602 if (cr4_smap) 4603 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; 4604 } 4605 4606 mmu->permissions[byte] = ff | uf | wf | smepf | smapf; 4607 } 4608 } 4609 4610 /* 4611 * PKU is an additional mechanism by which the paging controls access to 4612 * user-mode addresses based on the value in the PKRU register. Protection 4613 * key violations are reported through a bit in the page fault error code. 4614 * Unlike other bits of the error code, the PK bit is not known at the 4615 * call site of e.g. gva_to_gpa; it must be computed directly in 4616 * permission_fault based on two bits of PKRU, on some machine state (CR4, 4617 * CR0, EFER, CPL), and on other bits of the error code and the page tables. 4618 * 4619 * In particular the following conditions come from the error code, the 4620 * page tables and the machine state: 4621 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 4622 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) 4623 * - PK is always zero if U=0 in the page tables 4624 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. 4625 * 4626 * The PKRU bitmask caches the result of these four conditions. The error 4627 * code (minus the P bit) and the page table's U bit form an index into the 4628 * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed 4629 * with the two bits of the PKRU register corresponding to the protection key. 4630 * For the first three conditions above the bits will be 00, thus masking 4631 * away both AD and WD. For all reads or if the last condition holds, WD 4632 * only will be masked away. 4633 */ 4634 static void update_pkru_bitmask(struct kvm_mmu *mmu) 4635 { 4636 unsigned bit; 4637 bool wp; 4638 4639 mmu->pkru_mask = 0; 4640 4641 if (!is_cr4_pke(mmu)) 4642 return; 4643 4644 wp = is_cr0_wp(mmu); 4645 4646 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { 4647 unsigned pfec, pkey_bits; 4648 bool check_pkey, check_write, ff, uf, wf, pte_user; 4649 4650 pfec = bit << 1; 4651 ff = pfec & PFERR_FETCH_MASK; 4652 uf = pfec & PFERR_USER_MASK; 4653 wf = pfec & PFERR_WRITE_MASK; 4654 4655 /* PFEC.RSVD is replaced by ACC_USER_MASK. */ 4656 pte_user = pfec & PFERR_RSVD_MASK; 4657 4658 /* 4659 * Only need to check the access which is not an 4660 * instruction fetch and is to a user page. 4661 */ 4662 check_pkey = (!ff && pte_user); 4663 /* 4664 * write access is controlled by PKRU if it is a 4665 * user access or CR0.WP = 1. 4666 */ 4667 check_write = check_pkey && wf && (uf || wp); 4668 4669 /* PKRU.AD stops both read and write access. */ 4670 pkey_bits = !!check_pkey; 4671 /* PKRU.WD stops write access. */ 4672 pkey_bits |= (!!check_write) << 1; 4673 4674 mmu->pkru_mask |= (pkey_bits & 3) << pfec; 4675 } 4676 } 4677 4678 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, 4679 struct kvm_mmu *mmu) 4680 { 4681 if (!is_cr0_pg(mmu)) 4682 return; 4683 4684 reset_rsvds_bits_mask(vcpu, mmu); 4685 update_permission_bitmask(mmu, false); 4686 update_pkru_bitmask(mmu); 4687 } 4688 4689 static void paging64_init_context(struct kvm_mmu *context) 4690 { 4691 context->page_fault = paging64_page_fault; 4692 context->gva_to_gpa = paging64_gva_to_gpa; 4693 context->sync_page = paging64_sync_page; 4694 context->invlpg = paging64_invlpg; 4695 context->direct_map = false; 4696 } 4697 4698 static void paging32_init_context(struct kvm_mmu *context) 4699 { 4700 context->page_fault = paging32_page_fault; 4701 context->gva_to_gpa = paging32_gva_to_gpa; 4702 context->sync_page = paging32_sync_page; 4703 context->invlpg = paging32_invlpg; 4704 context->direct_map = false; 4705 } 4706 4707 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, 4708 struct kvm_mmu_role_regs *regs) 4709 { 4710 union kvm_mmu_extended_role ext = {0}; 4711 4712 if (____is_cr0_pg(regs)) { 4713 ext.cr0_pg = 1; 4714 ext.cr4_pae = ____is_cr4_pae(regs); 4715 ext.cr4_smep = ____is_cr4_smep(regs); 4716 ext.cr4_smap = ____is_cr4_smap(regs); 4717 ext.cr4_pse = ____is_cr4_pse(regs); 4718 4719 /* PKEY and LA57 are active iff long mode is active. */ 4720 ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); 4721 ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); 4722 ext.efer_lma = ____is_efer_lma(regs); 4723 } 4724 4725 ext.valid = 1; 4726 4727 return ext; 4728 } 4729 4730 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, 4731 struct kvm_mmu_role_regs *regs, 4732 bool base_only) 4733 { 4734 union kvm_mmu_role role = {0}; 4735 4736 role.base.access = ACC_ALL; 4737 if (____is_cr0_pg(regs)) { 4738 role.base.efer_nx = ____is_efer_nx(regs); 4739 role.base.cr0_wp = ____is_cr0_wp(regs); 4740 } 4741 role.base.smm = is_smm(vcpu); 4742 role.base.guest_mode = is_guest_mode(vcpu); 4743 4744 if (base_only) 4745 return role; 4746 4747 role.ext = kvm_calc_mmu_role_ext(vcpu, regs); 4748 4749 return role; 4750 } 4751 4752 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) 4753 { 4754 /* tdp_root_level is architecture forced level, use it if nonzero */ 4755 if (tdp_root_level) 4756 return tdp_root_level; 4757 4758 /* Use 5-level TDP if and only if it's useful/necessary. */ 4759 if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) 4760 return 4; 4761 4762 return max_tdp_level; 4763 } 4764 4765 static union kvm_mmu_role 4766 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, 4767 struct kvm_mmu_role_regs *regs, bool base_only) 4768 { 4769 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); 4770 4771 role.base.ad_disabled = (shadow_accessed_mask == 0); 4772 role.base.level = kvm_mmu_get_tdp_level(vcpu); 4773 role.base.direct = true; 4774 role.base.has_4_byte_gpte = false; 4775 4776 return role; 4777 } 4778 4779 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 4780 { 4781 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4782 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4783 union kvm_mmu_role new_role = 4784 kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, false); 4785 4786 if (new_role.as_u64 == context->mmu_role.as_u64) 4787 return; 4788 4789 context->mmu_role.as_u64 = new_role.as_u64; 4790 context->page_fault = kvm_tdp_page_fault; 4791 context->sync_page = nonpaging_sync_page; 4792 context->invlpg = NULL; 4793 context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); 4794 context->direct_map = true; 4795 context->get_guest_pgd = get_cr3; 4796 context->get_pdptr = kvm_pdptr_read; 4797 context->inject_page_fault = kvm_inject_page_fault; 4798 context->root_level = role_regs_to_root_level(®s); 4799 4800 if (!is_cr0_pg(context)) 4801 context->gva_to_gpa = nonpaging_gva_to_gpa; 4802 else if (is_cr4_pae(context)) 4803 context->gva_to_gpa = paging64_gva_to_gpa; 4804 else 4805 context->gva_to_gpa = paging32_gva_to_gpa; 4806 4807 reset_guest_paging_metadata(vcpu, context); 4808 reset_tdp_shadow_zero_bits_mask(vcpu, context); 4809 } 4810 4811 static union kvm_mmu_role 4812 kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, 4813 struct kvm_mmu_role_regs *regs, bool base_only) 4814 { 4815 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); 4816 4817 role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); 4818 role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); 4819 role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs); 4820 4821 return role; 4822 } 4823 4824 static union kvm_mmu_role 4825 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, 4826 struct kvm_mmu_role_regs *regs, bool base_only) 4827 { 4828 union kvm_mmu_role role = 4829 kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only); 4830 4831 role.base.direct = !____is_cr0_pg(regs); 4832 4833 if (!____is_efer_lma(regs)) 4834 role.base.level = PT32E_ROOT_LEVEL; 4835 else if (____is_cr4_la57(regs)) 4836 role.base.level = PT64_ROOT_5LEVEL; 4837 else 4838 role.base.level = PT64_ROOT_4LEVEL; 4839 4840 return role; 4841 } 4842 4843 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, 4844 struct kvm_mmu_role_regs *regs, 4845 union kvm_mmu_role new_role) 4846 { 4847 if (new_role.as_u64 == context->mmu_role.as_u64) 4848 return; 4849 4850 context->mmu_role.as_u64 = new_role.as_u64; 4851 4852 if (!is_cr0_pg(context)) 4853 nonpaging_init_context(context); 4854 else if (is_cr4_pae(context)) 4855 paging64_init_context(context); 4856 else 4857 paging32_init_context(context); 4858 context->root_level = role_regs_to_root_level(regs); 4859 4860 reset_guest_paging_metadata(vcpu, context); 4861 context->shadow_root_level = new_role.base.level; 4862 4863 reset_shadow_zero_bits_mask(vcpu, context); 4864 } 4865 4866 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, 4867 struct kvm_mmu_role_regs *regs) 4868 { 4869 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4870 union kvm_mmu_role new_role = 4871 kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false); 4872 4873 shadow_mmu_init_context(vcpu, context, regs, new_role); 4874 } 4875 4876 static union kvm_mmu_role 4877 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu, 4878 struct kvm_mmu_role_regs *regs) 4879 { 4880 union kvm_mmu_role role = 4881 kvm_calc_shadow_root_page_role_common(vcpu, regs, false); 4882 4883 role.base.direct = false; 4884 role.base.level = kvm_mmu_get_tdp_level(vcpu); 4885 4886 return role; 4887 } 4888 4889 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, 4890 unsigned long cr4, u64 efer, gpa_t nested_cr3) 4891 { 4892 struct kvm_mmu *context = &vcpu->arch.guest_mmu; 4893 struct kvm_mmu_role_regs regs = { 4894 .cr0 = cr0, 4895 .cr4 = cr4 & ~X86_CR4_PKE, 4896 .efer = efer, 4897 }; 4898 union kvm_mmu_role new_role; 4899 4900 new_role = kvm_calc_shadow_npt_root_page_role(vcpu, ®s); 4901 4902 __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); 4903 4904 shadow_mmu_init_context(vcpu, context, ®s, new_role); 4905 } 4906 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); 4907 4908 static union kvm_mmu_role 4909 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, 4910 bool execonly, u8 level) 4911 { 4912 union kvm_mmu_role role = {0}; 4913 4914 /* SMM flag is inherited from root_mmu */ 4915 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; 4916 4917 role.base.level = level; 4918 role.base.has_4_byte_gpte = false; 4919 role.base.direct = false; 4920 role.base.ad_disabled = !accessed_dirty; 4921 role.base.guest_mode = true; 4922 role.base.access = ACC_ALL; 4923 4924 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ 4925 role.ext.word = 0; 4926 role.ext.execonly = execonly; 4927 role.ext.valid = 1; 4928 4929 return role; 4930 } 4931 4932 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 4933 int huge_page_level, bool accessed_dirty, 4934 gpa_t new_eptp) 4935 { 4936 struct kvm_mmu *context = &vcpu->arch.guest_mmu; 4937 u8 level = vmx_eptp_page_walk_level(new_eptp); 4938 union kvm_mmu_role new_role = 4939 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, 4940 execonly, level); 4941 4942 __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base); 4943 4944 if (new_role.as_u64 == context->mmu_role.as_u64) 4945 return; 4946 4947 context->mmu_role.as_u64 = new_role.as_u64; 4948 4949 context->shadow_root_level = level; 4950 4951 context->ept_ad = accessed_dirty; 4952 context->page_fault = ept_page_fault; 4953 context->gva_to_gpa = ept_gva_to_gpa; 4954 context->sync_page = ept_sync_page; 4955 context->invlpg = ept_invlpg; 4956 context->root_level = level; 4957 context->direct_map = false; 4958 4959 update_permission_bitmask(context, true); 4960 context->pkru_mask = 0; 4961 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); 4962 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); 4963 } 4964 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); 4965 4966 static void init_kvm_softmmu(struct kvm_vcpu *vcpu) 4967 { 4968 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4969 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4970 4971 kvm_init_shadow_mmu(vcpu, ®s); 4972 4973 context->get_guest_pgd = get_cr3; 4974 context->get_pdptr = kvm_pdptr_read; 4975 context->inject_page_fault = kvm_inject_page_fault; 4976 } 4977 4978 static union kvm_mmu_role 4979 kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) 4980 { 4981 union kvm_mmu_role role; 4982 4983 role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false); 4984 4985 /* 4986 * Nested MMUs are used only for walking L2's gva->gpa, they never have 4987 * shadow pages of their own and so "direct" has no meaning. Set it 4988 * to "true" to try to detect bogus usage of the nested MMU. 4989 */ 4990 role.base.direct = true; 4991 role.base.level = role_regs_to_root_level(regs); 4992 return role; 4993 } 4994 4995 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) 4996 { 4997 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4998 union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, ®s); 4999 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; 5000 5001 if (new_role.as_u64 == g_context->mmu_role.as_u64) 5002 return; 5003 5004 g_context->mmu_role.as_u64 = new_role.as_u64; 5005 g_context->get_guest_pgd = get_cr3; 5006 g_context->get_pdptr = kvm_pdptr_read; 5007 g_context->inject_page_fault = kvm_inject_page_fault; 5008 g_context->root_level = new_role.base.level; 5009 5010 /* 5011 * L2 page tables are never shadowed, so there is no need to sync 5012 * SPTEs. 5013 */ 5014 g_context->invlpg = NULL; 5015 5016 /* 5017 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using 5018 * L1's nested page tables (e.g. EPT12). The nested translation 5019 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using 5020 * L2's page tables as the first level of translation and L1's 5021 * nested page tables as the second level of translation. Basically 5022 * the gva_to_gpa functions between mmu and nested_mmu are swapped. 5023 */ 5024 if (!is_paging(vcpu)) 5025 g_context->gva_to_gpa = nonpaging_gva_to_gpa; 5026 else if (is_long_mode(vcpu)) 5027 g_context->gva_to_gpa = paging64_gva_to_gpa; 5028 else if (is_pae(vcpu)) 5029 g_context->gva_to_gpa = paging64_gva_to_gpa; 5030 else 5031 g_context->gva_to_gpa = paging32_gva_to_gpa; 5032 5033 reset_guest_paging_metadata(vcpu, g_context); 5034 } 5035 5036 void kvm_init_mmu(struct kvm_vcpu *vcpu) 5037 { 5038 if (mmu_is_nested(vcpu)) 5039 init_kvm_nested_mmu(vcpu); 5040 else if (tdp_enabled) 5041 init_kvm_tdp_mmu(vcpu); 5042 else 5043 init_kvm_softmmu(vcpu); 5044 } 5045 EXPORT_SYMBOL_GPL(kvm_init_mmu); 5046 5047 static union kvm_mmu_page_role 5048 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) 5049 { 5050 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 5051 union kvm_mmu_role role; 5052 5053 if (tdp_enabled) 5054 role = kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, true); 5055 else 5056 role = kvm_calc_shadow_mmu_root_page_role(vcpu, ®s, true); 5057 5058 return role.base; 5059 } 5060 5061 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) 5062 { 5063 /* 5064 * Invalidate all MMU roles to force them to reinitialize as CPUID 5065 * information is factored into reserved bit calculations. 5066 * 5067 * Correctly handling multiple vCPU models with respect to paging and 5068 * physical address properties) in a single VM would require tracking 5069 * all relevant CPUID information in kvm_mmu_page_role. That is very 5070 * undesirable as it would increase the memory requirements for 5071 * gfn_track (see struct kvm_mmu_page_role comments). For now that 5072 * problem is swept under the rug; KVM's CPUID API is horrific and 5073 * it's all but impossible to solve it without introducing a new API. 5074 */ 5075 vcpu->arch.root_mmu.mmu_role.ext.valid = 0; 5076 vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; 5077 vcpu->arch.nested_mmu.mmu_role.ext.valid = 0; 5078 kvm_mmu_reset_context(vcpu); 5079 5080 /* 5081 * Changing guest CPUID after KVM_RUN is forbidden, see the comment in 5082 * kvm_arch_vcpu_ioctl(). 5083 */ 5084 KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); 5085 } 5086 5087 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 5088 { 5089 kvm_mmu_unload(vcpu); 5090 kvm_init_mmu(vcpu); 5091 } 5092 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 5093 5094 int kvm_mmu_load(struct kvm_vcpu *vcpu) 5095 { 5096 int r; 5097 5098 r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); 5099 if (r) 5100 goto out; 5101 r = mmu_alloc_special_roots(vcpu); 5102 if (r) 5103 goto out; 5104 if (vcpu->arch.mmu->direct_map) 5105 r = mmu_alloc_direct_roots(vcpu); 5106 else 5107 r = mmu_alloc_shadow_roots(vcpu); 5108 if (r) 5109 goto out; 5110 5111 kvm_mmu_sync_roots(vcpu); 5112 5113 kvm_mmu_load_pgd(vcpu); 5114 static_call(kvm_x86_tlb_flush_current)(vcpu); 5115 out: 5116 return r; 5117 } 5118 5119 void kvm_mmu_unload(struct kvm_vcpu *vcpu) 5120 { 5121 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); 5122 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa)); 5123 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5124 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); 5125 } 5126 5127 static bool need_remote_flush(u64 old, u64 new) 5128 { 5129 if (!is_shadow_present_pte(old)) 5130 return false; 5131 if (!is_shadow_present_pte(new)) 5132 return true; 5133 if ((old ^ new) & PT64_BASE_ADDR_MASK) 5134 return true; 5135 old ^= shadow_nx_mask; 5136 new ^= shadow_nx_mask; 5137 return (old & ~new & PT64_PERM_MASK) != 0; 5138 } 5139 5140 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, 5141 int *bytes) 5142 { 5143 u64 gentry = 0; 5144 int r; 5145 5146 /* 5147 * Assume that the pte write on a page table of the same type 5148 * as the current vcpu paging mode since we update the sptes only 5149 * when they have the same mode. 5150 */ 5151 if (is_pae(vcpu) && *bytes == 4) { 5152 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 5153 *gpa &= ~(gpa_t)7; 5154 *bytes = 8; 5155 } 5156 5157 if (*bytes == 4 || *bytes == 8) { 5158 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); 5159 if (r) 5160 gentry = 0; 5161 } 5162 5163 return gentry; 5164 } 5165 5166 /* 5167 * If we're seeing too many writes to a page, it may no longer be a page table, 5168 * or we may be forking, in which case it is better to unmap the page. 5169 */ 5170 static bool detect_write_flooding(struct kvm_mmu_page *sp) 5171 { 5172 /* 5173 * Skip write-flooding detected for the sp whose level is 1, because 5174 * it can become unsync, then the guest page is not write-protected. 5175 */ 5176 if (sp->role.level == PG_LEVEL_4K) 5177 return false; 5178 5179 atomic_inc(&sp->write_flooding_count); 5180 return atomic_read(&sp->write_flooding_count) >= 3; 5181 } 5182 5183 /* 5184 * Misaligned accesses are too much trouble to fix up; also, they usually 5185 * indicate a page is not used as a page table. 5186 */ 5187 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, 5188 int bytes) 5189 { 5190 unsigned offset, pte_size, misaligned; 5191 5192 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 5193 gpa, bytes, sp->role.word); 5194 5195 offset = offset_in_page(gpa); 5196 pte_size = sp->role.has_4_byte_gpte ? 4 : 8; 5197 5198 /* 5199 * Sometimes, the OS only writes the last one bytes to update status 5200 * bits, for example, in linux, andb instruction is used in clear_bit(). 5201 */ 5202 if (!(offset & (pte_size - 1)) && bytes == 1) 5203 return false; 5204 5205 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 5206 misaligned |= bytes < 4; 5207 5208 return misaligned; 5209 } 5210 5211 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) 5212 { 5213 unsigned page_offset, quadrant; 5214 u64 *spte; 5215 int level; 5216 5217 page_offset = offset_in_page(gpa); 5218 level = sp->role.level; 5219 *nspte = 1; 5220 if (sp->role.has_4_byte_gpte) { 5221 page_offset <<= 1; /* 32->64 */ 5222 /* 5223 * A 32-bit pde maps 4MB while the shadow pdes map 5224 * only 2MB. So we need to double the offset again 5225 * and zap two pdes instead of one. 5226 */ 5227 if (level == PT32_ROOT_LEVEL) { 5228 page_offset &= ~7; /* kill rounding error */ 5229 page_offset <<= 1; 5230 *nspte = 2; 5231 } 5232 quadrant = page_offset >> PAGE_SHIFT; 5233 page_offset &= ~PAGE_MASK; 5234 if (quadrant != sp->role.quadrant) 5235 return NULL; 5236 } 5237 5238 spte = &sp->spt[page_offset / sizeof(*spte)]; 5239 return spte; 5240 } 5241 5242 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 5243 const u8 *new, int bytes, 5244 struct kvm_page_track_notifier_node *node) 5245 { 5246 gfn_t gfn = gpa >> PAGE_SHIFT; 5247 struct kvm_mmu_page *sp; 5248 LIST_HEAD(invalid_list); 5249 u64 entry, gentry, *spte; 5250 int npte; 5251 bool flush = false; 5252 5253 /* 5254 * If we don't have indirect shadow pages, it means no page is 5255 * write-protected, so we can exit simply. 5256 */ 5257 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) 5258 return; 5259 5260 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 5261 5262 /* 5263 * No need to care whether allocation memory is successful 5264 * or not since pte prefetch is skipped if it does not have 5265 * enough objects in the cache. 5266 */ 5267 mmu_topup_memory_caches(vcpu, true); 5268 5269 write_lock(&vcpu->kvm->mmu_lock); 5270 5271 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); 5272 5273 ++vcpu->kvm->stat.mmu_pte_write; 5274 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 5275 5276 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { 5277 if (detect_write_misaligned(sp, gpa, bytes) || 5278 detect_write_flooding(sp)) { 5279 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 5280 ++vcpu->kvm->stat.mmu_flooded; 5281 continue; 5282 } 5283 5284 spte = get_written_sptes(sp, gpa, &npte); 5285 if (!spte) 5286 continue; 5287 5288 while (npte--) { 5289 entry = *spte; 5290 mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); 5291 if (gentry && sp->role.level != PG_LEVEL_4K) 5292 ++vcpu->kvm->stat.mmu_pde_zapped; 5293 if (need_remote_flush(entry, *spte)) 5294 flush = true; 5295 ++spte; 5296 } 5297 } 5298 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 5299 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 5300 write_unlock(&vcpu->kvm->mmu_lock); 5301 } 5302 5303 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 5304 void *insn, int insn_len) 5305 { 5306 int r, emulation_type = EMULTYPE_PF; 5307 bool direct = vcpu->arch.mmu->direct_map; 5308 5309 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 5310 return RET_PF_RETRY; 5311 5312 r = RET_PF_INVALID; 5313 if (unlikely(error_code & PFERR_RSVD_MASK)) { 5314 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); 5315 if (r == RET_PF_EMULATE) 5316 goto emulate; 5317 } 5318 5319 if (r == RET_PF_INVALID) { 5320 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, 5321 lower_32_bits(error_code), false); 5322 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) 5323 return -EIO; 5324 } 5325 5326 if (r < 0) 5327 return r; 5328 if (r != RET_PF_EMULATE) 5329 return 1; 5330 5331 /* 5332 * Before emulating the instruction, check if the error code 5333 * was due to a RO violation while translating the guest page. 5334 * This can occur when using nested virtualization with nested 5335 * paging in both guests. If true, we simply unprotect the page 5336 * and resume the guest. 5337 */ 5338 if (vcpu->arch.mmu->direct_map && 5339 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { 5340 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); 5341 return 1; 5342 } 5343 5344 /* 5345 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still 5346 * optimistically try to just unprotect the page and let the processor 5347 * re-execute the instruction that caused the page fault. Do not allow 5348 * retrying MMIO emulation, as it's not only pointless but could also 5349 * cause us to enter an infinite loop because the processor will keep 5350 * faulting on the non-existent MMIO address. Retrying an instruction 5351 * from a nested guest is also pointless and dangerous as we are only 5352 * explicitly shadowing L1's page tables, i.e. unprotecting something 5353 * for L1 isn't going to magically fix whatever issue cause L2 to fail. 5354 */ 5355 if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) 5356 emulation_type |= EMULTYPE_ALLOW_RETRY_PF; 5357 emulate: 5358 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, 5359 insn_len); 5360 } 5361 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 5362 5363 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 5364 gva_t gva, hpa_t root_hpa) 5365 { 5366 int i; 5367 5368 /* It's actually a GPA for vcpu->arch.guest_mmu. */ 5369 if (mmu != &vcpu->arch.guest_mmu) { 5370 /* INVLPG on a non-canonical address is a NOP according to the SDM. */ 5371 if (is_noncanonical_address(gva, vcpu)) 5372 return; 5373 5374 static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); 5375 } 5376 5377 if (!mmu->invlpg) 5378 return; 5379 5380 if (root_hpa == INVALID_PAGE) { 5381 mmu->invlpg(vcpu, gva, mmu->root_hpa); 5382 5383 /* 5384 * INVLPG is required to invalidate any global mappings for the VA, 5385 * irrespective of PCID. Since it would take us roughly similar amount 5386 * of work to determine whether any of the prev_root mappings of the VA 5387 * is marked global, or to just sync it blindly, so we might as well 5388 * just always sync it. 5389 * 5390 * Mappings not reachable via the current cr3 or the prev_roots will be 5391 * synced when switching to that cr3, so nothing needs to be done here 5392 * for them. 5393 */ 5394 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5395 if (VALID_PAGE(mmu->prev_roots[i].hpa)) 5396 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); 5397 } else { 5398 mmu->invlpg(vcpu, gva, root_hpa); 5399 } 5400 } 5401 5402 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 5403 { 5404 kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); 5405 ++vcpu->stat.invlpg; 5406 } 5407 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 5408 5409 5410 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) 5411 { 5412 struct kvm_mmu *mmu = vcpu->arch.mmu; 5413 bool tlb_flush = false; 5414 uint i; 5415 5416 if (pcid == kvm_get_active_pcid(vcpu)) { 5417 mmu->invlpg(vcpu, gva, mmu->root_hpa); 5418 tlb_flush = true; 5419 } 5420 5421 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5422 if (VALID_PAGE(mmu->prev_roots[i].hpa) && 5423 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { 5424 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); 5425 tlb_flush = true; 5426 } 5427 } 5428 5429 if (tlb_flush) 5430 static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); 5431 5432 ++vcpu->stat.invlpg; 5433 5434 /* 5435 * Mappings not reachable via the current cr3 or the prev_roots will be 5436 * synced when switching to that cr3, so nothing needs to be done here 5437 * for them. 5438 */ 5439 } 5440 5441 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, 5442 int tdp_max_root_level, int tdp_huge_page_level) 5443 { 5444 tdp_enabled = enable_tdp; 5445 tdp_root_level = tdp_forced_root_level; 5446 max_tdp_level = tdp_max_root_level; 5447 5448 /* 5449 * max_huge_page_level reflects KVM's MMU capabilities irrespective 5450 * of kernel support, e.g. KVM may be capable of using 1GB pages when 5451 * the kernel is not. But, KVM never creates a page size greater than 5452 * what is used by the kernel for any given HVA, i.e. the kernel's 5453 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). 5454 */ 5455 if (tdp_enabled) 5456 max_huge_page_level = tdp_huge_page_level; 5457 else if (boot_cpu_has(X86_FEATURE_GBPAGES)) 5458 max_huge_page_level = PG_LEVEL_1G; 5459 else 5460 max_huge_page_level = PG_LEVEL_2M; 5461 } 5462 EXPORT_SYMBOL_GPL(kvm_configure_mmu); 5463 5464 /* The return value indicates if tlb flush on all vcpus is needed. */ 5465 typedef bool (*slot_level_handler) (struct kvm *kvm, 5466 struct kvm_rmap_head *rmap_head, 5467 const struct kvm_memory_slot *slot); 5468 5469 /* The caller should hold mmu-lock before calling this function. */ 5470 static __always_inline bool 5471 slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, 5472 slot_level_handler fn, int start_level, int end_level, 5473 gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, 5474 bool flush) 5475 { 5476 struct slot_rmap_walk_iterator iterator; 5477 5478 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, 5479 end_gfn, &iterator) { 5480 if (iterator.rmap) 5481 flush |= fn(kvm, iterator.rmap, memslot); 5482 5483 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 5484 if (flush && flush_on_yield) { 5485 kvm_flush_remote_tlbs_with_address(kvm, 5486 start_gfn, 5487 iterator.gfn - start_gfn + 1); 5488 flush = false; 5489 } 5490 cond_resched_rwlock_write(&kvm->mmu_lock); 5491 } 5492 } 5493 5494 return flush; 5495 } 5496 5497 static __always_inline bool 5498 slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, 5499 slot_level_handler fn, int start_level, int end_level, 5500 bool flush_on_yield) 5501 { 5502 return slot_handle_level_range(kvm, memslot, fn, start_level, 5503 end_level, memslot->base_gfn, 5504 memslot->base_gfn + memslot->npages - 1, 5505 flush_on_yield, false); 5506 } 5507 5508 static __always_inline bool 5509 slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot, 5510 slot_level_handler fn, bool flush_on_yield) 5511 { 5512 return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, 5513 PG_LEVEL_4K, flush_on_yield); 5514 } 5515 5516 static void free_mmu_pages(struct kvm_mmu *mmu) 5517 { 5518 if (!tdp_enabled && mmu->pae_root) 5519 set_memory_encrypted((unsigned long)mmu->pae_root, 1); 5520 free_page((unsigned long)mmu->pae_root); 5521 free_page((unsigned long)mmu->pml4_root); 5522 free_page((unsigned long)mmu->pml5_root); 5523 } 5524 5525 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 5526 { 5527 struct page *page; 5528 int i; 5529 5530 mmu->root_hpa = INVALID_PAGE; 5531 mmu->root_pgd = 0; 5532 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5533 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; 5534 5535 /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ 5536 if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) 5537 return 0; 5538 5539 /* 5540 * When using PAE paging, the four PDPTEs are treated as 'root' pages, 5541 * while the PDP table is a per-vCPU construct that's allocated at MMU 5542 * creation. When emulating 32-bit mode, cr3 is only 32 bits even on 5543 * x86_64. Therefore we need to allocate the PDP table in the first 5544 * 4GB of memory, which happens to fit the DMA32 zone. TDP paging 5545 * generally doesn't use PAE paging and can skip allocating the PDP 5546 * table. The main exception, handled here, is SVM's 32-bit NPT. The 5547 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit 5548 * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). 5549 */ 5550 if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) 5551 return 0; 5552 5553 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); 5554 if (!page) 5555 return -ENOMEM; 5556 5557 mmu->pae_root = page_address(page); 5558 5559 /* 5560 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to 5561 * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so 5562 * that KVM's writes and the CPU's reads get along. Note, this is 5563 * only necessary when using shadow paging, as 64-bit NPT can get at 5564 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported 5565 * by 32-bit kernels (when KVM itself uses 32-bit NPT). 5566 */ 5567 if (!tdp_enabled) 5568 set_memory_decrypted((unsigned long)mmu->pae_root, 1); 5569 else 5570 WARN_ON_ONCE(shadow_me_mask); 5571 5572 for (i = 0; i < 4; ++i) 5573 mmu->pae_root[i] = INVALID_PAE_ROOT; 5574 5575 return 0; 5576 } 5577 5578 int kvm_mmu_create(struct kvm_vcpu *vcpu) 5579 { 5580 int ret; 5581 5582 vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; 5583 vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; 5584 5585 vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; 5586 vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; 5587 5588 vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; 5589 5590 vcpu->arch.mmu = &vcpu->arch.root_mmu; 5591 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 5592 5593 ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); 5594 if (ret) 5595 return ret; 5596 5597 ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); 5598 if (ret) 5599 goto fail_allocate_root; 5600 5601 return ret; 5602 fail_allocate_root: 5603 free_mmu_pages(&vcpu->arch.guest_mmu); 5604 return ret; 5605 } 5606 5607 #define BATCH_ZAP_PAGES 10 5608 static void kvm_zap_obsolete_pages(struct kvm *kvm) 5609 { 5610 struct kvm_mmu_page *sp, *node; 5611 int nr_zapped, batch = 0; 5612 5613 restart: 5614 list_for_each_entry_safe_reverse(sp, node, 5615 &kvm->arch.active_mmu_pages, link) { 5616 /* 5617 * No obsolete valid page exists before a newly created page 5618 * since active_mmu_pages is a FIFO list. 5619 */ 5620 if (!is_obsolete_sp(kvm, sp)) 5621 break; 5622 5623 /* 5624 * Invalid pages should never land back on the list of active 5625 * pages. Skip the bogus page, otherwise we'll get stuck in an 5626 * infinite loop if the page gets put back on the list (again). 5627 */ 5628 if (WARN_ON(sp->role.invalid)) 5629 continue; 5630 5631 /* 5632 * No need to flush the TLB since we're only zapping shadow 5633 * pages with an obsolete generation number and all vCPUS have 5634 * loaded a new root, i.e. the shadow pages being zapped cannot 5635 * be in active use by the guest. 5636 */ 5637 if (batch >= BATCH_ZAP_PAGES && 5638 cond_resched_rwlock_write(&kvm->mmu_lock)) { 5639 batch = 0; 5640 goto restart; 5641 } 5642 5643 if (__kvm_mmu_prepare_zap_page(kvm, sp, 5644 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { 5645 batch += nr_zapped; 5646 goto restart; 5647 } 5648 } 5649 5650 /* 5651 * Trigger a remote TLB flush before freeing the page tables to ensure 5652 * KVM is not in the middle of a lockless shadow page table walk, which 5653 * may reference the pages. 5654 */ 5655 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); 5656 } 5657 5658 /* 5659 * Fast invalidate all shadow pages and use lock-break technique 5660 * to zap obsolete pages. 5661 * 5662 * It's required when memslot is being deleted or VM is being 5663 * destroyed, in these cases, we should ensure that KVM MMU does 5664 * not use any resource of the being-deleted slot or all slots 5665 * after calling the function. 5666 */ 5667 static void kvm_mmu_zap_all_fast(struct kvm *kvm) 5668 { 5669 lockdep_assert_held(&kvm->slots_lock); 5670 5671 write_lock(&kvm->mmu_lock); 5672 trace_kvm_mmu_zap_all_fast(kvm); 5673 5674 /* 5675 * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is 5676 * held for the entire duration of zapping obsolete pages, it's 5677 * impossible for there to be multiple invalid generations associated 5678 * with *valid* shadow pages at any given time, i.e. there is exactly 5679 * one valid generation and (at most) one invalid generation. 5680 */ 5681 kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; 5682 5683 /* In order to ensure all threads see this change when 5684 * handling the MMU reload signal, this must happen in the 5685 * same critical section as kvm_reload_remote_mmus, and 5686 * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages 5687 * could drop the MMU lock and yield. 5688 */ 5689 if (is_tdp_mmu_enabled(kvm)) 5690 kvm_tdp_mmu_invalidate_all_roots(kvm); 5691 5692 /* 5693 * Notify all vcpus to reload its shadow page table and flush TLB. 5694 * Then all vcpus will switch to new shadow page table with the new 5695 * mmu_valid_gen. 5696 * 5697 * Note: we need to do this under the protection of mmu_lock, 5698 * otherwise, vcpu would purge shadow page but miss tlb flush. 5699 */ 5700 kvm_reload_remote_mmus(kvm); 5701 5702 kvm_zap_obsolete_pages(kvm); 5703 5704 write_unlock(&kvm->mmu_lock); 5705 5706 if (is_tdp_mmu_enabled(kvm)) { 5707 read_lock(&kvm->mmu_lock); 5708 kvm_tdp_mmu_zap_invalidated_roots(kvm); 5709 read_unlock(&kvm->mmu_lock); 5710 } 5711 } 5712 5713 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) 5714 { 5715 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); 5716 } 5717 5718 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, 5719 struct kvm_memory_slot *slot, 5720 struct kvm_page_track_notifier_node *node) 5721 { 5722 kvm_mmu_zap_all_fast(kvm); 5723 } 5724 5725 void kvm_mmu_init_vm(struct kvm *kvm) 5726 { 5727 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 5728 5729 spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); 5730 5731 kvm_mmu_init_tdp_mmu(kvm); 5732 5733 node->track_write = kvm_mmu_pte_write; 5734 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; 5735 kvm_page_track_register_notifier(kvm, node); 5736 } 5737 5738 void kvm_mmu_uninit_vm(struct kvm *kvm) 5739 { 5740 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 5741 5742 kvm_page_track_unregister_notifier(kvm, node); 5743 5744 kvm_mmu_uninit_tdp_mmu(kvm); 5745 } 5746 5747 static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 5748 { 5749 const struct kvm_memory_slot *memslot; 5750 struct kvm_memslots *slots; 5751 struct kvm_memslot_iter iter; 5752 bool flush = false; 5753 gfn_t start, end; 5754 int i; 5755 5756 if (!kvm_memslots_have_rmaps(kvm)) 5757 return flush; 5758 5759 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 5760 slots = __kvm_memslots(kvm, i); 5761 5762 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { 5763 memslot = iter.slot; 5764 start = max(gfn_start, memslot->base_gfn); 5765 end = min(gfn_end, memslot->base_gfn + memslot->npages); 5766 if (WARN_ON_ONCE(start >= end)) 5767 continue; 5768 5769 flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, 5770 5771 PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 5772 start, end - 1, true, flush); 5773 } 5774 } 5775 5776 return flush; 5777 } 5778 5779 /* 5780 * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end 5781 * (not including it) 5782 */ 5783 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 5784 { 5785 bool flush; 5786 int i; 5787 5788 if (WARN_ON_ONCE(gfn_end <= gfn_start)) 5789 return; 5790 5791 write_lock(&kvm->mmu_lock); 5792 5793 kvm_inc_notifier_count(kvm, gfn_start, gfn_end); 5794 5795 flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); 5796 5797 if (is_tdp_mmu_enabled(kvm)) { 5798 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 5799 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, 5800 gfn_end, flush); 5801 } 5802 5803 if (flush) 5804 kvm_flush_remote_tlbs_with_address(kvm, gfn_start, 5805 gfn_end - gfn_start); 5806 5807 kvm_dec_notifier_count(kvm, gfn_start, gfn_end); 5808 5809 write_unlock(&kvm->mmu_lock); 5810 } 5811 5812 static bool slot_rmap_write_protect(struct kvm *kvm, 5813 struct kvm_rmap_head *rmap_head, 5814 const struct kvm_memory_slot *slot) 5815 { 5816 return __rmap_write_protect(kvm, rmap_head, false); 5817 } 5818 5819 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 5820 const struct kvm_memory_slot *memslot, 5821 int start_level) 5822 { 5823 bool flush = false; 5824 5825 if (kvm_memslots_have_rmaps(kvm)) { 5826 write_lock(&kvm->mmu_lock); 5827 flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, 5828 start_level, KVM_MAX_HUGEPAGE_LEVEL, 5829 false); 5830 write_unlock(&kvm->mmu_lock); 5831 } 5832 5833 if (is_tdp_mmu_enabled(kvm)) { 5834 read_lock(&kvm->mmu_lock); 5835 flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); 5836 read_unlock(&kvm->mmu_lock); 5837 } 5838 5839 /* 5840 * Flush TLBs if any SPTEs had to be write-protected to ensure that 5841 * guest writes are reflected in the dirty bitmap before the memslot 5842 * update completes, i.e. before enabling dirty logging is visible to 5843 * userspace. 5844 * 5845 * Perform the TLB flush outside the mmu_lock to reduce the amount of 5846 * time the lock is held. However, this does mean that another CPU can 5847 * now grab mmu_lock and encounter a write-protected SPTE while CPUs 5848 * still have a writable mapping for the associated GFN in their TLB. 5849 * 5850 * This is safe but requires KVM to be careful when making decisions 5851 * based on the write-protection status of an SPTE. Specifically, KVM 5852 * also write-protects SPTEs to monitor changes to guest page tables 5853 * during shadow paging, and must guarantee no CPUs can write to those 5854 * page before the lock is dropped. As mentioned in the previous 5855 * paragraph, a write-protected SPTE is no guarantee that CPU cannot 5856 * perform writes. So to determine if a TLB flush is truly required, KVM 5857 * will clear a separate software-only bit (MMU-writable) and skip the 5858 * flush if-and-only-if this bit was already clear. 5859 * 5860 * See DEFAULT_SPTE_MMU_WRITEABLE for more details. 5861 */ 5862 if (flush) 5863 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5864 } 5865 5866 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, 5867 struct kvm_rmap_head *rmap_head, 5868 const struct kvm_memory_slot *slot) 5869 { 5870 u64 *sptep; 5871 struct rmap_iterator iter; 5872 int need_tlb_flush = 0; 5873 kvm_pfn_t pfn; 5874 struct kvm_mmu_page *sp; 5875 5876 restart: 5877 for_each_rmap_spte(rmap_head, &iter, sptep) { 5878 sp = sptep_to_sp(sptep); 5879 pfn = spte_to_pfn(*sptep); 5880 5881 /* 5882 * We cannot do huge page mapping for indirect shadow pages, 5883 * which are found on the last rmap (level = 1) when not using 5884 * tdp; such shadow pages are synced with the page table in 5885 * the guest, and the guest page table is using 4K page size 5886 * mapping if the indirect sp has level = 1. 5887 */ 5888 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && 5889 sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, 5890 pfn, PG_LEVEL_NUM)) { 5891 pte_list_remove(kvm, rmap_head, sptep); 5892 5893 if (kvm_available_flush_tlb_with_range()) 5894 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, 5895 KVM_PAGES_PER_HPAGE(sp->role.level)); 5896 else 5897 need_tlb_flush = 1; 5898 5899 goto restart; 5900 } 5901 } 5902 5903 return need_tlb_flush; 5904 } 5905 5906 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, 5907 const struct kvm_memory_slot *slot) 5908 { 5909 if (kvm_memslots_have_rmaps(kvm)) { 5910 write_lock(&kvm->mmu_lock); 5911 /* 5912 * Zap only 4k SPTEs since the legacy MMU only supports dirty 5913 * logging at a 4k granularity and never creates collapsible 5914 * 2m SPTEs during dirty logging. 5915 */ 5916 if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) 5917 kvm_arch_flush_remote_tlbs_memslot(kvm, slot); 5918 write_unlock(&kvm->mmu_lock); 5919 } 5920 5921 if (is_tdp_mmu_enabled(kvm)) { 5922 read_lock(&kvm->mmu_lock); 5923 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); 5924 read_unlock(&kvm->mmu_lock); 5925 } 5926 } 5927 5928 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 5929 const struct kvm_memory_slot *memslot) 5930 { 5931 /* 5932 * All current use cases for flushing the TLBs for a specific memslot 5933 * related to dirty logging, and many do the TLB flush out of mmu_lock. 5934 * The interaction between the various operations on memslot must be 5935 * serialized by slots_locks to ensure the TLB flush from one operation 5936 * is observed by any other operation on the same memslot. 5937 */ 5938 lockdep_assert_held(&kvm->slots_lock); 5939 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, 5940 memslot->npages); 5941 } 5942 5943 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 5944 const struct kvm_memory_slot *memslot) 5945 { 5946 bool flush = false; 5947 5948 if (kvm_memslots_have_rmaps(kvm)) { 5949 write_lock(&kvm->mmu_lock); 5950 /* 5951 * Clear dirty bits only on 4k SPTEs since the legacy MMU only 5952 * support dirty logging at a 4k granularity. 5953 */ 5954 flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); 5955 write_unlock(&kvm->mmu_lock); 5956 } 5957 5958 if (is_tdp_mmu_enabled(kvm)) { 5959 read_lock(&kvm->mmu_lock); 5960 flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); 5961 read_unlock(&kvm->mmu_lock); 5962 } 5963 5964 /* 5965 * It's also safe to flush TLBs out of mmu lock here as currently this 5966 * function is only used for dirty logging, in which case flushing TLB 5967 * out of mmu lock also guarantees no dirty pages will be lost in 5968 * dirty_bitmap. 5969 */ 5970 if (flush) 5971 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5972 } 5973 5974 void kvm_mmu_zap_all(struct kvm *kvm) 5975 { 5976 struct kvm_mmu_page *sp, *node; 5977 LIST_HEAD(invalid_list); 5978 int ign; 5979 5980 write_lock(&kvm->mmu_lock); 5981 restart: 5982 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 5983 if (WARN_ON(sp->role.invalid)) 5984 continue; 5985 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) 5986 goto restart; 5987 if (cond_resched_rwlock_write(&kvm->mmu_lock)) 5988 goto restart; 5989 } 5990 5991 kvm_mmu_commit_zap_page(kvm, &invalid_list); 5992 5993 if (is_tdp_mmu_enabled(kvm)) 5994 kvm_tdp_mmu_zap_all(kvm); 5995 5996 write_unlock(&kvm->mmu_lock); 5997 } 5998 5999 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) 6000 { 6001 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 6002 6003 gen &= MMIO_SPTE_GEN_MASK; 6004 6005 /* 6006 * Generation numbers are incremented in multiples of the number of 6007 * address spaces in order to provide unique generations across all 6008 * address spaces. Strip what is effectively the address space 6009 * modifier prior to checking for a wrap of the MMIO generation so 6010 * that a wrap in any address space is detected. 6011 */ 6012 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); 6013 6014 /* 6015 * The very rare case: if the MMIO generation number has wrapped, 6016 * zap all shadow pages. 6017 */ 6018 if (unlikely(gen == 0)) { 6019 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); 6020 kvm_mmu_zap_all_fast(kvm); 6021 } 6022 } 6023 6024 static unsigned long 6025 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 6026 { 6027 struct kvm *kvm; 6028 int nr_to_scan = sc->nr_to_scan; 6029 unsigned long freed = 0; 6030 6031 mutex_lock(&kvm_lock); 6032 6033 list_for_each_entry(kvm, &vm_list, vm_list) { 6034 int idx; 6035 LIST_HEAD(invalid_list); 6036 6037 /* 6038 * Never scan more than sc->nr_to_scan VM instances. 6039 * Will not hit this condition practically since we do not try 6040 * to shrink more than one VM and it is very unlikely to see 6041 * !n_used_mmu_pages so many times. 6042 */ 6043 if (!nr_to_scan--) 6044 break; 6045 /* 6046 * n_used_mmu_pages is accessed without holding kvm->mmu_lock 6047 * here. We may skip a VM instance errorneosly, but we do not 6048 * want to shrink a VM that only started to populate its MMU 6049 * anyway. 6050 */ 6051 if (!kvm->arch.n_used_mmu_pages && 6052 !kvm_has_zapped_obsolete_pages(kvm)) 6053 continue; 6054 6055 idx = srcu_read_lock(&kvm->srcu); 6056 write_lock(&kvm->mmu_lock); 6057 6058 if (kvm_has_zapped_obsolete_pages(kvm)) { 6059 kvm_mmu_commit_zap_page(kvm, 6060 &kvm->arch.zapped_obsolete_pages); 6061 goto unlock; 6062 } 6063 6064 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); 6065 6066 unlock: 6067 write_unlock(&kvm->mmu_lock); 6068 srcu_read_unlock(&kvm->srcu, idx); 6069 6070 /* 6071 * unfair on small ones 6072 * per-vm shrinkers cry out 6073 * sadness comes quickly 6074 */ 6075 list_move_tail(&kvm->vm_list, &vm_list); 6076 break; 6077 } 6078 6079 mutex_unlock(&kvm_lock); 6080 return freed; 6081 } 6082 6083 static unsigned long 6084 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 6085 { 6086 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 6087 } 6088 6089 static struct shrinker mmu_shrinker = { 6090 .count_objects = mmu_shrink_count, 6091 .scan_objects = mmu_shrink_scan, 6092 .seeks = DEFAULT_SEEKS * 10, 6093 }; 6094 6095 static void mmu_destroy_caches(void) 6096 { 6097 kmem_cache_destroy(pte_list_desc_cache); 6098 kmem_cache_destroy(mmu_page_header_cache); 6099 } 6100 6101 static bool get_nx_auto_mode(void) 6102 { 6103 /* Return true when CPU has the bug, and mitigations are ON */ 6104 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); 6105 } 6106 6107 static void __set_nx_huge_pages(bool val) 6108 { 6109 nx_huge_pages = itlb_multihit_kvm_mitigation = val; 6110 } 6111 6112 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) 6113 { 6114 bool old_val = nx_huge_pages; 6115 bool new_val; 6116 6117 /* In "auto" mode deploy workaround only if CPU has the bug. */ 6118 if (sysfs_streq(val, "off")) 6119 new_val = 0; 6120 else if (sysfs_streq(val, "force")) 6121 new_val = 1; 6122 else if (sysfs_streq(val, "auto")) 6123 new_val = get_nx_auto_mode(); 6124 else if (strtobool(val, &new_val) < 0) 6125 return -EINVAL; 6126 6127 __set_nx_huge_pages(new_val); 6128 6129 if (new_val != old_val) { 6130 struct kvm *kvm; 6131 6132 mutex_lock(&kvm_lock); 6133 6134 list_for_each_entry(kvm, &vm_list, vm_list) { 6135 mutex_lock(&kvm->slots_lock); 6136 kvm_mmu_zap_all_fast(kvm); 6137 mutex_unlock(&kvm->slots_lock); 6138 6139 wake_up_process(kvm->arch.nx_lpage_recovery_thread); 6140 } 6141 mutex_unlock(&kvm_lock); 6142 } 6143 6144 return 0; 6145 } 6146 6147 int kvm_mmu_module_init(void) 6148 { 6149 int ret = -ENOMEM; 6150 6151 if (nx_huge_pages == -1) 6152 __set_nx_huge_pages(get_nx_auto_mode()); 6153 6154 /* 6155 * MMU roles use union aliasing which is, generally speaking, an 6156 * undefined behavior. However, we supposedly know how compilers behave 6157 * and the current status quo is unlikely to change. Guardians below are 6158 * supposed to let us know if the assumption becomes false. 6159 */ 6160 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); 6161 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); 6162 BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64)); 6163 6164 kvm_mmu_reset_all_pte_masks(); 6165 6166 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 6167 sizeof(struct pte_list_desc), 6168 0, SLAB_ACCOUNT, NULL); 6169 if (!pte_list_desc_cache) 6170 goto out; 6171 6172 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 6173 sizeof(struct kvm_mmu_page), 6174 0, SLAB_ACCOUNT, NULL); 6175 if (!mmu_page_header_cache) 6176 goto out; 6177 6178 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) 6179 goto out; 6180 6181 ret = register_shrinker(&mmu_shrinker); 6182 if (ret) 6183 goto out; 6184 6185 return 0; 6186 6187 out: 6188 mmu_destroy_caches(); 6189 return ret; 6190 } 6191 6192 void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 6193 { 6194 kvm_mmu_unload(vcpu); 6195 free_mmu_pages(&vcpu->arch.root_mmu); 6196 free_mmu_pages(&vcpu->arch.guest_mmu); 6197 mmu_free_memory_caches(vcpu); 6198 } 6199 6200 void kvm_mmu_module_exit(void) 6201 { 6202 mmu_destroy_caches(); 6203 percpu_counter_destroy(&kvm_total_used_mmu_pages); 6204 unregister_shrinker(&mmu_shrinker); 6205 mmu_audit_disable(); 6206 } 6207 6208 /* 6209 * Calculate the effective recovery period, accounting for '0' meaning "let KVM 6210 * select a halving time of 1 hour". Returns true if recovery is enabled. 6211 */ 6212 static bool calc_nx_huge_pages_recovery_period(uint *period) 6213 { 6214 /* 6215 * Use READ_ONCE to get the params, this may be called outside of the 6216 * param setters, e.g. by the kthread to compute its next timeout. 6217 */ 6218 bool enabled = READ_ONCE(nx_huge_pages); 6219 uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); 6220 6221 if (!enabled || !ratio) 6222 return false; 6223 6224 *period = READ_ONCE(nx_huge_pages_recovery_period_ms); 6225 if (!*period) { 6226 /* Make sure the period is not less than one second. */ 6227 ratio = min(ratio, 3600u); 6228 *period = 60 * 60 * 1000 / ratio; 6229 } 6230 return true; 6231 } 6232 6233 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) 6234 { 6235 bool was_recovery_enabled, is_recovery_enabled; 6236 uint old_period, new_period; 6237 int err; 6238 6239 was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); 6240 6241 err = param_set_uint(val, kp); 6242 if (err) 6243 return err; 6244 6245 is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); 6246 6247 if (is_recovery_enabled && 6248 (!was_recovery_enabled || old_period > new_period)) { 6249 struct kvm *kvm; 6250 6251 mutex_lock(&kvm_lock); 6252 6253 list_for_each_entry(kvm, &vm_list, vm_list) 6254 wake_up_process(kvm->arch.nx_lpage_recovery_thread); 6255 6256 mutex_unlock(&kvm_lock); 6257 } 6258 6259 return err; 6260 } 6261 6262 static void kvm_recover_nx_lpages(struct kvm *kvm) 6263 { 6264 unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; 6265 int rcu_idx; 6266 struct kvm_mmu_page *sp; 6267 unsigned int ratio; 6268 LIST_HEAD(invalid_list); 6269 bool flush = false; 6270 ulong to_zap; 6271 6272 rcu_idx = srcu_read_lock(&kvm->srcu); 6273 write_lock(&kvm->mmu_lock); 6274 6275 ratio = READ_ONCE(nx_huge_pages_recovery_ratio); 6276 to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; 6277 for ( ; to_zap; --to_zap) { 6278 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) 6279 break; 6280 6281 /* 6282 * We use a separate list instead of just using active_mmu_pages 6283 * because the number of lpage_disallowed pages is expected to 6284 * be relatively small compared to the total. 6285 */ 6286 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, 6287 struct kvm_mmu_page, 6288 lpage_disallowed_link); 6289 WARN_ON_ONCE(!sp->lpage_disallowed); 6290 if (is_tdp_mmu_page(sp)) { 6291 flush |= kvm_tdp_mmu_zap_sp(kvm, sp); 6292 } else { 6293 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 6294 WARN_ON_ONCE(sp->lpage_disallowed); 6295 } 6296 6297 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 6298 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 6299 cond_resched_rwlock_write(&kvm->mmu_lock); 6300 flush = false; 6301 } 6302 } 6303 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 6304 6305 write_unlock(&kvm->mmu_lock); 6306 srcu_read_unlock(&kvm->srcu, rcu_idx); 6307 } 6308 6309 static long get_nx_lpage_recovery_timeout(u64 start_time) 6310 { 6311 bool enabled; 6312 uint period; 6313 6314 enabled = calc_nx_huge_pages_recovery_period(&period); 6315 6316 return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() 6317 : MAX_SCHEDULE_TIMEOUT; 6318 } 6319 6320 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) 6321 { 6322 u64 start_time; 6323 long remaining_time; 6324 6325 while (true) { 6326 start_time = get_jiffies_64(); 6327 remaining_time = get_nx_lpage_recovery_timeout(start_time); 6328 6329 set_current_state(TASK_INTERRUPTIBLE); 6330 while (!kthread_should_stop() && remaining_time > 0) { 6331 schedule_timeout(remaining_time); 6332 remaining_time = get_nx_lpage_recovery_timeout(start_time); 6333 set_current_state(TASK_INTERRUPTIBLE); 6334 } 6335 6336 set_current_state(TASK_RUNNING); 6337 6338 if (kthread_should_stop()) 6339 return 0; 6340 6341 kvm_recover_nx_lpages(kvm); 6342 } 6343 } 6344 6345 int kvm_mmu_post_init_vm(struct kvm *kvm) 6346 { 6347 int err; 6348 6349 err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, 6350 "kvm-nx-lpage-recovery", 6351 &kvm->arch.nx_lpage_recovery_thread); 6352 if (!err) 6353 kthread_unpark(kvm->arch.nx_lpage_recovery_thread); 6354 6355 return err; 6356 } 6357 6358 void kvm_mmu_pre_destroy_vm(struct kvm *kvm) 6359 { 6360 if (kvm->arch.nx_lpage_recovery_thread) 6361 kthread_stop(kvm->arch.nx_lpage_recovery_thread); 6362 } 6363