1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * MMU support 9 * 10 * Copyright (C) 2006 Qumranet, Inc. 11 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 12 * 13 * Authors: 14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Avi Kivity <avi@qumranet.com> 16 */ 17 18 /* 19 * We need the mmu code to access both 32-bit and 64-bit guest ptes, 20 * so the code in this file is compiled twice, once per pte size. 21 */ 22 23 #if PTTYPE == 64 24 #define pt_element_t u64 25 #define guest_walker guest_walker64 26 #define FNAME(name) paging##64_##name 27 #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK 28 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 29 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 30 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 31 #define PT_LEVEL_BITS PT64_LEVEL_BITS 32 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT 33 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT 34 #define PT_HAVE_ACCESSED_DIRTY(mmu) true 35 #ifdef CONFIG_X86_64 36 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL 37 #define CMPXCHG "cmpxchgq" 38 #else 39 #define PT_MAX_FULL_LEVELS 2 40 #endif 41 #elif PTTYPE == 32 42 #define pt_element_t u32 43 #define guest_walker guest_walker32 44 #define FNAME(name) paging##32_##name 45 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 46 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 47 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_LEVEL_BITS PT32_LEVEL_BITS 50 #define PT_MAX_FULL_LEVELS 2 51 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT 52 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT 53 #define PT_HAVE_ACCESSED_DIRTY(mmu) true 54 #define CMPXCHG "cmpxchgl" 55 #elif PTTYPE == PTTYPE_EPT 56 #define pt_element_t u64 57 #define guest_walker guest_walkerEPT 58 #define FNAME(name) ept_##name 59 #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK 60 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 61 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 62 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 63 #define PT_LEVEL_BITS PT64_LEVEL_BITS 64 #define PT_GUEST_DIRTY_SHIFT 9 65 #define PT_GUEST_ACCESSED_SHIFT 8 66 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) 67 #ifdef CONFIG_X86_64 68 #define CMPXCHG "cmpxchgq" 69 #endif 70 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL 71 #else 72 #error Invalid PTTYPE value 73 #endif 74 75 #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) 76 #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) 77 78 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) 79 #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) 80 81 /* 82 * The guest_walker structure emulates the behavior of the hardware page 83 * table walker. 84 */ 85 struct guest_walker { 86 int level; 87 unsigned max_level; 88 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 89 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 90 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; 91 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 92 pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; 93 bool pte_writable[PT_MAX_FULL_LEVELS]; 94 unsigned int pt_access[PT_MAX_FULL_LEVELS]; 95 unsigned int pte_access; 96 gfn_t gfn; 97 struct x86_exception fault; 98 }; 99 100 static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 101 { 102 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 103 } 104 105 static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, 106 unsigned gpte) 107 { 108 unsigned mask; 109 110 /* dirty bit is not supported, so no need to track it */ 111 if (!PT_HAVE_ACCESSED_DIRTY(mmu)) 112 return; 113 114 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); 115 116 mask = (unsigned)~ACC_WRITE_MASK; 117 /* Allow write access to dirty gptes */ 118 mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & 119 PT_WRITABLE_MASK; 120 *access &= mask; 121 } 122 123 static inline int FNAME(is_present_gpte)(unsigned long pte) 124 { 125 #if PTTYPE != PTTYPE_EPT 126 return pte & PT_PRESENT_MASK; 127 #else 128 return pte & 7; 129 #endif 130 } 131 132 static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) 133 { 134 #if PTTYPE != PTTYPE_EPT 135 return false; 136 #else 137 return __is_bad_mt_xwr(rsvd_check, gpte); 138 #endif 139 } 140 141 static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) 142 { 143 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || 144 FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); 145 } 146 147 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, 148 struct kvm_mmu_page *sp, u64 *spte, 149 u64 gpte) 150 { 151 if (!FNAME(is_present_gpte)(gpte)) 152 goto no_present; 153 154 /* Prefetch only accessed entries (unless A/D bits are disabled). */ 155 if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && 156 !(gpte & PT_GUEST_ACCESSED_MASK)) 157 goto no_present; 158 159 if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) 160 goto no_present; 161 162 return false; 163 164 no_present: 165 drop_spte(vcpu->kvm, spte); 166 return true; 167 } 168 169 /* 170 * For PTTYPE_EPT, a page table can be executable but not readable 171 * on supported processors. Therefore, set_spte does not automatically 172 * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK 173 * to signify readability since it isn't used in the EPT case 174 */ 175 static inline unsigned FNAME(gpte_access)(u64 gpte) 176 { 177 unsigned access; 178 #if PTTYPE == PTTYPE_EPT 179 access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | 180 ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | 181 ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); 182 #else 183 BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); 184 BUILD_BUG_ON(ACC_EXEC_MASK != 1); 185 access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); 186 /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ 187 access ^= (gpte >> PT64_NX_SHIFT); 188 #endif 189 190 return access; 191 } 192 193 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, 194 struct kvm_mmu *mmu, 195 struct guest_walker *walker, 196 gpa_t addr, int write_fault) 197 { 198 unsigned level, index; 199 pt_element_t pte, orig_pte; 200 pt_element_t __user *ptep_user; 201 gfn_t table_gfn; 202 int ret; 203 204 /* dirty/accessed bits are not supported, so no need to update them */ 205 if (!PT_HAVE_ACCESSED_DIRTY(mmu)) 206 return 0; 207 208 for (level = walker->max_level; level >= walker->level; --level) { 209 pte = orig_pte = walker->ptes[level - 1]; 210 table_gfn = walker->table_gfn[level - 1]; 211 ptep_user = walker->ptep_user[level - 1]; 212 index = offset_in_page(ptep_user) / sizeof(pt_element_t); 213 if (!(pte & PT_GUEST_ACCESSED_MASK)) { 214 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); 215 pte |= PT_GUEST_ACCESSED_MASK; 216 } 217 if (level == walker->level && write_fault && 218 !(pte & PT_GUEST_DIRTY_MASK)) { 219 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 220 #if PTTYPE == PTTYPE_EPT 221 if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) 222 return -EINVAL; 223 #endif 224 pte |= PT_GUEST_DIRTY_MASK; 225 } 226 if (pte == orig_pte) 227 continue; 228 229 /* 230 * If the slot is read-only, simply do not process the accessed 231 * and dirty bits. This is the correct thing to do if the slot 232 * is ROM, and page tables in read-as-ROM/write-as-MMIO slots 233 * are only supported if the accessed and dirty bits are already 234 * set in the ROM (so that MMIO writes are never needed). 235 * 236 * Note that NPT does not allow this at all and faults, since 237 * it always wants nested page table entries for the guest 238 * page tables to be writable. And EPT works but will simply 239 * overwrite the read-only memory to set the accessed and dirty 240 * bits. 241 */ 242 if (unlikely(!walker->pte_writable[level - 1])) 243 continue; 244 245 ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); 246 if (ret) 247 return ret; 248 249 kvm_vcpu_mark_page_dirty(vcpu, table_gfn); 250 walker->ptes[level - 1] = pte; 251 } 252 return 0; 253 } 254 255 static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) 256 { 257 unsigned pkeys = 0; 258 #if PTTYPE == 64 259 pte_t pte = {.pte = gpte}; 260 261 pkeys = pte_flags_pkey(pte_flags(pte)); 262 #endif 263 return pkeys; 264 } 265 266 static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, 267 unsigned int level, unsigned int gpte) 268 { 269 /* 270 * For EPT and PAE paging (both variants), bit 7 is either reserved at 271 * all level or indicates a huge page (ignoring CR3/EPTP). In either 272 * case, bit 7 being set terminates the walk. 273 */ 274 #if PTTYPE == 32 275 /* 276 * 32-bit paging requires special handling because bit 7 is ignored if 277 * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is 278 * greater than the last level for which bit 7 is the PAGE_SIZE bit. 279 * 280 * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 281 * is not reserved and does not indicate a large page at this level, 282 * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. 283 */ 284 gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); 285 #endif 286 /* 287 * PG_LEVEL_4K always terminates. The RHS has bit 7 set 288 * iff level <= PG_LEVEL_4K, which for our purpose means 289 * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. 290 */ 291 gpte |= level - PG_LEVEL_4K - 1; 292 293 return gpte & PT_PAGE_SIZE_MASK; 294 } 295 /* 296 * Fetch a guest pte for a guest virtual address, or for an L2's GPA. 297 */ 298 static int FNAME(walk_addr_generic)(struct guest_walker *walker, 299 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 300 gpa_t addr, u64 access) 301 { 302 int ret; 303 pt_element_t pte; 304 pt_element_t __user *ptep_user; 305 gfn_t table_gfn; 306 u64 pt_access, pte_access; 307 unsigned index, accessed_dirty, pte_pkey; 308 u64 nested_access; 309 gpa_t pte_gpa; 310 bool have_ad; 311 int offset; 312 u64 walk_nx_mask = 0; 313 const int write_fault = access & PFERR_WRITE_MASK; 314 const int user_fault = access & PFERR_USER_MASK; 315 const int fetch_fault = access & PFERR_FETCH_MASK; 316 u16 errcode = 0; 317 gpa_t real_gpa; 318 gfn_t gfn; 319 320 trace_kvm_mmu_pagetable_walk(addr, access); 321 retry_walk: 322 walker->level = mmu->cpu_role.base.level; 323 pte = mmu->get_guest_pgd(vcpu); 324 have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); 325 326 #if PTTYPE == 64 327 walk_nx_mask = 1ULL << PT64_NX_SHIFT; 328 if (walker->level == PT32E_ROOT_LEVEL) { 329 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); 330 trace_kvm_mmu_paging_element(pte, walker->level); 331 if (!FNAME(is_present_gpte)(pte)) 332 goto error; 333 --walker->level; 334 } 335 #endif 336 walker->max_level = walker->level; 337 ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); 338 339 /* 340 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging 341 * by the MOV to CR instruction are treated as reads and do not cause the 342 * processor to set the dirty flag in any EPT paging-structure entry. 343 */ 344 nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; 345 346 pte_access = ~0; 347 ++walker->level; 348 349 do { 350 unsigned long host_addr; 351 352 pt_access = pte_access; 353 --walker->level; 354 355 index = PT_INDEX(addr, walker->level); 356 table_gfn = gpte_to_gfn(pte); 357 offset = index * sizeof(pt_element_t); 358 pte_gpa = gfn_to_gpa(table_gfn) + offset; 359 360 BUG_ON(walker->level < 1); 361 walker->table_gfn[walker->level - 1] = table_gfn; 362 walker->pte_gpa[walker->level - 1] = pte_gpa; 363 364 real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), 365 nested_access, &walker->fault); 366 367 /* 368 * FIXME: This can happen if emulation (for of an INS/OUTS 369 * instruction) triggers a nested page fault. The exit 370 * qualification / exit info field will incorrectly have 371 * "guest page access" as the nested page fault's cause, 372 * instead of "guest page structure access". To fix this, 373 * the x86_exception struct should be augmented with enough 374 * information to fix the exit_qualification or exit_info_1 375 * fields. 376 */ 377 if (unlikely(real_gpa == UNMAPPED_GVA)) 378 return 0; 379 380 host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), 381 &walker->pte_writable[walker->level - 1]); 382 if (unlikely(kvm_is_error_hva(host_addr))) 383 goto error; 384 385 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 386 if (unlikely(__get_user(pte, ptep_user))) 387 goto error; 388 walker->ptep_user[walker->level - 1] = ptep_user; 389 390 trace_kvm_mmu_paging_element(pte, walker->level); 391 392 /* 393 * Inverting the NX it lets us AND it like other 394 * permission bits. 395 */ 396 pte_access = pt_access & (pte ^ walk_nx_mask); 397 398 if (unlikely(!FNAME(is_present_gpte)(pte))) 399 goto error; 400 401 if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { 402 errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; 403 goto error; 404 } 405 406 walker->ptes[walker->level - 1] = pte; 407 408 /* Convert to ACC_*_MASK flags for struct guest_walker. */ 409 walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); 410 } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); 411 412 pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); 413 accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; 414 415 /* Convert to ACC_*_MASK flags for struct guest_walker. */ 416 walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); 417 errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); 418 if (unlikely(errcode)) 419 goto error; 420 421 gfn = gpte_to_gfn_lvl(pte, walker->level); 422 gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; 423 424 if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) 425 gfn += pse36_gfn_delta(pte); 426 427 real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); 428 if (real_gpa == UNMAPPED_GVA) 429 return 0; 430 431 walker->gfn = real_gpa >> PAGE_SHIFT; 432 433 if (!write_fault) 434 FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); 435 else 436 /* 437 * On a write fault, fold the dirty bit into accessed_dirty. 438 * For modes without A/D bits support accessed_dirty will be 439 * always clear. 440 */ 441 accessed_dirty &= pte >> 442 (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); 443 444 if (unlikely(!accessed_dirty)) { 445 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, 446 addr, write_fault); 447 if (unlikely(ret < 0)) 448 goto error; 449 else if (ret) 450 goto retry_walk; 451 } 452 453 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 454 __func__, (u64)pte, walker->pte_access, 455 walker->pt_access[walker->level - 1]); 456 return 1; 457 458 error: 459 errcode |= write_fault | user_fault; 460 if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) 461 errcode |= PFERR_FETCH_MASK; 462 463 walker->fault.vector = PF_VECTOR; 464 walker->fault.error_code_valid = true; 465 walker->fault.error_code = errcode; 466 467 #if PTTYPE == PTTYPE_EPT 468 /* 469 * Use PFERR_RSVD_MASK in error_code to to tell if EPT 470 * misconfiguration requires to be injected. The detection is 471 * done by is_rsvd_bits_set() above. 472 * 473 * We set up the value of exit_qualification to inject: 474 * [2:0] - Derive from the access bits. The exit_qualification might be 475 * out of date if it is serving an EPT misconfiguration. 476 * [5:3] - Calculated by the page walk of the guest EPT page tables 477 * [7:8] - Derived from [7:8] of real exit_qualification 478 * 479 * The other bits are set to 0. 480 */ 481 if (!(errcode & PFERR_RSVD_MASK)) { 482 vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | 483 EPT_VIOLATION_GVA_TRANSLATED); 484 if (write_fault) 485 vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; 486 if (user_fault) 487 vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; 488 if (fetch_fault) 489 vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; 490 491 /* 492 * Note, pte_access holds the raw RWX bits from the EPTE, not 493 * ACC_*_MASK flags! 494 */ 495 vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << 496 EPT_VIOLATION_RWX_SHIFT; 497 } 498 #endif 499 walker->fault.address = addr; 500 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 501 walker->fault.async_page_fault = false; 502 503 trace_kvm_mmu_walker_error(walker->fault.error_code); 504 return 0; 505 } 506 507 static int FNAME(walk_addr)(struct guest_walker *walker, 508 struct kvm_vcpu *vcpu, gpa_t addr, u64 access) 509 { 510 return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, 511 access); 512 } 513 514 static bool 515 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 516 u64 *spte, pt_element_t gpte, bool no_dirty_log) 517 { 518 struct kvm_memory_slot *slot; 519 unsigned pte_access; 520 gfn_t gfn; 521 kvm_pfn_t pfn; 522 523 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 524 return false; 525 526 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 527 528 gfn = gpte_to_gfn(gpte); 529 pte_access = sp->role.access & FNAME(gpte_access)(gpte); 530 FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); 531 532 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, 533 no_dirty_log && (pte_access & ACC_WRITE_MASK)); 534 if (!slot) 535 return false; 536 537 pfn = gfn_to_pfn_memslot_atomic(slot, gfn); 538 if (is_error_pfn(pfn)) 539 return false; 540 541 mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); 542 kvm_release_pfn_clean(pfn); 543 return true; 544 } 545 546 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 547 struct guest_walker *gw, int level) 548 { 549 pt_element_t curr_pte; 550 gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; 551 u64 mask; 552 int r, index; 553 554 if (level == PG_LEVEL_4K) { 555 mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; 556 base_gpa = pte_gpa & ~mask; 557 index = (pte_gpa - base_gpa) / sizeof(pt_element_t); 558 559 r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, 560 gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); 561 curr_pte = gw->prefetch_ptes[index]; 562 } else 563 r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, 564 &curr_pte, sizeof(curr_pte)); 565 566 return r || curr_pte != gw->ptes[level - 1]; 567 } 568 569 static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, 570 u64 *sptep) 571 { 572 struct kvm_mmu_page *sp; 573 pt_element_t *gptep = gw->prefetch_ptes; 574 u64 *spte; 575 int i; 576 577 sp = sptep_to_sp(sptep); 578 579 if (sp->role.level > PG_LEVEL_4K) 580 return; 581 582 /* 583 * If addresses are being invalidated, skip prefetching to avoid 584 * accidentally prefetching those addresses. 585 */ 586 if (unlikely(vcpu->kvm->mmu_notifier_count)) 587 return; 588 589 if (sp->role.direct) 590 return __direct_pte_prefetch(vcpu, sp, sptep); 591 592 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); 593 spte = sp->spt + i; 594 595 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 596 if (spte == sptep) 597 continue; 598 599 if (is_shadow_present_pte(*spte)) 600 continue; 601 602 if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) 603 break; 604 } 605 } 606 607 /* 608 * Fetch a shadow pte for a specific level in the paging hierarchy. 609 * If the guest tries to write a write-protected page, we need to 610 * emulate this operation, return 1 to indicate this case. 611 */ 612 static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, 613 struct guest_walker *gw) 614 { 615 struct kvm_mmu_page *sp = NULL; 616 struct kvm_shadow_walk_iterator it; 617 unsigned int direct_access, access; 618 int top_level, ret; 619 gfn_t base_gfn = fault->gfn; 620 621 WARN_ON_ONCE(gw->gfn != base_gfn); 622 direct_access = gw->pte_access; 623 624 top_level = vcpu->arch.mmu->cpu_role.base.level; 625 if (top_level == PT32E_ROOT_LEVEL) 626 top_level = PT32_ROOT_LEVEL; 627 /* 628 * Verify that the top-level gpte is still there. Since the page 629 * is a root page, it is either write protected (and cannot be 630 * changed from now on) or it is invalid (in which case, we don't 631 * really care if it changes underneath us after this point). 632 */ 633 if (FNAME(gpte_changed)(vcpu, gw, top_level)) 634 goto out_gpte_changed; 635 636 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) 637 goto out_gpte_changed; 638 639 for (shadow_walk_init(&it, vcpu, fault->addr); 640 shadow_walk_okay(&it) && it.level > gw->level; 641 shadow_walk_next(&it)) { 642 gfn_t table_gfn; 643 644 clear_sp_write_flooding_count(it.sptep); 645 drop_large_spte(vcpu, it.sptep); 646 647 sp = NULL; 648 if (!is_shadow_present_pte(*it.sptep)) { 649 table_gfn = gw->table_gfn[it.level - 2]; 650 access = gw->pt_access[it.level - 2]; 651 sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, 652 it.level-1, false, access); 653 /* 654 * We must synchronize the pagetable before linking it 655 * because the guest doesn't need to flush tlb when 656 * the gpte is changed from non-present to present. 657 * Otherwise, the guest may use the wrong mapping. 658 * 659 * For PG_LEVEL_4K, kvm_mmu_get_page() has already 660 * synchronized it transiently via kvm_sync_page(). 661 * 662 * For higher level pagetable, we synchronize it via 663 * the slower mmu_sync_children(). If it needs to 664 * break, some progress has been made; return 665 * RET_PF_RETRY and retry on the next #PF. 666 * KVM_REQ_MMU_SYNC is not necessary but it 667 * expedites the process. 668 */ 669 if (sp->unsync_children && 670 mmu_sync_children(vcpu, sp, false)) 671 return RET_PF_RETRY; 672 } 673 674 /* 675 * Verify that the gpte in the page we've just write 676 * protected is still there. 677 */ 678 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) 679 goto out_gpte_changed; 680 681 if (sp) 682 link_shadow_page(vcpu, it.sptep, sp); 683 } 684 685 kvm_mmu_hugepage_adjust(vcpu, fault); 686 687 trace_kvm_mmu_spte_requested(fault); 688 689 for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { 690 clear_sp_write_flooding_count(it.sptep); 691 692 /* 693 * We cannot overwrite existing page tables with an NX 694 * large page, as the leaf could be executable. 695 */ 696 if (fault->nx_huge_page_workaround_enabled) 697 disallowed_hugepage_adjust(fault, *it.sptep, it.level); 698 699 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 700 if (it.level == fault->goal_level) 701 break; 702 703 validate_direct_spte(vcpu, it.sptep, direct_access); 704 705 drop_large_spte(vcpu, it.sptep); 706 707 if (!is_shadow_present_pte(*it.sptep)) { 708 sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, 709 it.level - 1, true, direct_access); 710 link_shadow_page(vcpu, it.sptep, sp); 711 if (fault->huge_page_disallowed && 712 fault->req_level >= it.level) 713 account_huge_nx_page(vcpu->kvm, sp); 714 } 715 } 716 717 if (WARN_ON_ONCE(it.level != fault->goal_level)) 718 return -EFAULT; 719 720 ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, 721 base_gfn, fault->pfn, fault); 722 if (ret == RET_PF_SPURIOUS) 723 return ret; 724 725 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 726 return ret; 727 728 out_gpte_changed: 729 return RET_PF_RETRY; 730 } 731 732 /* 733 * To see whether the mapped gfn can write its page table in the current 734 * mapping. 735 * 736 * It is the helper function of FNAME(page_fault). When guest uses large page 737 * size to map the writable gfn which is used as current page table, we should 738 * force kvm to use small page size to map it because new shadow page will be 739 * created when kvm establishes shadow page table that stop kvm using large 740 * page size. Do it early can avoid unnecessary #PF and emulation. 741 * 742 * @write_fault_to_shadow_pgtable will return true if the fault gfn is 743 * currently used as its page table. 744 * 745 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok 746 * since the PDPT is always shadowed, that means, we can not use large page 747 * size to map the gfn which is used as PDPT. 748 */ 749 static bool 750 FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, 751 struct guest_walker *walker, bool user_fault, 752 bool *write_fault_to_shadow_pgtable) 753 { 754 int level; 755 gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); 756 bool self_changed = false; 757 758 if (!(walker->pte_access & ACC_WRITE_MASK || 759 (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) 760 return false; 761 762 for (level = walker->level; level <= walker->max_level; level++) { 763 gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; 764 765 self_changed |= !(gfn & mask); 766 *write_fault_to_shadow_pgtable |= !gfn; 767 } 768 769 return self_changed; 770 } 771 772 /* 773 * Page fault handler. There are several causes for a page fault: 774 * - there is no shadow pte for the guest pte 775 * - write access through a shadow pte marked read only so that we can set 776 * the dirty bit 777 * - write access to a shadow pte marked read only so we can update the page 778 * dirty bitmap, when userspace requests it 779 * - mmio access; in this case we will never install a present shadow pte 780 * - normal guest page fault due to the guest pte marked not present, not 781 * writable, or not executable 782 * 783 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 784 * a negative value on error. 785 */ 786 static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 787 { 788 struct guest_walker walker; 789 int r; 790 unsigned long mmu_seq; 791 bool is_self_change_mapping; 792 793 pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); 794 WARN_ON_ONCE(fault->is_tdp); 795 796 /* 797 * Look up the guest pte for the faulting address. 798 * If PFEC.RSVD is set, this is a shadow page fault. 799 * The bit needs to be cleared before walking guest page tables. 800 */ 801 r = FNAME(walk_addr)(&walker, vcpu, fault->addr, 802 fault->error_code & ~PFERR_RSVD_MASK); 803 804 /* 805 * The page is not mapped by the guest. Let the guest handle it. 806 */ 807 if (!r) { 808 pgprintk("%s: guest page fault\n", __func__); 809 if (!fault->prefetch) 810 kvm_inject_emulated_page_fault(vcpu, &walker.fault); 811 812 return RET_PF_RETRY; 813 } 814 815 fault->gfn = walker.gfn; 816 fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); 817 818 if (page_fault_handle_page_track(vcpu, fault)) { 819 shadow_page_table_clear_flood(vcpu, fault->addr); 820 return RET_PF_EMULATE; 821 } 822 823 r = mmu_topup_memory_caches(vcpu, true); 824 if (r) 825 return r; 826 827 vcpu->arch.write_fault_to_shadow_pgtable = false; 828 829 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, 830 &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); 831 832 if (is_self_change_mapping) 833 fault->max_level = PG_LEVEL_4K; 834 else 835 fault->max_level = walker.level; 836 837 mmu_seq = vcpu->kvm->mmu_notifier_seq; 838 smp_rmb(); 839 840 r = kvm_faultin_pfn(vcpu, fault); 841 if (r != RET_PF_CONTINUE) 842 return r; 843 844 r = handle_abnormal_pfn(vcpu, fault, walker.pte_access); 845 if (r != RET_PF_CONTINUE) 846 return r; 847 848 /* 849 * Do not change pte_access if the pfn is a mmio page, otherwise 850 * we will cache the incorrect access into mmio spte. 851 */ 852 if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && 853 !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { 854 walker.pte_access |= ACC_WRITE_MASK; 855 walker.pte_access &= ~ACC_USER_MASK; 856 857 /* 858 * If we converted a user page to a kernel page, 859 * so that the kernel can write to it when cr0.wp=0, 860 * then we should prevent the kernel from executing it 861 * if SMEP is enabled. 862 */ 863 if (is_cr4_smep(vcpu->arch.mmu)) 864 walker.pte_access &= ~ACC_EXEC_MASK; 865 } 866 867 r = RET_PF_RETRY; 868 write_lock(&vcpu->kvm->mmu_lock); 869 870 if (is_page_fault_stale(vcpu, fault, mmu_seq)) 871 goto out_unlock; 872 873 r = make_mmu_pages_available(vcpu); 874 if (r) 875 goto out_unlock; 876 r = FNAME(fetch)(vcpu, fault, &walker); 877 878 out_unlock: 879 write_unlock(&vcpu->kvm->mmu_lock); 880 kvm_release_pfn_clean(fault->pfn); 881 return r; 882 } 883 884 static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) 885 { 886 int offset = 0; 887 888 WARN_ON(sp->role.level != PG_LEVEL_4K); 889 890 if (PTTYPE == 32) 891 offset = sp->role.quadrant << PT64_LEVEL_BITS; 892 893 return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); 894 } 895 896 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) 897 { 898 struct kvm_shadow_walk_iterator iterator; 899 struct kvm_mmu_page *sp; 900 u64 old_spte; 901 int level; 902 u64 *sptep; 903 904 vcpu_clear_mmio_info(vcpu, gva); 905 906 /* 907 * No need to check return value here, rmap_can_add() can 908 * help us to skip pte prefetch later. 909 */ 910 mmu_topup_memory_caches(vcpu, true); 911 912 if (!VALID_PAGE(root_hpa)) { 913 WARN_ON(1); 914 return; 915 } 916 917 write_lock(&vcpu->kvm->mmu_lock); 918 for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { 919 level = iterator.level; 920 sptep = iterator.sptep; 921 922 sp = sptep_to_sp(sptep); 923 old_spte = *sptep; 924 if (is_last_spte(old_spte, level)) { 925 pt_element_t gpte; 926 gpa_t pte_gpa; 927 928 if (!sp->unsync) 929 break; 930 931 pte_gpa = FNAME(get_level1_sp_gpa)(sp); 932 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 933 934 mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); 935 if (is_shadow_present_pte(old_spte)) 936 kvm_flush_remote_tlbs_with_address(vcpu->kvm, 937 sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); 938 939 if (!rmap_can_add(vcpu)) 940 break; 941 942 if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, 943 sizeof(pt_element_t))) 944 break; 945 946 FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); 947 } 948 949 if (!sp->unsync_children) 950 break; 951 } 952 write_unlock(&vcpu->kvm->mmu_lock); 953 } 954 955 /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ 956 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 957 gpa_t addr, u64 access, 958 struct x86_exception *exception) 959 { 960 struct guest_walker walker; 961 gpa_t gpa = UNMAPPED_GVA; 962 int r; 963 964 #ifndef CONFIG_X86_64 965 /* A 64-bit GVA should be impossible on 32-bit KVM. */ 966 WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); 967 #endif 968 969 r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); 970 971 if (r) { 972 gpa = gfn_to_gpa(walker.gfn); 973 gpa |= addr & ~PAGE_MASK; 974 } else if (exception) 975 *exception = walker.fault; 976 977 return gpa; 978 } 979 980 /* 981 * Using the cached information from sp->gfns is safe because: 982 * - The spte has a reference to the struct page, so the pfn for a given gfn 983 * can't change unless all sptes pointing to it are nuked first. 984 * 985 * Returns 986 * < 0: the sp should be zapped 987 * 0: the sp is synced and no tlb flushing is required 988 * > 0: the sp is synced and tlb flushing is required 989 */ 990 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 991 { 992 union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; 993 int i; 994 bool host_writable; 995 gpa_t first_pte_gpa; 996 bool flush = false; 997 998 /* 999 * Ignore various flags when verifying that it's safe to sync a shadow 1000 * page using the current MMU context. 1001 * 1002 * - level: not part of the overall MMU role and will never match as the MMU's 1003 * level tracks the root level 1004 * - access: updated based on the new guest PTE 1005 * - quadrant: not part of the overall MMU role (similar to level) 1006 */ 1007 const union kvm_mmu_page_role sync_role_ign = { 1008 .level = 0xf, 1009 .access = 0x7, 1010 .quadrant = 0x3, 1011 .passthrough = 0x1, 1012 }; 1013 1014 /* 1015 * Direct pages can never be unsync, and KVM should never attempt to 1016 * sync a shadow page for a different MMU context, e.g. if the role 1017 * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the 1018 * reserved bits checks will be wrong, etc... 1019 */ 1020 if (WARN_ON_ONCE(sp->role.direct || 1021 (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) 1022 return -1; 1023 1024 first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); 1025 1026 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 1027 u64 *sptep, spte; 1028 struct kvm_memory_slot *slot; 1029 unsigned pte_access; 1030 pt_element_t gpte; 1031 gpa_t pte_gpa; 1032 gfn_t gfn; 1033 1034 if (!sp->spt[i]) 1035 continue; 1036 1037 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); 1038 1039 if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, 1040 sizeof(pt_element_t))) 1041 return -1; 1042 1043 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 1044 flush = true; 1045 continue; 1046 } 1047 1048 gfn = gpte_to_gfn(gpte); 1049 pte_access = sp->role.access; 1050 pte_access &= FNAME(gpte_access)(gpte); 1051 FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); 1052 1053 if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) 1054 continue; 1055 1056 if (gfn != sp->gfns[i]) { 1057 drop_spte(vcpu->kvm, &sp->spt[i]); 1058 flush = true; 1059 continue; 1060 } 1061 1062 sptep = &sp->spt[i]; 1063 spte = *sptep; 1064 host_writable = spte & shadow_host_writable_mask; 1065 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1066 make_spte(vcpu, sp, slot, pte_access, gfn, 1067 spte_to_pfn(spte), spte, true, false, 1068 host_writable, &spte); 1069 1070 flush |= mmu_spte_update(sptep, spte); 1071 } 1072 1073 return flush; 1074 } 1075 1076 #undef pt_element_t 1077 #undef guest_walker 1078 #undef FNAME 1079 #undef PT_BASE_ADDR_MASK 1080 #undef PT_INDEX 1081 #undef PT_LVL_ADDR_MASK 1082 #undef PT_LVL_OFFSET_MASK 1083 #undef PT_LEVEL_BITS 1084 #undef PT_MAX_FULL_LEVELS 1085 #undef gpte_to_gfn 1086 #undef gpte_to_gfn_lvl 1087 #undef CMPXCHG 1088 #undef PT_GUEST_ACCESSED_MASK 1089 #undef PT_GUEST_DIRTY_MASK 1090 #undef PT_GUEST_DIRTY_SHIFT 1091 #undef PT_GUEST_ACCESSED_SHIFT 1092 #undef PT_HAVE_ACCESSED_DIRTY 1093