1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Copyright (C) 2012,2013 - ARM Ltd 4 * Author: Marc Zyngier <marc.zyngier@arm.com> 5 */ 6 7 #ifndef __ARM64_KVM_MMU_H__ 8 #define __ARM64_KVM_MMU_H__ 9 10 #include <asm/page.h> 11 #include <asm/memory.h> 12 #include <asm/cpufeature.h> 13 14 /* 15 * As ARMv8.0 only has the TTBR0_EL2 register, we cannot express 16 * "negative" addresses. This makes it impossible to directly share 17 * mappings with the kernel. 18 * 19 * Instead, give the HYP mode its own VA region at a fixed offset from 20 * the kernel by just masking the top bits (which are all ones for a 21 * kernel address). We need to find out how many bits to mask. 22 * 23 * We want to build a set of page tables that cover both parts of the 24 * idmap (the trampoline page used to initialize EL2), and our normal 25 * runtime VA space, at the same time. 26 * 27 * Given that the kernel uses VA_BITS for its entire address space, 28 * and that half of that space (VA_BITS - 1) is used for the linear 29 * mapping, we can also limit the EL2 space to (VA_BITS - 1). 30 * 31 * The main question is "Within the VA_BITS space, does EL2 use the 32 * top or the bottom half of that space to shadow the kernel's linear 33 * mapping?". As we need to idmap the trampoline page, this is 34 * determined by the range in which this page lives. 35 * 36 * If the page is in the bottom half, we have to use the top half. If 37 * the page is in the top half, we have to use the bottom half: 38 * 39 * T = __pa_symbol(__hyp_idmap_text_start) 40 * if (T & BIT(VA_BITS - 1)) 41 * HYP_VA_MIN = 0 //idmap in upper half 42 * else 43 * HYP_VA_MIN = 1 << (VA_BITS - 1) 44 * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1 45 * 46 * This of course assumes that the trampoline page exists within the 47 * VA_BITS range. If it doesn't, then it means we're in the odd case 48 * where the kernel idmap (as well as HYP) uses more levels than the 49 * kernel runtime page tables (as seen when the kernel is configured 50 * for 4k pages, 39bits VA, and yet memory lives just above that 51 * limit, forcing the idmap to use 4 levels of page tables while the 52 * kernel itself only uses 3). In this particular case, it doesn't 53 * matter which side of VA_BITS we use, as we're guaranteed not to 54 * conflict with anything. 55 * 56 * When using VHE, there are no separate hyp mappings and all KVM 57 * functionality is already mapped as part of the main kernel 58 * mappings, and none of this applies in that case. 59 */ 60 61 #ifdef __ASSEMBLY__ 62 63 #include <asm/alternative.h> 64 65 /* 66 * Convert a kernel VA into a HYP VA. 67 * reg: VA to be converted. 68 * 69 * The actual code generation takes place in kvm_update_va_mask, and 70 * the instructions below are only there to reserve the space and 71 * perform the register allocation (kvm_update_va_mask uses the 72 * specific registers encoded in the instructions). 73 */ 74 .macro kern_hyp_va reg 75 alternative_cb kvm_update_va_mask 76 and \reg, \reg, #1 /* mask with va_mask */ 77 ror \reg, \reg, #1 /* rotate to the first tag bit */ 78 add \reg, \reg, #0 /* insert the low 12 bits of the tag */ 79 add \reg, \reg, #0, lsl 12 /* insert the top 12 bits of the tag */ 80 ror \reg, \reg, #63 /* rotate back */ 81 alternative_cb_end 82 .endm 83 84 #else 85 86 #include <asm/pgalloc.h> 87 #include <asm/cache.h> 88 #include <asm/cacheflush.h> 89 #include <asm/mmu_context.h> 90 #include <asm/pgtable.h> 91 92 void kvm_update_va_mask(struct alt_instr *alt, 93 __le32 *origptr, __le32 *updptr, int nr_inst); 94 95 static inline unsigned long __kern_hyp_va(unsigned long v) 96 { 97 asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" 98 "ror %0, %0, #1\n" 99 "add %0, %0, #0\n" 100 "add %0, %0, #0, lsl 12\n" 101 "ror %0, %0, #63\n", 102 kvm_update_va_mask) 103 : "+r" (v)); 104 return v; 105 } 106 107 #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v)))) 108 109 /* 110 * Obtain the PC-relative address of a kernel symbol 111 * s: symbol 112 * 113 * The goal of this macro is to return a symbol's address based on a 114 * PC-relative computation, as opposed to a loading the VA from a 115 * constant pool or something similar. This works well for HYP, as an 116 * absolute VA is guaranteed to be wrong. Only use this if trying to 117 * obtain the address of a symbol (i.e. not something you obtained by 118 * following a pointer). 119 */ 120 #define hyp_symbol_addr(s) \ 121 ({ \ 122 typeof(s) *addr; \ 123 asm("adrp %0, %1\n" \ 124 "add %0, %0, :lo12:%1\n" \ 125 : "=r" (addr) : "S" (&s)); \ 126 addr; \ 127 }) 128 129 /* 130 * We currently support using a VM-specified IPA size. For backward 131 * compatibility, the default IPA size is fixed to 40bits. 132 */ 133 #define KVM_PHYS_SHIFT (40) 134 135 #define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr) 136 #define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) 137 #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) 138 139 static inline bool kvm_page_empty(void *ptr) 140 { 141 struct page *ptr_page = virt_to_page(ptr); 142 return page_count(ptr_page) == 1; 143 } 144 145 #include <asm/stage2_pgtable.h> 146 147 int create_hyp_mappings(void *from, void *to, pgprot_t prot); 148 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 149 void __iomem **kaddr, 150 void __iomem **haddr); 151 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 152 void **haddr); 153 void free_hyp_pgds(void); 154 155 void stage2_unmap_vm(struct kvm *kvm); 156 int kvm_alloc_stage2_pgd(struct kvm *kvm); 157 void kvm_free_stage2_pgd(struct kvm *kvm); 158 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 159 phys_addr_t pa, unsigned long size, bool writable); 160 161 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); 162 163 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); 164 165 phys_addr_t kvm_mmu_get_httbr(void); 166 phys_addr_t kvm_get_idmap_vector(void); 167 int kvm_mmu_init(void); 168 void kvm_clear_hyp_idmap(void); 169 170 #define kvm_mk_pmd(ptep) \ 171 __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) 172 #define kvm_mk_pud(pmdp) \ 173 __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE) 174 #define kvm_mk_pgd(pudp) \ 175 __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) 176 177 #define kvm_set_pud(pudp, pud) set_pud(pudp, pud) 178 179 #define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) 180 #define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) 181 #define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot) 182 183 #define kvm_pud_pfn(pud) pud_pfn(pud) 184 185 #define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) 186 #define kvm_pud_mkhuge(pud) pud_mkhuge(pud) 187 188 static inline pte_t kvm_s2pte_mkwrite(pte_t pte) 189 { 190 pte_val(pte) |= PTE_S2_RDWR; 191 return pte; 192 } 193 194 static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) 195 { 196 pmd_val(pmd) |= PMD_S2_RDWR; 197 return pmd; 198 } 199 200 static inline pud_t kvm_s2pud_mkwrite(pud_t pud) 201 { 202 pud_val(pud) |= PUD_S2_RDWR; 203 return pud; 204 } 205 206 static inline pte_t kvm_s2pte_mkexec(pte_t pte) 207 { 208 pte_val(pte) &= ~PTE_S2_XN; 209 return pte; 210 } 211 212 static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) 213 { 214 pmd_val(pmd) &= ~PMD_S2_XN; 215 return pmd; 216 } 217 218 static inline pud_t kvm_s2pud_mkexec(pud_t pud) 219 { 220 pud_val(pud) &= ~PUD_S2_XN; 221 return pud; 222 } 223 224 static inline void kvm_set_s2pte_readonly(pte_t *ptep) 225 { 226 pteval_t old_pteval, pteval; 227 228 pteval = READ_ONCE(pte_val(*ptep)); 229 do { 230 old_pteval = pteval; 231 pteval &= ~PTE_S2_RDWR; 232 pteval |= PTE_S2_RDONLY; 233 pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); 234 } while (pteval != old_pteval); 235 } 236 237 static inline bool kvm_s2pte_readonly(pte_t *ptep) 238 { 239 return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY; 240 } 241 242 static inline bool kvm_s2pte_exec(pte_t *ptep) 243 { 244 return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN); 245 } 246 247 static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp) 248 { 249 kvm_set_s2pte_readonly((pte_t *)pmdp); 250 } 251 252 static inline bool kvm_s2pmd_readonly(pmd_t *pmdp) 253 { 254 return kvm_s2pte_readonly((pte_t *)pmdp); 255 } 256 257 static inline bool kvm_s2pmd_exec(pmd_t *pmdp) 258 { 259 return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); 260 } 261 262 static inline void kvm_set_s2pud_readonly(pud_t *pudp) 263 { 264 kvm_set_s2pte_readonly((pte_t *)pudp); 265 } 266 267 static inline bool kvm_s2pud_readonly(pud_t *pudp) 268 { 269 return kvm_s2pte_readonly((pte_t *)pudp); 270 } 271 272 static inline bool kvm_s2pud_exec(pud_t *pudp) 273 { 274 return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); 275 } 276 277 static inline pud_t kvm_s2pud_mkyoung(pud_t pud) 278 { 279 return pud_mkyoung(pud); 280 } 281 282 static inline bool kvm_s2pud_young(pud_t pud) 283 { 284 return pud_young(pud); 285 } 286 287 #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) 288 289 #ifdef __PAGETABLE_PMD_FOLDED 290 #define hyp_pmd_table_empty(pmdp) (0) 291 #else 292 #define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) 293 #endif 294 295 #ifdef __PAGETABLE_PUD_FOLDED 296 #define hyp_pud_table_empty(pudp) (0) 297 #else 298 #define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) 299 #endif 300 301 struct kvm; 302 303 #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) 304 305 static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) 306 { 307 return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; 308 } 309 310 static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) 311 { 312 void *va = page_address(pfn_to_page(pfn)); 313 314 /* 315 * With FWB, we ensure that the guest always accesses memory using 316 * cacheable attributes, and we don't have to clean to PoC when 317 * faulting in pages. Furthermore, FWB implies IDC, so cleaning to 318 * PoU is not required either in this case. 319 */ 320 if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) 321 return; 322 323 kvm_flush_dcache_to_poc(va, size); 324 } 325 326 static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, 327 unsigned long size) 328 { 329 if (icache_is_aliasing()) { 330 /* any kind of VIPT cache */ 331 __flush_icache_all(); 332 } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) { 333 /* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */ 334 void *va = page_address(pfn_to_page(pfn)); 335 336 invalidate_icache_range((unsigned long)va, 337 (unsigned long)va + size); 338 } 339 } 340 341 static inline void __kvm_flush_dcache_pte(pte_t pte) 342 { 343 if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { 344 struct page *page = pte_page(pte); 345 kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE); 346 } 347 } 348 349 static inline void __kvm_flush_dcache_pmd(pmd_t pmd) 350 { 351 if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { 352 struct page *page = pmd_page(pmd); 353 kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE); 354 } 355 } 356 357 static inline void __kvm_flush_dcache_pud(pud_t pud) 358 { 359 if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { 360 struct page *page = pud_page(pud); 361 kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE); 362 } 363 } 364 365 #define kvm_virt_to_phys(x) __pa_symbol(x) 366 367 void kvm_set_way_flush(struct kvm_vcpu *vcpu); 368 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); 369 370 static inline bool __kvm_cpu_uses_extended_idmap(void) 371 { 372 return __cpu_uses_extended_idmap_level(); 373 } 374 375 static inline unsigned long __kvm_idmap_ptrs_per_pgd(void) 376 { 377 return idmap_ptrs_per_pgd; 378 } 379 380 /* 381 * Can't use pgd_populate here, because the extended idmap adds an extra level 382 * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended 383 * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4. 384 */ 385 static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, 386 pgd_t *hyp_pgd, 387 pgd_t *merged_hyp_pgd, 388 unsigned long hyp_idmap_start) 389 { 390 int idmap_idx; 391 u64 pgd_addr; 392 393 /* 394 * Use the first entry to access the HYP mappings. It is 395 * guaranteed to be free, otherwise we wouldn't use an 396 * extended idmap. 397 */ 398 VM_BUG_ON(pgd_val(merged_hyp_pgd[0])); 399 pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd)); 400 merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE); 401 402 /* 403 * Create another extended level entry that points to the boot HYP map, 404 * which contains an ID mapping of the HYP init code. We essentially 405 * merge the boot and runtime HYP maps by doing so, but they don't 406 * overlap anyway, so this is fine. 407 */ 408 idmap_idx = hyp_idmap_start >> VA_BITS; 409 VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx])); 410 pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd)); 411 merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE); 412 } 413 414 static inline unsigned int kvm_get_vmid_bits(void) 415 { 416 int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 417 418 return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; 419 } 420 421 /* 422 * We are not in the kvm->srcu critical section most of the time, so we take 423 * the SRCU read lock here. Since we copy the data from the user page, we 424 * can immediately drop the lock again. 425 */ 426 static inline int kvm_read_guest_lock(struct kvm *kvm, 427 gpa_t gpa, void *data, unsigned long len) 428 { 429 int srcu_idx = srcu_read_lock(&kvm->srcu); 430 int ret = kvm_read_guest(kvm, gpa, data, len); 431 432 srcu_read_unlock(&kvm->srcu, srcu_idx); 433 434 return ret; 435 } 436 437 static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, 438 const void *data, unsigned long len) 439 { 440 int srcu_idx = srcu_read_lock(&kvm->srcu); 441 int ret = kvm_write_guest(kvm, gpa, data, len); 442 443 srcu_read_unlock(&kvm->srcu, srcu_idx); 444 445 return ret; 446 } 447 448 #ifdef CONFIG_KVM_INDIRECT_VECTORS 449 /* 450 * EL2 vectors can be mapped and rerouted in a number of ways, 451 * depending on the kernel configuration and CPU present: 452 * 453 * - If the CPU has the ARM64_HARDEN_BRANCH_PREDICTOR cap, the 454 * hardening sequence is placed in one of the vector slots, which is 455 * executed before jumping to the real vectors. 456 * 457 * - If the CPU has both the ARM64_HARDEN_EL2_VECTORS cap and the 458 * ARM64_HARDEN_BRANCH_PREDICTOR cap, the slot containing the 459 * hardening sequence is mapped next to the idmap page, and executed 460 * before jumping to the real vectors. 461 * 462 * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an 463 * empty slot is selected, mapped next to the idmap page, and 464 * executed before jumping to the real vectors. 465 * 466 * Note that ARM64_HARDEN_EL2_VECTORS is somewhat incompatible with 467 * VHE, as we don't have hypervisor-specific mappings. If the system 468 * is VHE and yet selects this capability, it will be ignored. 469 */ 470 #include <asm/mmu.h> 471 472 extern void *__kvm_bp_vect_base; 473 extern int __kvm_harden_el2_vector_slot; 474 475 static inline void *kvm_get_hyp_vector(void) 476 { 477 struct bp_hardening_data *data = arm64_get_bp_hardening_data(); 478 void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); 479 int slot = -1; 480 481 if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) { 482 vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs_start)); 483 slot = data->hyp_vectors_slot; 484 } 485 486 if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS) && !has_vhe()) { 487 vect = __kvm_bp_vect_base; 488 if (slot == -1) 489 slot = __kvm_harden_el2_vector_slot; 490 } 491 492 if (slot != -1) 493 vect += slot * SZ_2K; 494 495 return vect; 496 } 497 498 /* This is only called on a !VHE system */ 499 static inline int kvm_map_vectors(void) 500 { 501 /* 502 * HBP = ARM64_HARDEN_BRANCH_PREDICTOR 503 * HEL2 = ARM64_HARDEN_EL2_VECTORS 504 * 505 * !HBP + !HEL2 -> use direct vectors 506 * HBP + !HEL2 -> use hardened vectors in place 507 * !HBP + HEL2 -> allocate one vector slot and use exec mapping 508 * HBP + HEL2 -> use hardened vertors and use exec mapping 509 */ 510 if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) { 511 __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs_start); 512 __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base); 513 } 514 515 if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) { 516 phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs_start); 517 unsigned long size = (__bp_harden_hyp_vecs_end - 518 __bp_harden_hyp_vecs_start); 519 520 /* 521 * Always allocate a spare vector slot, as we don't 522 * know yet which CPUs have a BP hardening slot that 523 * we can reuse. 524 */ 525 __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot); 526 BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS); 527 return create_hyp_exec_mappings(vect_pa, size, 528 &__kvm_bp_vect_base); 529 } 530 531 return 0; 532 } 533 #else 534 static inline void *kvm_get_hyp_vector(void) 535 { 536 return kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); 537 } 538 539 static inline int kvm_map_vectors(void) 540 { 541 return 0; 542 } 543 #endif 544 545 #ifdef CONFIG_ARM64_SSBD 546 DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); 547 548 static inline int hyp_map_aux_data(void) 549 { 550 int cpu, err; 551 552 for_each_possible_cpu(cpu) { 553 u64 *ptr; 554 555 ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu); 556 err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP); 557 if (err) 558 return err; 559 } 560 return 0; 561 } 562 #else 563 static inline int hyp_map_aux_data(void) 564 { 565 return 0; 566 } 567 #endif 568 569 #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) 570 571 /* 572 * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. 573 * With v8.2 LVA extensions, 'x' should be a minimum of 6 with 574 * 52bit IPS. 575 */ 576 static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) 577 { 578 int x = ARM64_VTTBR_X(ipa_shift, levels); 579 580 return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; 581 } 582 583 static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) 584 { 585 unsigned int x = arm64_vttbr_x(ipa_shift, levels); 586 587 return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); 588 } 589 590 static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) 591 { 592 return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); 593 } 594 595 static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) 596 { 597 struct kvm_vmid *vmid = &kvm->arch.vmid; 598 u64 vmid_field, baddr; 599 u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; 600 601 baddr = kvm->arch.pgd_phys; 602 vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; 603 return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; 604 } 605 606 #endif /* __ASSEMBLY__ */ 607 #endif /* __ARM64_KVM_MMU_H__ */ 608