1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2020 Google LLC 4 * Author: Quentin Perret <qperret@google.com> 5 */ 6 7 #include <linux/kvm_host.h> 8 #include <asm/kvm_emulate.h> 9 #include <asm/kvm_hyp.h> 10 #include <asm/kvm_mmu.h> 11 #include <asm/kvm_pgtable.h> 12 #include <asm/stage2_pgtable.h> 13 14 #include <hyp/switch.h> 15 16 #include <nvhe/gfp.h> 17 #include <nvhe/memory.h> 18 #include <nvhe/mem_protect.h> 19 #include <nvhe/mm.h> 20 21 #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) 22 23 extern unsigned long hyp_nr_cpus; 24 struct host_kvm host_kvm; 25 26 static struct hyp_pool host_s2_pool; 27 28 /* 29 * Copies of the host's CPU features registers holding sanitized values. 30 */ 31 u64 id_aa64mmfr0_el1_sys_val; 32 u64 id_aa64mmfr1_el1_sys_val; 33 34 const u8 pkvm_hyp_id = 1; 35 36 static void *host_s2_zalloc_pages_exact(size_t size) 37 { 38 void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); 39 40 hyp_split_page(hyp_virt_to_page(addr)); 41 42 /* 43 * The size of concatenated PGDs is always a power of two of PAGE_SIZE, 44 * so there should be no need to free any of the tail pages to make the 45 * allocation exact. 46 */ 47 WARN_ON(size != (PAGE_SIZE << get_order(size))); 48 49 return addr; 50 } 51 52 static void *host_s2_zalloc_page(void *pool) 53 { 54 return hyp_alloc_pages(pool, 0); 55 } 56 57 static void host_s2_get_page(void *addr) 58 { 59 hyp_get_page(&host_s2_pool, addr); 60 } 61 62 static void host_s2_put_page(void *addr) 63 { 64 hyp_put_page(&host_s2_pool, addr); 65 } 66 67 static int prepare_s2_pool(void *pgt_pool_base) 68 { 69 unsigned long nr_pages, pfn; 70 int ret; 71 72 pfn = hyp_virt_to_pfn(pgt_pool_base); 73 nr_pages = host_s2_pgtable_pages(); 74 ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0); 75 if (ret) 76 return ret; 77 78 host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) { 79 .zalloc_pages_exact = host_s2_zalloc_pages_exact, 80 .zalloc_page = host_s2_zalloc_page, 81 .phys_to_virt = hyp_phys_to_virt, 82 .virt_to_phys = hyp_virt_to_phys, 83 .page_count = hyp_page_count, 84 .get_page = host_s2_get_page, 85 .put_page = host_s2_put_page, 86 }; 87 88 return 0; 89 } 90 91 static void prepare_host_vtcr(void) 92 { 93 u32 parange, phys_shift; 94 95 /* The host stage 2 is id-mapped, so use parange for T0SZ */ 96 parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); 97 phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); 98 99 host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, 100 id_aa64mmfr1_el1_sys_val, phys_shift); 101 } 102 103 static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); 104 105 int kvm_host_prepare_stage2(void *pgt_pool_base) 106 { 107 struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; 108 int ret; 109 110 prepare_host_vtcr(); 111 hyp_spin_lock_init(&host_kvm.lock); 112 113 ret = prepare_s2_pool(pgt_pool_base); 114 if (ret) 115 return ret; 116 117 ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch, 118 &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, 119 host_stage2_force_pte_cb); 120 if (ret) 121 return ret; 122 123 mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); 124 mmu->arch = &host_kvm.arch; 125 mmu->pgt = &host_kvm.pgt; 126 WRITE_ONCE(mmu->vmid.vmid_gen, 0); 127 WRITE_ONCE(mmu->vmid.vmid, 0); 128 129 return 0; 130 } 131 132 int __pkvm_prot_finalize(void) 133 { 134 struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; 135 struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); 136 137 params->vttbr = kvm_get_vttbr(mmu); 138 params->vtcr = host_kvm.arch.vtcr; 139 params->hcr_el2 |= HCR_VM; 140 kvm_flush_dcache_to_poc(params, sizeof(*params)); 141 142 write_sysreg(params->hcr_el2, hcr_el2); 143 __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); 144 145 /* 146 * Make sure to have an ISB before the TLB maintenance below but only 147 * when __load_stage2() doesn't include one already. 148 */ 149 asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); 150 151 /* Invalidate stale HCR bits that may be cached in TLBs */ 152 __tlbi(vmalls12e1); 153 dsb(nsh); 154 isb(); 155 156 return 0; 157 } 158 159 static int host_stage2_unmap_dev_all(void) 160 { 161 struct kvm_pgtable *pgt = &host_kvm.pgt; 162 struct memblock_region *reg; 163 u64 addr = 0; 164 int i, ret; 165 166 /* Unmap all non-memory regions to recycle the pages */ 167 for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) { 168 reg = &hyp_memory[i]; 169 ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr); 170 if (ret) 171 return ret; 172 } 173 return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); 174 } 175 176 struct kvm_mem_range { 177 u64 start; 178 u64 end; 179 }; 180 181 static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) 182 { 183 int cur, left = 0, right = hyp_memblock_nr; 184 struct memblock_region *reg; 185 phys_addr_t end; 186 187 range->start = 0; 188 range->end = ULONG_MAX; 189 190 /* The list of memblock regions is sorted, binary search it */ 191 while (left < right) { 192 cur = (left + right) >> 1; 193 reg = &hyp_memory[cur]; 194 end = reg->base + reg->size; 195 if (addr < reg->base) { 196 right = cur; 197 range->end = reg->base; 198 } else if (addr >= end) { 199 left = cur + 1; 200 range->start = end; 201 } else { 202 range->start = reg->base; 203 range->end = end; 204 return true; 205 } 206 } 207 208 return false; 209 } 210 211 bool addr_is_memory(phys_addr_t phys) 212 { 213 struct kvm_mem_range range; 214 215 return find_mem_range(phys, &range); 216 } 217 218 static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) 219 { 220 return range->start <= addr && addr < range->end; 221 } 222 223 static bool range_is_memory(u64 start, u64 end) 224 { 225 struct kvm_mem_range r; 226 227 if (!find_mem_range(start, &r)) 228 return false; 229 230 return is_in_mem_range(end - 1, &r); 231 } 232 233 static inline int __host_stage2_idmap(u64 start, u64 end, 234 enum kvm_pgtable_prot prot) 235 { 236 return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start, 237 prot, &host_s2_pool); 238 } 239 240 /* 241 * The pool has been provided with enough pages to cover all of memory with 242 * page granularity, but it is difficult to know how much of the MMIO range 243 * we will need to cover upfront, so we may need to 'recycle' the pages if we 244 * run out. 245 */ 246 #define host_stage2_try(fn, ...) \ 247 ({ \ 248 int __ret; \ 249 hyp_assert_lock_held(&host_kvm.lock); \ 250 __ret = fn(__VA_ARGS__); \ 251 if (__ret == -ENOMEM) { \ 252 __ret = host_stage2_unmap_dev_all(); \ 253 if (!__ret) \ 254 __ret = fn(__VA_ARGS__); \ 255 } \ 256 __ret; \ 257 }) 258 259 static inline bool range_included(struct kvm_mem_range *child, 260 struct kvm_mem_range *parent) 261 { 262 return parent->start <= child->start && child->end <= parent->end; 263 } 264 265 static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) 266 { 267 struct kvm_mem_range cur; 268 kvm_pte_t pte; 269 u32 level; 270 int ret; 271 272 hyp_assert_lock_held(&host_kvm.lock); 273 ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); 274 if (ret) 275 return ret; 276 277 if (kvm_pte_valid(pte)) 278 return -EAGAIN; 279 280 if (pte) 281 return -EPERM; 282 283 do { 284 u64 granule = kvm_granule_size(level); 285 cur.start = ALIGN_DOWN(addr, granule); 286 cur.end = cur.start + granule; 287 level++; 288 } while ((level < KVM_PGTABLE_MAX_LEVELS) && 289 !(kvm_level_supports_block_mapping(level) && 290 range_included(&cur, range))); 291 292 *range = cur; 293 294 return 0; 295 } 296 297 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, 298 enum kvm_pgtable_prot prot) 299 { 300 hyp_assert_lock_held(&host_kvm.lock); 301 302 return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); 303 } 304 305 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) 306 { 307 hyp_assert_lock_held(&host_kvm.lock); 308 309 return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, 310 addr, size, &host_s2_pool, owner_id); 311 } 312 313 static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) 314 { 315 /* 316 * Block mappings must be used with care in the host stage-2 as a 317 * kvm_pgtable_stage2_map() operation targeting a page in the range of 318 * an existing block will delete the block under the assumption that 319 * mappings in the rest of the block range can always be rebuilt lazily. 320 * That assumption is correct for the host stage-2 with RWX mappings 321 * targeting memory or RW mappings targeting MMIO ranges (see 322 * host_stage2_idmap() below which implements some of the host memory 323 * abort logic). However, this is not safe for any other mappings where 324 * the host stage-2 page-table is in fact the only place where this 325 * state is stored. In all those cases, it is safer to use page-level 326 * mappings, hence avoiding to lose the state because of side-effects in 327 * kvm_pgtable_stage2_map(). 328 */ 329 if (range_is_memory(addr, end)) 330 return prot != PKVM_HOST_MEM_PROT; 331 else 332 return prot != PKVM_HOST_MMIO_PROT; 333 } 334 335 static int host_stage2_idmap(u64 addr) 336 { 337 struct kvm_mem_range range; 338 bool is_memory = find_mem_range(addr, &range); 339 enum kvm_pgtable_prot prot; 340 int ret; 341 342 prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; 343 344 hyp_spin_lock(&host_kvm.lock); 345 ret = host_stage2_adjust_range(addr, &range); 346 if (ret) 347 goto unlock; 348 349 ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot); 350 unlock: 351 hyp_spin_unlock(&host_kvm.lock); 352 353 return ret; 354 } 355 356 static inline bool check_prot(enum kvm_pgtable_prot prot, 357 enum kvm_pgtable_prot required, 358 enum kvm_pgtable_prot denied) 359 { 360 return (prot & (required | denied)) == required; 361 } 362 363 int __pkvm_host_share_hyp(u64 pfn) 364 { 365 phys_addr_t addr = hyp_pfn_to_phys(pfn); 366 enum kvm_pgtable_prot prot, cur; 367 void *virt = __hyp_va(addr); 368 enum pkvm_page_state state; 369 kvm_pte_t pte; 370 int ret; 371 372 if (!addr_is_memory(addr)) 373 return -EINVAL; 374 375 hyp_spin_lock(&host_kvm.lock); 376 hyp_spin_lock(&pkvm_pgd_lock); 377 378 ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL); 379 if (ret) 380 goto unlock; 381 if (!pte) 382 goto map_shared; 383 384 /* 385 * Check attributes in the host stage-2 PTE. We need the page to be: 386 * - mapped RWX as we're sharing memory; 387 * - not borrowed, as that implies absence of ownership. 388 * Otherwise, we can't let it got through 389 */ 390 cur = kvm_pgtable_stage2_pte_prot(pte); 391 prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED); 392 if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) { 393 ret = -EPERM; 394 goto unlock; 395 } 396 397 state = pkvm_getstate(cur); 398 if (state == PKVM_PAGE_OWNED) 399 goto map_shared; 400 401 /* 402 * Tolerate double-sharing the same page, but this requires 403 * cross-checking the hypervisor stage-1. 404 */ 405 if (state != PKVM_PAGE_SHARED_OWNED) { 406 ret = -EPERM; 407 goto unlock; 408 } 409 410 ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL); 411 if (ret) 412 goto unlock; 413 414 /* 415 * If the page has been shared with the hypervisor, it must be 416 * already mapped as SHARED_BORROWED in its stage-1. 417 */ 418 cur = kvm_pgtable_hyp_pte_prot(pte); 419 prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); 420 if (!check_prot(cur, prot, ~prot)) 421 ret = -EPERM; 422 goto unlock; 423 424 map_shared: 425 /* 426 * If the page is not yet shared, adjust mappings in both page-tables 427 * while both locks are held. 428 */ 429 prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); 430 ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot); 431 BUG_ON(ret); 432 433 prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); 434 ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot); 435 BUG_ON(ret); 436 437 unlock: 438 hyp_spin_unlock(&pkvm_pgd_lock); 439 hyp_spin_unlock(&host_kvm.lock); 440 441 return ret; 442 } 443 444 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) 445 { 446 struct kvm_vcpu_fault_info fault; 447 u64 esr, addr; 448 int ret = 0; 449 450 esr = read_sysreg_el2(SYS_ESR); 451 BUG_ON(!__get_fault_info(esr, &fault)); 452 453 addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; 454 ret = host_stage2_idmap(addr); 455 BUG_ON(ret && ret != -EAGAIN); 456 } 457