1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_pgtable.h> 18 #include <asm/kvm_ras.h> 19 #include <asm/kvm_asm.h> 20 #include <asm/kvm_emulate.h> 21 #include <asm/virt.h> 22 23 #include "trace.h" 24 25 static struct kvm_pgtable *hyp_pgtable; 26 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 27 28 static unsigned long hyp_idmap_start; 29 static unsigned long hyp_idmap_end; 30 static phys_addr_t hyp_idmap_vector; 31 32 static unsigned long io_map_base; 33 34 35 /* 36 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 37 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 38 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 39 * long will also starve other vCPUs. We have to also make sure that the page 40 * tables are not freed while we released the lock. 41 */ 42 static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, 43 phys_addr_t end, 44 int (*fn)(struct kvm_pgtable *, u64, u64), 45 bool resched) 46 { 47 int ret; 48 u64 next; 49 50 do { 51 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; 52 if (!pgt) 53 return -EINVAL; 54 55 next = stage2_pgd_addr_end(kvm, addr, end); 56 ret = fn(pgt, addr, next - addr); 57 if (ret) 58 break; 59 60 if (resched && next != end) 61 cond_resched_lock(&kvm->mmu_lock); 62 } while (addr = next, addr != end); 63 64 return ret; 65 } 66 67 #define stage2_apply_range_resched(kvm, addr, end, fn) \ 68 stage2_apply_range(kvm, addr, end, fn, true) 69 70 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 71 { 72 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 73 } 74 75 /** 76 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 77 * @kvm: pointer to kvm structure. 78 * 79 * Interface to HYP function to flush all VM TLB entries 80 */ 81 void kvm_flush_remote_tlbs(struct kvm *kvm) 82 { 83 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 84 } 85 86 static bool kvm_is_device_pfn(unsigned long pfn) 87 { 88 return !pfn_is_map_memory(pfn); 89 } 90 91 static void *stage2_memcache_zalloc_page(void *arg) 92 { 93 struct kvm_mmu_memory_cache *mc = arg; 94 95 /* Allocated with __GFP_ZERO, so no need to zero */ 96 return kvm_mmu_memory_cache_alloc(mc); 97 } 98 99 static void *kvm_host_zalloc_pages_exact(size_t size) 100 { 101 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 102 } 103 104 static void kvm_host_get_page(void *addr) 105 { 106 get_page(virt_to_page(addr)); 107 } 108 109 static void kvm_host_put_page(void *addr) 110 { 111 put_page(virt_to_page(addr)); 112 } 113 114 static int kvm_host_page_count(void *addr) 115 { 116 return page_count(virt_to_page(addr)); 117 } 118 119 static phys_addr_t kvm_host_pa(void *addr) 120 { 121 return __pa(addr); 122 } 123 124 static void *kvm_host_va(phys_addr_t phys) 125 { 126 return __va(phys); 127 } 128 129 static void clean_dcache_guest_page(void *va, size_t size) 130 { 131 __clean_dcache_guest_page(va, size); 132 } 133 134 static void invalidate_icache_guest_page(void *va, size_t size) 135 { 136 __invalidate_icache_guest_page(va, size); 137 } 138 139 /* 140 * Unmapping vs dcache management: 141 * 142 * If a guest maps certain memory pages as uncached, all writes will 143 * bypass the data cache and go directly to RAM. However, the CPUs 144 * can still speculate reads (not writes) and fill cache lines with 145 * data. 146 * 147 * Those cache lines will be *clean* cache lines though, so a 148 * clean+invalidate operation is equivalent to an invalidate 149 * operation, because no cache lines are marked dirty. 150 * 151 * Those clean cache lines could be filled prior to an uncached write 152 * by the guest, and the cache coherent IO subsystem would therefore 153 * end up writing old data to disk. 154 * 155 * This is why right after unmapping a page/section and invalidating 156 * the corresponding TLBs, we flush to make sure the IO subsystem will 157 * never hit in the cache. 158 * 159 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 160 * we then fully enforce cacheability of RAM, no matter what the guest 161 * does. 162 */ 163 /** 164 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 165 * @mmu: The KVM stage-2 MMU pointer 166 * @start: The intermediate physical base address of the range to unmap 167 * @size: The size of the area to unmap 168 * @may_block: Whether or not we are permitted to block 169 * 170 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 171 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 172 * destroying the VM), otherwise another faulting VCPU may come in and mess 173 * with things behind our backs. 174 */ 175 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 176 bool may_block) 177 { 178 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 179 phys_addr_t end = start + size; 180 181 assert_spin_locked(&kvm->mmu_lock); 182 WARN_ON(size & ~PAGE_MASK); 183 WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap, 184 may_block)); 185 } 186 187 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) 188 { 189 __unmap_stage2_range(mmu, start, size, true); 190 } 191 192 static void stage2_flush_memslot(struct kvm *kvm, 193 struct kvm_memory_slot *memslot) 194 { 195 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 196 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 197 198 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush); 199 } 200 201 /** 202 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 203 * @kvm: The struct kvm pointer 204 * 205 * Go through the stage 2 page tables and invalidate any cache lines 206 * backing memory already mapped to the VM. 207 */ 208 static void stage2_flush_vm(struct kvm *kvm) 209 { 210 struct kvm_memslots *slots; 211 struct kvm_memory_slot *memslot; 212 int idx; 213 214 idx = srcu_read_lock(&kvm->srcu); 215 spin_lock(&kvm->mmu_lock); 216 217 slots = kvm_memslots(kvm); 218 kvm_for_each_memslot(memslot, slots) 219 stage2_flush_memslot(kvm, memslot); 220 221 spin_unlock(&kvm->mmu_lock); 222 srcu_read_unlock(&kvm->srcu, idx); 223 } 224 225 /** 226 * free_hyp_pgds - free Hyp-mode page tables 227 */ 228 void free_hyp_pgds(void) 229 { 230 mutex_lock(&kvm_hyp_pgd_mutex); 231 if (hyp_pgtable) { 232 kvm_pgtable_hyp_destroy(hyp_pgtable); 233 kfree(hyp_pgtable); 234 hyp_pgtable = NULL; 235 } 236 mutex_unlock(&kvm_hyp_pgd_mutex); 237 } 238 239 static bool kvm_host_owns_hyp_mappings(void) 240 { 241 if (static_branch_likely(&kvm_protected_mode_initialized)) 242 return false; 243 244 /* 245 * This can happen at boot time when __create_hyp_mappings() is called 246 * after the hyp protection has been enabled, but the static key has 247 * not been flipped yet. 248 */ 249 if (!hyp_pgtable && is_protected_kvm_enabled()) 250 return false; 251 252 WARN_ON(!hyp_pgtable); 253 254 return true; 255 } 256 257 static int __create_hyp_mappings(unsigned long start, unsigned long size, 258 unsigned long phys, enum kvm_pgtable_prot prot) 259 { 260 int err; 261 262 if (!kvm_host_owns_hyp_mappings()) { 263 return kvm_call_hyp_nvhe(__pkvm_create_mappings, 264 start, size, phys, prot); 265 } 266 267 mutex_lock(&kvm_hyp_pgd_mutex); 268 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 269 mutex_unlock(&kvm_hyp_pgd_mutex); 270 271 return err; 272 } 273 274 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 275 { 276 if (!is_vmalloc_addr(kaddr)) { 277 BUG_ON(!virt_addr_valid(kaddr)); 278 return __pa(kaddr); 279 } else { 280 return page_to_phys(vmalloc_to_page(kaddr)) + 281 offset_in_page(kaddr); 282 } 283 } 284 285 /** 286 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 287 * @from: The virtual kernel start address of the range 288 * @to: The virtual kernel end address of the range (exclusive) 289 * @prot: The protection to be applied to this range 290 * 291 * The same virtual address as the kernel virtual address is also used 292 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 293 * physical pages. 294 */ 295 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 296 { 297 phys_addr_t phys_addr; 298 unsigned long virt_addr; 299 unsigned long start = kern_hyp_va((unsigned long)from); 300 unsigned long end = kern_hyp_va((unsigned long)to); 301 302 if (is_kernel_in_hyp_mode()) 303 return 0; 304 305 start = start & PAGE_MASK; 306 end = PAGE_ALIGN(end); 307 308 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 309 int err; 310 311 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 312 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 313 prot); 314 if (err) 315 return err; 316 } 317 318 return 0; 319 } 320 321 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 322 unsigned long *haddr, 323 enum kvm_pgtable_prot prot) 324 { 325 unsigned long base; 326 int ret = 0; 327 328 if (!kvm_host_owns_hyp_mappings()) { 329 base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 330 phys_addr, size, prot); 331 if (IS_ERR_OR_NULL((void *)base)) 332 return PTR_ERR((void *)base); 333 *haddr = base; 334 335 return 0; 336 } 337 338 mutex_lock(&kvm_hyp_pgd_mutex); 339 340 /* 341 * This assumes that we have enough space below the idmap 342 * page to allocate our VAs. If not, the check below will 343 * kick. A potential alternative would be to detect that 344 * overflow and switch to an allocation above the idmap. 345 * 346 * The allocated size is always a multiple of PAGE_SIZE. 347 */ 348 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 349 base = io_map_base - size; 350 351 /* 352 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 353 * allocating the new area, as it would indicate we've 354 * overflowed the idmap/IO address range. 355 */ 356 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 357 ret = -ENOMEM; 358 else 359 io_map_base = base; 360 361 mutex_unlock(&kvm_hyp_pgd_mutex); 362 363 if (ret) 364 goto out; 365 366 ret = __create_hyp_mappings(base, size, phys_addr, prot); 367 if (ret) 368 goto out; 369 370 *haddr = base + offset_in_page(phys_addr); 371 out: 372 return ret; 373 } 374 375 /** 376 * create_hyp_io_mappings - Map IO into both kernel and HYP 377 * @phys_addr: The physical start address which gets mapped 378 * @size: Size of the region being mapped 379 * @kaddr: Kernel VA for this mapping 380 * @haddr: HYP VA for this mapping 381 */ 382 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 383 void __iomem **kaddr, 384 void __iomem **haddr) 385 { 386 unsigned long addr; 387 int ret; 388 389 *kaddr = ioremap(phys_addr, size); 390 if (!*kaddr) 391 return -ENOMEM; 392 393 if (is_kernel_in_hyp_mode()) { 394 *haddr = *kaddr; 395 return 0; 396 } 397 398 ret = __create_hyp_private_mapping(phys_addr, size, 399 &addr, PAGE_HYP_DEVICE); 400 if (ret) { 401 iounmap(*kaddr); 402 *kaddr = NULL; 403 *haddr = NULL; 404 return ret; 405 } 406 407 *haddr = (void __iomem *)addr; 408 return 0; 409 } 410 411 /** 412 * create_hyp_exec_mappings - Map an executable range into HYP 413 * @phys_addr: The physical start address which gets mapped 414 * @size: Size of the region being mapped 415 * @haddr: HYP VA for this mapping 416 */ 417 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 418 void **haddr) 419 { 420 unsigned long addr; 421 int ret; 422 423 BUG_ON(is_kernel_in_hyp_mode()); 424 425 ret = __create_hyp_private_mapping(phys_addr, size, 426 &addr, PAGE_HYP_EXEC); 427 if (ret) { 428 *haddr = NULL; 429 return ret; 430 } 431 432 *haddr = (void *)addr; 433 return 0; 434 } 435 436 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 437 .zalloc_page = stage2_memcache_zalloc_page, 438 .zalloc_pages_exact = kvm_host_zalloc_pages_exact, 439 .free_pages_exact = free_pages_exact, 440 .get_page = kvm_host_get_page, 441 .put_page = kvm_host_put_page, 442 .page_count = kvm_host_page_count, 443 .phys_to_virt = kvm_host_va, 444 .virt_to_phys = kvm_host_pa, 445 .dcache_clean_inval_poc = clean_dcache_guest_page, 446 .icache_inval_pou = invalidate_icache_guest_page, 447 }; 448 449 /** 450 * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure 451 * @kvm: The pointer to the KVM structure 452 * @mmu: The pointer to the s2 MMU structure 453 * 454 * Allocates only the stage-2 HW PGD level table(s). 455 * Note we don't need locking here as this is only called when the VM is 456 * created, which can only be done once. 457 */ 458 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) 459 { 460 int cpu, err; 461 struct kvm_pgtable *pgt; 462 463 if (mmu->pgt != NULL) { 464 kvm_err("kvm_arch already initialized?\n"); 465 return -EINVAL; 466 } 467 468 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL); 469 if (!pgt) 470 return -ENOMEM; 471 472 err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops); 473 if (err) 474 goto out_free_pgtable; 475 476 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 477 if (!mmu->last_vcpu_ran) { 478 err = -ENOMEM; 479 goto out_destroy_pgtable; 480 } 481 482 for_each_possible_cpu(cpu) 483 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 484 485 mmu->arch = &kvm->arch; 486 mmu->pgt = pgt; 487 mmu->pgd_phys = __pa(pgt->pgd); 488 mmu->vmid.vmid_gen = 0; 489 return 0; 490 491 out_destroy_pgtable: 492 kvm_pgtable_stage2_destroy(pgt); 493 out_free_pgtable: 494 kfree(pgt); 495 return err; 496 } 497 498 static void stage2_unmap_memslot(struct kvm *kvm, 499 struct kvm_memory_slot *memslot) 500 { 501 hva_t hva = memslot->userspace_addr; 502 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 503 phys_addr_t size = PAGE_SIZE * memslot->npages; 504 hva_t reg_end = hva + size; 505 506 /* 507 * A memory region could potentially cover multiple VMAs, and any holes 508 * between them, so iterate over all of them to find out if we should 509 * unmap any of them. 510 * 511 * +--------------------------------------------+ 512 * +---------------+----------------+ +----------------+ 513 * | : VMA 1 | VMA 2 | | VMA 3 : | 514 * +---------------+----------------+ +----------------+ 515 * | memory region | 516 * +--------------------------------------------+ 517 */ 518 do { 519 struct vm_area_struct *vma; 520 hva_t vm_start, vm_end; 521 522 vma = find_vma_intersection(current->mm, hva, reg_end); 523 if (!vma) 524 break; 525 526 /* 527 * Take the intersection of this VMA with the memory region 528 */ 529 vm_start = max(hva, vma->vm_start); 530 vm_end = min(reg_end, vma->vm_end); 531 532 if (!(vma->vm_flags & VM_PFNMAP)) { 533 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 534 unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); 535 } 536 hva = vm_end; 537 } while (hva < reg_end); 538 } 539 540 /** 541 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 542 * @kvm: The struct kvm pointer 543 * 544 * Go through the memregions and unmap any regular RAM 545 * backing memory already mapped to the VM. 546 */ 547 void stage2_unmap_vm(struct kvm *kvm) 548 { 549 struct kvm_memslots *slots; 550 struct kvm_memory_slot *memslot; 551 int idx; 552 553 idx = srcu_read_lock(&kvm->srcu); 554 mmap_read_lock(current->mm); 555 spin_lock(&kvm->mmu_lock); 556 557 slots = kvm_memslots(kvm); 558 kvm_for_each_memslot(memslot, slots) 559 stage2_unmap_memslot(kvm, memslot); 560 561 spin_unlock(&kvm->mmu_lock); 562 mmap_read_unlock(current->mm); 563 srcu_read_unlock(&kvm->srcu, idx); 564 } 565 566 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 567 { 568 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 569 struct kvm_pgtable *pgt = NULL; 570 571 spin_lock(&kvm->mmu_lock); 572 pgt = mmu->pgt; 573 if (pgt) { 574 mmu->pgd_phys = 0; 575 mmu->pgt = NULL; 576 free_percpu(mmu->last_vcpu_ran); 577 } 578 spin_unlock(&kvm->mmu_lock); 579 580 if (pgt) { 581 kvm_pgtable_stage2_destroy(pgt); 582 kfree(pgt); 583 } 584 } 585 586 /** 587 * kvm_phys_addr_ioremap - map a device range to guest IPA 588 * 589 * @kvm: The KVM pointer 590 * @guest_ipa: The IPA at which to insert the mapping 591 * @pa: The physical address of the device 592 * @size: The size of the mapping 593 * @writable: Whether or not to create a writable mapping 594 */ 595 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 596 phys_addr_t pa, unsigned long size, bool writable) 597 { 598 phys_addr_t addr; 599 int ret = 0; 600 struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; 601 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; 602 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 603 KVM_PGTABLE_PROT_R | 604 (writable ? KVM_PGTABLE_PROT_W : 0); 605 606 size += offset_in_page(guest_ipa); 607 guest_ipa &= PAGE_MASK; 608 609 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 610 ret = kvm_mmu_topup_memory_cache(&cache, 611 kvm_mmu_cache_min_pages(kvm)); 612 if (ret) 613 break; 614 615 spin_lock(&kvm->mmu_lock); 616 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, 617 &cache); 618 spin_unlock(&kvm->mmu_lock); 619 if (ret) 620 break; 621 622 pa += PAGE_SIZE; 623 } 624 625 kvm_mmu_free_memory_cache(&cache); 626 return ret; 627 } 628 629 /** 630 * stage2_wp_range() - write protect stage2 memory region range 631 * @mmu: The KVM stage-2 MMU pointer 632 * @addr: Start address of range 633 * @end: End address of range 634 */ 635 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 636 { 637 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 638 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect); 639 } 640 641 /** 642 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 643 * @kvm: The KVM pointer 644 * @slot: The memory slot to write protect 645 * 646 * Called to start logging dirty pages after memory region 647 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 648 * all present PUD, PMD and PTEs are write protected in the memory region. 649 * Afterwards read of dirty page log can be called. 650 * 651 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 652 * serializing operations for VM memory regions. 653 */ 654 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 655 { 656 struct kvm_memslots *slots = kvm_memslots(kvm); 657 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 658 phys_addr_t start, end; 659 660 if (WARN_ON_ONCE(!memslot)) 661 return; 662 663 start = memslot->base_gfn << PAGE_SHIFT; 664 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 665 666 spin_lock(&kvm->mmu_lock); 667 stage2_wp_range(&kvm->arch.mmu, start, end); 668 spin_unlock(&kvm->mmu_lock); 669 kvm_flush_remote_tlbs(kvm); 670 } 671 672 /** 673 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 674 * @kvm: The KVM pointer 675 * @slot: The memory slot associated with mask 676 * @gfn_offset: The gfn offset in memory slot 677 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 678 * slot to be write protected 679 * 680 * Walks bits set in mask write protects the associated pte's. Caller must 681 * acquire kvm_mmu_lock. 682 */ 683 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 684 struct kvm_memory_slot *slot, 685 gfn_t gfn_offset, unsigned long mask) 686 { 687 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 688 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 689 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 690 691 stage2_wp_range(&kvm->arch.mmu, start, end); 692 } 693 694 /* 695 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 696 * dirty pages. 697 * 698 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 699 * enable dirty logging for them. 700 */ 701 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 702 struct kvm_memory_slot *slot, 703 gfn_t gfn_offset, unsigned long mask) 704 { 705 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 706 } 707 708 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 709 { 710 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 711 } 712 713 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 714 unsigned long hva, 715 unsigned long map_size) 716 { 717 gpa_t gpa_start; 718 hva_t uaddr_start, uaddr_end; 719 size_t size; 720 721 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 722 if (map_size == PAGE_SIZE) 723 return true; 724 725 size = memslot->npages * PAGE_SIZE; 726 727 gpa_start = memslot->base_gfn << PAGE_SHIFT; 728 729 uaddr_start = memslot->userspace_addr; 730 uaddr_end = uaddr_start + size; 731 732 /* 733 * Pages belonging to memslots that don't have the same alignment 734 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 735 * PMD/PUD entries, because we'll end up mapping the wrong pages. 736 * 737 * Consider a layout like the following: 738 * 739 * memslot->userspace_addr: 740 * +-----+--------------------+--------------------+---+ 741 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 742 * +-----+--------------------+--------------------+---+ 743 * 744 * memslot->base_gfn << PAGE_SHIFT: 745 * +---+--------------------+--------------------+-----+ 746 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 747 * +---+--------------------+--------------------+-----+ 748 * 749 * If we create those stage-2 blocks, we'll end up with this incorrect 750 * mapping: 751 * d -> f 752 * e -> g 753 * f -> h 754 */ 755 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 756 return false; 757 758 /* 759 * Next, let's make sure we're not trying to map anything not covered 760 * by the memslot. This means we have to prohibit block size mappings 761 * for the beginning and end of a non-block aligned and non-block sized 762 * memory slot (illustrated by the head and tail parts of the 763 * userspace view above containing pages 'abcde' and 'xyz', 764 * respectively). 765 * 766 * Note that it doesn't matter if we do the check using the 767 * userspace_addr or the base_gfn, as both are equally aligned (per 768 * the check above) and equally sized. 769 */ 770 return (hva & ~(map_size - 1)) >= uaddr_start && 771 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 772 } 773 774 /* 775 * Check if the given hva is backed by a transparent huge page (THP) and 776 * whether it can be mapped using block mapping in stage2. If so, adjust 777 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 778 * supported. This will need to be updated to support other THP sizes. 779 * 780 * Returns the size of the mapping. 781 */ 782 static unsigned long 783 transparent_hugepage_adjust(struct kvm_memory_slot *memslot, 784 unsigned long hva, kvm_pfn_t *pfnp, 785 phys_addr_t *ipap) 786 { 787 kvm_pfn_t pfn = *pfnp; 788 789 /* 790 * Make sure the adjustment is done only for THP pages. Also make 791 * sure that the HVA and IPA are sufficiently aligned and that the 792 * block map is contained within the memslot. 793 */ 794 if (kvm_is_transparent_hugepage(pfn) && 795 fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 796 /* 797 * The address we faulted on is backed by a transparent huge 798 * page. However, because we map the compound huge page and 799 * not the individual tail page, we need to transfer the 800 * refcount to the head page. We have to be careful that the 801 * THP doesn't start to split while we are adjusting the 802 * refcounts. 803 * 804 * We are sure this doesn't happen, because mmu_notifier_retry 805 * was successful and we are holding the mmu_lock, so if this 806 * THP is trying to split, it will be blocked in the mmu 807 * notifier before touching any of the pages, specifically 808 * before being able to call __split_huge_page_refcount(). 809 * 810 * We can therefore safely transfer the refcount from PG_tail 811 * to PG_head and switch the pfn from a tail page to the head 812 * page accordingly. 813 */ 814 *ipap &= PMD_MASK; 815 kvm_release_pfn_clean(pfn); 816 pfn &= ~(PTRS_PER_PMD - 1); 817 kvm_get_pfn(pfn); 818 *pfnp = pfn; 819 820 return PMD_SIZE; 821 } 822 823 /* Use page mapping if we cannot use block mapping. */ 824 return PAGE_SIZE; 825 } 826 827 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 828 { 829 unsigned long pa; 830 831 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 832 return huge_page_shift(hstate_vma(vma)); 833 834 if (!(vma->vm_flags & VM_PFNMAP)) 835 return PAGE_SHIFT; 836 837 VM_BUG_ON(is_vm_hugetlb_page(vma)); 838 839 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 840 841 #ifndef __PAGETABLE_PMD_FOLDED 842 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 843 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 844 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 845 return PUD_SHIFT; 846 #endif 847 848 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 849 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 850 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 851 return PMD_SHIFT; 852 853 return PAGE_SHIFT; 854 } 855 856 /* 857 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 858 * able to see the page's tags and therefore they must be initialised first. If 859 * PG_mte_tagged is set, tags have already been initialised. 860 * 861 * The race in the test/set of the PG_mte_tagged flag is handled by: 862 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs 863 * racing to santise the same page 864 * - mmap_lock protects between a VM faulting a page in and the VMM performing 865 * an mprotect() to add VM_MTE 866 */ 867 static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 868 unsigned long size) 869 { 870 unsigned long i, nr_pages = size >> PAGE_SHIFT; 871 struct page *page; 872 873 if (!kvm_has_mte(kvm)) 874 return 0; 875 876 /* 877 * pfn_to_online_page() is used to reject ZONE_DEVICE pages 878 * that may not support tags. 879 */ 880 page = pfn_to_online_page(pfn); 881 882 if (!page) 883 return -EFAULT; 884 885 for (i = 0; i < nr_pages; i++, page++) { 886 if (!test_bit(PG_mte_tagged, &page->flags)) { 887 mte_clear_page_tags(page_address(page)); 888 set_bit(PG_mte_tagged, &page->flags); 889 } 890 } 891 892 return 0; 893 } 894 895 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 896 struct kvm_memory_slot *memslot, unsigned long hva, 897 unsigned long fault_status) 898 { 899 int ret = 0; 900 bool write_fault, writable, force_pte = false; 901 bool exec_fault; 902 bool device = false; 903 bool shared; 904 unsigned long mmu_seq; 905 struct kvm *kvm = vcpu->kvm; 906 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 907 struct vm_area_struct *vma; 908 short vma_shift; 909 gfn_t gfn; 910 kvm_pfn_t pfn; 911 bool logging_active = memslot_is_logging(memslot); 912 unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); 913 unsigned long vma_pagesize, fault_granule; 914 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 915 struct kvm_pgtable *pgt; 916 917 fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level); 918 write_fault = kvm_is_write_fault(vcpu); 919 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 920 VM_BUG_ON(write_fault && exec_fault); 921 922 if (fault_status == FSC_PERM && !write_fault && !exec_fault) { 923 kvm_err("Unexpected L2 read permission error\n"); 924 return -EFAULT; 925 } 926 927 /* 928 * Let's check if we will get back a huge page backed by hugetlbfs, or 929 * get block mapping for device MMIO region. 930 */ 931 mmap_read_lock(current->mm); 932 vma = vma_lookup(current->mm, hva); 933 if (unlikely(!vma)) { 934 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 935 mmap_read_unlock(current->mm); 936 return -EFAULT; 937 } 938 939 /* 940 * logging_active is guaranteed to never be true for VM_PFNMAP 941 * memslots. 942 */ 943 if (logging_active) { 944 force_pte = true; 945 vma_shift = PAGE_SHIFT; 946 } else { 947 vma_shift = get_vma_page_shift(vma, hva); 948 } 949 950 shared = (vma->vm_flags & VM_PFNMAP); 951 952 switch (vma_shift) { 953 #ifndef __PAGETABLE_PMD_FOLDED 954 case PUD_SHIFT: 955 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 956 break; 957 fallthrough; 958 #endif 959 case CONT_PMD_SHIFT: 960 vma_shift = PMD_SHIFT; 961 fallthrough; 962 case PMD_SHIFT: 963 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 964 break; 965 fallthrough; 966 case CONT_PTE_SHIFT: 967 vma_shift = PAGE_SHIFT; 968 force_pte = true; 969 fallthrough; 970 case PAGE_SHIFT: 971 break; 972 default: 973 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 974 } 975 976 vma_pagesize = 1UL << vma_shift; 977 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) 978 fault_ipa &= ~(vma_pagesize - 1); 979 980 gfn = fault_ipa >> PAGE_SHIFT; 981 mmap_read_unlock(current->mm); 982 983 /* 984 * Permission faults just need to update the existing leaf entry, 985 * and so normally don't require allocations from the memcache. The 986 * only exception to this is when dirty logging is enabled at runtime 987 * and a write fault needs to collapse a block entry into a table. 988 */ 989 if (fault_status != FSC_PERM || (logging_active && write_fault)) { 990 ret = kvm_mmu_topup_memory_cache(memcache, 991 kvm_mmu_cache_min_pages(kvm)); 992 if (ret) 993 return ret; 994 } 995 996 mmu_seq = vcpu->kvm->mmu_notifier_seq; 997 /* 998 * Ensure the read of mmu_notifier_seq happens before we call 999 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1000 * the page we just got a reference to gets unmapped before we have a 1001 * chance to grab the mmu_lock, which ensure that if the page gets 1002 * unmapped afterwards, the call to kvm_unmap_gfn will take it away 1003 * from us again properly. This smp_rmb() interacts with the smp_wmb() 1004 * in kvm_mmu_notifier_invalidate_<page|range_end>. 1005 * 1006 * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is 1007 * used to avoid unnecessary overhead introduced to locate the memory 1008 * slot because it's always fixed even @gfn is adjusted for huge pages. 1009 */ 1010 smp_rmb(); 1011 1012 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 1013 write_fault, &writable, NULL); 1014 if (pfn == KVM_PFN_ERR_HWPOISON) { 1015 kvm_send_hwpoison_signal(hva, vma_shift); 1016 return 0; 1017 } 1018 if (is_error_noslot_pfn(pfn)) 1019 return -EFAULT; 1020 1021 if (kvm_is_device_pfn(pfn)) { 1022 /* 1023 * If the page was identified as device early by looking at 1024 * the VMA flags, vma_pagesize is already representing the 1025 * largest quantity we can map. If instead it was mapped 1026 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE 1027 * and must not be upgraded. 1028 * 1029 * In both cases, we don't let transparent_hugepage_adjust() 1030 * change things at the last minute. 1031 */ 1032 device = true; 1033 } else if (logging_active && !write_fault) { 1034 /* 1035 * Only actually map the page as writable if this was a write 1036 * fault. 1037 */ 1038 writable = false; 1039 } 1040 1041 if (exec_fault && device) 1042 return -ENOEXEC; 1043 1044 spin_lock(&kvm->mmu_lock); 1045 pgt = vcpu->arch.hw_mmu->pgt; 1046 if (mmu_notifier_retry(kvm, mmu_seq)) 1047 goto out_unlock; 1048 1049 /* 1050 * If we are not forced to use page mapping, check if we are 1051 * backed by a THP and thus use block mapping if possible. 1052 */ 1053 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) 1054 vma_pagesize = transparent_hugepage_adjust(memslot, hva, 1055 &pfn, &fault_ipa); 1056 1057 if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { 1058 /* Check the VMM hasn't introduced a new VM_SHARED VMA */ 1059 if (!shared) 1060 ret = sanitise_mte_tags(kvm, pfn, vma_pagesize); 1061 else 1062 ret = -EFAULT; 1063 if (ret) 1064 goto out_unlock; 1065 } 1066 1067 if (writable) 1068 prot |= KVM_PGTABLE_PROT_W; 1069 1070 if (exec_fault) 1071 prot |= KVM_PGTABLE_PROT_X; 1072 1073 if (device) 1074 prot |= KVM_PGTABLE_PROT_DEVICE; 1075 else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) 1076 prot |= KVM_PGTABLE_PROT_X; 1077 1078 /* 1079 * Under the premise of getting a FSC_PERM fault, we just need to relax 1080 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1081 * kvm_pgtable_stage2_map() should be called to change block size. 1082 */ 1083 if (fault_status == FSC_PERM && vma_pagesize == fault_granule) { 1084 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); 1085 } else { 1086 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, 1087 __pfn_to_phys(pfn), prot, 1088 memcache); 1089 } 1090 1091 /* Mark the page dirty only if the fault is handled successfully */ 1092 if (writable && !ret) { 1093 kvm_set_pfn_dirty(pfn); 1094 mark_page_dirty_in_slot(kvm, memslot, gfn); 1095 } 1096 1097 out_unlock: 1098 spin_unlock(&kvm->mmu_lock); 1099 kvm_set_pfn_accessed(pfn); 1100 kvm_release_pfn_clean(pfn); 1101 return ret != -EAGAIN ? ret : 0; 1102 } 1103 1104 /* Resolve the access fault by making the page young again. */ 1105 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1106 { 1107 pte_t pte; 1108 kvm_pte_t kpte; 1109 struct kvm_s2_mmu *mmu; 1110 1111 trace_kvm_access_fault(fault_ipa); 1112 1113 spin_lock(&vcpu->kvm->mmu_lock); 1114 mmu = vcpu->arch.hw_mmu; 1115 kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); 1116 spin_unlock(&vcpu->kvm->mmu_lock); 1117 1118 pte = __pte(kpte); 1119 if (pte_valid(pte)) 1120 kvm_set_pfn_accessed(pte_pfn(pte)); 1121 } 1122 1123 /** 1124 * kvm_handle_guest_abort - handles all 2nd stage aborts 1125 * @vcpu: the VCPU pointer 1126 * 1127 * Any abort that gets to the host is almost guaranteed to be caused by a 1128 * missing second stage translation table entry, which can mean that either the 1129 * guest simply needs more memory and we must allocate an appropriate page or it 1130 * can mean that the guest tried to access I/O memory, which is emulated by user 1131 * space. The distinction is based on the IPA causing the fault and whether this 1132 * memory region has been registered as standard RAM by user space. 1133 */ 1134 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1135 { 1136 unsigned long fault_status; 1137 phys_addr_t fault_ipa; 1138 struct kvm_memory_slot *memslot; 1139 unsigned long hva; 1140 bool is_iabt, write_fault, writable; 1141 gfn_t gfn; 1142 int ret, idx; 1143 1144 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1145 1146 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1147 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1148 1149 /* Synchronous External Abort? */ 1150 if (kvm_vcpu_abt_issea(vcpu)) { 1151 /* 1152 * For RAS the host kernel may handle this abort. 1153 * There is no need to pass the error into the guest. 1154 */ 1155 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu))) 1156 kvm_inject_vabt(vcpu); 1157 1158 return 1; 1159 } 1160 1161 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1162 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1163 1164 /* Check the stage-2 fault is trans. fault or write fault */ 1165 if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 1166 fault_status != FSC_ACCESS) { 1167 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1168 kvm_vcpu_trap_get_class(vcpu), 1169 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1170 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1171 return -EFAULT; 1172 } 1173 1174 idx = srcu_read_lock(&vcpu->kvm->srcu); 1175 1176 gfn = fault_ipa >> PAGE_SHIFT; 1177 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1178 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1179 write_fault = kvm_is_write_fault(vcpu); 1180 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1181 /* 1182 * The guest has put either its instructions or its page-tables 1183 * somewhere it shouldn't have. Userspace won't be able to do 1184 * anything about this (there's no syndrome for a start), so 1185 * re-inject the abort back into the guest. 1186 */ 1187 if (is_iabt) { 1188 ret = -ENOEXEC; 1189 goto out; 1190 } 1191 1192 if (kvm_vcpu_abt_iss1tw(vcpu)) { 1193 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1194 ret = 1; 1195 goto out_unlock; 1196 } 1197 1198 /* 1199 * Check for a cache maintenance operation. Since we 1200 * ended-up here, we know it is outside of any memory 1201 * slot. But we can't find out if that is for a device, 1202 * or if the guest is just being stupid. The only thing 1203 * we know for sure is that this range cannot be cached. 1204 * 1205 * So let's assume that the guest is just being 1206 * cautious, and skip the instruction. 1207 */ 1208 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 1209 kvm_incr_pc(vcpu); 1210 ret = 1; 1211 goto out_unlock; 1212 } 1213 1214 /* 1215 * The IPA is reported as [MAX:12], so we need to 1216 * complement it with the bottom 12 bits from the 1217 * faulting VA. This is always 12 bits, irrespective 1218 * of the page size. 1219 */ 1220 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 1221 ret = io_mem_abort(vcpu, fault_ipa); 1222 goto out_unlock; 1223 } 1224 1225 /* Userspace should not be able to register out-of-bounds IPAs */ 1226 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); 1227 1228 if (fault_status == FSC_ACCESS) { 1229 handle_access_fault(vcpu, fault_ipa); 1230 ret = 1; 1231 goto out_unlock; 1232 } 1233 1234 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); 1235 if (ret == 0) 1236 ret = 1; 1237 out: 1238 if (ret == -ENOEXEC) { 1239 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1240 ret = 1; 1241 } 1242 out_unlock: 1243 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1244 return ret; 1245 } 1246 1247 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1248 { 1249 if (!kvm->arch.mmu.pgt) 1250 return false; 1251 1252 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 1253 (range->end - range->start) << PAGE_SHIFT, 1254 range->may_block); 1255 1256 return false; 1257 } 1258 1259 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1260 { 1261 kvm_pfn_t pfn = pte_pfn(range->pte); 1262 int ret; 1263 1264 if (!kvm->arch.mmu.pgt) 1265 return false; 1266 1267 WARN_ON(range->end - range->start != 1); 1268 1269 ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE); 1270 if (ret) 1271 return false; 1272 1273 /* 1274 * We've moved a page around, probably through CoW, so let's treat 1275 * it just like a translation fault and the map handler will clean 1276 * the cache to the PoC. 1277 * 1278 * The MMU notifiers will have unmapped a huge PMD before calling 1279 * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and 1280 * therefore we never need to clear out a huge PMD through this 1281 * calling path and a memcache is not required. 1282 */ 1283 kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, 1284 PAGE_SIZE, __pfn_to_phys(pfn), 1285 KVM_PGTABLE_PROT_R, NULL); 1286 1287 return false; 1288 } 1289 1290 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1291 { 1292 u64 size = (range->end - range->start) << PAGE_SHIFT; 1293 kvm_pte_t kpte; 1294 pte_t pte; 1295 1296 if (!kvm->arch.mmu.pgt) 1297 return false; 1298 1299 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); 1300 1301 kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, 1302 range->start << PAGE_SHIFT); 1303 pte = __pte(kpte); 1304 return pte_valid(pte) && pte_young(pte); 1305 } 1306 1307 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1308 { 1309 if (!kvm->arch.mmu.pgt) 1310 return false; 1311 1312 return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, 1313 range->start << PAGE_SHIFT); 1314 } 1315 1316 phys_addr_t kvm_mmu_get_httbr(void) 1317 { 1318 return __pa(hyp_pgtable->pgd); 1319 } 1320 1321 phys_addr_t kvm_get_idmap_vector(void) 1322 { 1323 return hyp_idmap_vector; 1324 } 1325 1326 static int kvm_map_idmap_text(void) 1327 { 1328 unsigned long size = hyp_idmap_end - hyp_idmap_start; 1329 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 1330 PAGE_HYP_EXEC); 1331 if (err) 1332 kvm_err("Failed to idmap %lx-%lx\n", 1333 hyp_idmap_start, hyp_idmap_end); 1334 1335 return err; 1336 } 1337 1338 static void *kvm_hyp_zalloc_page(void *arg) 1339 { 1340 return (void *)get_zeroed_page(GFP_KERNEL); 1341 } 1342 1343 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 1344 .zalloc_page = kvm_hyp_zalloc_page, 1345 .get_page = kvm_host_get_page, 1346 .put_page = kvm_host_put_page, 1347 .phys_to_virt = kvm_host_va, 1348 .virt_to_phys = kvm_host_pa, 1349 }; 1350 1351 int kvm_mmu_init(u32 *hyp_va_bits) 1352 { 1353 int err; 1354 1355 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 1356 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 1357 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 1358 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 1359 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 1360 1361 /* 1362 * We rely on the linker script to ensure at build time that the HYP 1363 * init code does not cross a page boundary. 1364 */ 1365 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 1366 1367 *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); 1368 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 1369 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 1370 kvm_debug("HYP VA range: %lx:%lx\n", 1371 kern_hyp_va(PAGE_OFFSET), 1372 kern_hyp_va((unsigned long)high_memory - 1)); 1373 1374 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 1375 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 1376 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 1377 /* 1378 * The idmap page is intersecting with the VA space, 1379 * it is not safe to continue further. 1380 */ 1381 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 1382 err = -EINVAL; 1383 goto out; 1384 } 1385 1386 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 1387 if (!hyp_pgtable) { 1388 kvm_err("Hyp mode page-table not allocated\n"); 1389 err = -ENOMEM; 1390 goto out; 1391 } 1392 1393 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 1394 if (err) 1395 goto out_free_pgtable; 1396 1397 err = kvm_map_idmap_text(); 1398 if (err) 1399 goto out_destroy_pgtable; 1400 1401 io_map_base = hyp_idmap_start; 1402 return 0; 1403 1404 out_destroy_pgtable: 1405 kvm_pgtable_hyp_destroy(hyp_pgtable); 1406 out_free_pgtable: 1407 kfree(hyp_pgtable); 1408 hyp_pgtable = NULL; 1409 out: 1410 return err; 1411 } 1412 1413 void kvm_arch_commit_memory_region(struct kvm *kvm, 1414 const struct kvm_userspace_memory_region *mem, 1415 struct kvm_memory_slot *old, 1416 const struct kvm_memory_slot *new, 1417 enum kvm_mr_change change) 1418 { 1419 /* 1420 * At this point memslot has been committed and there is an 1421 * allocated dirty_bitmap[], dirty pages will be tracked while the 1422 * memory slot is write protected. 1423 */ 1424 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1425 /* 1426 * If we're with initial-all-set, we don't need to write 1427 * protect any pages because they're all reported as dirty. 1428 * Huge pages and normal pages will be write protect gradually. 1429 */ 1430 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { 1431 kvm_mmu_wp_memory_region(kvm, mem->slot); 1432 } 1433 } 1434 } 1435 1436 int kvm_arch_prepare_memory_region(struct kvm *kvm, 1437 struct kvm_memory_slot *memslot, 1438 const struct kvm_userspace_memory_region *mem, 1439 enum kvm_mr_change change) 1440 { 1441 hva_t hva = mem->userspace_addr; 1442 hva_t reg_end = hva + mem->memory_size; 1443 int ret = 0; 1444 1445 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 1446 change != KVM_MR_FLAGS_ONLY) 1447 return 0; 1448 1449 /* 1450 * Prevent userspace from creating a memory region outside of the IPA 1451 * space addressable by the KVM guest IPA space. 1452 */ 1453 if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) 1454 return -EFAULT; 1455 1456 mmap_read_lock(current->mm); 1457 /* 1458 * A memory region could potentially cover multiple VMAs, and any holes 1459 * between them, so iterate over all of them. 1460 * 1461 * +--------------------------------------------+ 1462 * +---------------+----------------+ +----------------+ 1463 * | : VMA 1 | VMA 2 | | VMA 3 : | 1464 * +---------------+----------------+ +----------------+ 1465 * | memory region | 1466 * +--------------------------------------------+ 1467 */ 1468 do { 1469 struct vm_area_struct *vma; 1470 1471 vma = find_vma_intersection(current->mm, hva, reg_end); 1472 if (!vma) 1473 break; 1474 1475 /* 1476 * VM_SHARED mappings are not allowed with MTE to avoid races 1477 * when updating the PG_mte_tagged page flag, see 1478 * sanitise_mte_tags for more details. 1479 */ 1480 if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) 1481 return -EINVAL; 1482 1483 if (vma->vm_flags & VM_PFNMAP) { 1484 /* IO region dirty page logging not allowed */ 1485 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1486 ret = -EINVAL; 1487 break; 1488 } 1489 } 1490 hva = min(reg_end, vma->vm_end); 1491 } while (hva < reg_end); 1492 1493 mmap_read_unlock(current->mm); 1494 return ret; 1495 } 1496 1497 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 1498 { 1499 } 1500 1501 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 1502 { 1503 } 1504 1505 void kvm_arch_flush_shadow_all(struct kvm *kvm) 1506 { 1507 kvm_free_stage2_pgd(&kvm->arch.mmu); 1508 } 1509 1510 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 1511 struct kvm_memory_slot *slot) 1512 { 1513 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 1514 phys_addr_t size = slot->npages << PAGE_SHIFT; 1515 1516 spin_lock(&kvm->mmu_lock); 1517 unmap_stage2_range(&kvm->arch.mmu, gpa, size); 1518 spin_unlock(&kvm->mmu_lock); 1519 } 1520 1521 /* 1522 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 1523 * 1524 * Main problems: 1525 * - S/W ops are local to a CPU (not broadcast) 1526 * - We have line migration behind our back (speculation) 1527 * - System caches don't support S/W at all (damn!) 1528 * 1529 * In the face of the above, the best we can do is to try and convert 1530 * S/W ops to VA ops. Because the guest is not allowed to infer the 1531 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 1532 * which is a rather good thing for us. 1533 * 1534 * Also, it is only used when turning caches on/off ("The expected 1535 * usage of the cache maintenance instructions that operate by set/way 1536 * is associated with the cache maintenance instructions associated 1537 * with the powerdown and powerup of caches, if this is required by 1538 * the implementation."). 1539 * 1540 * We use the following policy: 1541 * 1542 * - If we trap a S/W operation, we enable VM trapping to detect 1543 * caches being turned on/off, and do a full clean. 1544 * 1545 * - We flush the caches on both caches being turned on and off. 1546 * 1547 * - Once the caches are enabled, we stop trapping VM ops. 1548 */ 1549 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 1550 { 1551 unsigned long hcr = *vcpu_hcr(vcpu); 1552 1553 /* 1554 * If this is the first time we do a S/W operation 1555 * (i.e. HCR_TVM not set) flush the whole memory, and set the 1556 * VM trapping. 1557 * 1558 * Otherwise, rely on the VM trapping to wait for the MMU + 1559 * Caches to be turned off. At that point, we'll be able to 1560 * clean the caches again. 1561 */ 1562 if (!(hcr & HCR_TVM)) { 1563 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 1564 vcpu_has_cache_enabled(vcpu)); 1565 stage2_flush_vm(vcpu->kvm); 1566 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 1567 } 1568 } 1569 1570 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 1571 { 1572 bool now_enabled = vcpu_has_cache_enabled(vcpu); 1573 1574 /* 1575 * If switching the MMU+caches on, need to invalidate the caches. 1576 * If switching it off, need to clean the caches. 1577 * Clean + invalidate does the trick always. 1578 */ 1579 if (now_enabled != was_enabled) 1580 stage2_flush_vm(vcpu->kvm); 1581 1582 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 1583 if (now_enabled) 1584 *vcpu_hcr(vcpu) &= ~HCR_TVM; 1585 1586 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 1587 } 1588