1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_ras.h> 18 #include <asm/kvm_asm.h> 19 #include <asm/kvm_emulate.h> 20 #include <asm/virt.h> 21 22 #include "trace.h" 23 24 static pgd_t *boot_hyp_pgd; 25 static pgd_t *hyp_pgd; 26 static pgd_t *merged_hyp_pgd; 27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 28 29 static unsigned long hyp_idmap_start; 30 static unsigned long hyp_idmap_end; 31 static phys_addr_t hyp_idmap_vector; 32 33 static unsigned long io_map_base; 34 35 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 36 37 #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) 38 #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) 39 40 static bool is_iomap(unsigned long flags) 41 { 42 return flags & KVM_S2PTE_FLAG_IS_IOMAP; 43 } 44 45 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 46 { 47 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 48 } 49 50 /** 51 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 52 * @kvm: pointer to kvm structure. 53 * 54 * Interface to HYP function to flush all VM TLB entries 55 */ 56 void kvm_flush_remote_tlbs(struct kvm *kvm) 57 { 58 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); 59 } 60 61 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 62 { 63 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 64 } 65 66 /* 67 * D-Cache management functions. They take the page table entries by 68 * value, as they are flushing the cache using the kernel mapping (or 69 * kmap on 32bit). 70 */ 71 static void kvm_flush_dcache_pte(pte_t pte) 72 { 73 __kvm_flush_dcache_pte(pte); 74 } 75 76 static void kvm_flush_dcache_pmd(pmd_t pmd) 77 { 78 __kvm_flush_dcache_pmd(pmd); 79 } 80 81 static void kvm_flush_dcache_pud(pud_t pud) 82 { 83 __kvm_flush_dcache_pud(pud); 84 } 85 86 static bool kvm_is_device_pfn(unsigned long pfn) 87 { 88 return !pfn_valid(pfn); 89 } 90 91 /** 92 * stage2_dissolve_pmd() - clear and flush huge PMD entry 93 * @kvm: pointer to kvm structure. 94 * @addr: IPA 95 * @pmd: pmd pointer for IPA 96 * 97 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. 98 */ 99 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 100 { 101 if (!pmd_thp_or_huge(*pmd)) 102 return; 103 104 pmd_clear(pmd); 105 kvm_tlb_flush_vmid_ipa(kvm, addr); 106 put_page(virt_to_page(pmd)); 107 } 108 109 /** 110 * stage2_dissolve_pud() - clear and flush huge PUD entry 111 * @kvm: pointer to kvm structure. 112 * @addr: IPA 113 * @pud: pud pointer for IPA 114 * 115 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. 116 */ 117 static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) 118 { 119 if (!stage2_pud_huge(kvm, *pudp)) 120 return; 121 122 stage2_pud_clear(kvm, pudp); 123 kvm_tlb_flush_vmid_ipa(kvm, addr); 124 put_page(virt_to_page(pudp)); 125 } 126 127 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 128 int min, int max) 129 { 130 void *page; 131 132 BUG_ON(max > KVM_NR_MEM_OBJS); 133 if (cache->nobjs >= min) 134 return 0; 135 while (cache->nobjs < max) { 136 page = (void *)__get_free_page(GFP_PGTABLE_USER); 137 if (!page) 138 return -ENOMEM; 139 cache->objects[cache->nobjs++] = page; 140 } 141 return 0; 142 } 143 144 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 145 { 146 while (mc->nobjs) 147 free_page((unsigned long)mc->objects[--mc->nobjs]); 148 } 149 150 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 151 { 152 void *p; 153 154 BUG_ON(!mc || !mc->nobjs); 155 p = mc->objects[--mc->nobjs]; 156 return p; 157 } 158 159 static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) 160 { 161 p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); 162 stage2_pgd_clear(kvm, pgd); 163 kvm_tlb_flush_vmid_ipa(kvm, addr); 164 stage2_p4d_free(kvm, p4d_table); 165 put_page(virt_to_page(pgd)); 166 } 167 168 static void clear_stage2_p4d_entry(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr) 169 { 170 pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0); 171 stage2_p4d_clear(kvm, p4d); 172 kvm_tlb_flush_vmid_ipa(kvm, addr); 173 stage2_pud_free(kvm, pud_table); 174 put_page(virt_to_page(p4d)); 175 } 176 177 static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 178 { 179 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); 180 VM_BUG_ON(stage2_pud_huge(kvm, *pud)); 181 stage2_pud_clear(kvm, pud); 182 kvm_tlb_flush_vmid_ipa(kvm, addr); 183 stage2_pmd_free(kvm, pmd_table); 184 put_page(virt_to_page(pud)); 185 } 186 187 static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 188 { 189 pte_t *pte_table = pte_offset_kernel(pmd, 0); 190 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 191 pmd_clear(pmd); 192 kvm_tlb_flush_vmid_ipa(kvm, addr); 193 free_page((unsigned long)pte_table); 194 put_page(virt_to_page(pmd)); 195 } 196 197 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) 198 { 199 WRITE_ONCE(*ptep, new_pte); 200 dsb(ishst); 201 } 202 203 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) 204 { 205 WRITE_ONCE(*pmdp, new_pmd); 206 dsb(ishst); 207 } 208 209 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) 210 { 211 kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); 212 } 213 214 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) 215 { 216 WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); 217 dsb(ishst); 218 } 219 220 static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp) 221 { 222 WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp)); 223 dsb(ishst); 224 } 225 226 static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) 227 { 228 #ifndef __PAGETABLE_P4D_FOLDED 229 WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp)); 230 dsb(ishst); 231 #endif 232 } 233 234 /* 235 * Unmapping vs dcache management: 236 * 237 * If a guest maps certain memory pages as uncached, all writes will 238 * bypass the data cache and go directly to RAM. However, the CPUs 239 * can still speculate reads (not writes) and fill cache lines with 240 * data. 241 * 242 * Those cache lines will be *clean* cache lines though, so a 243 * clean+invalidate operation is equivalent to an invalidate 244 * operation, because no cache lines are marked dirty. 245 * 246 * Those clean cache lines could be filled prior to an uncached write 247 * by the guest, and the cache coherent IO subsystem would therefore 248 * end up writing old data to disk. 249 * 250 * This is why right after unmapping a page/section and invalidating 251 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure 252 * the IO subsystem will never hit in the cache. 253 * 254 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 255 * we then fully enforce cacheability of RAM, no matter what the guest 256 * does. 257 */ 258 static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, 259 phys_addr_t addr, phys_addr_t end) 260 { 261 phys_addr_t start_addr = addr; 262 pte_t *pte, *start_pte; 263 264 start_pte = pte = pte_offset_kernel(pmd, addr); 265 do { 266 if (!pte_none(*pte)) { 267 pte_t old_pte = *pte; 268 269 kvm_set_pte(pte, __pte(0)); 270 kvm_tlb_flush_vmid_ipa(kvm, addr); 271 272 /* No need to invalidate the cache for device mappings */ 273 if (!kvm_is_device_pfn(pte_pfn(old_pte))) 274 kvm_flush_dcache_pte(old_pte); 275 276 put_page(virt_to_page(pte)); 277 } 278 } while (pte++, addr += PAGE_SIZE, addr != end); 279 280 if (stage2_pte_table_empty(kvm, start_pte)) 281 clear_stage2_pmd_entry(kvm, pmd, start_addr); 282 } 283 284 static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, 285 phys_addr_t addr, phys_addr_t end) 286 { 287 phys_addr_t next, start_addr = addr; 288 pmd_t *pmd, *start_pmd; 289 290 start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); 291 do { 292 next = stage2_pmd_addr_end(kvm, addr, end); 293 if (!pmd_none(*pmd)) { 294 if (pmd_thp_or_huge(*pmd)) { 295 pmd_t old_pmd = *pmd; 296 297 pmd_clear(pmd); 298 kvm_tlb_flush_vmid_ipa(kvm, addr); 299 300 kvm_flush_dcache_pmd(old_pmd); 301 302 put_page(virt_to_page(pmd)); 303 } else { 304 unmap_stage2_ptes(kvm, pmd, addr, next); 305 } 306 } 307 } while (pmd++, addr = next, addr != end); 308 309 if (stage2_pmd_table_empty(kvm, start_pmd)) 310 clear_stage2_pud_entry(kvm, pud, start_addr); 311 } 312 313 static void unmap_stage2_puds(struct kvm *kvm, p4d_t *p4d, 314 phys_addr_t addr, phys_addr_t end) 315 { 316 phys_addr_t next, start_addr = addr; 317 pud_t *pud, *start_pud; 318 319 start_pud = pud = stage2_pud_offset(kvm, p4d, addr); 320 do { 321 next = stage2_pud_addr_end(kvm, addr, end); 322 if (!stage2_pud_none(kvm, *pud)) { 323 if (stage2_pud_huge(kvm, *pud)) { 324 pud_t old_pud = *pud; 325 326 stage2_pud_clear(kvm, pud); 327 kvm_tlb_flush_vmid_ipa(kvm, addr); 328 kvm_flush_dcache_pud(old_pud); 329 put_page(virt_to_page(pud)); 330 } else { 331 unmap_stage2_pmds(kvm, pud, addr, next); 332 } 333 } 334 } while (pud++, addr = next, addr != end); 335 336 if (stage2_pud_table_empty(kvm, start_pud)) 337 clear_stage2_p4d_entry(kvm, p4d, start_addr); 338 } 339 340 static void unmap_stage2_p4ds(struct kvm *kvm, pgd_t *pgd, 341 phys_addr_t addr, phys_addr_t end) 342 { 343 phys_addr_t next, start_addr = addr; 344 p4d_t *p4d, *start_p4d; 345 346 start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr); 347 do { 348 next = stage2_p4d_addr_end(kvm, addr, end); 349 if (!stage2_p4d_none(kvm, *p4d)) 350 unmap_stage2_puds(kvm, p4d, addr, next); 351 } while (p4d++, addr = next, addr != end); 352 353 if (stage2_p4d_table_empty(kvm, start_p4d)) 354 clear_stage2_pgd_entry(kvm, pgd, start_addr); 355 } 356 357 /** 358 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 359 * @kvm: The VM pointer 360 * @start: The intermediate physical base address of the range to unmap 361 * @size: The size of the area to unmap 362 * 363 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 364 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 365 * destroying the VM), otherwise another faulting VCPU may come in and mess 366 * with things behind our backs. 367 */ 368 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 369 { 370 pgd_t *pgd; 371 phys_addr_t addr = start, end = start + size; 372 phys_addr_t next; 373 374 assert_spin_locked(&kvm->mmu_lock); 375 WARN_ON(size & ~PAGE_MASK); 376 377 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 378 do { 379 /* 380 * Make sure the page table is still active, as another thread 381 * could have possibly freed the page table, while we released 382 * the lock. 383 */ 384 if (!READ_ONCE(kvm->arch.pgd)) 385 break; 386 next = stage2_pgd_addr_end(kvm, addr, end); 387 if (!stage2_pgd_none(kvm, *pgd)) 388 unmap_stage2_p4ds(kvm, pgd, addr, next); 389 /* 390 * If the range is too large, release the kvm->mmu_lock 391 * to prevent starvation and lockup detector warnings. 392 */ 393 if (next != end) 394 cond_resched_lock(&kvm->mmu_lock); 395 } while (pgd++, addr = next, addr != end); 396 } 397 398 static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, 399 phys_addr_t addr, phys_addr_t end) 400 { 401 pte_t *pte; 402 403 pte = pte_offset_kernel(pmd, addr); 404 do { 405 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) 406 kvm_flush_dcache_pte(*pte); 407 } while (pte++, addr += PAGE_SIZE, addr != end); 408 } 409 410 static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, 411 phys_addr_t addr, phys_addr_t end) 412 { 413 pmd_t *pmd; 414 phys_addr_t next; 415 416 pmd = stage2_pmd_offset(kvm, pud, addr); 417 do { 418 next = stage2_pmd_addr_end(kvm, addr, end); 419 if (!pmd_none(*pmd)) { 420 if (pmd_thp_or_huge(*pmd)) 421 kvm_flush_dcache_pmd(*pmd); 422 else 423 stage2_flush_ptes(kvm, pmd, addr, next); 424 } 425 } while (pmd++, addr = next, addr != end); 426 } 427 428 static void stage2_flush_puds(struct kvm *kvm, p4d_t *p4d, 429 phys_addr_t addr, phys_addr_t end) 430 { 431 pud_t *pud; 432 phys_addr_t next; 433 434 pud = stage2_pud_offset(kvm, p4d, addr); 435 do { 436 next = stage2_pud_addr_end(kvm, addr, end); 437 if (!stage2_pud_none(kvm, *pud)) { 438 if (stage2_pud_huge(kvm, *pud)) 439 kvm_flush_dcache_pud(*pud); 440 else 441 stage2_flush_pmds(kvm, pud, addr, next); 442 } 443 } while (pud++, addr = next, addr != end); 444 } 445 446 static void stage2_flush_p4ds(struct kvm *kvm, pgd_t *pgd, 447 phys_addr_t addr, phys_addr_t end) 448 { 449 p4d_t *p4d; 450 phys_addr_t next; 451 452 p4d = stage2_p4d_offset(kvm, pgd, addr); 453 do { 454 next = stage2_p4d_addr_end(kvm, addr, end); 455 if (!stage2_p4d_none(kvm, *p4d)) 456 stage2_flush_puds(kvm, p4d, addr, next); 457 } while (p4d++, addr = next, addr != end); 458 } 459 460 static void stage2_flush_memslot(struct kvm *kvm, 461 struct kvm_memory_slot *memslot) 462 { 463 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 464 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 465 phys_addr_t next; 466 pgd_t *pgd; 467 468 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 469 do { 470 next = stage2_pgd_addr_end(kvm, addr, end); 471 if (!stage2_pgd_none(kvm, *pgd)) 472 stage2_flush_p4ds(kvm, pgd, addr, next); 473 474 if (next != end) 475 cond_resched_lock(&kvm->mmu_lock); 476 } while (pgd++, addr = next, addr != end); 477 } 478 479 /** 480 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 481 * @kvm: The struct kvm pointer 482 * 483 * Go through the stage 2 page tables and invalidate any cache lines 484 * backing memory already mapped to the VM. 485 */ 486 static void stage2_flush_vm(struct kvm *kvm) 487 { 488 struct kvm_memslots *slots; 489 struct kvm_memory_slot *memslot; 490 int idx; 491 492 idx = srcu_read_lock(&kvm->srcu); 493 spin_lock(&kvm->mmu_lock); 494 495 slots = kvm_memslots(kvm); 496 kvm_for_each_memslot(memslot, slots) 497 stage2_flush_memslot(kvm, memslot); 498 499 spin_unlock(&kvm->mmu_lock); 500 srcu_read_unlock(&kvm->srcu, idx); 501 } 502 503 static void clear_hyp_pgd_entry(pgd_t *pgd) 504 { 505 p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL); 506 pgd_clear(pgd); 507 p4d_free(NULL, p4d_table); 508 put_page(virt_to_page(pgd)); 509 } 510 511 static void clear_hyp_p4d_entry(p4d_t *p4d) 512 { 513 pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL); 514 VM_BUG_ON(p4d_huge(*p4d)); 515 p4d_clear(p4d); 516 pud_free(NULL, pud_table); 517 put_page(virt_to_page(p4d)); 518 } 519 520 static void clear_hyp_pud_entry(pud_t *pud) 521 { 522 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); 523 VM_BUG_ON(pud_huge(*pud)); 524 pud_clear(pud); 525 pmd_free(NULL, pmd_table); 526 put_page(virt_to_page(pud)); 527 } 528 529 static void clear_hyp_pmd_entry(pmd_t *pmd) 530 { 531 pte_t *pte_table = pte_offset_kernel(pmd, 0); 532 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 533 pmd_clear(pmd); 534 pte_free_kernel(NULL, pte_table); 535 put_page(virt_to_page(pmd)); 536 } 537 538 static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 539 { 540 pte_t *pte, *start_pte; 541 542 start_pte = pte = pte_offset_kernel(pmd, addr); 543 do { 544 if (!pte_none(*pte)) { 545 kvm_set_pte(pte, __pte(0)); 546 put_page(virt_to_page(pte)); 547 } 548 } while (pte++, addr += PAGE_SIZE, addr != end); 549 550 if (hyp_pte_table_empty(start_pte)) 551 clear_hyp_pmd_entry(pmd); 552 } 553 554 static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 555 { 556 phys_addr_t next; 557 pmd_t *pmd, *start_pmd; 558 559 start_pmd = pmd = pmd_offset(pud, addr); 560 do { 561 next = pmd_addr_end(addr, end); 562 /* Hyp doesn't use huge pmds */ 563 if (!pmd_none(*pmd)) 564 unmap_hyp_ptes(pmd, addr, next); 565 } while (pmd++, addr = next, addr != end); 566 567 if (hyp_pmd_table_empty(start_pmd)) 568 clear_hyp_pud_entry(pud); 569 } 570 571 static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end) 572 { 573 phys_addr_t next; 574 pud_t *pud, *start_pud; 575 576 start_pud = pud = pud_offset(p4d, addr); 577 do { 578 next = pud_addr_end(addr, end); 579 /* Hyp doesn't use huge puds */ 580 if (!pud_none(*pud)) 581 unmap_hyp_pmds(pud, addr, next); 582 } while (pud++, addr = next, addr != end); 583 584 if (hyp_pud_table_empty(start_pud)) 585 clear_hyp_p4d_entry(p4d); 586 } 587 588 static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 589 { 590 phys_addr_t next; 591 p4d_t *p4d, *start_p4d; 592 593 start_p4d = p4d = p4d_offset(pgd, addr); 594 do { 595 next = p4d_addr_end(addr, end); 596 /* Hyp doesn't use huge p4ds */ 597 if (!p4d_none(*p4d)) 598 unmap_hyp_puds(p4d, addr, next); 599 } while (p4d++, addr = next, addr != end); 600 601 if (hyp_p4d_table_empty(start_p4d)) 602 clear_hyp_pgd_entry(pgd); 603 } 604 605 static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) 606 { 607 return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); 608 } 609 610 static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, 611 phys_addr_t start, u64 size) 612 { 613 pgd_t *pgd; 614 phys_addr_t addr = start, end = start + size; 615 phys_addr_t next; 616 617 /* 618 * We don't unmap anything from HYP, except at the hyp tear down. 619 * Hence, we don't have to invalidate the TLBs here. 620 */ 621 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 622 do { 623 next = pgd_addr_end(addr, end); 624 if (!pgd_none(*pgd)) 625 unmap_hyp_p4ds(pgd, addr, next); 626 } while (pgd++, addr = next, addr != end); 627 } 628 629 static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) 630 { 631 __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); 632 } 633 634 static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) 635 { 636 __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); 637 } 638 639 /** 640 * free_hyp_pgds - free Hyp-mode page tables 641 * 642 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 643 * therefore contains either mappings in the kernel memory area (above 644 * PAGE_OFFSET), or device mappings in the idmap range. 645 * 646 * boot_hyp_pgd should only map the idmap range, and is only used in 647 * the extended idmap case. 648 */ 649 void free_hyp_pgds(void) 650 { 651 pgd_t *id_pgd; 652 653 mutex_lock(&kvm_hyp_pgd_mutex); 654 655 id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; 656 657 if (id_pgd) { 658 /* In case we never called hyp_mmu_init() */ 659 if (!io_map_base) 660 io_map_base = hyp_idmap_start; 661 unmap_hyp_idmap_range(id_pgd, io_map_base, 662 hyp_idmap_start + PAGE_SIZE - io_map_base); 663 } 664 665 if (boot_hyp_pgd) { 666 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 667 boot_hyp_pgd = NULL; 668 } 669 670 if (hyp_pgd) { 671 unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), 672 (uintptr_t)high_memory - PAGE_OFFSET); 673 674 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 675 hyp_pgd = NULL; 676 } 677 if (merged_hyp_pgd) { 678 clear_page(merged_hyp_pgd); 679 free_page((unsigned long)merged_hyp_pgd); 680 merged_hyp_pgd = NULL; 681 } 682 683 mutex_unlock(&kvm_hyp_pgd_mutex); 684 } 685 686 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 687 unsigned long end, unsigned long pfn, 688 pgprot_t prot) 689 { 690 pte_t *pte; 691 unsigned long addr; 692 693 addr = start; 694 do { 695 pte = pte_offset_kernel(pmd, addr); 696 kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); 697 get_page(virt_to_page(pte)); 698 pfn++; 699 } while (addr += PAGE_SIZE, addr != end); 700 } 701 702 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 703 unsigned long end, unsigned long pfn, 704 pgprot_t prot) 705 { 706 pmd_t *pmd; 707 pte_t *pte; 708 unsigned long addr, next; 709 710 addr = start; 711 do { 712 pmd = pmd_offset(pud, addr); 713 714 BUG_ON(pmd_sect(*pmd)); 715 716 if (pmd_none(*pmd)) { 717 pte = pte_alloc_one_kernel(NULL); 718 if (!pte) { 719 kvm_err("Cannot allocate Hyp pte\n"); 720 return -ENOMEM; 721 } 722 kvm_pmd_populate(pmd, pte); 723 get_page(virt_to_page(pmd)); 724 } 725 726 next = pmd_addr_end(addr, end); 727 728 create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 729 pfn += (next - addr) >> PAGE_SHIFT; 730 } while (addr = next, addr != end); 731 732 return 0; 733 } 734 735 static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start, 736 unsigned long end, unsigned long pfn, 737 pgprot_t prot) 738 { 739 pud_t *pud; 740 pmd_t *pmd; 741 unsigned long addr, next; 742 int ret; 743 744 addr = start; 745 do { 746 pud = pud_offset(p4d, addr); 747 748 if (pud_none_or_clear_bad(pud)) { 749 pmd = pmd_alloc_one(NULL, addr); 750 if (!pmd) { 751 kvm_err("Cannot allocate Hyp pmd\n"); 752 return -ENOMEM; 753 } 754 kvm_pud_populate(pud, pmd); 755 get_page(virt_to_page(pud)); 756 } 757 758 next = pud_addr_end(addr, end); 759 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 760 if (ret) 761 return ret; 762 pfn += (next - addr) >> PAGE_SHIFT; 763 } while (addr = next, addr != end); 764 765 return 0; 766 } 767 768 static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start, 769 unsigned long end, unsigned long pfn, 770 pgprot_t prot) 771 { 772 p4d_t *p4d; 773 pud_t *pud; 774 unsigned long addr, next; 775 int ret; 776 777 addr = start; 778 do { 779 p4d = p4d_offset(pgd, addr); 780 781 if (p4d_none(*p4d)) { 782 pud = pud_alloc_one(NULL, addr); 783 if (!pud) { 784 kvm_err("Cannot allocate Hyp pud\n"); 785 return -ENOMEM; 786 } 787 kvm_p4d_populate(p4d, pud); 788 get_page(virt_to_page(p4d)); 789 } 790 791 next = p4d_addr_end(addr, end); 792 ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot); 793 if (ret) 794 return ret; 795 pfn += (next - addr) >> PAGE_SHIFT; 796 } while (addr = next, addr != end); 797 798 return 0; 799 } 800 801 static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, 802 unsigned long start, unsigned long end, 803 unsigned long pfn, pgprot_t prot) 804 { 805 pgd_t *pgd; 806 p4d_t *p4d; 807 unsigned long addr, next; 808 int err = 0; 809 810 mutex_lock(&kvm_hyp_pgd_mutex); 811 addr = start & PAGE_MASK; 812 end = PAGE_ALIGN(end); 813 do { 814 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 815 816 if (pgd_none(*pgd)) { 817 p4d = p4d_alloc_one(NULL, addr); 818 if (!p4d) { 819 kvm_err("Cannot allocate Hyp p4d\n"); 820 err = -ENOMEM; 821 goto out; 822 } 823 kvm_pgd_populate(pgd, p4d); 824 get_page(virt_to_page(pgd)); 825 } 826 827 next = pgd_addr_end(addr, end); 828 err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot); 829 if (err) 830 goto out; 831 pfn += (next - addr) >> PAGE_SHIFT; 832 } while (addr = next, addr != end); 833 out: 834 mutex_unlock(&kvm_hyp_pgd_mutex); 835 return err; 836 } 837 838 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 839 { 840 if (!is_vmalloc_addr(kaddr)) { 841 BUG_ON(!virt_addr_valid(kaddr)); 842 return __pa(kaddr); 843 } else { 844 return page_to_phys(vmalloc_to_page(kaddr)) + 845 offset_in_page(kaddr); 846 } 847 } 848 849 /** 850 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 851 * @from: The virtual kernel start address of the range 852 * @to: The virtual kernel end address of the range (exclusive) 853 * @prot: The protection to be applied to this range 854 * 855 * The same virtual address as the kernel virtual address is also used 856 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 857 * physical pages. 858 */ 859 int create_hyp_mappings(void *from, void *to, pgprot_t prot) 860 { 861 phys_addr_t phys_addr; 862 unsigned long virt_addr; 863 unsigned long start = kern_hyp_va((unsigned long)from); 864 unsigned long end = kern_hyp_va((unsigned long)to); 865 866 if (is_kernel_in_hyp_mode()) 867 return 0; 868 869 start = start & PAGE_MASK; 870 end = PAGE_ALIGN(end); 871 872 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 873 int err; 874 875 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 876 err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, 877 virt_addr, virt_addr + PAGE_SIZE, 878 __phys_to_pfn(phys_addr), 879 prot); 880 if (err) 881 return err; 882 } 883 884 return 0; 885 } 886 887 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 888 unsigned long *haddr, pgprot_t prot) 889 { 890 pgd_t *pgd = hyp_pgd; 891 unsigned long base; 892 int ret = 0; 893 894 mutex_lock(&kvm_hyp_pgd_mutex); 895 896 /* 897 * This assumes that we have enough space below the idmap 898 * page to allocate our VAs. If not, the check below will 899 * kick. A potential alternative would be to detect that 900 * overflow and switch to an allocation above the idmap. 901 * 902 * The allocated size is always a multiple of PAGE_SIZE. 903 */ 904 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 905 base = io_map_base - size; 906 907 /* 908 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 909 * allocating the new area, as it would indicate we've 910 * overflowed the idmap/IO address range. 911 */ 912 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 913 ret = -ENOMEM; 914 else 915 io_map_base = base; 916 917 mutex_unlock(&kvm_hyp_pgd_mutex); 918 919 if (ret) 920 goto out; 921 922 if (__kvm_cpu_uses_extended_idmap()) 923 pgd = boot_hyp_pgd; 924 925 ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 926 base, base + size, 927 __phys_to_pfn(phys_addr), prot); 928 if (ret) 929 goto out; 930 931 *haddr = base + offset_in_page(phys_addr); 932 933 out: 934 return ret; 935 } 936 937 /** 938 * create_hyp_io_mappings - Map IO into both kernel and HYP 939 * @phys_addr: The physical start address which gets mapped 940 * @size: Size of the region being mapped 941 * @kaddr: Kernel VA for this mapping 942 * @haddr: HYP VA for this mapping 943 */ 944 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 945 void __iomem **kaddr, 946 void __iomem **haddr) 947 { 948 unsigned long addr; 949 int ret; 950 951 *kaddr = ioremap(phys_addr, size); 952 if (!*kaddr) 953 return -ENOMEM; 954 955 if (is_kernel_in_hyp_mode()) { 956 *haddr = *kaddr; 957 return 0; 958 } 959 960 ret = __create_hyp_private_mapping(phys_addr, size, 961 &addr, PAGE_HYP_DEVICE); 962 if (ret) { 963 iounmap(*kaddr); 964 *kaddr = NULL; 965 *haddr = NULL; 966 return ret; 967 } 968 969 *haddr = (void __iomem *)addr; 970 return 0; 971 } 972 973 /** 974 * create_hyp_exec_mappings - Map an executable range into HYP 975 * @phys_addr: The physical start address which gets mapped 976 * @size: Size of the region being mapped 977 * @haddr: HYP VA for this mapping 978 */ 979 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 980 void **haddr) 981 { 982 unsigned long addr; 983 int ret; 984 985 BUG_ON(is_kernel_in_hyp_mode()); 986 987 ret = __create_hyp_private_mapping(phys_addr, size, 988 &addr, PAGE_HYP_EXEC); 989 if (ret) { 990 *haddr = NULL; 991 return ret; 992 } 993 994 *haddr = (void *)addr; 995 return 0; 996 } 997 998 /** 999 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 1000 * @kvm: The KVM struct pointer for the VM. 1001 * 1002 * Allocates only the stage-2 HW PGD level table(s) of size defined by 1003 * stage2_pgd_size(kvm). 1004 * 1005 * Note we don't need locking here as this is only called when the VM is 1006 * created, which can only be done once. 1007 */ 1008 int kvm_alloc_stage2_pgd(struct kvm *kvm) 1009 { 1010 phys_addr_t pgd_phys; 1011 pgd_t *pgd; 1012 1013 if (kvm->arch.pgd != NULL) { 1014 kvm_err("kvm_arch already initialized?\n"); 1015 return -EINVAL; 1016 } 1017 1018 /* Allocate the HW PGD, making sure that each page gets its own refcount */ 1019 pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); 1020 if (!pgd) 1021 return -ENOMEM; 1022 1023 pgd_phys = virt_to_phys(pgd); 1024 if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) 1025 return -EINVAL; 1026 1027 kvm->arch.pgd = pgd; 1028 kvm->arch.pgd_phys = pgd_phys; 1029 return 0; 1030 } 1031 1032 static void stage2_unmap_memslot(struct kvm *kvm, 1033 struct kvm_memory_slot *memslot) 1034 { 1035 hva_t hva = memslot->userspace_addr; 1036 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1037 phys_addr_t size = PAGE_SIZE * memslot->npages; 1038 hva_t reg_end = hva + size; 1039 1040 /* 1041 * A memory region could potentially cover multiple VMAs, and any holes 1042 * between them, so iterate over all of them to find out if we should 1043 * unmap any of them. 1044 * 1045 * +--------------------------------------------+ 1046 * +---------------+----------------+ +----------------+ 1047 * | : VMA 1 | VMA 2 | | VMA 3 : | 1048 * +---------------+----------------+ +----------------+ 1049 * | memory region | 1050 * +--------------------------------------------+ 1051 */ 1052 do { 1053 struct vm_area_struct *vma = find_vma(current->mm, hva); 1054 hva_t vm_start, vm_end; 1055 1056 if (!vma || vma->vm_start >= reg_end) 1057 break; 1058 1059 /* 1060 * Take the intersection of this VMA with the memory region 1061 */ 1062 vm_start = max(hva, vma->vm_start); 1063 vm_end = min(reg_end, vma->vm_end); 1064 1065 if (!(vma->vm_flags & VM_PFNMAP)) { 1066 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1067 unmap_stage2_range(kvm, gpa, vm_end - vm_start); 1068 } 1069 hva = vm_end; 1070 } while (hva < reg_end); 1071 } 1072 1073 /** 1074 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1075 * @kvm: The struct kvm pointer 1076 * 1077 * Go through the memregions and unmap any regular RAM 1078 * backing memory already mapped to the VM. 1079 */ 1080 void stage2_unmap_vm(struct kvm *kvm) 1081 { 1082 struct kvm_memslots *slots; 1083 struct kvm_memory_slot *memslot; 1084 int idx; 1085 1086 idx = srcu_read_lock(&kvm->srcu); 1087 mmap_read_lock(current->mm); 1088 spin_lock(&kvm->mmu_lock); 1089 1090 slots = kvm_memslots(kvm); 1091 kvm_for_each_memslot(memslot, slots) 1092 stage2_unmap_memslot(kvm, memslot); 1093 1094 spin_unlock(&kvm->mmu_lock); 1095 mmap_read_unlock(current->mm); 1096 srcu_read_unlock(&kvm->srcu, idx); 1097 } 1098 1099 /** 1100 * kvm_free_stage2_pgd - free all stage-2 tables 1101 * @kvm: The KVM struct pointer for the VM. 1102 * 1103 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all 1104 * underlying level-2 and level-3 tables before freeing the actual level-1 table 1105 * and setting the struct pointer to NULL. 1106 */ 1107 void kvm_free_stage2_pgd(struct kvm *kvm) 1108 { 1109 void *pgd = NULL; 1110 1111 spin_lock(&kvm->mmu_lock); 1112 if (kvm->arch.pgd) { 1113 unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); 1114 pgd = READ_ONCE(kvm->arch.pgd); 1115 kvm->arch.pgd = NULL; 1116 kvm->arch.pgd_phys = 0; 1117 } 1118 spin_unlock(&kvm->mmu_lock); 1119 1120 /* Free the HW pgd, one page at a time */ 1121 if (pgd) 1122 free_pages_exact(pgd, stage2_pgd_size(kvm)); 1123 } 1124 1125 static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1126 phys_addr_t addr) 1127 { 1128 pgd_t *pgd; 1129 p4d_t *p4d; 1130 1131 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 1132 if (stage2_pgd_none(kvm, *pgd)) { 1133 if (!cache) 1134 return NULL; 1135 p4d = mmu_memory_cache_alloc(cache); 1136 stage2_pgd_populate(kvm, pgd, p4d); 1137 get_page(virt_to_page(pgd)); 1138 } 1139 1140 return stage2_p4d_offset(kvm, pgd, addr); 1141 } 1142 1143 static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1144 phys_addr_t addr) 1145 { 1146 p4d_t *p4d; 1147 pud_t *pud; 1148 1149 p4d = stage2_get_p4d(kvm, cache, addr); 1150 if (stage2_p4d_none(kvm, *p4d)) { 1151 if (!cache) 1152 return NULL; 1153 pud = mmu_memory_cache_alloc(cache); 1154 stage2_p4d_populate(kvm, p4d, pud); 1155 get_page(virt_to_page(p4d)); 1156 } 1157 1158 return stage2_pud_offset(kvm, p4d, addr); 1159 } 1160 1161 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1162 phys_addr_t addr) 1163 { 1164 pud_t *pud; 1165 pmd_t *pmd; 1166 1167 pud = stage2_get_pud(kvm, cache, addr); 1168 if (!pud || stage2_pud_huge(kvm, *pud)) 1169 return NULL; 1170 1171 if (stage2_pud_none(kvm, *pud)) { 1172 if (!cache) 1173 return NULL; 1174 pmd = mmu_memory_cache_alloc(cache); 1175 stage2_pud_populate(kvm, pud, pmd); 1176 get_page(virt_to_page(pud)); 1177 } 1178 1179 return stage2_pmd_offset(kvm, pud, addr); 1180 } 1181 1182 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 1183 *cache, phys_addr_t addr, const pmd_t *new_pmd) 1184 { 1185 pmd_t *pmd, old_pmd; 1186 1187 retry: 1188 pmd = stage2_get_pmd(kvm, cache, addr); 1189 VM_BUG_ON(!pmd); 1190 1191 old_pmd = *pmd; 1192 /* 1193 * Multiple vcpus faulting on the same PMD entry, can 1194 * lead to them sequentially updating the PMD with the 1195 * same value. Following the break-before-make 1196 * (pmd_clear() followed by tlb_flush()) process can 1197 * hinder forward progress due to refaults generated 1198 * on missing translations. 1199 * 1200 * Skip updating the page table if the entry is 1201 * unchanged. 1202 */ 1203 if (pmd_val(old_pmd) == pmd_val(*new_pmd)) 1204 return 0; 1205 1206 if (pmd_present(old_pmd)) { 1207 /* 1208 * If we already have PTE level mapping for this block, 1209 * we must unmap it to avoid inconsistent TLB state and 1210 * leaking the table page. We could end up in this situation 1211 * if the memory slot was marked for dirty logging and was 1212 * reverted, leaving PTE level mappings for the pages accessed 1213 * during the period. So, unmap the PTE level mapping for this 1214 * block and retry, as we could have released the upper level 1215 * table in the process. 1216 * 1217 * Normal THP split/merge follows mmu_notifier callbacks and do 1218 * get handled accordingly. 1219 */ 1220 if (!pmd_thp_or_huge(old_pmd)) { 1221 unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); 1222 goto retry; 1223 } 1224 /* 1225 * Mapping in huge pages should only happen through a 1226 * fault. If a page is merged into a transparent huge 1227 * page, the individual subpages of that huge page 1228 * should be unmapped through MMU notifiers before we 1229 * get here. 1230 * 1231 * Merging of CompoundPages is not supported; they 1232 * should become splitting first, unmapped, merged, 1233 * and mapped back in on-demand. 1234 */ 1235 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); 1236 pmd_clear(pmd); 1237 kvm_tlb_flush_vmid_ipa(kvm, addr); 1238 } else { 1239 get_page(virt_to_page(pmd)); 1240 } 1241 1242 kvm_set_pmd(pmd, *new_pmd); 1243 return 0; 1244 } 1245 1246 static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1247 phys_addr_t addr, const pud_t *new_pudp) 1248 { 1249 pud_t *pudp, old_pud; 1250 1251 retry: 1252 pudp = stage2_get_pud(kvm, cache, addr); 1253 VM_BUG_ON(!pudp); 1254 1255 old_pud = *pudp; 1256 1257 /* 1258 * A large number of vcpus faulting on the same stage 2 entry, 1259 * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). 1260 * Skip updating the page tables if there is no change. 1261 */ 1262 if (pud_val(old_pud) == pud_val(*new_pudp)) 1263 return 0; 1264 1265 if (stage2_pud_present(kvm, old_pud)) { 1266 /* 1267 * If we already have table level mapping for this block, unmap 1268 * the range for this block and retry. 1269 */ 1270 if (!stage2_pud_huge(kvm, old_pud)) { 1271 unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); 1272 goto retry; 1273 } 1274 1275 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); 1276 stage2_pud_clear(kvm, pudp); 1277 kvm_tlb_flush_vmid_ipa(kvm, addr); 1278 } else { 1279 get_page(virt_to_page(pudp)); 1280 } 1281 1282 kvm_set_pud(pudp, *new_pudp); 1283 return 0; 1284 } 1285 1286 /* 1287 * stage2_get_leaf_entry - walk the stage2 VM page tables and return 1288 * true if a valid and present leaf-entry is found. A pointer to the 1289 * leaf-entry is returned in the appropriate level variable - pudpp, 1290 * pmdpp, ptepp. 1291 */ 1292 static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, 1293 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) 1294 { 1295 pud_t *pudp; 1296 pmd_t *pmdp; 1297 pte_t *ptep; 1298 1299 *pudpp = NULL; 1300 *pmdpp = NULL; 1301 *ptepp = NULL; 1302 1303 pudp = stage2_get_pud(kvm, NULL, addr); 1304 if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) 1305 return false; 1306 1307 if (stage2_pud_huge(kvm, *pudp)) { 1308 *pudpp = pudp; 1309 return true; 1310 } 1311 1312 pmdp = stage2_pmd_offset(kvm, pudp, addr); 1313 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) 1314 return false; 1315 1316 if (pmd_thp_or_huge(*pmdp)) { 1317 *pmdpp = pmdp; 1318 return true; 1319 } 1320 1321 ptep = pte_offset_kernel(pmdp, addr); 1322 if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) 1323 return false; 1324 1325 *ptepp = ptep; 1326 return true; 1327 } 1328 1329 static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) 1330 { 1331 pud_t *pudp; 1332 pmd_t *pmdp; 1333 pte_t *ptep; 1334 bool found; 1335 1336 found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); 1337 if (!found) 1338 return false; 1339 1340 if (pudp) 1341 return kvm_s2pud_exec(pudp); 1342 else if (pmdp) 1343 return kvm_s2pmd_exec(pmdp); 1344 else 1345 return kvm_s2pte_exec(ptep); 1346 } 1347 1348 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1349 phys_addr_t addr, const pte_t *new_pte, 1350 unsigned long flags) 1351 { 1352 pud_t *pud; 1353 pmd_t *pmd; 1354 pte_t *pte, old_pte; 1355 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; 1356 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; 1357 1358 VM_BUG_ON(logging_active && !cache); 1359 1360 /* Create stage-2 page table mapping - Levels 0 and 1 */ 1361 pud = stage2_get_pud(kvm, cache, addr); 1362 if (!pud) { 1363 /* 1364 * Ignore calls from kvm_set_spte_hva for unallocated 1365 * address ranges. 1366 */ 1367 return 0; 1368 } 1369 1370 /* 1371 * While dirty page logging - dissolve huge PUD, then continue 1372 * on to allocate page. 1373 */ 1374 if (logging_active) 1375 stage2_dissolve_pud(kvm, addr, pud); 1376 1377 if (stage2_pud_none(kvm, *pud)) { 1378 if (!cache) 1379 return 0; /* ignore calls from kvm_set_spte_hva */ 1380 pmd = mmu_memory_cache_alloc(cache); 1381 stage2_pud_populate(kvm, pud, pmd); 1382 get_page(virt_to_page(pud)); 1383 } 1384 1385 pmd = stage2_pmd_offset(kvm, pud, addr); 1386 if (!pmd) { 1387 /* 1388 * Ignore calls from kvm_set_spte_hva for unallocated 1389 * address ranges. 1390 */ 1391 return 0; 1392 } 1393 1394 /* 1395 * While dirty page logging - dissolve huge PMD, then continue on to 1396 * allocate page. 1397 */ 1398 if (logging_active) 1399 stage2_dissolve_pmd(kvm, addr, pmd); 1400 1401 /* Create stage-2 page mappings - Level 2 */ 1402 if (pmd_none(*pmd)) { 1403 if (!cache) 1404 return 0; /* ignore calls from kvm_set_spte_hva */ 1405 pte = mmu_memory_cache_alloc(cache); 1406 kvm_pmd_populate(pmd, pte); 1407 get_page(virt_to_page(pmd)); 1408 } 1409 1410 pte = pte_offset_kernel(pmd, addr); 1411 1412 if (iomap && pte_present(*pte)) 1413 return -EFAULT; 1414 1415 /* Create 2nd stage page table mapping - Level 3 */ 1416 old_pte = *pte; 1417 if (pte_present(old_pte)) { 1418 /* Skip page table update if there is no change */ 1419 if (pte_val(old_pte) == pte_val(*new_pte)) 1420 return 0; 1421 1422 kvm_set_pte(pte, __pte(0)); 1423 kvm_tlb_flush_vmid_ipa(kvm, addr); 1424 } else { 1425 get_page(virt_to_page(pte)); 1426 } 1427 1428 kvm_set_pte(pte, *new_pte); 1429 return 0; 1430 } 1431 1432 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 1433 static int stage2_ptep_test_and_clear_young(pte_t *pte) 1434 { 1435 if (pte_young(*pte)) { 1436 *pte = pte_mkold(*pte); 1437 return 1; 1438 } 1439 return 0; 1440 } 1441 #else 1442 static int stage2_ptep_test_and_clear_young(pte_t *pte) 1443 { 1444 return __ptep_test_and_clear_young(pte); 1445 } 1446 #endif 1447 1448 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) 1449 { 1450 return stage2_ptep_test_and_clear_young((pte_t *)pmd); 1451 } 1452 1453 static int stage2_pudp_test_and_clear_young(pud_t *pud) 1454 { 1455 return stage2_ptep_test_and_clear_young((pte_t *)pud); 1456 } 1457 1458 /** 1459 * kvm_phys_addr_ioremap - map a device range to guest IPA 1460 * 1461 * @kvm: The KVM pointer 1462 * @guest_ipa: The IPA at which to insert the mapping 1463 * @pa: The physical address of the device 1464 * @size: The size of the mapping 1465 */ 1466 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1467 phys_addr_t pa, unsigned long size, bool writable) 1468 { 1469 phys_addr_t addr, end; 1470 int ret = 0; 1471 unsigned long pfn; 1472 struct kvm_mmu_memory_cache cache = { 0, }; 1473 1474 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; 1475 pfn = __phys_to_pfn(pa); 1476 1477 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 1478 pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); 1479 1480 if (writable) 1481 pte = kvm_s2pte_mkwrite(pte); 1482 1483 ret = mmu_topup_memory_cache(&cache, 1484 kvm_mmu_cache_min_pages(kvm), 1485 KVM_NR_MEM_OBJS); 1486 if (ret) 1487 goto out; 1488 spin_lock(&kvm->mmu_lock); 1489 ret = stage2_set_pte(kvm, &cache, addr, &pte, 1490 KVM_S2PTE_FLAG_IS_IOMAP); 1491 spin_unlock(&kvm->mmu_lock); 1492 if (ret) 1493 goto out; 1494 1495 pfn++; 1496 } 1497 1498 out: 1499 mmu_free_memory_cache(&cache); 1500 return ret; 1501 } 1502 1503 /** 1504 * stage2_wp_ptes - write protect PMD range 1505 * @pmd: pointer to pmd entry 1506 * @addr: range start address 1507 * @end: range end address 1508 */ 1509 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 1510 { 1511 pte_t *pte; 1512 1513 pte = pte_offset_kernel(pmd, addr); 1514 do { 1515 if (!pte_none(*pte)) { 1516 if (!kvm_s2pte_readonly(pte)) 1517 kvm_set_s2pte_readonly(pte); 1518 } 1519 } while (pte++, addr += PAGE_SIZE, addr != end); 1520 } 1521 1522 /** 1523 * stage2_wp_pmds - write protect PUD range 1524 * kvm: kvm instance for the VM 1525 * @pud: pointer to pud entry 1526 * @addr: range start address 1527 * @end: range end address 1528 */ 1529 static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, 1530 phys_addr_t addr, phys_addr_t end) 1531 { 1532 pmd_t *pmd; 1533 phys_addr_t next; 1534 1535 pmd = stage2_pmd_offset(kvm, pud, addr); 1536 1537 do { 1538 next = stage2_pmd_addr_end(kvm, addr, end); 1539 if (!pmd_none(*pmd)) { 1540 if (pmd_thp_or_huge(*pmd)) { 1541 if (!kvm_s2pmd_readonly(pmd)) 1542 kvm_set_s2pmd_readonly(pmd); 1543 } else { 1544 stage2_wp_ptes(pmd, addr, next); 1545 } 1546 } 1547 } while (pmd++, addr = next, addr != end); 1548 } 1549 1550 /** 1551 * stage2_wp_puds - write protect P4D range 1552 * @pgd: pointer to pgd entry 1553 * @addr: range start address 1554 * @end: range end address 1555 */ 1556 static void stage2_wp_puds(struct kvm *kvm, p4d_t *p4d, 1557 phys_addr_t addr, phys_addr_t end) 1558 { 1559 pud_t *pud; 1560 phys_addr_t next; 1561 1562 pud = stage2_pud_offset(kvm, p4d, addr); 1563 do { 1564 next = stage2_pud_addr_end(kvm, addr, end); 1565 if (!stage2_pud_none(kvm, *pud)) { 1566 if (stage2_pud_huge(kvm, *pud)) { 1567 if (!kvm_s2pud_readonly(pud)) 1568 kvm_set_s2pud_readonly(pud); 1569 } else { 1570 stage2_wp_pmds(kvm, pud, addr, next); 1571 } 1572 } 1573 } while (pud++, addr = next, addr != end); 1574 } 1575 1576 /** 1577 * stage2_wp_p4ds - write protect PGD range 1578 * @pgd: pointer to pgd entry 1579 * @addr: range start address 1580 * @end: range end address 1581 */ 1582 static void stage2_wp_p4ds(struct kvm *kvm, pgd_t *pgd, 1583 phys_addr_t addr, phys_addr_t end) 1584 { 1585 p4d_t *p4d; 1586 phys_addr_t next; 1587 1588 p4d = stage2_p4d_offset(kvm, pgd, addr); 1589 do { 1590 next = stage2_p4d_addr_end(kvm, addr, end); 1591 if (!stage2_p4d_none(kvm, *p4d)) 1592 stage2_wp_puds(kvm, p4d, addr, next); 1593 } while (p4d++, addr = next, addr != end); 1594 } 1595 1596 /** 1597 * stage2_wp_range() - write protect stage2 memory region range 1598 * @kvm: The KVM pointer 1599 * @addr: Start address of range 1600 * @end: End address of range 1601 */ 1602 static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) 1603 { 1604 pgd_t *pgd; 1605 phys_addr_t next; 1606 1607 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 1608 do { 1609 /* 1610 * Release kvm_mmu_lock periodically if the memory region is 1611 * large. Otherwise, we may see kernel panics with 1612 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, 1613 * CONFIG_LOCKDEP. Additionally, holding the lock too long 1614 * will also starve other vCPUs. We have to also make sure 1615 * that the page tables are not freed while we released 1616 * the lock. 1617 */ 1618 cond_resched_lock(&kvm->mmu_lock); 1619 if (!READ_ONCE(kvm->arch.pgd)) 1620 break; 1621 next = stage2_pgd_addr_end(kvm, addr, end); 1622 if (stage2_pgd_present(kvm, *pgd)) 1623 stage2_wp_p4ds(kvm, pgd, addr, next); 1624 } while (pgd++, addr = next, addr != end); 1625 } 1626 1627 /** 1628 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1629 * @kvm: The KVM pointer 1630 * @slot: The memory slot to write protect 1631 * 1632 * Called to start logging dirty pages after memory region 1633 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1634 * all present PUD, PMD and PTEs are write protected in the memory region. 1635 * Afterwards read of dirty page log can be called. 1636 * 1637 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1638 * serializing operations for VM memory regions. 1639 */ 1640 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1641 { 1642 struct kvm_memslots *slots = kvm_memslots(kvm); 1643 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1644 phys_addr_t start, end; 1645 1646 if (WARN_ON_ONCE(!memslot)) 1647 return; 1648 1649 start = memslot->base_gfn << PAGE_SHIFT; 1650 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1651 1652 spin_lock(&kvm->mmu_lock); 1653 stage2_wp_range(kvm, start, end); 1654 spin_unlock(&kvm->mmu_lock); 1655 kvm_flush_remote_tlbs(kvm); 1656 } 1657 1658 /** 1659 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 1660 * @kvm: The KVM pointer 1661 * @slot: The memory slot associated with mask 1662 * @gfn_offset: The gfn offset in memory slot 1663 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 1664 * slot to be write protected 1665 * 1666 * Walks bits set in mask write protects the associated pte's. Caller must 1667 * acquire kvm_mmu_lock. 1668 */ 1669 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1670 struct kvm_memory_slot *slot, 1671 gfn_t gfn_offset, unsigned long mask) 1672 { 1673 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1674 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1675 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1676 1677 stage2_wp_range(kvm, start, end); 1678 } 1679 1680 /* 1681 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1682 * dirty pages. 1683 * 1684 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1685 * enable dirty logging for them. 1686 */ 1687 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1688 struct kvm_memory_slot *slot, 1689 gfn_t gfn_offset, unsigned long mask) 1690 { 1691 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1692 } 1693 1694 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) 1695 { 1696 __clean_dcache_guest_page(pfn, size); 1697 } 1698 1699 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) 1700 { 1701 __invalidate_icache_guest_page(pfn, size); 1702 } 1703 1704 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1705 { 1706 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1707 } 1708 1709 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1710 unsigned long hva, 1711 unsigned long map_size) 1712 { 1713 gpa_t gpa_start; 1714 hva_t uaddr_start, uaddr_end; 1715 size_t size; 1716 1717 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1718 if (map_size == PAGE_SIZE) 1719 return true; 1720 1721 size = memslot->npages * PAGE_SIZE; 1722 1723 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1724 1725 uaddr_start = memslot->userspace_addr; 1726 uaddr_end = uaddr_start + size; 1727 1728 /* 1729 * Pages belonging to memslots that don't have the same alignment 1730 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1731 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1732 * 1733 * Consider a layout like the following: 1734 * 1735 * memslot->userspace_addr: 1736 * +-----+--------------------+--------------------+---+ 1737 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1738 * +-----+--------------------+--------------------+---+ 1739 * 1740 * memslot->base_gfn << PAGE_SHIFT: 1741 * +---+--------------------+--------------------+-----+ 1742 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1743 * +---+--------------------+--------------------+-----+ 1744 * 1745 * If we create those stage-2 blocks, we'll end up with this incorrect 1746 * mapping: 1747 * d -> f 1748 * e -> g 1749 * f -> h 1750 */ 1751 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1752 return false; 1753 1754 /* 1755 * Next, let's make sure we're not trying to map anything not covered 1756 * by the memslot. This means we have to prohibit block size mappings 1757 * for the beginning and end of a non-block aligned and non-block sized 1758 * memory slot (illustrated by the head and tail parts of the 1759 * userspace view above containing pages 'abcde' and 'xyz', 1760 * respectively). 1761 * 1762 * Note that it doesn't matter if we do the check using the 1763 * userspace_addr or the base_gfn, as both are equally aligned (per 1764 * the check above) and equally sized. 1765 */ 1766 return (hva & ~(map_size - 1)) >= uaddr_start && 1767 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1768 } 1769 1770 /* 1771 * Check if the given hva is backed by a transparent huge page (THP) and 1772 * whether it can be mapped using block mapping in stage2. If so, adjust 1773 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1774 * supported. This will need to be updated to support other THP sizes. 1775 * 1776 * Returns the size of the mapping. 1777 */ 1778 static unsigned long 1779 transparent_hugepage_adjust(struct kvm_memory_slot *memslot, 1780 unsigned long hva, kvm_pfn_t *pfnp, 1781 phys_addr_t *ipap) 1782 { 1783 kvm_pfn_t pfn = *pfnp; 1784 1785 /* 1786 * Make sure the adjustment is done only for THP pages. Also make 1787 * sure that the HVA and IPA are sufficiently aligned and that the 1788 * block map is contained within the memslot. 1789 */ 1790 if (kvm_is_transparent_hugepage(pfn) && 1791 fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1792 /* 1793 * The address we faulted on is backed by a transparent huge 1794 * page. However, because we map the compound huge page and 1795 * not the individual tail page, we need to transfer the 1796 * refcount to the head page. We have to be careful that the 1797 * THP doesn't start to split while we are adjusting the 1798 * refcounts. 1799 * 1800 * We are sure this doesn't happen, because mmu_notifier_retry 1801 * was successful and we are holding the mmu_lock, so if this 1802 * THP is trying to split, it will be blocked in the mmu 1803 * notifier before touching any of the pages, specifically 1804 * before being able to call __split_huge_page_refcount(). 1805 * 1806 * We can therefore safely transfer the refcount from PG_tail 1807 * to PG_head and switch the pfn from a tail page to the head 1808 * page accordingly. 1809 */ 1810 *ipap &= PMD_MASK; 1811 kvm_release_pfn_clean(pfn); 1812 pfn &= ~(PTRS_PER_PMD - 1); 1813 kvm_get_pfn(pfn); 1814 *pfnp = pfn; 1815 1816 return PMD_SIZE; 1817 } 1818 1819 /* Use page mapping if we cannot use block mapping. */ 1820 return PAGE_SIZE; 1821 } 1822 1823 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1824 struct kvm_memory_slot *memslot, unsigned long hva, 1825 unsigned long fault_status) 1826 { 1827 int ret; 1828 bool write_fault, writable, force_pte = false; 1829 bool exec_fault, needs_exec; 1830 unsigned long mmu_seq; 1831 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 1832 struct kvm *kvm = vcpu->kvm; 1833 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1834 struct vm_area_struct *vma; 1835 short vma_shift; 1836 kvm_pfn_t pfn; 1837 pgprot_t mem_type = PAGE_S2; 1838 bool logging_active = memslot_is_logging(memslot); 1839 unsigned long vma_pagesize, flags = 0; 1840 1841 write_fault = kvm_is_write_fault(vcpu); 1842 exec_fault = kvm_vcpu_trap_is_iabt(vcpu); 1843 VM_BUG_ON(write_fault && exec_fault); 1844 1845 if (fault_status == FSC_PERM && !write_fault && !exec_fault) { 1846 kvm_err("Unexpected L2 read permission error\n"); 1847 return -EFAULT; 1848 } 1849 1850 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1851 mmap_read_lock(current->mm); 1852 vma = find_vma_intersection(current->mm, hva, hva + 1); 1853 if (unlikely(!vma)) { 1854 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1855 mmap_read_unlock(current->mm); 1856 return -EFAULT; 1857 } 1858 1859 if (is_vm_hugetlb_page(vma)) 1860 vma_shift = huge_page_shift(hstate_vma(vma)); 1861 else 1862 vma_shift = PAGE_SHIFT; 1863 1864 vma_pagesize = 1ULL << vma_shift; 1865 if (logging_active || 1866 (vma->vm_flags & VM_PFNMAP) || 1867 !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { 1868 force_pte = true; 1869 vma_pagesize = PAGE_SIZE; 1870 } 1871 1872 /* 1873 * The stage2 has a minimum of 2 level table (For arm64 see 1874 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can 1875 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). 1876 * As for PUD huge maps, we must make sure that we have at least 1877 * 3 levels, i.e, PMD is not folded. 1878 */ 1879 if (vma_pagesize == PMD_SIZE || 1880 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) 1881 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; 1882 mmap_read_unlock(current->mm); 1883 1884 /* We need minimum second+third level pages */ 1885 ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), 1886 KVM_NR_MEM_OBJS); 1887 if (ret) 1888 return ret; 1889 1890 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1891 /* 1892 * Ensure the read of mmu_notifier_seq happens before we call 1893 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1894 * the page we just got a reference to gets unmapped before we have a 1895 * chance to grab the mmu_lock, which ensure that if the page gets 1896 * unmapped afterwards, the call to kvm_unmap_hva will take it away 1897 * from us again properly. This smp_rmb() interacts with the smp_wmb() 1898 * in kvm_mmu_notifier_invalidate_<page|range_end>. 1899 */ 1900 smp_rmb(); 1901 1902 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1903 if (pfn == KVM_PFN_ERR_HWPOISON) { 1904 kvm_send_hwpoison_signal(hva, vma_shift); 1905 return 0; 1906 } 1907 if (is_error_noslot_pfn(pfn)) 1908 return -EFAULT; 1909 1910 if (kvm_is_device_pfn(pfn)) { 1911 mem_type = PAGE_S2_DEVICE; 1912 flags |= KVM_S2PTE_FLAG_IS_IOMAP; 1913 } else if (logging_active) { 1914 /* 1915 * Faults on pages in a memslot with logging enabled 1916 * should not be mapped with huge pages (it introduces churn 1917 * and performance degradation), so force a pte mapping. 1918 */ 1919 flags |= KVM_S2_FLAG_LOGGING_ACTIVE; 1920 1921 /* 1922 * Only actually map the page as writable if this was a write 1923 * fault. 1924 */ 1925 if (!write_fault) 1926 writable = false; 1927 } 1928 1929 if (exec_fault && is_iomap(flags)) 1930 return -ENOEXEC; 1931 1932 spin_lock(&kvm->mmu_lock); 1933 if (mmu_notifier_retry(kvm, mmu_seq)) 1934 goto out_unlock; 1935 1936 /* 1937 * If we are not forced to use page mapping, check if we are 1938 * backed by a THP and thus use block mapping if possible. 1939 */ 1940 if (vma_pagesize == PAGE_SIZE && !force_pte) 1941 vma_pagesize = transparent_hugepage_adjust(memslot, hva, 1942 &pfn, &fault_ipa); 1943 if (writable) 1944 kvm_set_pfn_dirty(pfn); 1945 1946 if (fault_status != FSC_PERM && !is_iomap(flags)) 1947 clean_dcache_guest_page(pfn, vma_pagesize); 1948 1949 if (exec_fault) 1950 invalidate_icache_guest_page(pfn, vma_pagesize); 1951 1952 /* 1953 * If we took an execution fault we have made the 1954 * icache/dcache coherent above and should now let the s2 1955 * mapping be executable. 1956 * 1957 * Write faults (!exec_fault && FSC_PERM) are orthogonal to 1958 * execute permissions, and we preserve whatever we have. 1959 */ 1960 needs_exec = exec_fault || 1961 (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); 1962 1963 if (vma_pagesize == PUD_SIZE) { 1964 pud_t new_pud = kvm_pfn_pud(pfn, mem_type); 1965 1966 new_pud = kvm_pud_mkhuge(new_pud); 1967 if (writable) 1968 new_pud = kvm_s2pud_mkwrite(new_pud); 1969 1970 if (needs_exec) 1971 new_pud = kvm_s2pud_mkexec(new_pud); 1972 1973 ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); 1974 } else if (vma_pagesize == PMD_SIZE) { 1975 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); 1976 1977 new_pmd = kvm_pmd_mkhuge(new_pmd); 1978 1979 if (writable) 1980 new_pmd = kvm_s2pmd_mkwrite(new_pmd); 1981 1982 if (needs_exec) 1983 new_pmd = kvm_s2pmd_mkexec(new_pmd); 1984 1985 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1986 } else { 1987 pte_t new_pte = kvm_pfn_pte(pfn, mem_type); 1988 1989 if (writable) { 1990 new_pte = kvm_s2pte_mkwrite(new_pte); 1991 mark_page_dirty(kvm, gfn); 1992 } 1993 1994 if (needs_exec) 1995 new_pte = kvm_s2pte_mkexec(new_pte); 1996 1997 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1998 } 1999 2000 out_unlock: 2001 spin_unlock(&kvm->mmu_lock); 2002 kvm_set_pfn_accessed(pfn); 2003 kvm_release_pfn_clean(pfn); 2004 return ret; 2005 } 2006 2007 /* 2008 * Resolve the access fault by making the page young again. 2009 * Note that because the faulting entry is guaranteed not to be 2010 * cached in the TLB, we don't need to invalidate anything. 2011 * Only the HW Access Flag updates are supported for Stage 2 (no DBM), 2012 * so there is no need for atomic (pte|pmd)_mkyoung operations. 2013 */ 2014 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 2015 { 2016 pud_t *pud; 2017 pmd_t *pmd; 2018 pte_t *pte; 2019 kvm_pfn_t pfn; 2020 bool pfn_valid = false; 2021 2022 trace_kvm_access_fault(fault_ipa); 2023 2024 spin_lock(&vcpu->kvm->mmu_lock); 2025 2026 if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) 2027 goto out; 2028 2029 if (pud) { /* HugeTLB */ 2030 *pud = kvm_s2pud_mkyoung(*pud); 2031 pfn = kvm_pud_pfn(*pud); 2032 pfn_valid = true; 2033 } else if (pmd) { /* THP, HugeTLB */ 2034 *pmd = pmd_mkyoung(*pmd); 2035 pfn = pmd_pfn(*pmd); 2036 pfn_valid = true; 2037 } else { 2038 *pte = pte_mkyoung(*pte); /* Just a page... */ 2039 pfn = pte_pfn(*pte); 2040 pfn_valid = true; 2041 } 2042 2043 out: 2044 spin_unlock(&vcpu->kvm->mmu_lock); 2045 if (pfn_valid) 2046 kvm_set_pfn_accessed(pfn); 2047 } 2048 2049 /** 2050 * kvm_handle_guest_abort - handles all 2nd stage aborts 2051 * @vcpu: the VCPU pointer 2052 * @run: the kvm_run structure 2053 * 2054 * Any abort that gets to the host is almost guaranteed to be caused by a 2055 * missing second stage translation table entry, which can mean that either the 2056 * guest simply needs more memory and we must allocate an appropriate page or it 2057 * can mean that the guest tried to access I/O memory, which is emulated by user 2058 * space. The distinction is based on the IPA causing the fault and whether this 2059 * memory region has been registered as standard RAM by user space. 2060 */ 2061 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) 2062 { 2063 unsigned long fault_status; 2064 phys_addr_t fault_ipa; 2065 struct kvm_memory_slot *memslot; 2066 unsigned long hva; 2067 bool is_iabt, write_fault, writable; 2068 gfn_t gfn; 2069 int ret, idx; 2070 2071 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 2072 2073 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 2074 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 2075 2076 /* Synchronous External Abort? */ 2077 if (kvm_vcpu_dabt_isextabt(vcpu)) { 2078 /* 2079 * For RAS the host kernel may handle this abort. 2080 * There is no need to pass the error into the guest. 2081 */ 2082 if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) 2083 return 1; 2084 2085 if (unlikely(!is_iabt)) { 2086 kvm_inject_vabt(vcpu); 2087 return 1; 2088 } 2089 } 2090 2091 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 2092 kvm_vcpu_get_hfar(vcpu), fault_ipa); 2093 2094 /* Check the stage-2 fault is trans. fault or write fault */ 2095 if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 2096 fault_status != FSC_ACCESS) { 2097 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 2098 kvm_vcpu_trap_get_class(vcpu), 2099 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 2100 (unsigned long)kvm_vcpu_get_hsr(vcpu)); 2101 return -EFAULT; 2102 } 2103 2104 idx = srcu_read_lock(&vcpu->kvm->srcu); 2105 2106 gfn = fault_ipa >> PAGE_SHIFT; 2107 memslot = gfn_to_memslot(vcpu->kvm, gfn); 2108 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 2109 write_fault = kvm_is_write_fault(vcpu); 2110 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 2111 if (is_iabt) { 2112 /* Prefetch Abort on I/O address */ 2113 ret = -ENOEXEC; 2114 goto out; 2115 } 2116 2117 /* 2118 * Check for a cache maintenance operation. Since we 2119 * ended-up here, we know it is outside of any memory 2120 * slot. But we can't find out if that is for a device, 2121 * or if the guest is just being stupid. The only thing 2122 * we know for sure is that this range cannot be cached. 2123 * 2124 * So let's assume that the guest is just being 2125 * cautious, and skip the instruction. 2126 */ 2127 if (kvm_vcpu_dabt_is_cm(vcpu)) { 2128 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 2129 ret = 1; 2130 goto out_unlock; 2131 } 2132 2133 /* 2134 * The IPA is reported as [MAX:12], so we need to 2135 * complement it with the bottom 12 bits from the 2136 * faulting VA. This is always 12 bits, irrespective 2137 * of the page size. 2138 */ 2139 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 2140 ret = io_mem_abort(vcpu, run, fault_ipa); 2141 goto out_unlock; 2142 } 2143 2144 /* Userspace should not be able to register out-of-bounds IPAs */ 2145 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); 2146 2147 if (fault_status == FSC_ACCESS) { 2148 handle_access_fault(vcpu, fault_ipa); 2149 ret = 1; 2150 goto out_unlock; 2151 } 2152 2153 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); 2154 if (ret == 0) 2155 ret = 1; 2156 out: 2157 if (ret == -ENOEXEC) { 2158 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2159 ret = 1; 2160 } 2161 out_unlock: 2162 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2163 return ret; 2164 } 2165 2166 static int handle_hva_to_gpa(struct kvm *kvm, 2167 unsigned long start, 2168 unsigned long end, 2169 int (*handler)(struct kvm *kvm, 2170 gpa_t gpa, u64 size, 2171 void *data), 2172 void *data) 2173 { 2174 struct kvm_memslots *slots; 2175 struct kvm_memory_slot *memslot; 2176 int ret = 0; 2177 2178 slots = kvm_memslots(kvm); 2179 2180 /* we only care about the pages that the guest sees */ 2181 kvm_for_each_memslot(memslot, slots) { 2182 unsigned long hva_start, hva_end; 2183 gfn_t gpa; 2184 2185 hva_start = max(start, memslot->userspace_addr); 2186 hva_end = min(end, memslot->userspace_addr + 2187 (memslot->npages << PAGE_SHIFT)); 2188 if (hva_start >= hva_end) 2189 continue; 2190 2191 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; 2192 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); 2193 } 2194 2195 return ret; 2196 } 2197 2198 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2199 { 2200 unmap_stage2_range(kvm, gpa, size); 2201 return 0; 2202 } 2203 2204 int kvm_unmap_hva_range(struct kvm *kvm, 2205 unsigned long start, unsigned long end) 2206 { 2207 if (!kvm->arch.pgd) 2208 return 0; 2209 2210 trace_kvm_unmap_hva_range(start, end); 2211 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); 2212 return 0; 2213 } 2214 2215 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2216 { 2217 pte_t *pte = (pte_t *)data; 2218 2219 WARN_ON(size != PAGE_SIZE); 2220 /* 2221 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE 2222 * flag clear because MMU notifiers will have unmapped a huge PMD before 2223 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and 2224 * therefore stage2_set_pte() never needs to clear out a huge PMD 2225 * through this calling path. 2226 */ 2227 stage2_set_pte(kvm, NULL, gpa, pte, 0); 2228 return 0; 2229 } 2230 2231 2232 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 2233 { 2234 unsigned long end = hva + PAGE_SIZE; 2235 kvm_pfn_t pfn = pte_pfn(pte); 2236 pte_t stage2_pte; 2237 2238 if (!kvm->arch.pgd) 2239 return 0; 2240 2241 trace_kvm_set_spte_hva(hva); 2242 2243 /* 2244 * We've moved a page around, probably through CoW, so let's treat it 2245 * just like a translation fault and clean the cache to the PoC. 2246 */ 2247 clean_dcache_guest_page(pfn, PAGE_SIZE); 2248 stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); 2249 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 2250 2251 return 0; 2252 } 2253 2254 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2255 { 2256 pud_t *pud; 2257 pmd_t *pmd; 2258 pte_t *pte; 2259 2260 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); 2261 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) 2262 return 0; 2263 2264 if (pud) 2265 return stage2_pudp_test_and_clear_young(pud); 2266 else if (pmd) 2267 return stage2_pmdp_test_and_clear_young(pmd); 2268 else 2269 return stage2_ptep_test_and_clear_young(pte); 2270 } 2271 2272 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2273 { 2274 pud_t *pud; 2275 pmd_t *pmd; 2276 pte_t *pte; 2277 2278 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); 2279 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) 2280 return 0; 2281 2282 if (pud) 2283 return kvm_s2pud_young(*pud); 2284 else if (pmd) 2285 return pmd_young(*pmd); 2286 else 2287 return pte_young(*pte); 2288 } 2289 2290 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 2291 { 2292 if (!kvm->arch.pgd) 2293 return 0; 2294 trace_kvm_age_hva(start, end); 2295 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); 2296 } 2297 2298 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 2299 { 2300 if (!kvm->arch.pgd) 2301 return 0; 2302 trace_kvm_test_age_hva(hva); 2303 return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, 2304 kvm_test_age_hva_handler, NULL); 2305 } 2306 2307 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 2308 { 2309 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 2310 } 2311 2312 phys_addr_t kvm_mmu_get_httbr(void) 2313 { 2314 if (__kvm_cpu_uses_extended_idmap()) 2315 return virt_to_phys(merged_hyp_pgd); 2316 else 2317 return virt_to_phys(hyp_pgd); 2318 } 2319 2320 phys_addr_t kvm_get_idmap_vector(void) 2321 { 2322 return hyp_idmap_vector; 2323 } 2324 2325 static int kvm_map_idmap_text(pgd_t *pgd) 2326 { 2327 int err; 2328 2329 /* Create the idmap in the boot page tables */ 2330 err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 2331 hyp_idmap_start, hyp_idmap_end, 2332 __phys_to_pfn(hyp_idmap_start), 2333 PAGE_HYP_EXEC); 2334 if (err) 2335 kvm_err("Failed to idmap %lx-%lx\n", 2336 hyp_idmap_start, hyp_idmap_end); 2337 2338 return err; 2339 } 2340 2341 int kvm_mmu_init(void) 2342 { 2343 int err; 2344 2345 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2346 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2347 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2348 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2349 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2350 2351 /* 2352 * We rely on the linker script to ensure at build time that the HYP 2353 * init code does not cross a page boundary. 2354 */ 2355 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2356 2357 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2358 kvm_debug("HYP VA range: %lx:%lx\n", 2359 kern_hyp_va(PAGE_OFFSET), 2360 kern_hyp_va((unsigned long)high_memory - 1)); 2361 2362 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2363 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2364 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2365 /* 2366 * The idmap page is intersecting with the VA space, 2367 * it is not safe to continue further. 2368 */ 2369 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2370 err = -EINVAL; 2371 goto out; 2372 } 2373 2374 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); 2375 if (!hyp_pgd) { 2376 kvm_err("Hyp mode PGD not allocated\n"); 2377 err = -ENOMEM; 2378 goto out; 2379 } 2380 2381 if (__kvm_cpu_uses_extended_idmap()) { 2382 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 2383 hyp_pgd_order); 2384 if (!boot_hyp_pgd) { 2385 kvm_err("Hyp boot PGD not allocated\n"); 2386 err = -ENOMEM; 2387 goto out; 2388 } 2389 2390 err = kvm_map_idmap_text(boot_hyp_pgd); 2391 if (err) 2392 goto out; 2393 2394 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 2395 if (!merged_hyp_pgd) { 2396 kvm_err("Failed to allocate extra HYP pgd\n"); 2397 goto out; 2398 } 2399 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, 2400 hyp_idmap_start); 2401 } else { 2402 err = kvm_map_idmap_text(hyp_pgd); 2403 if (err) 2404 goto out; 2405 } 2406 2407 io_map_base = hyp_idmap_start; 2408 return 0; 2409 out: 2410 free_hyp_pgds(); 2411 return err; 2412 } 2413 2414 void kvm_arch_commit_memory_region(struct kvm *kvm, 2415 const struct kvm_userspace_memory_region *mem, 2416 struct kvm_memory_slot *old, 2417 const struct kvm_memory_slot *new, 2418 enum kvm_mr_change change) 2419 { 2420 /* 2421 * At this point memslot has been committed and there is an 2422 * allocated dirty_bitmap[], dirty pages will be tracked while the 2423 * memory slot is write protected. 2424 */ 2425 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2426 /* 2427 * If we're with initial-all-set, we don't need to write 2428 * protect any pages because they're all reported as dirty. 2429 * Huge pages and normal pages will be write protect gradually. 2430 */ 2431 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { 2432 kvm_mmu_wp_memory_region(kvm, mem->slot); 2433 } 2434 } 2435 } 2436 2437 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2438 struct kvm_memory_slot *memslot, 2439 const struct kvm_userspace_memory_region *mem, 2440 enum kvm_mr_change change) 2441 { 2442 hva_t hva = mem->userspace_addr; 2443 hva_t reg_end = hva + mem->memory_size; 2444 bool writable = !(mem->flags & KVM_MEM_READONLY); 2445 int ret = 0; 2446 2447 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2448 change != KVM_MR_FLAGS_ONLY) 2449 return 0; 2450 2451 /* 2452 * Prevent userspace from creating a memory region outside of the IPA 2453 * space addressable by the KVM guest IPA space. 2454 */ 2455 if (memslot->base_gfn + memslot->npages >= 2456 (kvm_phys_size(kvm) >> PAGE_SHIFT)) 2457 return -EFAULT; 2458 2459 mmap_read_lock(current->mm); 2460 /* 2461 * A memory region could potentially cover multiple VMAs, and any holes 2462 * between them, so iterate over all of them to find out if we can map 2463 * any of them right now. 2464 * 2465 * +--------------------------------------------+ 2466 * +---------------+----------------+ +----------------+ 2467 * | : VMA 1 | VMA 2 | | VMA 3 : | 2468 * +---------------+----------------+ +----------------+ 2469 * | memory region | 2470 * +--------------------------------------------+ 2471 */ 2472 do { 2473 struct vm_area_struct *vma = find_vma(current->mm, hva); 2474 hva_t vm_start, vm_end; 2475 2476 if (!vma || vma->vm_start >= reg_end) 2477 break; 2478 2479 /* 2480 * Take the intersection of this VMA with the memory region 2481 */ 2482 vm_start = max(hva, vma->vm_start); 2483 vm_end = min(reg_end, vma->vm_end); 2484 2485 if (vma->vm_flags & VM_PFNMAP) { 2486 gpa_t gpa = mem->guest_phys_addr + 2487 (vm_start - mem->userspace_addr); 2488 phys_addr_t pa; 2489 2490 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; 2491 pa += vm_start - vma->vm_start; 2492 2493 /* IO region dirty page logging not allowed */ 2494 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2495 ret = -EINVAL; 2496 goto out; 2497 } 2498 2499 ret = kvm_phys_addr_ioremap(kvm, gpa, pa, 2500 vm_end - vm_start, 2501 writable); 2502 if (ret) 2503 break; 2504 } 2505 hva = vm_end; 2506 } while (hva < reg_end); 2507 2508 if (change == KVM_MR_FLAGS_ONLY) 2509 goto out; 2510 2511 spin_lock(&kvm->mmu_lock); 2512 if (ret) 2513 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); 2514 else 2515 stage2_flush_memslot(kvm, memslot); 2516 spin_unlock(&kvm->mmu_lock); 2517 out: 2518 mmap_read_unlock(current->mm); 2519 return ret; 2520 } 2521 2522 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2523 { 2524 } 2525 2526 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2527 { 2528 } 2529 2530 void kvm_arch_flush_shadow_all(struct kvm *kvm) 2531 { 2532 kvm_free_stage2_pgd(kvm); 2533 } 2534 2535 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2536 struct kvm_memory_slot *slot) 2537 { 2538 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2539 phys_addr_t size = slot->npages << PAGE_SHIFT; 2540 2541 spin_lock(&kvm->mmu_lock); 2542 unmap_stage2_range(kvm, gpa, size); 2543 spin_unlock(&kvm->mmu_lock); 2544 } 2545 2546 /* 2547 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2548 * 2549 * Main problems: 2550 * - S/W ops are local to a CPU (not broadcast) 2551 * - We have line migration behind our back (speculation) 2552 * - System caches don't support S/W at all (damn!) 2553 * 2554 * In the face of the above, the best we can do is to try and convert 2555 * S/W ops to VA ops. Because the guest is not allowed to infer the 2556 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2557 * which is a rather good thing for us. 2558 * 2559 * Also, it is only used when turning caches on/off ("The expected 2560 * usage of the cache maintenance instructions that operate by set/way 2561 * is associated with the cache maintenance instructions associated 2562 * with the powerdown and powerup of caches, if this is required by 2563 * the implementation."). 2564 * 2565 * We use the following policy: 2566 * 2567 * - If we trap a S/W operation, we enable VM trapping to detect 2568 * caches being turned on/off, and do a full clean. 2569 * 2570 * - We flush the caches on both caches being turned on and off. 2571 * 2572 * - Once the caches are enabled, we stop trapping VM ops. 2573 */ 2574 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2575 { 2576 unsigned long hcr = *vcpu_hcr(vcpu); 2577 2578 /* 2579 * If this is the first time we do a S/W operation 2580 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2581 * VM trapping. 2582 * 2583 * Otherwise, rely on the VM trapping to wait for the MMU + 2584 * Caches to be turned off. At that point, we'll be able to 2585 * clean the caches again. 2586 */ 2587 if (!(hcr & HCR_TVM)) { 2588 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2589 vcpu_has_cache_enabled(vcpu)); 2590 stage2_flush_vm(vcpu->kvm); 2591 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2592 } 2593 } 2594 2595 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2596 { 2597 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2598 2599 /* 2600 * If switching the MMU+caches on, need to invalidate the caches. 2601 * If switching it off, need to clean the caches. 2602 * Clean + invalidate does the trick always. 2603 */ 2604 if (now_enabled != was_enabled) 2605 stage2_flush_vm(vcpu->kvm); 2606 2607 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2608 if (now_enabled) 2609 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2610 2611 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2612 } 2613