1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 5 */ 6 7 #include <linux/types.h> 8 #include <linux/string.h> 9 #include <linux/kvm.h> 10 #include <linux/kvm_host.h> 11 #include <linux/anon_inodes.h> 12 #include <linux/file.h> 13 #include <linux/debugfs.h> 14 #include <linux/pgtable.h> 15 16 #include <asm/kvm_ppc.h> 17 #include <asm/kvm_book3s.h> 18 #include <asm/page.h> 19 #include <asm/mmu.h> 20 #include <asm/pgalloc.h> 21 #include <asm/pte-walk.h> 22 #include <asm/ultravisor.h> 23 #include <asm/kvm_book3s_uvmem.h> 24 #include <asm/plpar_wrappers.h> 25 26 /* 27 * Supported radix tree geometry. 28 * Like p9, we support either 5 or 9 bits at the first (lowest) level, 29 * for a page size of 64k or 4k. 30 */ 31 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; 32 33 unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, 34 gva_t eaddr, void *to, void *from, 35 unsigned long n) 36 { 37 int old_pid, old_lpid; 38 unsigned long quadrant, ret = n; 39 bool is_load = !!to; 40 41 /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ 42 if (kvmhv_on_pseries()) 43 return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, 44 (to != NULL) ? __pa(to): 0, 45 (from != NULL) ? __pa(from): 0, n); 46 47 quadrant = 1; 48 if (!pid) 49 quadrant = 2; 50 if (is_load) 51 from = (void *) (eaddr | (quadrant << 62)); 52 else 53 to = (void *) (eaddr | (quadrant << 62)); 54 55 preempt_disable(); 56 57 /* switch the lpid first to avoid running host with unallocated pid */ 58 old_lpid = mfspr(SPRN_LPID); 59 if (old_lpid != lpid) 60 mtspr(SPRN_LPID, lpid); 61 if (quadrant == 1) { 62 old_pid = mfspr(SPRN_PID); 63 if (old_pid != pid) 64 mtspr(SPRN_PID, pid); 65 } 66 isync(); 67 68 if (is_load) 69 ret = copy_from_user_nofault(to, (const void __user *)from, n); 70 else 71 ret = copy_to_user_nofault((void __user *)to, from, n); 72 73 /* switch the pid first to avoid running host with unallocated pid */ 74 if (quadrant == 1 && pid != old_pid) 75 mtspr(SPRN_PID, old_pid); 76 if (lpid != old_lpid) 77 mtspr(SPRN_LPID, old_lpid); 78 isync(); 79 80 preempt_enable(); 81 82 return ret; 83 } 84 EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); 85 86 static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, 87 void *to, void *from, unsigned long n) 88 { 89 int lpid = vcpu->kvm->arch.lpid; 90 int pid = vcpu->arch.pid; 91 92 /* This would cause a data segment intr so don't allow the access */ 93 if (eaddr & (0x3FFUL << 52)) 94 return -EINVAL; 95 96 /* Should we be using the nested lpid */ 97 if (vcpu->arch.nested) 98 lpid = vcpu->arch.nested->shadow_lpid; 99 100 /* If accessing quadrant 3 then pid is expected to be 0 */ 101 if (((eaddr >> 62) & 0x3) == 0x3) 102 pid = 0; 103 104 eaddr &= ~(0xFFFUL << 52); 105 106 return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n); 107 } 108 109 long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, 110 unsigned long n) 111 { 112 long ret; 113 114 ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); 115 if (ret > 0) 116 memset(to + (n - ret), 0, ret); 117 118 return ret; 119 } 120 EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); 121 122 long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, 123 unsigned long n) 124 { 125 return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); 126 } 127 EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); 128 129 int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, 130 struct kvmppc_pte *gpte, u64 root, 131 u64 *pte_ret_p) 132 { 133 struct kvm *kvm = vcpu->kvm; 134 int ret, level, ps; 135 unsigned long rts, bits, offset, index; 136 u64 pte, base, gpa; 137 __be64 rpte; 138 139 rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | 140 ((root & RTS2_MASK) >> RTS2_SHIFT); 141 bits = root & RPDS_MASK; 142 base = root & RPDB_MASK; 143 144 offset = rts + 31; 145 146 /* Current implementations only support 52-bit space */ 147 if (offset != 52) 148 return -EINVAL; 149 150 /* Walk each level of the radix tree */ 151 for (level = 3; level >= 0; --level) { 152 u64 addr; 153 /* Check a valid size */ 154 if (level && bits != p9_supported_radix_bits[level]) 155 return -EINVAL; 156 if (level == 0 && !(bits == 5 || bits == 9)) 157 return -EINVAL; 158 offset -= bits; 159 index = (eaddr >> offset) & ((1UL << bits) - 1); 160 /* Check that low bits of page table base are zero */ 161 if (base & ((1UL << (bits + 3)) - 1)) 162 return -EINVAL; 163 /* Read the entry from guest memory */ 164 addr = base + (index * sizeof(rpte)); 165 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 166 ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); 167 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 168 if (ret) { 169 if (pte_ret_p) 170 *pte_ret_p = addr; 171 return ret; 172 } 173 pte = __be64_to_cpu(rpte); 174 if (!(pte & _PAGE_PRESENT)) 175 return -ENOENT; 176 /* Check if a leaf entry */ 177 if (pte & _PAGE_PTE) 178 break; 179 /* Get ready to walk the next level */ 180 base = pte & RPDB_MASK; 181 bits = pte & RPDS_MASK; 182 } 183 184 /* Need a leaf at lowest level; 512GB pages not supported */ 185 if (level < 0 || level == 3) 186 return -EINVAL; 187 188 /* We found a valid leaf PTE */ 189 /* Offset is now log base 2 of the page size */ 190 gpa = pte & 0x01fffffffffff000ul; 191 if (gpa & ((1ul << offset) - 1)) 192 return -EINVAL; 193 gpa |= eaddr & ((1ul << offset) - 1); 194 for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) 195 if (offset == mmu_psize_defs[ps].shift) 196 break; 197 gpte->page_size = ps; 198 gpte->page_shift = offset; 199 200 gpte->eaddr = eaddr; 201 gpte->raddr = gpa; 202 203 /* Work out permissions */ 204 gpte->may_read = !!(pte & _PAGE_READ); 205 gpte->may_write = !!(pte & _PAGE_WRITE); 206 gpte->may_execute = !!(pte & _PAGE_EXEC); 207 208 gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); 209 210 if (pte_ret_p) 211 *pte_ret_p = pte; 212 213 return 0; 214 } 215 216 /* 217 * Used to walk a partition or process table radix tree in guest memory 218 * Note: We exploit the fact that a partition table and a process 219 * table have the same layout, a partition-scoped page table and a 220 * process-scoped page table have the same layout, and the 2nd 221 * doubleword of a partition table entry has the same layout as 222 * the PTCR register. 223 */ 224 int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, 225 struct kvmppc_pte *gpte, u64 table, 226 int table_index, u64 *pte_ret_p) 227 { 228 struct kvm *kvm = vcpu->kvm; 229 int ret; 230 unsigned long size, ptbl, root; 231 struct prtb_entry entry; 232 233 if ((table & PRTS_MASK) > 24) 234 return -EINVAL; 235 size = 1ul << ((table & PRTS_MASK) + 12); 236 237 /* Is the table big enough to contain this entry? */ 238 if ((table_index * sizeof(entry)) >= size) 239 return -EINVAL; 240 241 /* Read the table to find the root of the radix tree */ 242 ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); 243 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 244 ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); 245 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 246 if (ret) 247 return ret; 248 249 /* Root is stored in the first double word */ 250 root = be64_to_cpu(entry.prtb0); 251 252 return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); 253 } 254 255 int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 256 struct kvmppc_pte *gpte, bool data, bool iswrite) 257 { 258 u32 pid; 259 u64 pte; 260 int ret; 261 262 /* Work out effective PID */ 263 switch (eaddr >> 62) { 264 case 0: 265 pid = vcpu->arch.pid; 266 break; 267 case 3: 268 pid = 0; 269 break; 270 default: 271 return -EINVAL; 272 } 273 274 ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, 275 vcpu->kvm->arch.process_table, pid, &pte); 276 if (ret) 277 return ret; 278 279 /* Check privilege (applies only to process scoped translations) */ 280 if (kvmppc_get_msr(vcpu) & MSR_PR) { 281 if (pte & _PAGE_PRIVILEGED) { 282 gpte->may_read = 0; 283 gpte->may_write = 0; 284 gpte->may_execute = 0; 285 } 286 } else { 287 if (!(pte & _PAGE_PRIVILEGED)) { 288 /* Check AMR/IAMR to see if strict mode is in force */ 289 if (vcpu->arch.amr & (1ul << 62)) 290 gpte->may_read = 0; 291 if (vcpu->arch.amr & (1ul << 63)) 292 gpte->may_write = 0; 293 if (vcpu->arch.iamr & (1ul << 62)) 294 gpte->may_execute = 0; 295 } 296 } 297 298 return 0; 299 } 300 301 void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, 302 unsigned int pshift, unsigned int lpid) 303 { 304 unsigned long psize = PAGE_SIZE; 305 int psi; 306 long rc; 307 unsigned long rb; 308 309 if (pshift) 310 psize = 1UL << pshift; 311 else 312 pshift = PAGE_SHIFT; 313 314 addr &= ~(psize - 1); 315 316 if (!kvmhv_on_pseries()) { 317 radix__flush_tlb_lpid_page(lpid, addr, psize); 318 return; 319 } 320 321 psi = shift_to_mmu_psize(pshift); 322 323 if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) { 324 rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); 325 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), 326 lpid, rb); 327 } else { 328 rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU, 329 H_RPTI_TYPE_NESTED | 330 H_RPTI_TYPE_TLB, 331 psize_to_rpti_pgsize(psi), 332 addr, addr + psize); 333 } 334 335 if (rc) 336 pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); 337 } 338 339 static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) 340 { 341 long rc; 342 343 if (!kvmhv_on_pseries()) { 344 radix__flush_pwc_lpid(lpid); 345 return; 346 } 347 348 if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) 349 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), 350 lpid, TLBIEL_INVAL_SET_LPID); 351 else 352 rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU, 353 H_RPTI_TYPE_NESTED | 354 H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL, 355 0, -1UL); 356 if (rc) 357 pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); 358 } 359 360 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, 361 unsigned long clr, unsigned long set, 362 unsigned long addr, unsigned int shift) 363 { 364 return __radix_pte_update(ptep, clr, set); 365 } 366 367 static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, 368 pte_t *ptep, pte_t pte) 369 { 370 radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); 371 } 372 373 static struct kmem_cache *kvm_pte_cache; 374 static struct kmem_cache *kvm_pmd_cache; 375 376 static pte_t *kvmppc_pte_alloc(void) 377 { 378 pte_t *pte; 379 380 pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); 381 /* pmd_populate() will only reference _pa(pte). */ 382 kmemleak_ignore(pte); 383 384 return pte; 385 } 386 387 static void kvmppc_pte_free(pte_t *ptep) 388 { 389 kmem_cache_free(kvm_pte_cache, ptep); 390 } 391 392 static pmd_t *kvmppc_pmd_alloc(void) 393 { 394 pmd_t *pmd; 395 396 pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL); 397 /* pud_populate() will only reference _pa(pmd). */ 398 kmemleak_ignore(pmd); 399 400 return pmd; 401 } 402 403 static void kvmppc_pmd_free(pmd_t *pmdp) 404 { 405 kmem_cache_free(kvm_pmd_cache, pmdp); 406 } 407 408 /* Called with kvm->mmu_lock held */ 409 void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, 410 unsigned int shift, 411 const struct kvm_memory_slot *memslot, 412 unsigned int lpid) 413 414 { 415 unsigned long old; 416 unsigned long gfn = gpa >> PAGE_SHIFT; 417 unsigned long page_size = PAGE_SIZE; 418 unsigned long hpa; 419 420 old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); 421 kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); 422 423 /* The following only applies to L1 entries */ 424 if (lpid != kvm->arch.lpid) 425 return; 426 427 if (!memslot) { 428 memslot = gfn_to_memslot(kvm, gfn); 429 if (!memslot) 430 return; 431 } 432 if (shift) { /* 1GB or 2MB page */ 433 page_size = 1ul << shift; 434 if (shift == PMD_SHIFT) 435 kvm->stat.num_2M_pages--; 436 else if (shift == PUD_SHIFT) 437 kvm->stat.num_1G_pages--; 438 } 439 440 gpa &= ~(page_size - 1); 441 hpa = old & PTE_RPN_MASK; 442 kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); 443 444 if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) 445 kvmppc_update_dirty_map(memslot, gfn, page_size); 446 } 447 448 /* 449 * kvmppc_free_p?d are used to free existing page tables, and recursively 450 * descend and clear and free children. 451 * Callers are responsible for flushing the PWC. 452 * 453 * When page tables are being unmapped/freed as part of page fault path 454 * (full == false), valid ptes are generally not expected; however, there 455 * is one situation where they arise, which is when dirty page logging is 456 * turned off for a memslot while the VM is running. The new memslot 457 * becomes visible to page faults before the memslot commit function 458 * gets to flush the memslot, which can lead to a 2MB page mapping being 459 * installed for a guest physical address where there are already 64kB 460 * (or 4kB) mappings (of sub-pages of the same 2MB page). 461 */ 462 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, 463 unsigned int lpid) 464 { 465 if (full) { 466 memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE); 467 } else { 468 pte_t *p = pte; 469 unsigned long it; 470 471 for (it = 0; it < PTRS_PER_PTE; ++it, ++p) { 472 if (pte_val(*p) == 0) 473 continue; 474 kvmppc_unmap_pte(kvm, p, 475 pte_pfn(*p) << PAGE_SHIFT, 476 PAGE_SHIFT, NULL, lpid); 477 } 478 } 479 480 kvmppc_pte_free(pte); 481 } 482 483 static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, 484 unsigned int lpid) 485 { 486 unsigned long im; 487 pmd_t *p = pmd; 488 489 for (im = 0; im < PTRS_PER_PMD; ++im, ++p) { 490 if (!pmd_present(*p)) 491 continue; 492 if (pmd_is_leaf(*p)) { 493 if (full) { 494 pmd_clear(p); 495 } else { 496 WARN_ON_ONCE(1); 497 kvmppc_unmap_pte(kvm, (pte_t *)p, 498 pte_pfn(*(pte_t *)p) << PAGE_SHIFT, 499 PMD_SHIFT, NULL, lpid); 500 } 501 } else { 502 pte_t *pte; 503 504 pte = pte_offset_map(p, 0); 505 kvmppc_unmap_free_pte(kvm, pte, full, lpid); 506 pmd_clear(p); 507 } 508 } 509 kvmppc_pmd_free(pmd); 510 } 511 512 static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, 513 unsigned int lpid) 514 { 515 unsigned long iu; 516 pud_t *p = pud; 517 518 for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) { 519 if (!pud_present(*p)) 520 continue; 521 if (pud_is_leaf(*p)) { 522 pud_clear(p); 523 } else { 524 pmd_t *pmd; 525 526 pmd = pmd_offset(p, 0); 527 kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); 528 pud_clear(p); 529 } 530 } 531 pud_free(kvm->mm, pud); 532 } 533 534 void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) 535 { 536 unsigned long ig; 537 538 for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { 539 p4d_t *p4d = p4d_offset(pgd, 0); 540 pud_t *pud; 541 542 if (!p4d_present(*p4d)) 543 continue; 544 pud = pud_offset(p4d, 0); 545 kvmppc_unmap_free_pud(kvm, pud, lpid); 546 p4d_clear(p4d); 547 } 548 } 549 550 void kvmppc_free_radix(struct kvm *kvm) 551 { 552 if (kvm->arch.pgtable) { 553 kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, 554 kvm->arch.lpid); 555 pgd_free(kvm->mm, kvm->arch.pgtable); 556 kvm->arch.pgtable = NULL; 557 } 558 } 559 560 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, 561 unsigned long gpa, unsigned int lpid) 562 { 563 pte_t *pte = pte_offset_kernel(pmd, 0); 564 565 /* 566 * Clearing the pmd entry then flushing the PWC ensures that the pte 567 * page no longer be cached by the MMU, so can be freed without 568 * flushing the PWC again. 569 */ 570 pmd_clear(pmd); 571 kvmppc_radix_flush_pwc(kvm, lpid); 572 573 kvmppc_unmap_free_pte(kvm, pte, false, lpid); 574 } 575 576 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, 577 unsigned long gpa, unsigned int lpid) 578 { 579 pmd_t *pmd = pmd_offset(pud, 0); 580 581 /* 582 * Clearing the pud entry then flushing the PWC ensures that the pmd 583 * page and any children pte pages will no longer be cached by the MMU, 584 * so can be freed without flushing the PWC again. 585 */ 586 pud_clear(pud); 587 kvmppc_radix_flush_pwc(kvm, lpid); 588 589 kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); 590 } 591 592 /* 593 * There are a number of bits which may differ between different faults to 594 * the same partition scope entry. RC bits, in the course of cleaning and 595 * aging. And the write bit can change, either the access could have been 596 * upgraded, or a read fault could happen concurrently with a write fault 597 * that sets those bits first. 598 */ 599 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) 600 601 int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, 602 unsigned long gpa, unsigned int level, 603 unsigned long mmu_seq, unsigned int lpid, 604 unsigned long *rmapp, struct rmap_nested **n_rmap) 605 { 606 pgd_t *pgd; 607 p4d_t *p4d; 608 pud_t *pud, *new_pud = NULL; 609 pmd_t *pmd, *new_pmd = NULL; 610 pte_t *ptep, *new_ptep = NULL; 611 int ret; 612 613 /* Traverse the guest's 2nd-level tree, allocate new levels needed */ 614 pgd = pgtable + pgd_index(gpa); 615 p4d = p4d_offset(pgd, gpa); 616 617 pud = NULL; 618 if (p4d_present(*p4d)) 619 pud = pud_offset(p4d, gpa); 620 else 621 new_pud = pud_alloc_one(kvm->mm, gpa); 622 623 pmd = NULL; 624 if (pud && pud_present(*pud) && !pud_is_leaf(*pud)) 625 pmd = pmd_offset(pud, gpa); 626 else if (level <= 1) 627 new_pmd = kvmppc_pmd_alloc(); 628 629 if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) 630 new_ptep = kvmppc_pte_alloc(); 631 632 /* Check if we might have been invalidated; let the guest retry if so */ 633 spin_lock(&kvm->mmu_lock); 634 ret = -EAGAIN; 635 if (mmu_notifier_retry(kvm, mmu_seq)) 636 goto out_unlock; 637 638 /* Now traverse again under the lock and change the tree */ 639 ret = -ENOMEM; 640 if (p4d_none(*p4d)) { 641 if (!new_pud) 642 goto out_unlock; 643 p4d_populate(kvm->mm, p4d, new_pud); 644 new_pud = NULL; 645 } 646 pud = pud_offset(p4d, gpa); 647 if (pud_is_leaf(*pud)) { 648 unsigned long hgpa = gpa & PUD_MASK; 649 650 /* Check if we raced and someone else has set the same thing */ 651 if (level == 2) { 652 if (pud_raw(*pud) == pte_raw(pte)) { 653 ret = 0; 654 goto out_unlock; 655 } 656 /* Valid 1GB page here already, add our extra bits */ 657 WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) & 658 PTE_BITS_MUST_MATCH); 659 kvmppc_radix_update_pte(kvm, (pte_t *)pud, 660 0, pte_val(pte), hgpa, PUD_SHIFT); 661 ret = 0; 662 goto out_unlock; 663 } 664 /* 665 * If we raced with another CPU which has just put 666 * a 1GB pte in after we saw a pmd page, try again. 667 */ 668 if (!new_pmd) { 669 ret = -EAGAIN; 670 goto out_unlock; 671 } 672 /* Valid 1GB page here already, remove it */ 673 kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, 674 lpid); 675 } 676 if (level == 2) { 677 if (!pud_none(*pud)) { 678 /* 679 * There's a page table page here, but we wanted to 680 * install a large page, so remove and free the page 681 * table page. 682 */ 683 kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); 684 } 685 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); 686 if (rmapp && n_rmap) 687 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 688 ret = 0; 689 goto out_unlock; 690 } 691 if (pud_none(*pud)) { 692 if (!new_pmd) 693 goto out_unlock; 694 pud_populate(kvm->mm, pud, new_pmd); 695 new_pmd = NULL; 696 } 697 pmd = pmd_offset(pud, gpa); 698 if (pmd_is_leaf(*pmd)) { 699 unsigned long lgpa = gpa & PMD_MASK; 700 701 /* Check if we raced and someone else has set the same thing */ 702 if (level == 1) { 703 if (pmd_raw(*pmd) == pte_raw(pte)) { 704 ret = 0; 705 goto out_unlock; 706 } 707 /* Valid 2MB page here already, add our extra bits */ 708 WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & 709 PTE_BITS_MUST_MATCH); 710 kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), 711 0, pte_val(pte), lgpa, PMD_SHIFT); 712 ret = 0; 713 goto out_unlock; 714 } 715 716 /* 717 * If we raced with another CPU which has just put 718 * a 2MB pte in after we saw a pte page, try again. 719 */ 720 if (!new_ptep) { 721 ret = -EAGAIN; 722 goto out_unlock; 723 } 724 /* Valid 2MB page here already, remove it */ 725 kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, 726 lpid); 727 } 728 if (level == 1) { 729 if (!pmd_none(*pmd)) { 730 /* 731 * There's a page table page here, but we wanted to 732 * install a large page, so remove and free the page 733 * table page. 734 */ 735 kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); 736 } 737 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); 738 if (rmapp && n_rmap) 739 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 740 ret = 0; 741 goto out_unlock; 742 } 743 if (pmd_none(*pmd)) { 744 if (!new_ptep) 745 goto out_unlock; 746 pmd_populate(kvm->mm, pmd, new_ptep); 747 new_ptep = NULL; 748 } 749 ptep = pte_offset_kernel(pmd, gpa); 750 if (pte_present(*ptep)) { 751 /* Check if someone else set the same thing */ 752 if (pte_raw(*ptep) == pte_raw(pte)) { 753 ret = 0; 754 goto out_unlock; 755 } 756 /* Valid page here already, add our extra bits */ 757 WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) & 758 PTE_BITS_MUST_MATCH); 759 kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0); 760 ret = 0; 761 goto out_unlock; 762 } 763 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); 764 if (rmapp && n_rmap) 765 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 766 ret = 0; 767 768 out_unlock: 769 spin_unlock(&kvm->mmu_lock); 770 if (new_pud) 771 pud_free(kvm->mm, new_pud); 772 if (new_pmd) 773 kvmppc_pmd_free(new_pmd); 774 if (new_ptep) 775 kvmppc_pte_free(new_ptep); 776 return ret; 777 } 778 779 bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing, 780 unsigned long gpa, unsigned int lpid) 781 { 782 unsigned long pgflags; 783 unsigned int shift; 784 pte_t *ptep; 785 786 /* 787 * Need to set an R or C bit in the 2nd-level tables; 788 * since we are just helping out the hardware here, 789 * it is sufficient to do what the hardware does. 790 */ 791 pgflags = _PAGE_ACCESSED; 792 if (writing) 793 pgflags |= _PAGE_DIRTY; 794 795 if (nested) 796 ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift); 797 else 798 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 799 800 if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { 801 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); 802 return true; 803 } 804 return false; 805 } 806 807 int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, 808 unsigned long gpa, 809 struct kvm_memory_slot *memslot, 810 bool writing, bool kvm_ro, 811 pte_t *inserted_pte, unsigned int *levelp) 812 { 813 struct kvm *kvm = vcpu->kvm; 814 struct page *page = NULL; 815 unsigned long mmu_seq; 816 unsigned long hva, gfn = gpa >> PAGE_SHIFT; 817 bool upgrade_write = false; 818 bool *upgrade_p = &upgrade_write; 819 pte_t pte, *ptep; 820 unsigned int shift, level; 821 int ret; 822 bool large_enable; 823 824 /* used to check for invalidations in progress */ 825 mmu_seq = kvm->mmu_notifier_seq; 826 smp_rmb(); 827 828 /* 829 * Do a fast check first, since __gfn_to_pfn_memslot doesn't 830 * do it with !atomic && !async, which is how we call it. 831 * We always ask for write permission since the common case 832 * is that the page is writable. 833 */ 834 hva = gfn_to_hva_memslot(memslot, gfn); 835 if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { 836 upgrade_write = true; 837 } else { 838 unsigned long pfn; 839 840 /* Call KVM generic code to do the slow-path check */ 841 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 842 writing, upgrade_p, NULL); 843 if (is_error_noslot_pfn(pfn)) 844 return -EFAULT; 845 page = NULL; 846 if (pfn_valid(pfn)) { 847 page = pfn_to_page(pfn); 848 if (PageReserved(page)) 849 page = NULL; 850 } 851 } 852 853 /* 854 * Read the PTE from the process' radix tree and use that 855 * so we get the shift and attribute bits. 856 */ 857 spin_lock(&kvm->mmu_lock); 858 ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); 859 pte = __pte(0); 860 if (ptep) 861 pte = READ_ONCE(*ptep); 862 spin_unlock(&kvm->mmu_lock); 863 /* 864 * If the PTE disappeared temporarily due to a THP 865 * collapse, just return and let the guest try again. 866 */ 867 if (!pte_present(pte)) { 868 if (page) 869 put_page(page); 870 return RESUME_GUEST; 871 } 872 873 /* If we're logging dirty pages, always map single pages */ 874 large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); 875 876 /* Get pte level from shift/size */ 877 if (large_enable && shift == PUD_SHIFT && 878 (gpa & (PUD_SIZE - PAGE_SIZE)) == 879 (hva & (PUD_SIZE - PAGE_SIZE))) { 880 level = 2; 881 } else if (large_enable && shift == PMD_SHIFT && 882 (gpa & (PMD_SIZE - PAGE_SIZE)) == 883 (hva & (PMD_SIZE - PAGE_SIZE))) { 884 level = 1; 885 } else { 886 level = 0; 887 if (shift > PAGE_SHIFT) { 888 /* 889 * If the pte maps more than one page, bring over 890 * bits from the virtual address to get the real 891 * address of the specific single page we want. 892 */ 893 unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; 894 pte = __pte(pte_val(pte) | (hva & rpnmask)); 895 } 896 } 897 898 pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); 899 if (writing || upgrade_write) { 900 if (pte_val(pte) & _PAGE_WRITE) 901 pte = __pte(pte_val(pte) | _PAGE_DIRTY); 902 } else { 903 pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); 904 } 905 906 /* Allocate space in the tree and write the PTE */ 907 ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, 908 mmu_seq, kvm->arch.lpid, NULL, NULL); 909 if (inserted_pte) 910 *inserted_pte = pte; 911 if (levelp) 912 *levelp = level; 913 914 if (page) { 915 if (!ret && (pte_val(pte) & _PAGE_WRITE)) 916 set_page_dirty_lock(page); 917 put_page(page); 918 } 919 920 /* Increment number of large pages if we (successfully) inserted one */ 921 if (!ret) { 922 if (level == 1) 923 kvm->stat.num_2M_pages++; 924 else if (level == 2) 925 kvm->stat.num_1G_pages++; 926 } 927 928 return ret; 929 } 930 931 int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, 932 unsigned long ea, unsigned long dsisr) 933 { 934 struct kvm *kvm = vcpu->kvm; 935 unsigned long gpa, gfn; 936 struct kvm_memory_slot *memslot; 937 long ret; 938 bool writing = !!(dsisr & DSISR_ISSTORE); 939 bool kvm_ro = false; 940 941 /* Check for unusual errors */ 942 if (dsisr & DSISR_UNSUPP_MMU) { 943 pr_err("KVM: Got unsupported MMU fault\n"); 944 return -EFAULT; 945 } 946 if (dsisr & DSISR_BADACCESS) { 947 /* Reflect to the guest as DSI */ 948 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); 949 kvmppc_core_queue_data_storage(vcpu, ea, dsisr); 950 return RESUME_GUEST; 951 } 952 953 /* Translate the logical address */ 954 gpa = vcpu->arch.fault_gpa & ~0xfffUL; 955 gpa &= ~0xF000000000000000ul; 956 gfn = gpa >> PAGE_SHIFT; 957 if (!(dsisr & DSISR_PRTABLE_FAULT)) 958 gpa |= ea & 0xfff; 959 960 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 961 return kvmppc_send_page_to_uv(kvm, gfn); 962 963 /* Get the corresponding memslot */ 964 memslot = gfn_to_memslot(kvm, gfn); 965 966 /* No memslot means it's an emulated MMIO region */ 967 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { 968 if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | 969 DSISR_SET_RC)) { 970 /* 971 * Bad address in guest page table tree, or other 972 * unusual error - reflect it to the guest as DSI. 973 */ 974 kvmppc_core_queue_data_storage(vcpu, ea, dsisr); 975 return RESUME_GUEST; 976 } 977 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing); 978 } 979 980 if (memslot->flags & KVM_MEM_READONLY) { 981 if (writing) { 982 /* give the guest a DSI */ 983 kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | 984 DSISR_PROTFAULT); 985 return RESUME_GUEST; 986 } 987 kvm_ro = true; 988 } 989 990 /* Failed to set the reference/change bits */ 991 if (dsisr & DSISR_SET_RC) { 992 spin_lock(&kvm->mmu_lock); 993 if (kvmppc_hv_handle_set_rc(kvm, false, writing, 994 gpa, kvm->arch.lpid)) 995 dsisr &= ~DSISR_SET_RC; 996 spin_unlock(&kvm->mmu_lock); 997 998 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | 999 DSISR_PROTFAULT | DSISR_SET_RC))) 1000 return RESUME_GUEST; 1001 } 1002 1003 /* Try to insert a pte */ 1004 ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, 1005 kvm_ro, NULL, NULL); 1006 1007 if (ret == 0 || ret == -EAGAIN) 1008 ret = RESUME_GUEST; 1009 return ret; 1010 } 1011 1012 /* Called with kvm->mmu_lock held */ 1013 void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 1014 unsigned long gfn) 1015 { 1016 pte_t *ptep; 1017 unsigned long gpa = gfn << PAGE_SHIFT; 1018 unsigned int shift; 1019 1020 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { 1021 uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); 1022 return; 1023 } 1024 1025 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 1026 if (ptep && pte_present(*ptep)) 1027 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, 1028 kvm->arch.lpid); 1029 } 1030 1031 /* Called with kvm->mmu_lock held */ 1032 bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 1033 unsigned long gfn) 1034 { 1035 pte_t *ptep; 1036 unsigned long gpa = gfn << PAGE_SHIFT; 1037 unsigned int shift; 1038 bool ref = false; 1039 unsigned long old, *rmapp; 1040 1041 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1042 return ref; 1043 1044 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 1045 if (ptep && pte_present(*ptep) && pte_young(*ptep)) { 1046 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, 1047 gpa, shift); 1048 /* XXX need to flush tlb here? */ 1049 /* Also clear bit in ptes in shadow pgtable for nested guests */ 1050 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1051 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, 1052 old & PTE_RPN_MASK, 1053 1UL << shift); 1054 ref = true; 1055 } 1056 return ref; 1057 } 1058 1059 /* Called with kvm->mmu_lock held */ 1060 bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 1061 unsigned long gfn) 1062 1063 { 1064 pte_t *ptep; 1065 unsigned long gpa = gfn << PAGE_SHIFT; 1066 unsigned int shift; 1067 bool ref = false; 1068 1069 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1070 return ref; 1071 1072 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 1073 if (ptep && pte_present(*ptep) && pte_young(*ptep)) 1074 ref = true; 1075 return ref; 1076 } 1077 1078 /* Returns the number of PAGE_SIZE pages that are dirty */ 1079 static int kvm_radix_test_clear_dirty(struct kvm *kvm, 1080 struct kvm_memory_slot *memslot, int pagenum) 1081 { 1082 unsigned long gfn = memslot->base_gfn + pagenum; 1083 unsigned long gpa = gfn << PAGE_SHIFT; 1084 pte_t *ptep, pte; 1085 unsigned int shift; 1086 int ret = 0; 1087 unsigned long old, *rmapp; 1088 1089 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1090 return ret; 1091 1092 /* 1093 * For performance reasons we don't hold kvm->mmu_lock while walking the 1094 * partition scoped table. 1095 */ 1096 ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift); 1097 if (!ptep) 1098 return 0; 1099 1100 pte = READ_ONCE(*ptep); 1101 if (pte_present(pte) && pte_dirty(pte)) { 1102 spin_lock(&kvm->mmu_lock); 1103 /* 1104 * Recheck the pte again 1105 */ 1106 if (pte_val(pte) != pte_val(*ptep)) { 1107 /* 1108 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can 1109 * only find PAGE_SIZE pte entries here. We can continue 1110 * to use the pte addr returned by above page table 1111 * walk. 1112 */ 1113 if (!pte_present(*ptep) || !pte_dirty(*ptep)) { 1114 spin_unlock(&kvm->mmu_lock); 1115 return 0; 1116 } 1117 } 1118 1119 ret = 1; 1120 VM_BUG_ON(shift); 1121 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, 1122 gpa, shift); 1123 kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); 1124 /* Also clear bit in ptes in shadow pgtable for nested guests */ 1125 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1126 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0, 1127 old & PTE_RPN_MASK, 1128 1UL << shift); 1129 spin_unlock(&kvm->mmu_lock); 1130 } 1131 return ret; 1132 } 1133 1134 long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, 1135 struct kvm_memory_slot *memslot, unsigned long *map) 1136 { 1137 unsigned long i, j; 1138 int npages; 1139 1140 for (i = 0; i < memslot->npages; i = j) { 1141 npages = kvm_radix_test_clear_dirty(kvm, memslot, i); 1142 1143 /* 1144 * Note that if npages > 0 then i must be a multiple of npages, 1145 * since huge pages are only used to back the guest at guest 1146 * real addresses that are a multiple of their size. 1147 * Since we have at most one PTE covering any given guest 1148 * real address, if npages > 1 we can skip to i + npages. 1149 */ 1150 j = i + 1; 1151 if (npages) { 1152 set_dirty_bits(map, i, npages); 1153 j = i + npages; 1154 } 1155 } 1156 return 0; 1157 } 1158 1159 void kvmppc_radix_flush_memslot(struct kvm *kvm, 1160 const struct kvm_memory_slot *memslot) 1161 { 1162 unsigned long n; 1163 pte_t *ptep; 1164 unsigned long gpa; 1165 unsigned int shift; 1166 1167 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) 1168 kvmppc_uvmem_drop_pages(memslot, kvm, true); 1169 1170 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1171 return; 1172 1173 gpa = memslot->base_gfn << PAGE_SHIFT; 1174 spin_lock(&kvm->mmu_lock); 1175 for (n = memslot->npages; n; --n) { 1176 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 1177 if (ptep && pte_present(*ptep)) 1178 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, 1179 kvm->arch.lpid); 1180 gpa += PAGE_SIZE; 1181 } 1182 /* 1183 * Increase the mmu notifier sequence number to prevent any page 1184 * fault that read the memslot earlier from writing a PTE. 1185 */ 1186 kvm->mmu_notifier_seq++; 1187 spin_unlock(&kvm->mmu_lock); 1188 } 1189 1190 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, 1191 int psize, int *indexp) 1192 { 1193 if (!mmu_psize_defs[psize].shift) 1194 return; 1195 info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | 1196 (mmu_psize_defs[psize].ap << 29); 1197 ++(*indexp); 1198 } 1199 1200 int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) 1201 { 1202 int i; 1203 1204 if (!radix_enabled()) 1205 return -EINVAL; 1206 memset(info, 0, sizeof(*info)); 1207 1208 /* 4k page size */ 1209 info->geometries[0].page_shift = 12; 1210 info->geometries[0].level_bits[0] = 9; 1211 for (i = 1; i < 4; ++i) 1212 info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; 1213 /* 64k page size */ 1214 info->geometries[1].page_shift = 16; 1215 for (i = 0; i < 4; ++i) 1216 info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; 1217 1218 i = 0; 1219 add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); 1220 add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); 1221 add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); 1222 add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); 1223 1224 return 0; 1225 } 1226 1227 int kvmppc_init_vm_radix(struct kvm *kvm) 1228 { 1229 kvm->arch.pgtable = pgd_alloc(kvm->mm); 1230 if (!kvm->arch.pgtable) 1231 return -ENOMEM; 1232 return 0; 1233 } 1234 1235 static void pte_ctor(void *addr) 1236 { 1237 memset(addr, 0, RADIX_PTE_TABLE_SIZE); 1238 } 1239 1240 static void pmd_ctor(void *addr) 1241 { 1242 memset(addr, 0, RADIX_PMD_TABLE_SIZE); 1243 } 1244 1245 struct debugfs_radix_state { 1246 struct kvm *kvm; 1247 struct mutex mutex; 1248 unsigned long gpa; 1249 int lpid; 1250 int chars_left; 1251 int buf_index; 1252 char buf[128]; 1253 u8 hdr; 1254 }; 1255 1256 static int debugfs_radix_open(struct inode *inode, struct file *file) 1257 { 1258 struct kvm *kvm = inode->i_private; 1259 struct debugfs_radix_state *p; 1260 1261 p = kzalloc(sizeof(*p), GFP_KERNEL); 1262 if (!p) 1263 return -ENOMEM; 1264 1265 kvm_get_kvm(kvm); 1266 p->kvm = kvm; 1267 mutex_init(&p->mutex); 1268 file->private_data = p; 1269 1270 return nonseekable_open(inode, file); 1271 } 1272 1273 static int debugfs_radix_release(struct inode *inode, struct file *file) 1274 { 1275 struct debugfs_radix_state *p = file->private_data; 1276 1277 kvm_put_kvm(p->kvm); 1278 kfree(p); 1279 return 0; 1280 } 1281 1282 static ssize_t debugfs_radix_read(struct file *file, char __user *buf, 1283 size_t len, loff_t *ppos) 1284 { 1285 struct debugfs_radix_state *p = file->private_data; 1286 ssize_t ret, r; 1287 unsigned long n; 1288 struct kvm *kvm; 1289 unsigned long gpa; 1290 pgd_t *pgt; 1291 struct kvm_nested_guest *nested; 1292 pgd_t *pgdp; 1293 p4d_t p4d, *p4dp; 1294 pud_t pud, *pudp; 1295 pmd_t pmd, *pmdp; 1296 pte_t *ptep; 1297 int shift; 1298 unsigned long pte; 1299 1300 kvm = p->kvm; 1301 if (!kvm_is_radix(kvm)) 1302 return 0; 1303 1304 ret = mutex_lock_interruptible(&p->mutex); 1305 if (ret) 1306 return ret; 1307 1308 if (p->chars_left) { 1309 n = p->chars_left; 1310 if (n > len) 1311 n = len; 1312 r = copy_to_user(buf, p->buf + p->buf_index, n); 1313 n -= r; 1314 p->chars_left -= n; 1315 p->buf_index += n; 1316 buf += n; 1317 len -= n; 1318 ret = n; 1319 if (r) { 1320 if (!n) 1321 ret = -EFAULT; 1322 goto out; 1323 } 1324 } 1325 1326 gpa = p->gpa; 1327 nested = NULL; 1328 pgt = NULL; 1329 while (len != 0 && p->lpid >= 0) { 1330 if (gpa >= RADIX_PGTABLE_RANGE) { 1331 gpa = 0; 1332 pgt = NULL; 1333 if (nested) { 1334 kvmhv_put_nested(nested); 1335 nested = NULL; 1336 } 1337 p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); 1338 p->hdr = 0; 1339 if (p->lpid < 0) 1340 break; 1341 } 1342 if (!pgt) { 1343 if (p->lpid == 0) { 1344 pgt = kvm->arch.pgtable; 1345 } else { 1346 nested = kvmhv_get_nested(kvm, p->lpid, false); 1347 if (!nested) { 1348 gpa = RADIX_PGTABLE_RANGE; 1349 continue; 1350 } 1351 pgt = nested->shadow_pgtable; 1352 } 1353 } 1354 n = 0; 1355 if (!p->hdr) { 1356 if (p->lpid > 0) 1357 n = scnprintf(p->buf, sizeof(p->buf), 1358 "\nNested LPID %d: ", p->lpid); 1359 n += scnprintf(p->buf + n, sizeof(p->buf) - n, 1360 "pgdir: %lx\n", (unsigned long)pgt); 1361 p->hdr = 1; 1362 goto copy; 1363 } 1364 1365 pgdp = pgt + pgd_index(gpa); 1366 p4dp = p4d_offset(pgdp, gpa); 1367 p4d = READ_ONCE(*p4dp); 1368 if (!(p4d_val(p4d) & _PAGE_PRESENT)) { 1369 gpa = (gpa & P4D_MASK) + P4D_SIZE; 1370 continue; 1371 } 1372 1373 pudp = pud_offset(&p4d, gpa); 1374 pud = READ_ONCE(*pudp); 1375 if (!(pud_val(pud) & _PAGE_PRESENT)) { 1376 gpa = (gpa & PUD_MASK) + PUD_SIZE; 1377 continue; 1378 } 1379 if (pud_val(pud) & _PAGE_PTE) { 1380 pte = pud_val(pud); 1381 shift = PUD_SHIFT; 1382 goto leaf; 1383 } 1384 1385 pmdp = pmd_offset(&pud, gpa); 1386 pmd = READ_ONCE(*pmdp); 1387 if (!(pmd_val(pmd) & _PAGE_PRESENT)) { 1388 gpa = (gpa & PMD_MASK) + PMD_SIZE; 1389 continue; 1390 } 1391 if (pmd_val(pmd) & _PAGE_PTE) { 1392 pte = pmd_val(pmd); 1393 shift = PMD_SHIFT; 1394 goto leaf; 1395 } 1396 1397 ptep = pte_offset_kernel(&pmd, gpa); 1398 pte = pte_val(READ_ONCE(*ptep)); 1399 if (!(pte & _PAGE_PRESENT)) { 1400 gpa += PAGE_SIZE; 1401 continue; 1402 } 1403 shift = PAGE_SHIFT; 1404 leaf: 1405 n = scnprintf(p->buf, sizeof(p->buf), 1406 " %lx: %lx %d\n", gpa, pte, shift); 1407 gpa += 1ul << shift; 1408 copy: 1409 p->chars_left = n; 1410 if (n > len) 1411 n = len; 1412 r = copy_to_user(buf, p->buf, n); 1413 n -= r; 1414 p->chars_left -= n; 1415 p->buf_index = n; 1416 buf += n; 1417 len -= n; 1418 ret += n; 1419 if (r) { 1420 if (!ret) 1421 ret = -EFAULT; 1422 break; 1423 } 1424 } 1425 p->gpa = gpa; 1426 if (nested) 1427 kvmhv_put_nested(nested); 1428 1429 out: 1430 mutex_unlock(&p->mutex); 1431 return ret; 1432 } 1433 1434 static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, 1435 size_t len, loff_t *ppos) 1436 { 1437 return -EACCES; 1438 } 1439 1440 static const struct file_operations debugfs_radix_fops = { 1441 .owner = THIS_MODULE, 1442 .open = debugfs_radix_open, 1443 .release = debugfs_radix_release, 1444 .read = debugfs_radix_read, 1445 .write = debugfs_radix_write, 1446 .llseek = generic_file_llseek, 1447 }; 1448 1449 void kvmhv_radix_debugfs_init(struct kvm *kvm) 1450 { 1451 debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm, 1452 &debugfs_radix_fops); 1453 } 1454 1455 int kvmppc_radix_init(void) 1456 { 1457 unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; 1458 1459 kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); 1460 if (!kvm_pte_cache) 1461 return -ENOMEM; 1462 1463 size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; 1464 1465 kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); 1466 if (!kvm_pmd_cache) { 1467 kmem_cache_destroy(kvm_pte_cache); 1468 return -ENOMEM; 1469 } 1470 1471 return 0; 1472 } 1473 1474 void kvmppc_radix_exit(void) 1475 { 1476 kmem_cache_destroy(kvm_pte_cache); 1477 kmem_cache_destroy(kvm_pmd_cache); 1478 } 1479