1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 5 */ 6 7 #include <linux/types.h> 8 #include <linux/string.h> 9 #include <linux/kvm.h> 10 #include <linux/kvm_host.h> 11 #include <linux/anon_inodes.h> 12 #include <linux/file.h> 13 #include <linux/debugfs.h> 14 #include <linux/pgtable.h> 15 16 #include <asm/kvm_ppc.h> 17 #include <asm/kvm_book3s.h> 18 #include <asm/page.h> 19 #include <asm/mmu.h> 20 #include <asm/pgalloc.h> 21 #include <asm/pte-walk.h> 22 #include <asm/ultravisor.h> 23 #include <asm/kvm_book3s_uvmem.h> 24 25 /* 26 * Supported radix tree geometry. 27 * Like p9, we support either 5 or 9 bits at the first (lowest) level, 28 * for a page size of 64k or 4k. 29 */ 30 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; 31 32 unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, 33 gva_t eaddr, void *to, void *from, 34 unsigned long n) 35 { 36 int uninitialized_var(old_pid), old_lpid; 37 unsigned long quadrant, ret = n; 38 bool is_load = !!to; 39 40 /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ 41 if (kvmhv_on_pseries()) 42 return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, 43 (to != NULL) ? __pa(to): 0, 44 (from != NULL) ? __pa(from): 0, n); 45 46 quadrant = 1; 47 if (!pid) 48 quadrant = 2; 49 if (is_load) 50 from = (void *) (eaddr | (quadrant << 62)); 51 else 52 to = (void *) (eaddr | (quadrant << 62)); 53 54 preempt_disable(); 55 56 /* switch the lpid first to avoid running host with unallocated pid */ 57 old_lpid = mfspr(SPRN_LPID); 58 if (old_lpid != lpid) 59 mtspr(SPRN_LPID, lpid); 60 if (quadrant == 1) { 61 old_pid = mfspr(SPRN_PID); 62 if (old_pid != pid) 63 mtspr(SPRN_PID, pid); 64 } 65 isync(); 66 67 if (is_load) 68 ret = copy_from_user_nofault(to, (const void __user *)from, n); 69 else 70 ret = copy_to_user_nofault((void __user *)to, from, n); 71 72 /* switch the pid first to avoid running host with unallocated pid */ 73 if (quadrant == 1 && pid != old_pid) 74 mtspr(SPRN_PID, old_pid); 75 if (lpid != old_lpid) 76 mtspr(SPRN_LPID, old_lpid); 77 isync(); 78 79 preempt_enable(); 80 81 return ret; 82 } 83 EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); 84 85 static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, 86 void *to, void *from, unsigned long n) 87 { 88 int lpid = vcpu->kvm->arch.lpid; 89 int pid = vcpu->arch.pid; 90 91 /* This would cause a data segment intr so don't allow the access */ 92 if (eaddr & (0x3FFUL << 52)) 93 return -EINVAL; 94 95 /* Should we be using the nested lpid */ 96 if (vcpu->arch.nested) 97 lpid = vcpu->arch.nested->shadow_lpid; 98 99 /* If accessing quadrant 3 then pid is expected to be 0 */ 100 if (((eaddr >> 62) & 0x3) == 0x3) 101 pid = 0; 102 103 eaddr &= ~(0xFFFUL << 52); 104 105 return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n); 106 } 107 108 long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, 109 unsigned long n) 110 { 111 long ret; 112 113 ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); 114 if (ret > 0) 115 memset(to + (n - ret), 0, ret); 116 117 return ret; 118 } 119 EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); 120 121 long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, 122 unsigned long n) 123 { 124 return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); 125 } 126 EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); 127 128 int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, 129 struct kvmppc_pte *gpte, u64 root, 130 u64 *pte_ret_p) 131 { 132 struct kvm *kvm = vcpu->kvm; 133 int ret, level, ps; 134 unsigned long rts, bits, offset, index; 135 u64 pte, base, gpa; 136 __be64 rpte; 137 138 rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | 139 ((root & RTS2_MASK) >> RTS2_SHIFT); 140 bits = root & RPDS_MASK; 141 base = root & RPDB_MASK; 142 143 offset = rts + 31; 144 145 /* Current implementations only support 52-bit space */ 146 if (offset != 52) 147 return -EINVAL; 148 149 /* Walk each level of the radix tree */ 150 for (level = 3; level >= 0; --level) { 151 u64 addr; 152 /* Check a valid size */ 153 if (level && bits != p9_supported_radix_bits[level]) 154 return -EINVAL; 155 if (level == 0 && !(bits == 5 || bits == 9)) 156 return -EINVAL; 157 offset -= bits; 158 index = (eaddr >> offset) & ((1UL << bits) - 1); 159 /* Check that low bits of page table base are zero */ 160 if (base & ((1UL << (bits + 3)) - 1)) 161 return -EINVAL; 162 /* Read the entry from guest memory */ 163 addr = base + (index * sizeof(rpte)); 164 ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); 165 if (ret) { 166 if (pte_ret_p) 167 *pte_ret_p = addr; 168 return ret; 169 } 170 pte = __be64_to_cpu(rpte); 171 if (!(pte & _PAGE_PRESENT)) 172 return -ENOENT; 173 /* Check if a leaf entry */ 174 if (pte & _PAGE_PTE) 175 break; 176 /* Get ready to walk the next level */ 177 base = pte & RPDB_MASK; 178 bits = pte & RPDS_MASK; 179 } 180 181 /* Need a leaf at lowest level; 512GB pages not supported */ 182 if (level < 0 || level == 3) 183 return -EINVAL; 184 185 /* We found a valid leaf PTE */ 186 /* Offset is now log base 2 of the page size */ 187 gpa = pte & 0x01fffffffffff000ul; 188 if (gpa & ((1ul << offset) - 1)) 189 return -EINVAL; 190 gpa |= eaddr & ((1ul << offset) - 1); 191 for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) 192 if (offset == mmu_psize_defs[ps].shift) 193 break; 194 gpte->page_size = ps; 195 gpte->page_shift = offset; 196 197 gpte->eaddr = eaddr; 198 gpte->raddr = gpa; 199 200 /* Work out permissions */ 201 gpte->may_read = !!(pte & _PAGE_READ); 202 gpte->may_write = !!(pte & _PAGE_WRITE); 203 gpte->may_execute = !!(pte & _PAGE_EXEC); 204 205 gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); 206 207 if (pte_ret_p) 208 *pte_ret_p = pte; 209 210 return 0; 211 } 212 213 /* 214 * Used to walk a partition or process table radix tree in guest memory 215 * Note: We exploit the fact that a partition table and a process 216 * table have the same layout, a partition-scoped page table and a 217 * process-scoped page table have the same layout, and the 2nd 218 * doubleword of a partition table entry has the same layout as 219 * the PTCR register. 220 */ 221 int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, 222 struct kvmppc_pte *gpte, u64 table, 223 int table_index, u64 *pte_ret_p) 224 { 225 struct kvm *kvm = vcpu->kvm; 226 int ret; 227 unsigned long size, ptbl, root; 228 struct prtb_entry entry; 229 230 if ((table & PRTS_MASK) > 24) 231 return -EINVAL; 232 size = 1ul << ((table & PRTS_MASK) + 12); 233 234 /* Is the table big enough to contain this entry? */ 235 if ((table_index * sizeof(entry)) >= size) 236 return -EINVAL; 237 238 /* Read the table to find the root of the radix tree */ 239 ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); 240 ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); 241 if (ret) 242 return ret; 243 244 /* Root is stored in the first double word */ 245 root = be64_to_cpu(entry.prtb0); 246 247 return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); 248 } 249 250 int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 251 struct kvmppc_pte *gpte, bool data, bool iswrite) 252 { 253 u32 pid; 254 u64 pte; 255 int ret; 256 257 /* Work out effective PID */ 258 switch (eaddr >> 62) { 259 case 0: 260 pid = vcpu->arch.pid; 261 break; 262 case 3: 263 pid = 0; 264 break; 265 default: 266 return -EINVAL; 267 } 268 269 ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, 270 vcpu->kvm->arch.process_table, pid, &pte); 271 if (ret) 272 return ret; 273 274 /* Check privilege (applies only to process scoped translations) */ 275 if (kvmppc_get_msr(vcpu) & MSR_PR) { 276 if (pte & _PAGE_PRIVILEGED) { 277 gpte->may_read = 0; 278 gpte->may_write = 0; 279 gpte->may_execute = 0; 280 } 281 } else { 282 if (!(pte & _PAGE_PRIVILEGED)) { 283 /* Check AMR/IAMR to see if strict mode is in force */ 284 if (vcpu->arch.amr & (1ul << 62)) 285 gpte->may_read = 0; 286 if (vcpu->arch.amr & (1ul << 63)) 287 gpte->may_write = 0; 288 if (vcpu->arch.iamr & (1ul << 62)) 289 gpte->may_execute = 0; 290 } 291 } 292 293 return 0; 294 } 295 296 void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, 297 unsigned int pshift, unsigned int lpid) 298 { 299 unsigned long psize = PAGE_SIZE; 300 int psi; 301 long rc; 302 unsigned long rb; 303 304 if (pshift) 305 psize = 1UL << pshift; 306 else 307 pshift = PAGE_SHIFT; 308 309 addr &= ~(psize - 1); 310 311 if (!kvmhv_on_pseries()) { 312 radix__flush_tlb_lpid_page(lpid, addr, psize); 313 return; 314 } 315 316 psi = shift_to_mmu_psize(pshift); 317 rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); 318 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), 319 lpid, rb); 320 if (rc) 321 pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); 322 } 323 324 static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) 325 { 326 long rc; 327 328 if (!kvmhv_on_pseries()) { 329 radix__flush_pwc_lpid(lpid); 330 return; 331 } 332 333 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), 334 lpid, TLBIEL_INVAL_SET_LPID); 335 if (rc) 336 pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); 337 } 338 339 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, 340 unsigned long clr, unsigned long set, 341 unsigned long addr, unsigned int shift) 342 { 343 return __radix_pte_update(ptep, clr, set); 344 } 345 346 void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, 347 pte_t *ptep, pte_t pte) 348 { 349 radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); 350 } 351 352 static struct kmem_cache *kvm_pte_cache; 353 static struct kmem_cache *kvm_pmd_cache; 354 355 static pte_t *kvmppc_pte_alloc(void) 356 { 357 pte_t *pte; 358 359 pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); 360 /* pmd_populate() will only reference _pa(pte). */ 361 kmemleak_ignore(pte); 362 363 return pte; 364 } 365 366 static void kvmppc_pte_free(pte_t *ptep) 367 { 368 kmem_cache_free(kvm_pte_cache, ptep); 369 } 370 371 static pmd_t *kvmppc_pmd_alloc(void) 372 { 373 pmd_t *pmd; 374 375 pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL); 376 /* pud_populate() will only reference _pa(pmd). */ 377 kmemleak_ignore(pmd); 378 379 return pmd; 380 } 381 382 static void kvmppc_pmd_free(pmd_t *pmdp) 383 { 384 kmem_cache_free(kvm_pmd_cache, pmdp); 385 } 386 387 /* Called with kvm->mmu_lock held */ 388 void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, 389 unsigned int shift, 390 const struct kvm_memory_slot *memslot, 391 unsigned int lpid) 392 393 { 394 unsigned long old; 395 unsigned long gfn = gpa >> PAGE_SHIFT; 396 unsigned long page_size = PAGE_SIZE; 397 unsigned long hpa; 398 399 old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); 400 kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); 401 402 /* The following only applies to L1 entries */ 403 if (lpid != kvm->arch.lpid) 404 return; 405 406 if (!memslot) { 407 memslot = gfn_to_memslot(kvm, gfn); 408 if (!memslot) 409 return; 410 } 411 if (shift) { /* 1GB or 2MB page */ 412 page_size = 1ul << shift; 413 if (shift == PMD_SHIFT) 414 kvm->stat.num_2M_pages--; 415 else if (shift == PUD_SHIFT) 416 kvm->stat.num_1G_pages--; 417 } 418 419 gpa &= ~(page_size - 1); 420 hpa = old & PTE_RPN_MASK; 421 kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); 422 423 if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) 424 kvmppc_update_dirty_map(memslot, gfn, page_size); 425 } 426 427 /* 428 * kvmppc_free_p?d are used to free existing page tables, and recursively 429 * descend and clear and free children. 430 * Callers are responsible for flushing the PWC. 431 * 432 * When page tables are being unmapped/freed as part of page fault path 433 * (full == false), valid ptes are generally not expected; however, there 434 * is one situation where they arise, which is when dirty page logging is 435 * turned off for a memslot while the VM is running. The new memslot 436 * becomes visible to page faults before the memslot commit function 437 * gets to flush the memslot, which can lead to a 2MB page mapping being 438 * installed for a guest physical address where there are already 64kB 439 * (or 4kB) mappings (of sub-pages of the same 2MB page). 440 */ 441 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, 442 unsigned int lpid) 443 { 444 if (full) { 445 memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE); 446 } else { 447 pte_t *p = pte; 448 unsigned long it; 449 450 for (it = 0; it < PTRS_PER_PTE; ++it, ++p) { 451 if (pte_val(*p) == 0) 452 continue; 453 kvmppc_unmap_pte(kvm, p, 454 pte_pfn(*p) << PAGE_SHIFT, 455 PAGE_SHIFT, NULL, lpid); 456 } 457 } 458 459 kvmppc_pte_free(pte); 460 } 461 462 static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, 463 unsigned int lpid) 464 { 465 unsigned long im; 466 pmd_t *p = pmd; 467 468 for (im = 0; im < PTRS_PER_PMD; ++im, ++p) { 469 if (!pmd_present(*p)) 470 continue; 471 if (pmd_is_leaf(*p)) { 472 if (full) { 473 pmd_clear(p); 474 } else { 475 WARN_ON_ONCE(1); 476 kvmppc_unmap_pte(kvm, (pte_t *)p, 477 pte_pfn(*(pte_t *)p) << PAGE_SHIFT, 478 PMD_SHIFT, NULL, lpid); 479 } 480 } else { 481 pte_t *pte; 482 483 pte = pte_offset_map(p, 0); 484 kvmppc_unmap_free_pte(kvm, pte, full, lpid); 485 pmd_clear(p); 486 } 487 } 488 kvmppc_pmd_free(pmd); 489 } 490 491 static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, 492 unsigned int lpid) 493 { 494 unsigned long iu; 495 pud_t *p = pud; 496 497 for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) { 498 if (!pud_present(*p)) 499 continue; 500 if (pud_is_leaf(*p)) { 501 pud_clear(p); 502 } else { 503 pmd_t *pmd; 504 505 pmd = pmd_offset(p, 0); 506 kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); 507 pud_clear(p); 508 } 509 } 510 pud_free(kvm->mm, pud); 511 } 512 513 void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) 514 { 515 unsigned long ig; 516 517 for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { 518 p4d_t *p4d = p4d_offset(pgd, 0); 519 pud_t *pud; 520 521 if (!p4d_present(*p4d)) 522 continue; 523 pud = pud_offset(p4d, 0); 524 kvmppc_unmap_free_pud(kvm, pud, lpid); 525 p4d_clear(p4d); 526 } 527 } 528 529 void kvmppc_free_radix(struct kvm *kvm) 530 { 531 if (kvm->arch.pgtable) { 532 kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, 533 kvm->arch.lpid); 534 pgd_free(kvm->mm, kvm->arch.pgtable); 535 kvm->arch.pgtable = NULL; 536 } 537 } 538 539 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, 540 unsigned long gpa, unsigned int lpid) 541 { 542 pte_t *pte = pte_offset_kernel(pmd, 0); 543 544 /* 545 * Clearing the pmd entry then flushing the PWC ensures that the pte 546 * page no longer be cached by the MMU, so can be freed without 547 * flushing the PWC again. 548 */ 549 pmd_clear(pmd); 550 kvmppc_radix_flush_pwc(kvm, lpid); 551 552 kvmppc_unmap_free_pte(kvm, pte, false, lpid); 553 } 554 555 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, 556 unsigned long gpa, unsigned int lpid) 557 { 558 pmd_t *pmd = pmd_offset(pud, 0); 559 560 /* 561 * Clearing the pud entry then flushing the PWC ensures that the pmd 562 * page and any children pte pages will no longer be cached by the MMU, 563 * so can be freed without flushing the PWC again. 564 */ 565 pud_clear(pud); 566 kvmppc_radix_flush_pwc(kvm, lpid); 567 568 kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); 569 } 570 571 /* 572 * There are a number of bits which may differ between different faults to 573 * the same partition scope entry. RC bits, in the course of cleaning and 574 * aging. And the write bit can change, either the access could have been 575 * upgraded, or a read fault could happen concurrently with a write fault 576 * that sets those bits first. 577 */ 578 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) 579 580 int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, 581 unsigned long gpa, unsigned int level, 582 unsigned long mmu_seq, unsigned int lpid, 583 unsigned long *rmapp, struct rmap_nested **n_rmap) 584 { 585 pgd_t *pgd; 586 p4d_t *p4d; 587 pud_t *pud, *new_pud = NULL; 588 pmd_t *pmd, *new_pmd = NULL; 589 pte_t *ptep, *new_ptep = NULL; 590 int ret; 591 592 /* Traverse the guest's 2nd-level tree, allocate new levels needed */ 593 pgd = pgtable + pgd_index(gpa); 594 p4d = p4d_offset(pgd, gpa); 595 596 pud = NULL; 597 if (p4d_present(*p4d)) 598 pud = pud_offset(p4d, gpa); 599 else 600 new_pud = pud_alloc_one(kvm->mm, gpa); 601 602 pmd = NULL; 603 if (pud && pud_present(*pud) && !pud_is_leaf(*pud)) 604 pmd = pmd_offset(pud, gpa); 605 else if (level <= 1) 606 new_pmd = kvmppc_pmd_alloc(); 607 608 if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) 609 new_ptep = kvmppc_pte_alloc(); 610 611 /* Check if we might have been invalidated; let the guest retry if so */ 612 spin_lock(&kvm->mmu_lock); 613 ret = -EAGAIN; 614 if (mmu_notifier_retry(kvm, mmu_seq)) 615 goto out_unlock; 616 617 /* Now traverse again under the lock and change the tree */ 618 ret = -ENOMEM; 619 if (p4d_none(*p4d)) { 620 if (!new_pud) 621 goto out_unlock; 622 p4d_populate(kvm->mm, p4d, new_pud); 623 new_pud = NULL; 624 } 625 pud = pud_offset(p4d, gpa); 626 if (pud_is_leaf(*pud)) { 627 unsigned long hgpa = gpa & PUD_MASK; 628 629 /* Check if we raced and someone else has set the same thing */ 630 if (level == 2) { 631 if (pud_raw(*pud) == pte_raw(pte)) { 632 ret = 0; 633 goto out_unlock; 634 } 635 /* Valid 1GB page here already, add our extra bits */ 636 WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) & 637 PTE_BITS_MUST_MATCH); 638 kvmppc_radix_update_pte(kvm, (pte_t *)pud, 639 0, pte_val(pte), hgpa, PUD_SHIFT); 640 ret = 0; 641 goto out_unlock; 642 } 643 /* 644 * If we raced with another CPU which has just put 645 * a 1GB pte in after we saw a pmd page, try again. 646 */ 647 if (!new_pmd) { 648 ret = -EAGAIN; 649 goto out_unlock; 650 } 651 /* Valid 1GB page here already, remove it */ 652 kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, 653 lpid); 654 } 655 if (level == 2) { 656 if (!pud_none(*pud)) { 657 /* 658 * There's a page table page here, but we wanted to 659 * install a large page, so remove and free the page 660 * table page. 661 */ 662 kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); 663 } 664 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); 665 if (rmapp && n_rmap) 666 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 667 ret = 0; 668 goto out_unlock; 669 } 670 if (pud_none(*pud)) { 671 if (!new_pmd) 672 goto out_unlock; 673 pud_populate(kvm->mm, pud, new_pmd); 674 new_pmd = NULL; 675 } 676 pmd = pmd_offset(pud, gpa); 677 if (pmd_is_leaf(*pmd)) { 678 unsigned long lgpa = gpa & PMD_MASK; 679 680 /* Check if we raced and someone else has set the same thing */ 681 if (level == 1) { 682 if (pmd_raw(*pmd) == pte_raw(pte)) { 683 ret = 0; 684 goto out_unlock; 685 } 686 /* Valid 2MB page here already, add our extra bits */ 687 WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & 688 PTE_BITS_MUST_MATCH); 689 kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), 690 0, pte_val(pte), lgpa, PMD_SHIFT); 691 ret = 0; 692 goto out_unlock; 693 } 694 695 /* 696 * If we raced with another CPU which has just put 697 * a 2MB pte in after we saw a pte page, try again. 698 */ 699 if (!new_ptep) { 700 ret = -EAGAIN; 701 goto out_unlock; 702 } 703 /* Valid 2MB page here already, remove it */ 704 kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, 705 lpid); 706 } 707 if (level == 1) { 708 if (!pmd_none(*pmd)) { 709 /* 710 * There's a page table page here, but we wanted to 711 * install a large page, so remove and free the page 712 * table page. 713 */ 714 kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); 715 } 716 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); 717 if (rmapp && n_rmap) 718 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 719 ret = 0; 720 goto out_unlock; 721 } 722 if (pmd_none(*pmd)) { 723 if (!new_ptep) 724 goto out_unlock; 725 pmd_populate(kvm->mm, pmd, new_ptep); 726 new_ptep = NULL; 727 } 728 ptep = pte_offset_kernel(pmd, gpa); 729 if (pte_present(*ptep)) { 730 /* Check if someone else set the same thing */ 731 if (pte_raw(*ptep) == pte_raw(pte)) { 732 ret = 0; 733 goto out_unlock; 734 } 735 /* Valid page here already, add our extra bits */ 736 WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) & 737 PTE_BITS_MUST_MATCH); 738 kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0); 739 ret = 0; 740 goto out_unlock; 741 } 742 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); 743 if (rmapp && n_rmap) 744 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 745 ret = 0; 746 747 out_unlock: 748 spin_unlock(&kvm->mmu_lock); 749 if (new_pud) 750 pud_free(kvm->mm, new_pud); 751 if (new_pmd) 752 kvmppc_pmd_free(new_pmd); 753 if (new_ptep) 754 kvmppc_pte_free(new_ptep); 755 return ret; 756 } 757 758 bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing, 759 unsigned long gpa, unsigned int lpid) 760 { 761 unsigned long pgflags; 762 unsigned int shift; 763 pte_t *ptep; 764 765 /* 766 * Need to set an R or C bit in the 2nd-level tables; 767 * since we are just helping out the hardware here, 768 * it is sufficient to do what the hardware does. 769 */ 770 pgflags = _PAGE_ACCESSED; 771 if (writing) 772 pgflags |= _PAGE_DIRTY; 773 774 if (nested) 775 ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift); 776 else 777 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 778 779 if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { 780 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); 781 return true; 782 } 783 return false; 784 } 785 786 int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, 787 unsigned long gpa, 788 struct kvm_memory_slot *memslot, 789 bool writing, bool kvm_ro, 790 pte_t *inserted_pte, unsigned int *levelp) 791 { 792 struct kvm *kvm = vcpu->kvm; 793 struct page *page = NULL; 794 unsigned long mmu_seq; 795 unsigned long hva, gfn = gpa >> PAGE_SHIFT; 796 bool upgrade_write = false; 797 bool *upgrade_p = &upgrade_write; 798 pte_t pte, *ptep; 799 unsigned int shift, level; 800 int ret; 801 bool large_enable; 802 803 /* used to check for invalidations in progress */ 804 mmu_seq = kvm->mmu_notifier_seq; 805 smp_rmb(); 806 807 /* 808 * Do a fast check first, since __gfn_to_pfn_memslot doesn't 809 * do it with !atomic && !async, which is how we call it. 810 * We always ask for write permission since the common case 811 * is that the page is writable. 812 */ 813 hva = gfn_to_hva_memslot(memslot, gfn); 814 if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { 815 upgrade_write = true; 816 } else { 817 unsigned long pfn; 818 819 /* Call KVM generic code to do the slow-path check */ 820 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 821 writing, upgrade_p); 822 if (is_error_noslot_pfn(pfn)) 823 return -EFAULT; 824 page = NULL; 825 if (pfn_valid(pfn)) { 826 page = pfn_to_page(pfn); 827 if (PageReserved(page)) 828 page = NULL; 829 } 830 } 831 832 /* 833 * Read the PTE from the process' radix tree and use that 834 * so we get the shift and attribute bits. 835 */ 836 spin_lock(&kvm->mmu_lock); 837 ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); 838 pte = __pte(0); 839 if (ptep) 840 pte = READ_ONCE(*ptep); 841 spin_unlock(&kvm->mmu_lock); 842 /* 843 * If the PTE disappeared temporarily due to a THP 844 * collapse, just return and let the guest try again. 845 */ 846 if (!pte_present(pte)) { 847 if (page) 848 put_page(page); 849 return RESUME_GUEST; 850 } 851 852 /* If we're logging dirty pages, always map single pages */ 853 large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); 854 855 /* Get pte level from shift/size */ 856 if (large_enable && shift == PUD_SHIFT && 857 (gpa & (PUD_SIZE - PAGE_SIZE)) == 858 (hva & (PUD_SIZE - PAGE_SIZE))) { 859 level = 2; 860 } else if (large_enable && shift == PMD_SHIFT && 861 (gpa & (PMD_SIZE - PAGE_SIZE)) == 862 (hva & (PMD_SIZE - PAGE_SIZE))) { 863 level = 1; 864 } else { 865 level = 0; 866 if (shift > PAGE_SHIFT) { 867 /* 868 * If the pte maps more than one page, bring over 869 * bits from the virtual address to get the real 870 * address of the specific single page we want. 871 */ 872 unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; 873 pte = __pte(pte_val(pte) | (hva & rpnmask)); 874 } 875 } 876 877 pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); 878 if (writing || upgrade_write) { 879 if (pte_val(pte) & _PAGE_WRITE) 880 pte = __pte(pte_val(pte) | _PAGE_DIRTY); 881 } else { 882 pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); 883 } 884 885 /* Allocate space in the tree and write the PTE */ 886 ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, 887 mmu_seq, kvm->arch.lpid, NULL, NULL); 888 if (inserted_pte) 889 *inserted_pte = pte; 890 if (levelp) 891 *levelp = level; 892 893 if (page) { 894 if (!ret && (pte_val(pte) & _PAGE_WRITE)) 895 set_page_dirty_lock(page); 896 put_page(page); 897 } 898 899 /* Increment number of large pages if we (successfully) inserted one */ 900 if (!ret) { 901 if (level == 1) 902 kvm->stat.num_2M_pages++; 903 else if (level == 2) 904 kvm->stat.num_1G_pages++; 905 } 906 907 return ret; 908 } 909 910 int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, 911 unsigned long ea, unsigned long dsisr) 912 { 913 struct kvm *kvm = vcpu->kvm; 914 unsigned long gpa, gfn; 915 struct kvm_memory_slot *memslot; 916 long ret; 917 bool writing = !!(dsisr & DSISR_ISSTORE); 918 bool kvm_ro = false; 919 920 /* Check for unusual errors */ 921 if (dsisr & DSISR_UNSUPP_MMU) { 922 pr_err("KVM: Got unsupported MMU fault\n"); 923 return -EFAULT; 924 } 925 if (dsisr & DSISR_BADACCESS) { 926 /* Reflect to the guest as DSI */ 927 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); 928 kvmppc_core_queue_data_storage(vcpu, ea, dsisr); 929 return RESUME_GUEST; 930 } 931 932 /* Translate the logical address */ 933 gpa = vcpu->arch.fault_gpa & ~0xfffUL; 934 gpa &= ~0xF000000000000000ul; 935 gfn = gpa >> PAGE_SHIFT; 936 if (!(dsisr & DSISR_PRTABLE_FAULT)) 937 gpa |= ea & 0xfff; 938 939 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 940 return kvmppc_send_page_to_uv(kvm, gfn); 941 942 /* Get the corresponding memslot */ 943 memslot = gfn_to_memslot(kvm, gfn); 944 945 /* No memslot means it's an emulated MMIO region */ 946 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { 947 if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | 948 DSISR_SET_RC)) { 949 /* 950 * Bad address in guest page table tree, or other 951 * unusual error - reflect it to the guest as DSI. 952 */ 953 kvmppc_core_queue_data_storage(vcpu, ea, dsisr); 954 return RESUME_GUEST; 955 } 956 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing); 957 } 958 959 if (memslot->flags & KVM_MEM_READONLY) { 960 if (writing) { 961 /* give the guest a DSI */ 962 kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | 963 DSISR_PROTFAULT); 964 return RESUME_GUEST; 965 } 966 kvm_ro = true; 967 } 968 969 /* Failed to set the reference/change bits */ 970 if (dsisr & DSISR_SET_RC) { 971 spin_lock(&kvm->mmu_lock); 972 if (kvmppc_hv_handle_set_rc(kvm, false, writing, 973 gpa, kvm->arch.lpid)) 974 dsisr &= ~DSISR_SET_RC; 975 spin_unlock(&kvm->mmu_lock); 976 977 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | 978 DSISR_PROTFAULT | DSISR_SET_RC))) 979 return RESUME_GUEST; 980 } 981 982 /* Try to insert a pte */ 983 ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, 984 kvm_ro, NULL, NULL); 985 986 if (ret == 0 || ret == -EAGAIN) 987 ret = RESUME_GUEST; 988 return ret; 989 } 990 991 /* Called with kvm->mmu_lock held */ 992 int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 993 unsigned long gfn) 994 { 995 pte_t *ptep; 996 unsigned long gpa = gfn << PAGE_SHIFT; 997 unsigned int shift; 998 999 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { 1000 uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); 1001 return 0; 1002 } 1003 1004 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 1005 if (ptep && pte_present(*ptep)) 1006 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, 1007 kvm->arch.lpid); 1008 return 0; 1009 } 1010 1011 /* Called with kvm->mmu_lock held */ 1012 int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 1013 unsigned long gfn) 1014 { 1015 pte_t *ptep; 1016 unsigned long gpa = gfn << PAGE_SHIFT; 1017 unsigned int shift; 1018 int ref = 0; 1019 unsigned long old, *rmapp; 1020 1021 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1022 return ref; 1023 1024 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 1025 if (ptep && pte_present(*ptep) && pte_young(*ptep)) { 1026 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, 1027 gpa, shift); 1028 /* XXX need to flush tlb here? */ 1029 /* Also clear bit in ptes in shadow pgtable for nested guests */ 1030 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1031 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, 1032 old & PTE_RPN_MASK, 1033 1UL << shift); 1034 ref = 1; 1035 } 1036 return ref; 1037 } 1038 1039 /* Called with kvm->mmu_lock held */ 1040 int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 1041 unsigned long gfn) 1042 { 1043 pte_t *ptep; 1044 unsigned long gpa = gfn << PAGE_SHIFT; 1045 unsigned int shift; 1046 int ref = 0; 1047 1048 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1049 return ref; 1050 1051 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 1052 if (ptep && pte_present(*ptep) && pte_young(*ptep)) 1053 ref = 1; 1054 return ref; 1055 } 1056 1057 /* Returns the number of PAGE_SIZE pages that are dirty */ 1058 static int kvm_radix_test_clear_dirty(struct kvm *kvm, 1059 struct kvm_memory_slot *memslot, int pagenum) 1060 { 1061 unsigned long gfn = memslot->base_gfn + pagenum; 1062 unsigned long gpa = gfn << PAGE_SHIFT; 1063 pte_t *ptep, pte; 1064 unsigned int shift; 1065 int ret = 0; 1066 unsigned long old, *rmapp; 1067 1068 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1069 return ret; 1070 1071 /* 1072 * For performance reasons we don't hold kvm->mmu_lock while walking the 1073 * partition scoped table. 1074 */ 1075 ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift); 1076 if (!ptep) 1077 return 0; 1078 1079 pte = READ_ONCE(*ptep); 1080 if (pte_present(pte) && pte_dirty(pte)) { 1081 spin_lock(&kvm->mmu_lock); 1082 /* 1083 * Recheck the pte again 1084 */ 1085 if (pte_val(pte) != pte_val(*ptep)) { 1086 /* 1087 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can 1088 * only find PAGE_SIZE pte entries here. We can continue 1089 * to use the pte addr returned by above page table 1090 * walk. 1091 */ 1092 if (!pte_present(*ptep) || !pte_dirty(*ptep)) { 1093 spin_unlock(&kvm->mmu_lock); 1094 return 0; 1095 } 1096 } 1097 1098 ret = 1; 1099 VM_BUG_ON(shift); 1100 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, 1101 gpa, shift); 1102 kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); 1103 /* Also clear bit in ptes in shadow pgtable for nested guests */ 1104 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1105 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0, 1106 old & PTE_RPN_MASK, 1107 1UL << shift); 1108 spin_unlock(&kvm->mmu_lock); 1109 } 1110 return ret; 1111 } 1112 1113 long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, 1114 struct kvm_memory_slot *memslot, unsigned long *map) 1115 { 1116 unsigned long i, j; 1117 int npages; 1118 1119 for (i = 0; i < memslot->npages; i = j) { 1120 npages = kvm_radix_test_clear_dirty(kvm, memslot, i); 1121 1122 /* 1123 * Note that if npages > 0 then i must be a multiple of npages, 1124 * since huge pages are only used to back the guest at guest 1125 * real addresses that are a multiple of their size. 1126 * Since we have at most one PTE covering any given guest 1127 * real address, if npages > 1 we can skip to i + npages. 1128 */ 1129 j = i + 1; 1130 if (npages) { 1131 set_dirty_bits(map, i, npages); 1132 j = i + npages; 1133 } 1134 } 1135 return 0; 1136 } 1137 1138 void kvmppc_radix_flush_memslot(struct kvm *kvm, 1139 const struct kvm_memory_slot *memslot) 1140 { 1141 unsigned long n; 1142 pte_t *ptep; 1143 unsigned long gpa; 1144 unsigned int shift; 1145 1146 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) 1147 kvmppc_uvmem_drop_pages(memslot, kvm, true); 1148 1149 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1150 return; 1151 1152 gpa = memslot->base_gfn << PAGE_SHIFT; 1153 spin_lock(&kvm->mmu_lock); 1154 for (n = memslot->npages; n; --n) { 1155 ptep = find_kvm_secondary_pte(kvm, gpa, &shift); 1156 if (ptep && pte_present(*ptep)) 1157 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, 1158 kvm->arch.lpid); 1159 gpa += PAGE_SIZE; 1160 } 1161 /* 1162 * Increase the mmu notifier sequence number to prevent any page 1163 * fault that read the memslot earlier from writing a PTE. 1164 */ 1165 kvm->mmu_notifier_seq++; 1166 spin_unlock(&kvm->mmu_lock); 1167 } 1168 1169 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, 1170 int psize, int *indexp) 1171 { 1172 if (!mmu_psize_defs[psize].shift) 1173 return; 1174 info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | 1175 (mmu_psize_defs[psize].ap << 29); 1176 ++(*indexp); 1177 } 1178 1179 int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) 1180 { 1181 int i; 1182 1183 if (!radix_enabled()) 1184 return -EINVAL; 1185 memset(info, 0, sizeof(*info)); 1186 1187 /* 4k page size */ 1188 info->geometries[0].page_shift = 12; 1189 info->geometries[0].level_bits[0] = 9; 1190 for (i = 1; i < 4; ++i) 1191 info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; 1192 /* 64k page size */ 1193 info->geometries[1].page_shift = 16; 1194 for (i = 0; i < 4; ++i) 1195 info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; 1196 1197 i = 0; 1198 add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); 1199 add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); 1200 add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); 1201 add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); 1202 1203 return 0; 1204 } 1205 1206 int kvmppc_init_vm_radix(struct kvm *kvm) 1207 { 1208 kvm->arch.pgtable = pgd_alloc(kvm->mm); 1209 if (!kvm->arch.pgtable) 1210 return -ENOMEM; 1211 return 0; 1212 } 1213 1214 static void pte_ctor(void *addr) 1215 { 1216 memset(addr, 0, RADIX_PTE_TABLE_SIZE); 1217 } 1218 1219 static void pmd_ctor(void *addr) 1220 { 1221 memset(addr, 0, RADIX_PMD_TABLE_SIZE); 1222 } 1223 1224 struct debugfs_radix_state { 1225 struct kvm *kvm; 1226 struct mutex mutex; 1227 unsigned long gpa; 1228 int lpid; 1229 int chars_left; 1230 int buf_index; 1231 char buf[128]; 1232 u8 hdr; 1233 }; 1234 1235 static int debugfs_radix_open(struct inode *inode, struct file *file) 1236 { 1237 struct kvm *kvm = inode->i_private; 1238 struct debugfs_radix_state *p; 1239 1240 p = kzalloc(sizeof(*p), GFP_KERNEL); 1241 if (!p) 1242 return -ENOMEM; 1243 1244 kvm_get_kvm(kvm); 1245 p->kvm = kvm; 1246 mutex_init(&p->mutex); 1247 file->private_data = p; 1248 1249 return nonseekable_open(inode, file); 1250 } 1251 1252 static int debugfs_radix_release(struct inode *inode, struct file *file) 1253 { 1254 struct debugfs_radix_state *p = file->private_data; 1255 1256 kvm_put_kvm(p->kvm); 1257 kfree(p); 1258 return 0; 1259 } 1260 1261 static ssize_t debugfs_radix_read(struct file *file, char __user *buf, 1262 size_t len, loff_t *ppos) 1263 { 1264 struct debugfs_radix_state *p = file->private_data; 1265 ssize_t ret, r; 1266 unsigned long n; 1267 struct kvm *kvm; 1268 unsigned long gpa; 1269 pgd_t *pgt; 1270 struct kvm_nested_guest *nested; 1271 pgd_t *pgdp; 1272 p4d_t p4d, *p4dp; 1273 pud_t pud, *pudp; 1274 pmd_t pmd, *pmdp; 1275 pte_t *ptep; 1276 int shift; 1277 unsigned long pte; 1278 1279 kvm = p->kvm; 1280 if (!kvm_is_radix(kvm)) 1281 return 0; 1282 1283 ret = mutex_lock_interruptible(&p->mutex); 1284 if (ret) 1285 return ret; 1286 1287 if (p->chars_left) { 1288 n = p->chars_left; 1289 if (n > len) 1290 n = len; 1291 r = copy_to_user(buf, p->buf + p->buf_index, n); 1292 n -= r; 1293 p->chars_left -= n; 1294 p->buf_index += n; 1295 buf += n; 1296 len -= n; 1297 ret = n; 1298 if (r) { 1299 if (!n) 1300 ret = -EFAULT; 1301 goto out; 1302 } 1303 } 1304 1305 gpa = p->gpa; 1306 nested = NULL; 1307 pgt = NULL; 1308 while (len != 0 && p->lpid >= 0) { 1309 if (gpa >= RADIX_PGTABLE_RANGE) { 1310 gpa = 0; 1311 pgt = NULL; 1312 if (nested) { 1313 kvmhv_put_nested(nested); 1314 nested = NULL; 1315 } 1316 p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); 1317 p->hdr = 0; 1318 if (p->lpid < 0) 1319 break; 1320 } 1321 if (!pgt) { 1322 if (p->lpid == 0) { 1323 pgt = kvm->arch.pgtable; 1324 } else { 1325 nested = kvmhv_get_nested(kvm, p->lpid, false); 1326 if (!nested) { 1327 gpa = RADIX_PGTABLE_RANGE; 1328 continue; 1329 } 1330 pgt = nested->shadow_pgtable; 1331 } 1332 } 1333 n = 0; 1334 if (!p->hdr) { 1335 if (p->lpid > 0) 1336 n = scnprintf(p->buf, sizeof(p->buf), 1337 "\nNested LPID %d: ", p->lpid); 1338 n += scnprintf(p->buf + n, sizeof(p->buf) - n, 1339 "pgdir: %lx\n", (unsigned long)pgt); 1340 p->hdr = 1; 1341 goto copy; 1342 } 1343 1344 pgdp = pgt + pgd_index(gpa); 1345 p4dp = p4d_offset(pgdp, gpa); 1346 p4d = READ_ONCE(*p4dp); 1347 if (!(p4d_val(p4d) & _PAGE_PRESENT)) { 1348 gpa = (gpa & P4D_MASK) + P4D_SIZE; 1349 continue; 1350 } 1351 1352 pudp = pud_offset(&p4d, gpa); 1353 pud = READ_ONCE(*pudp); 1354 if (!(pud_val(pud) & _PAGE_PRESENT)) { 1355 gpa = (gpa & PUD_MASK) + PUD_SIZE; 1356 continue; 1357 } 1358 if (pud_val(pud) & _PAGE_PTE) { 1359 pte = pud_val(pud); 1360 shift = PUD_SHIFT; 1361 goto leaf; 1362 } 1363 1364 pmdp = pmd_offset(&pud, gpa); 1365 pmd = READ_ONCE(*pmdp); 1366 if (!(pmd_val(pmd) & _PAGE_PRESENT)) { 1367 gpa = (gpa & PMD_MASK) + PMD_SIZE; 1368 continue; 1369 } 1370 if (pmd_val(pmd) & _PAGE_PTE) { 1371 pte = pmd_val(pmd); 1372 shift = PMD_SHIFT; 1373 goto leaf; 1374 } 1375 1376 ptep = pte_offset_kernel(&pmd, gpa); 1377 pte = pte_val(READ_ONCE(*ptep)); 1378 if (!(pte & _PAGE_PRESENT)) { 1379 gpa += PAGE_SIZE; 1380 continue; 1381 } 1382 shift = PAGE_SHIFT; 1383 leaf: 1384 n = scnprintf(p->buf, sizeof(p->buf), 1385 " %lx: %lx %d\n", gpa, pte, shift); 1386 gpa += 1ul << shift; 1387 copy: 1388 p->chars_left = n; 1389 if (n > len) 1390 n = len; 1391 r = copy_to_user(buf, p->buf, n); 1392 n -= r; 1393 p->chars_left -= n; 1394 p->buf_index = n; 1395 buf += n; 1396 len -= n; 1397 ret += n; 1398 if (r) { 1399 if (!ret) 1400 ret = -EFAULT; 1401 break; 1402 } 1403 } 1404 p->gpa = gpa; 1405 if (nested) 1406 kvmhv_put_nested(nested); 1407 1408 out: 1409 mutex_unlock(&p->mutex); 1410 return ret; 1411 } 1412 1413 static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, 1414 size_t len, loff_t *ppos) 1415 { 1416 return -EACCES; 1417 } 1418 1419 static const struct file_operations debugfs_radix_fops = { 1420 .owner = THIS_MODULE, 1421 .open = debugfs_radix_open, 1422 .release = debugfs_radix_release, 1423 .read = debugfs_radix_read, 1424 .write = debugfs_radix_write, 1425 .llseek = generic_file_llseek, 1426 }; 1427 1428 void kvmhv_radix_debugfs_init(struct kvm *kvm) 1429 { 1430 debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm, 1431 &debugfs_radix_fops); 1432 } 1433 1434 int kvmppc_radix_init(void) 1435 { 1436 unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; 1437 1438 kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); 1439 if (!kvm_pte_cache) 1440 return -ENOMEM; 1441 1442 size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; 1443 1444 kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); 1445 if (!kvm_pmd_cache) { 1446 kmem_cache_destroy(kvm_pte_cache); 1447 return -ENOMEM; 1448 } 1449 1450 return 0; 1451 } 1452 1453 void kvmppc_radix_exit(void) 1454 { 1455 kmem_cache_destroy(kvm_pte_cache); 1456 kmem_cache_destroy(kvm_pmd_cache); 1457 } 1458