1 /* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License, version 2, as 4 * published by the Free Software Foundation. 5 * 6 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 */ 8 9 #include <linux/types.h> 10 #include <linux/string.h> 11 #include <linux/kvm.h> 12 #include <linux/kvm_host.h> 13 #include <linux/anon_inodes.h> 14 #include <linux/file.h> 15 #include <linux/debugfs.h> 16 17 #include <asm/kvm_ppc.h> 18 #include <asm/kvm_book3s.h> 19 #include <asm/page.h> 20 #include <asm/mmu.h> 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/pte-walk.h> 24 25 /* 26 * Supported radix tree geometry. 27 * Like p9, we support either 5 or 9 bits at the first (lowest) level, 28 * for a page size of 64k or 4k. 29 */ 30 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; 31 32 int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, 33 struct kvmppc_pte *gpte, u64 root, 34 u64 *pte_ret_p) 35 { 36 struct kvm *kvm = vcpu->kvm; 37 int ret, level, ps; 38 unsigned long rts, bits, offset, index; 39 u64 pte, base, gpa; 40 __be64 rpte; 41 42 rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | 43 ((root & RTS2_MASK) >> RTS2_SHIFT); 44 bits = root & RPDS_MASK; 45 base = root & RPDB_MASK; 46 47 offset = rts + 31; 48 49 /* Current implementations only support 52-bit space */ 50 if (offset != 52) 51 return -EINVAL; 52 53 /* Walk each level of the radix tree */ 54 for (level = 3; level >= 0; --level) { 55 u64 addr; 56 /* Check a valid size */ 57 if (level && bits != p9_supported_radix_bits[level]) 58 return -EINVAL; 59 if (level == 0 && !(bits == 5 || bits == 9)) 60 return -EINVAL; 61 offset -= bits; 62 index = (eaddr >> offset) & ((1UL << bits) - 1); 63 /* Check that low bits of page table base are zero */ 64 if (base & ((1UL << (bits + 3)) - 1)) 65 return -EINVAL; 66 /* Read the entry from guest memory */ 67 addr = base + (index * sizeof(rpte)); 68 ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); 69 if (ret) { 70 if (pte_ret_p) 71 *pte_ret_p = addr; 72 return ret; 73 } 74 pte = __be64_to_cpu(rpte); 75 if (!(pte & _PAGE_PRESENT)) 76 return -ENOENT; 77 /* Check if a leaf entry */ 78 if (pte & _PAGE_PTE) 79 break; 80 /* Get ready to walk the next level */ 81 base = pte & RPDB_MASK; 82 bits = pte & RPDS_MASK; 83 } 84 85 /* Need a leaf at lowest level; 512GB pages not supported */ 86 if (level < 0 || level == 3) 87 return -EINVAL; 88 89 /* We found a valid leaf PTE */ 90 /* Offset is now log base 2 of the page size */ 91 gpa = pte & 0x01fffffffffff000ul; 92 if (gpa & ((1ul << offset) - 1)) 93 return -EINVAL; 94 gpa |= eaddr & ((1ul << offset) - 1); 95 for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) 96 if (offset == mmu_psize_defs[ps].shift) 97 break; 98 gpte->page_size = ps; 99 gpte->page_shift = offset; 100 101 gpte->eaddr = eaddr; 102 gpte->raddr = gpa; 103 104 /* Work out permissions */ 105 gpte->may_read = !!(pte & _PAGE_READ); 106 gpte->may_write = !!(pte & _PAGE_WRITE); 107 gpte->may_execute = !!(pte & _PAGE_EXEC); 108 109 gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); 110 111 if (pte_ret_p) 112 *pte_ret_p = pte; 113 114 return 0; 115 } 116 117 /* 118 * Used to walk a partition or process table radix tree in guest memory 119 * Note: We exploit the fact that a partition table and a process 120 * table have the same layout, a partition-scoped page table and a 121 * process-scoped page table have the same layout, and the 2nd 122 * doubleword of a partition table entry has the same layout as 123 * the PTCR register. 124 */ 125 int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, 126 struct kvmppc_pte *gpte, u64 table, 127 int table_index, u64 *pte_ret_p) 128 { 129 struct kvm *kvm = vcpu->kvm; 130 int ret; 131 unsigned long size, ptbl, root; 132 struct prtb_entry entry; 133 134 if ((table & PRTS_MASK) > 24) 135 return -EINVAL; 136 size = 1ul << ((table & PRTS_MASK) + 12); 137 138 /* Is the table big enough to contain this entry? */ 139 if ((table_index * sizeof(entry)) >= size) 140 return -EINVAL; 141 142 /* Read the table to find the root of the radix tree */ 143 ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); 144 ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); 145 if (ret) 146 return ret; 147 148 /* Root is stored in the first double word */ 149 root = be64_to_cpu(entry.prtb0); 150 151 return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); 152 } 153 154 int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 155 struct kvmppc_pte *gpte, bool data, bool iswrite) 156 { 157 u32 pid; 158 u64 pte; 159 int ret; 160 161 /* Work out effective PID */ 162 switch (eaddr >> 62) { 163 case 0: 164 pid = vcpu->arch.pid; 165 break; 166 case 3: 167 pid = 0; 168 break; 169 default: 170 return -EINVAL; 171 } 172 173 ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, 174 vcpu->kvm->arch.process_table, pid, &pte); 175 if (ret) 176 return ret; 177 178 /* Check privilege (applies only to process scoped translations) */ 179 if (kvmppc_get_msr(vcpu) & MSR_PR) { 180 if (pte & _PAGE_PRIVILEGED) { 181 gpte->may_read = 0; 182 gpte->may_write = 0; 183 gpte->may_execute = 0; 184 } 185 } else { 186 if (!(pte & _PAGE_PRIVILEGED)) { 187 /* Check AMR/IAMR to see if strict mode is in force */ 188 if (vcpu->arch.amr & (1ul << 62)) 189 gpte->may_read = 0; 190 if (vcpu->arch.amr & (1ul << 63)) 191 gpte->may_write = 0; 192 if (vcpu->arch.iamr & (1ul << 62)) 193 gpte->may_execute = 0; 194 } 195 } 196 197 return 0; 198 } 199 200 static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, 201 unsigned int pshift, unsigned int lpid) 202 { 203 unsigned long psize = PAGE_SIZE; 204 int psi; 205 long rc; 206 unsigned long rb; 207 208 if (pshift) 209 psize = 1UL << pshift; 210 else 211 pshift = PAGE_SHIFT; 212 213 addr &= ~(psize - 1); 214 215 if (!kvmhv_on_pseries()) { 216 radix__flush_tlb_lpid_page(lpid, addr, psize); 217 return; 218 } 219 220 psi = shift_to_mmu_psize(pshift); 221 rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); 222 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), 223 lpid, rb); 224 if (rc) 225 pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); 226 } 227 228 static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) 229 { 230 long rc; 231 232 if (!kvmhv_on_pseries()) { 233 radix__flush_pwc_lpid(lpid); 234 return; 235 } 236 237 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), 238 lpid, TLBIEL_INVAL_SET_LPID); 239 if (rc) 240 pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); 241 } 242 243 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, 244 unsigned long clr, unsigned long set, 245 unsigned long addr, unsigned int shift) 246 { 247 return __radix_pte_update(ptep, clr, set); 248 } 249 250 void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, 251 pte_t *ptep, pte_t pte) 252 { 253 radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); 254 } 255 256 static struct kmem_cache *kvm_pte_cache; 257 static struct kmem_cache *kvm_pmd_cache; 258 259 static pte_t *kvmppc_pte_alloc(void) 260 { 261 return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); 262 } 263 264 static void kvmppc_pte_free(pte_t *ptep) 265 { 266 kmem_cache_free(kvm_pte_cache, ptep); 267 } 268 269 /* Like pmd_huge() and pmd_large(), but works regardless of config options */ 270 static inline int pmd_is_leaf(pmd_t pmd) 271 { 272 return !!(pmd_val(pmd) & _PAGE_PTE); 273 } 274 275 static pmd_t *kvmppc_pmd_alloc(void) 276 { 277 return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL); 278 } 279 280 static void kvmppc_pmd_free(pmd_t *pmdp) 281 { 282 kmem_cache_free(kvm_pmd_cache, pmdp); 283 } 284 285 /* Called with kvm->mmu_lock held */ 286 void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, 287 unsigned int shift, struct kvm_memory_slot *memslot, 288 unsigned int lpid) 289 290 { 291 unsigned long old; 292 unsigned long gfn = gpa >> PAGE_SHIFT; 293 unsigned long page_size = PAGE_SIZE; 294 unsigned long hpa; 295 296 old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); 297 kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); 298 299 /* The following only applies to L1 entries */ 300 if (lpid != kvm->arch.lpid) 301 return; 302 303 if (!memslot) { 304 memslot = gfn_to_memslot(kvm, gfn); 305 if (!memslot) 306 return; 307 } 308 if (shift) 309 page_size = 1ul << shift; 310 311 gpa &= ~(page_size - 1); 312 hpa = old & PTE_RPN_MASK; 313 kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); 314 315 if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) 316 kvmppc_update_dirty_map(memslot, gfn, page_size); 317 } 318 319 /* 320 * kvmppc_free_p?d are used to free existing page tables, and recursively 321 * descend and clear and free children. 322 * Callers are responsible for flushing the PWC. 323 * 324 * When page tables are being unmapped/freed as part of page fault path 325 * (full == false), ptes are not expected. There is code to unmap them 326 * and emit a warning if encountered, but there may already be data 327 * corruption due to the unexpected mappings. 328 */ 329 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, 330 unsigned int lpid) 331 { 332 if (full) { 333 memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); 334 } else { 335 pte_t *p = pte; 336 unsigned long it; 337 338 for (it = 0; it < PTRS_PER_PTE; ++it, ++p) { 339 if (pte_val(*p) == 0) 340 continue; 341 WARN_ON_ONCE(1); 342 kvmppc_unmap_pte(kvm, p, 343 pte_pfn(*p) << PAGE_SHIFT, 344 PAGE_SHIFT, NULL, lpid); 345 } 346 } 347 348 kvmppc_pte_free(pte); 349 } 350 351 static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, 352 unsigned int lpid) 353 { 354 unsigned long im; 355 pmd_t *p = pmd; 356 357 for (im = 0; im < PTRS_PER_PMD; ++im, ++p) { 358 if (!pmd_present(*p)) 359 continue; 360 if (pmd_is_leaf(*p)) { 361 if (full) { 362 pmd_clear(p); 363 } else { 364 WARN_ON_ONCE(1); 365 kvmppc_unmap_pte(kvm, (pte_t *)p, 366 pte_pfn(*(pte_t *)p) << PAGE_SHIFT, 367 PMD_SHIFT, NULL, lpid); 368 } 369 } else { 370 pte_t *pte; 371 372 pte = pte_offset_map(p, 0); 373 kvmppc_unmap_free_pte(kvm, pte, full, lpid); 374 pmd_clear(p); 375 } 376 } 377 kvmppc_pmd_free(pmd); 378 } 379 380 static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, 381 unsigned int lpid) 382 { 383 unsigned long iu; 384 pud_t *p = pud; 385 386 for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) { 387 if (!pud_present(*p)) 388 continue; 389 if (pud_huge(*p)) { 390 pud_clear(p); 391 } else { 392 pmd_t *pmd; 393 394 pmd = pmd_offset(p, 0); 395 kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); 396 pud_clear(p); 397 } 398 } 399 pud_free(kvm->mm, pud); 400 } 401 402 void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) 403 { 404 unsigned long ig; 405 406 for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { 407 pud_t *pud; 408 409 if (!pgd_present(*pgd)) 410 continue; 411 pud = pud_offset(pgd, 0); 412 kvmppc_unmap_free_pud(kvm, pud, lpid); 413 pgd_clear(pgd); 414 } 415 } 416 417 void kvmppc_free_radix(struct kvm *kvm) 418 { 419 if (kvm->arch.pgtable) { 420 kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, 421 kvm->arch.lpid); 422 pgd_free(kvm->mm, kvm->arch.pgtable); 423 kvm->arch.pgtable = NULL; 424 } 425 } 426 427 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, 428 unsigned long gpa, unsigned int lpid) 429 { 430 pte_t *pte = pte_offset_kernel(pmd, 0); 431 432 /* 433 * Clearing the pmd entry then flushing the PWC ensures that the pte 434 * page no longer be cached by the MMU, so can be freed without 435 * flushing the PWC again. 436 */ 437 pmd_clear(pmd); 438 kvmppc_radix_flush_pwc(kvm, lpid); 439 440 kvmppc_unmap_free_pte(kvm, pte, false, lpid); 441 } 442 443 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, 444 unsigned long gpa, unsigned int lpid) 445 { 446 pmd_t *pmd = pmd_offset(pud, 0); 447 448 /* 449 * Clearing the pud entry then flushing the PWC ensures that the pmd 450 * page and any children pte pages will no longer be cached by the MMU, 451 * so can be freed without flushing the PWC again. 452 */ 453 pud_clear(pud); 454 kvmppc_radix_flush_pwc(kvm, lpid); 455 456 kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); 457 } 458 459 /* 460 * There are a number of bits which may differ between different faults to 461 * the same partition scope entry. RC bits, in the course of cleaning and 462 * aging. And the write bit can change, either the access could have been 463 * upgraded, or a read fault could happen concurrently with a write fault 464 * that sets those bits first. 465 */ 466 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) 467 468 int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, 469 unsigned long gpa, unsigned int level, 470 unsigned long mmu_seq, unsigned int lpid, 471 unsigned long *rmapp, struct rmap_nested **n_rmap) 472 { 473 pgd_t *pgd; 474 pud_t *pud, *new_pud = NULL; 475 pmd_t *pmd, *new_pmd = NULL; 476 pte_t *ptep, *new_ptep = NULL; 477 int ret; 478 479 /* Traverse the guest's 2nd-level tree, allocate new levels needed */ 480 pgd = pgtable + pgd_index(gpa); 481 pud = NULL; 482 if (pgd_present(*pgd)) 483 pud = pud_offset(pgd, gpa); 484 else 485 new_pud = pud_alloc_one(kvm->mm, gpa); 486 487 pmd = NULL; 488 if (pud && pud_present(*pud) && !pud_huge(*pud)) 489 pmd = pmd_offset(pud, gpa); 490 else if (level <= 1) 491 new_pmd = kvmppc_pmd_alloc(); 492 493 if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) 494 new_ptep = kvmppc_pte_alloc(); 495 496 /* Check if we might have been invalidated; let the guest retry if so */ 497 spin_lock(&kvm->mmu_lock); 498 ret = -EAGAIN; 499 if (mmu_notifier_retry(kvm, mmu_seq)) 500 goto out_unlock; 501 502 /* Now traverse again under the lock and change the tree */ 503 ret = -ENOMEM; 504 if (pgd_none(*pgd)) { 505 if (!new_pud) 506 goto out_unlock; 507 pgd_populate(kvm->mm, pgd, new_pud); 508 new_pud = NULL; 509 } 510 pud = pud_offset(pgd, gpa); 511 if (pud_huge(*pud)) { 512 unsigned long hgpa = gpa & PUD_MASK; 513 514 /* Check if we raced and someone else has set the same thing */ 515 if (level == 2) { 516 if (pud_raw(*pud) == pte_raw(pte)) { 517 ret = 0; 518 goto out_unlock; 519 } 520 /* Valid 1GB page here already, add our extra bits */ 521 WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) & 522 PTE_BITS_MUST_MATCH); 523 kvmppc_radix_update_pte(kvm, (pte_t *)pud, 524 0, pte_val(pte), hgpa, PUD_SHIFT); 525 ret = 0; 526 goto out_unlock; 527 } 528 /* 529 * If we raced with another CPU which has just put 530 * a 1GB pte in after we saw a pmd page, try again. 531 */ 532 if (!new_pmd) { 533 ret = -EAGAIN; 534 goto out_unlock; 535 } 536 /* Valid 1GB page here already, remove it */ 537 kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, 538 lpid); 539 } 540 if (level == 2) { 541 if (!pud_none(*pud)) { 542 /* 543 * There's a page table page here, but we wanted to 544 * install a large page, so remove and free the page 545 * table page. 546 */ 547 kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); 548 } 549 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); 550 if (rmapp && n_rmap) 551 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 552 ret = 0; 553 goto out_unlock; 554 } 555 if (pud_none(*pud)) { 556 if (!new_pmd) 557 goto out_unlock; 558 pud_populate(kvm->mm, pud, new_pmd); 559 new_pmd = NULL; 560 } 561 pmd = pmd_offset(pud, gpa); 562 if (pmd_is_leaf(*pmd)) { 563 unsigned long lgpa = gpa & PMD_MASK; 564 565 /* Check if we raced and someone else has set the same thing */ 566 if (level == 1) { 567 if (pmd_raw(*pmd) == pte_raw(pte)) { 568 ret = 0; 569 goto out_unlock; 570 } 571 /* Valid 2MB page here already, add our extra bits */ 572 WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & 573 PTE_BITS_MUST_MATCH); 574 kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), 575 0, pte_val(pte), lgpa, PMD_SHIFT); 576 ret = 0; 577 goto out_unlock; 578 } 579 580 /* 581 * If we raced with another CPU which has just put 582 * a 2MB pte in after we saw a pte page, try again. 583 */ 584 if (!new_ptep) { 585 ret = -EAGAIN; 586 goto out_unlock; 587 } 588 /* Valid 2MB page here already, remove it */ 589 kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, 590 lpid); 591 } 592 if (level == 1) { 593 if (!pmd_none(*pmd)) { 594 /* 595 * There's a page table page here, but we wanted to 596 * install a large page, so remove and free the page 597 * table page. 598 */ 599 kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); 600 } 601 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); 602 if (rmapp && n_rmap) 603 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 604 ret = 0; 605 goto out_unlock; 606 } 607 if (pmd_none(*pmd)) { 608 if (!new_ptep) 609 goto out_unlock; 610 pmd_populate(kvm->mm, pmd, new_ptep); 611 new_ptep = NULL; 612 } 613 ptep = pte_offset_kernel(pmd, gpa); 614 if (pte_present(*ptep)) { 615 /* Check if someone else set the same thing */ 616 if (pte_raw(*ptep) == pte_raw(pte)) { 617 ret = 0; 618 goto out_unlock; 619 } 620 /* Valid page here already, add our extra bits */ 621 WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) & 622 PTE_BITS_MUST_MATCH); 623 kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0); 624 ret = 0; 625 goto out_unlock; 626 } 627 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); 628 if (rmapp && n_rmap) 629 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); 630 ret = 0; 631 632 out_unlock: 633 spin_unlock(&kvm->mmu_lock); 634 if (new_pud) 635 pud_free(kvm->mm, new_pud); 636 if (new_pmd) 637 kvmppc_pmd_free(new_pmd); 638 if (new_ptep) 639 kvmppc_pte_free(new_ptep); 640 return ret; 641 } 642 643 bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, 644 unsigned long gpa, unsigned int lpid) 645 { 646 unsigned long pgflags; 647 unsigned int shift; 648 pte_t *ptep; 649 650 /* 651 * Need to set an R or C bit in the 2nd-level tables; 652 * since we are just helping out the hardware here, 653 * it is sufficient to do what the hardware does. 654 */ 655 pgflags = _PAGE_ACCESSED; 656 if (writing) 657 pgflags |= _PAGE_DIRTY; 658 /* 659 * We are walking the secondary (partition-scoped) page table here. 660 * We can do this without disabling irq because the Linux MM 661 * subsystem doesn't do THP splits and collapses on this tree. 662 */ 663 ptep = __find_linux_pte(pgtable, gpa, NULL, &shift); 664 if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { 665 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); 666 return true; 667 } 668 return false; 669 } 670 671 int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, 672 unsigned long gpa, 673 struct kvm_memory_slot *memslot, 674 bool writing, bool kvm_ro, 675 pte_t *inserted_pte, unsigned int *levelp) 676 { 677 struct kvm *kvm = vcpu->kvm; 678 struct page *page = NULL; 679 unsigned long mmu_seq; 680 unsigned long hva, gfn = gpa >> PAGE_SHIFT; 681 bool upgrade_write = false; 682 bool *upgrade_p = &upgrade_write; 683 pte_t pte, *ptep; 684 unsigned int shift, level; 685 int ret; 686 687 /* used to check for invalidations in progress */ 688 mmu_seq = kvm->mmu_notifier_seq; 689 smp_rmb(); 690 691 /* 692 * Do a fast check first, since __gfn_to_pfn_memslot doesn't 693 * do it with !atomic && !async, which is how we call it. 694 * We always ask for write permission since the common case 695 * is that the page is writable. 696 */ 697 hva = gfn_to_hva_memslot(memslot, gfn); 698 if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { 699 upgrade_write = true; 700 } else { 701 unsigned long pfn; 702 703 /* Call KVM generic code to do the slow-path check */ 704 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 705 writing, upgrade_p); 706 if (is_error_noslot_pfn(pfn)) 707 return -EFAULT; 708 page = NULL; 709 if (pfn_valid(pfn)) { 710 page = pfn_to_page(pfn); 711 if (PageReserved(page)) 712 page = NULL; 713 } 714 } 715 716 /* 717 * Read the PTE from the process' radix tree and use that 718 * so we get the shift and attribute bits. 719 */ 720 local_irq_disable(); 721 ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); 722 /* 723 * If the PTE disappeared temporarily due to a THP 724 * collapse, just return and let the guest try again. 725 */ 726 if (!ptep) { 727 local_irq_enable(); 728 if (page) 729 put_page(page); 730 return RESUME_GUEST; 731 } 732 pte = *ptep; 733 local_irq_enable(); 734 735 /* Get pte level from shift/size */ 736 if (shift == PUD_SHIFT && 737 (gpa & (PUD_SIZE - PAGE_SIZE)) == 738 (hva & (PUD_SIZE - PAGE_SIZE))) { 739 level = 2; 740 } else if (shift == PMD_SHIFT && 741 (gpa & (PMD_SIZE - PAGE_SIZE)) == 742 (hva & (PMD_SIZE - PAGE_SIZE))) { 743 level = 1; 744 } else { 745 level = 0; 746 if (shift > PAGE_SHIFT) { 747 /* 748 * If the pte maps more than one page, bring over 749 * bits from the virtual address to get the real 750 * address of the specific single page we want. 751 */ 752 unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; 753 pte = __pte(pte_val(pte) | (hva & rpnmask)); 754 } 755 } 756 757 pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); 758 if (writing || upgrade_write) { 759 if (pte_val(pte) & _PAGE_WRITE) 760 pte = __pte(pte_val(pte) | _PAGE_DIRTY); 761 } else { 762 pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); 763 } 764 765 /* Allocate space in the tree and write the PTE */ 766 ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, 767 mmu_seq, kvm->arch.lpid, NULL, NULL); 768 if (inserted_pte) 769 *inserted_pte = pte; 770 if (levelp) 771 *levelp = level; 772 773 if (page) { 774 if (!ret && (pte_val(pte) & _PAGE_WRITE)) 775 set_page_dirty_lock(page); 776 put_page(page); 777 } 778 779 return ret; 780 } 781 782 int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, 783 unsigned long ea, unsigned long dsisr) 784 { 785 struct kvm *kvm = vcpu->kvm; 786 unsigned long gpa, gfn; 787 struct kvm_memory_slot *memslot; 788 long ret; 789 bool writing = !!(dsisr & DSISR_ISSTORE); 790 bool kvm_ro = false; 791 792 /* Check for unusual errors */ 793 if (dsisr & DSISR_UNSUPP_MMU) { 794 pr_err("KVM: Got unsupported MMU fault\n"); 795 return -EFAULT; 796 } 797 if (dsisr & DSISR_BADACCESS) { 798 /* Reflect to the guest as DSI */ 799 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); 800 kvmppc_core_queue_data_storage(vcpu, ea, dsisr); 801 return RESUME_GUEST; 802 } 803 804 /* Translate the logical address */ 805 gpa = vcpu->arch.fault_gpa & ~0xfffUL; 806 gpa &= ~0xF000000000000000ul; 807 gfn = gpa >> PAGE_SHIFT; 808 if (!(dsisr & DSISR_PRTABLE_FAULT)) 809 gpa |= ea & 0xfff; 810 811 /* Get the corresponding memslot */ 812 memslot = gfn_to_memslot(kvm, gfn); 813 814 /* No memslot means it's an emulated MMIO region */ 815 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { 816 if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | 817 DSISR_SET_RC)) { 818 /* 819 * Bad address in guest page table tree, or other 820 * unusual error - reflect it to the guest as DSI. 821 */ 822 kvmppc_core_queue_data_storage(vcpu, ea, dsisr); 823 return RESUME_GUEST; 824 } 825 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); 826 } 827 828 if (memslot->flags & KVM_MEM_READONLY) { 829 if (writing) { 830 /* give the guest a DSI */ 831 kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | 832 DSISR_PROTFAULT); 833 return RESUME_GUEST; 834 } 835 kvm_ro = true; 836 } 837 838 /* Failed to set the reference/change bits */ 839 if (dsisr & DSISR_SET_RC) { 840 spin_lock(&kvm->mmu_lock); 841 if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, 842 writing, gpa, kvm->arch.lpid)) 843 dsisr &= ~DSISR_SET_RC; 844 spin_unlock(&kvm->mmu_lock); 845 846 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | 847 DSISR_PROTFAULT | DSISR_SET_RC))) 848 return RESUME_GUEST; 849 } 850 851 /* Try to insert a pte */ 852 ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, 853 kvm_ro, NULL, NULL); 854 855 if (ret == 0 || ret == -EAGAIN) 856 ret = RESUME_GUEST; 857 return ret; 858 } 859 860 /* Called with kvm->lock held */ 861 int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 862 unsigned long gfn) 863 { 864 pte_t *ptep; 865 unsigned long gpa = gfn << PAGE_SHIFT; 866 unsigned int shift; 867 868 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 869 if (ptep && pte_present(*ptep)) 870 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, 871 kvm->arch.lpid); 872 return 0; 873 } 874 875 /* Called with kvm->lock held */ 876 int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 877 unsigned long gfn) 878 { 879 pte_t *ptep; 880 unsigned long gpa = gfn << PAGE_SHIFT; 881 unsigned int shift; 882 int ref = 0; 883 884 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 885 if (ptep && pte_present(*ptep) && pte_young(*ptep)) { 886 kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, 887 gpa, shift); 888 /* XXX need to flush tlb here? */ 889 ref = 1; 890 } 891 return ref; 892 } 893 894 /* Called with kvm->lock held */ 895 int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 896 unsigned long gfn) 897 { 898 pte_t *ptep; 899 unsigned long gpa = gfn << PAGE_SHIFT; 900 unsigned int shift; 901 int ref = 0; 902 903 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 904 if (ptep && pte_present(*ptep) && pte_young(*ptep)) 905 ref = 1; 906 return ref; 907 } 908 909 /* Returns the number of PAGE_SIZE pages that are dirty */ 910 static int kvm_radix_test_clear_dirty(struct kvm *kvm, 911 struct kvm_memory_slot *memslot, int pagenum) 912 { 913 unsigned long gfn = memslot->base_gfn + pagenum; 914 unsigned long gpa = gfn << PAGE_SHIFT; 915 pte_t *ptep; 916 unsigned int shift; 917 int ret = 0; 918 919 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 920 if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { 921 ret = 1; 922 if (shift) 923 ret = 1 << (shift - PAGE_SHIFT); 924 kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, 925 gpa, shift); 926 kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); 927 } 928 return ret; 929 } 930 931 long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, 932 struct kvm_memory_slot *memslot, unsigned long *map) 933 { 934 unsigned long i, j; 935 int npages; 936 937 for (i = 0; i < memslot->npages; i = j) { 938 npages = kvm_radix_test_clear_dirty(kvm, memslot, i); 939 940 /* 941 * Note that if npages > 0 then i must be a multiple of npages, 942 * since huge pages are only used to back the guest at guest 943 * real addresses that are a multiple of their size. 944 * Since we have at most one PTE covering any given guest 945 * real address, if npages > 1 we can skip to i + npages. 946 */ 947 j = i + 1; 948 if (npages) { 949 set_dirty_bits(map, i, npages); 950 j = i + npages; 951 } 952 } 953 return 0; 954 } 955 956 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, 957 int psize, int *indexp) 958 { 959 if (!mmu_psize_defs[psize].shift) 960 return; 961 info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | 962 (mmu_psize_defs[psize].ap << 29); 963 ++(*indexp); 964 } 965 966 int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) 967 { 968 int i; 969 970 if (!radix_enabled()) 971 return -EINVAL; 972 memset(info, 0, sizeof(*info)); 973 974 /* 4k page size */ 975 info->geometries[0].page_shift = 12; 976 info->geometries[0].level_bits[0] = 9; 977 for (i = 1; i < 4; ++i) 978 info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; 979 /* 64k page size */ 980 info->geometries[1].page_shift = 16; 981 for (i = 0; i < 4; ++i) 982 info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; 983 984 i = 0; 985 add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); 986 add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); 987 add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); 988 add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); 989 990 return 0; 991 } 992 993 int kvmppc_init_vm_radix(struct kvm *kvm) 994 { 995 kvm->arch.pgtable = pgd_alloc(kvm->mm); 996 if (!kvm->arch.pgtable) 997 return -ENOMEM; 998 return 0; 999 } 1000 1001 static void pte_ctor(void *addr) 1002 { 1003 memset(addr, 0, RADIX_PTE_TABLE_SIZE); 1004 } 1005 1006 static void pmd_ctor(void *addr) 1007 { 1008 memset(addr, 0, RADIX_PMD_TABLE_SIZE); 1009 } 1010 1011 struct debugfs_radix_state { 1012 struct kvm *kvm; 1013 struct mutex mutex; 1014 unsigned long gpa; 1015 int lpid; 1016 int chars_left; 1017 int buf_index; 1018 char buf[128]; 1019 u8 hdr; 1020 }; 1021 1022 static int debugfs_radix_open(struct inode *inode, struct file *file) 1023 { 1024 struct kvm *kvm = inode->i_private; 1025 struct debugfs_radix_state *p; 1026 1027 p = kzalloc(sizeof(*p), GFP_KERNEL); 1028 if (!p) 1029 return -ENOMEM; 1030 1031 kvm_get_kvm(kvm); 1032 p->kvm = kvm; 1033 mutex_init(&p->mutex); 1034 file->private_data = p; 1035 1036 return nonseekable_open(inode, file); 1037 } 1038 1039 static int debugfs_radix_release(struct inode *inode, struct file *file) 1040 { 1041 struct debugfs_radix_state *p = file->private_data; 1042 1043 kvm_put_kvm(p->kvm); 1044 kfree(p); 1045 return 0; 1046 } 1047 1048 static ssize_t debugfs_radix_read(struct file *file, char __user *buf, 1049 size_t len, loff_t *ppos) 1050 { 1051 struct debugfs_radix_state *p = file->private_data; 1052 ssize_t ret, r; 1053 unsigned long n; 1054 struct kvm *kvm; 1055 unsigned long gpa; 1056 pgd_t *pgt; 1057 struct kvm_nested_guest *nested; 1058 pgd_t pgd, *pgdp; 1059 pud_t pud, *pudp; 1060 pmd_t pmd, *pmdp; 1061 pte_t *ptep; 1062 int shift; 1063 unsigned long pte; 1064 1065 kvm = p->kvm; 1066 if (!kvm_is_radix(kvm)) 1067 return 0; 1068 1069 ret = mutex_lock_interruptible(&p->mutex); 1070 if (ret) 1071 return ret; 1072 1073 if (p->chars_left) { 1074 n = p->chars_left; 1075 if (n > len) 1076 n = len; 1077 r = copy_to_user(buf, p->buf + p->buf_index, n); 1078 n -= r; 1079 p->chars_left -= n; 1080 p->buf_index += n; 1081 buf += n; 1082 len -= n; 1083 ret = n; 1084 if (r) { 1085 if (!n) 1086 ret = -EFAULT; 1087 goto out; 1088 } 1089 } 1090 1091 gpa = p->gpa; 1092 nested = NULL; 1093 pgt = NULL; 1094 while (len != 0 && p->lpid >= 0) { 1095 if (gpa >= RADIX_PGTABLE_RANGE) { 1096 gpa = 0; 1097 pgt = NULL; 1098 if (nested) { 1099 kvmhv_put_nested(nested); 1100 nested = NULL; 1101 } 1102 p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); 1103 p->hdr = 0; 1104 if (p->lpid < 0) 1105 break; 1106 } 1107 if (!pgt) { 1108 if (p->lpid == 0) { 1109 pgt = kvm->arch.pgtable; 1110 } else { 1111 nested = kvmhv_get_nested(kvm, p->lpid, false); 1112 if (!nested) { 1113 gpa = RADIX_PGTABLE_RANGE; 1114 continue; 1115 } 1116 pgt = nested->shadow_pgtable; 1117 } 1118 } 1119 n = 0; 1120 if (!p->hdr) { 1121 if (p->lpid > 0) 1122 n = scnprintf(p->buf, sizeof(p->buf), 1123 "\nNested LPID %d: ", p->lpid); 1124 n += scnprintf(p->buf + n, sizeof(p->buf) - n, 1125 "pgdir: %lx\n", (unsigned long)pgt); 1126 p->hdr = 1; 1127 goto copy; 1128 } 1129 1130 pgdp = pgt + pgd_index(gpa); 1131 pgd = READ_ONCE(*pgdp); 1132 if (!(pgd_val(pgd) & _PAGE_PRESENT)) { 1133 gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; 1134 continue; 1135 } 1136 1137 pudp = pud_offset(&pgd, gpa); 1138 pud = READ_ONCE(*pudp); 1139 if (!(pud_val(pud) & _PAGE_PRESENT)) { 1140 gpa = (gpa & PUD_MASK) + PUD_SIZE; 1141 continue; 1142 } 1143 if (pud_val(pud) & _PAGE_PTE) { 1144 pte = pud_val(pud); 1145 shift = PUD_SHIFT; 1146 goto leaf; 1147 } 1148 1149 pmdp = pmd_offset(&pud, gpa); 1150 pmd = READ_ONCE(*pmdp); 1151 if (!(pmd_val(pmd) & _PAGE_PRESENT)) { 1152 gpa = (gpa & PMD_MASK) + PMD_SIZE; 1153 continue; 1154 } 1155 if (pmd_val(pmd) & _PAGE_PTE) { 1156 pte = pmd_val(pmd); 1157 shift = PMD_SHIFT; 1158 goto leaf; 1159 } 1160 1161 ptep = pte_offset_kernel(&pmd, gpa); 1162 pte = pte_val(READ_ONCE(*ptep)); 1163 if (!(pte & _PAGE_PRESENT)) { 1164 gpa += PAGE_SIZE; 1165 continue; 1166 } 1167 shift = PAGE_SHIFT; 1168 leaf: 1169 n = scnprintf(p->buf, sizeof(p->buf), 1170 " %lx: %lx %d\n", gpa, pte, shift); 1171 gpa += 1ul << shift; 1172 copy: 1173 p->chars_left = n; 1174 if (n > len) 1175 n = len; 1176 r = copy_to_user(buf, p->buf, n); 1177 n -= r; 1178 p->chars_left -= n; 1179 p->buf_index = n; 1180 buf += n; 1181 len -= n; 1182 ret += n; 1183 if (r) { 1184 if (!ret) 1185 ret = -EFAULT; 1186 break; 1187 } 1188 } 1189 p->gpa = gpa; 1190 if (nested) 1191 kvmhv_put_nested(nested); 1192 1193 out: 1194 mutex_unlock(&p->mutex); 1195 return ret; 1196 } 1197 1198 static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, 1199 size_t len, loff_t *ppos) 1200 { 1201 return -EACCES; 1202 } 1203 1204 static const struct file_operations debugfs_radix_fops = { 1205 .owner = THIS_MODULE, 1206 .open = debugfs_radix_open, 1207 .release = debugfs_radix_release, 1208 .read = debugfs_radix_read, 1209 .write = debugfs_radix_write, 1210 .llseek = generic_file_llseek, 1211 }; 1212 1213 void kvmhv_radix_debugfs_init(struct kvm *kvm) 1214 { 1215 kvm->arch.radix_dentry = debugfs_create_file("radix", 0400, 1216 kvm->arch.debugfs_dir, kvm, 1217 &debugfs_radix_fops); 1218 } 1219 1220 int kvmppc_radix_init(void) 1221 { 1222 unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; 1223 1224 kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); 1225 if (!kvm_pte_cache) 1226 return -ENOMEM; 1227 1228 size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; 1229 1230 kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); 1231 if (!kvm_pmd_cache) { 1232 kmem_cache_destroy(kvm_pte_cache); 1233 return -ENOMEM; 1234 } 1235 1236 return 0; 1237 } 1238 1239 void kvmppc_radix_exit(void) 1240 { 1241 kmem_cache_destroy(kvm_pte_cache); 1242 kmem_cache_destroy(kvm_pmd_cache); 1243 } 1244