1 /* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License, version 2, as 4 * published by the Free Software Foundation. 5 * 6 * This program is distributed in the hope that it will be useful, 7 * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 * GNU General Public License for more details. 10 * 11 * You should have received a copy of the GNU General Public License 12 * along with this program; if not, write to the Free Software 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 14 * 15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 16 */ 17 18 #include <linux/types.h> 19 #include <linux/string.h> 20 #include <linux/kvm.h> 21 #include <linux/kvm_host.h> 22 #include <linux/highmem.h> 23 #include <linux/gfp.h> 24 #include <linux/slab.h> 25 #include <linux/hugetlb.h> 26 #include <linux/vmalloc.h> 27 #include <linux/srcu.h> 28 #include <linux/anon_inodes.h> 29 #include <linux/file.h> 30 #include <linux/debugfs.h> 31 32 #include <asm/tlbflush.h> 33 #include <asm/kvm_ppc.h> 34 #include <asm/kvm_book3s.h> 35 #include <asm/book3s/64/mmu-hash.h> 36 #include <asm/hvcall.h> 37 #include <asm/synch.h> 38 #include <asm/ppc-opcode.h> 39 #include <asm/cputable.h> 40 41 #include "trace_hv.h" 42 43 /* Power architecture requires HPT is at least 256kB */ 44 #define PPC_MIN_HPT_ORDER 18 45 46 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 47 long pte_index, unsigned long pteh, 48 unsigned long ptel, unsigned long *pte_idx_ret); 49 static void kvmppc_rmap_reset(struct kvm *kvm); 50 51 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 52 { 53 unsigned long hpt = 0; 54 struct revmap_entry *rev; 55 struct page *page = NULL; 56 long order = KVM_DEFAULT_HPT_ORDER; 57 58 if (htab_orderp) { 59 order = *htab_orderp; 60 if (order < PPC_MIN_HPT_ORDER) 61 order = PPC_MIN_HPT_ORDER; 62 } 63 64 kvm->arch.hpt_cma_alloc = 0; 65 page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT)); 66 if (page) { 67 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 68 memset((void *)hpt, 0, (1ul << order)); 69 kvm->arch.hpt_cma_alloc = 1; 70 } 71 72 /* Lastly try successively smaller sizes from the page allocator */ 73 /* Only do this if userspace didn't specify a size via ioctl */ 74 while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) { 75 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 76 __GFP_NOWARN, order - PAGE_SHIFT); 77 if (!hpt) 78 --order; 79 } 80 81 if (!hpt) 82 return -ENOMEM; 83 84 kvm->arch.hpt_virt = hpt; 85 kvm->arch.hpt_order = order; 86 /* HPTEs are 2**4 bytes long */ 87 kvm->arch.hpt_npte = 1ul << (order - 4); 88 /* 128 (2**7) bytes in each HPTEG */ 89 kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; 90 91 atomic64_set(&kvm->arch.mmio_update, 0); 92 93 /* Allocate reverse map array */ 94 rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); 95 if (!rev) { 96 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); 97 goto out_freehpt; 98 } 99 kvm->arch.revmap = rev; 100 kvm->arch.sdr1 = __pa(hpt) | (order - 18); 101 102 pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", 103 hpt, order, kvm->arch.lpid); 104 105 if (htab_orderp) 106 *htab_orderp = order; 107 return 0; 108 109 out_freehpt: 110 if (kvm->arch.hpt_cma_alloc) 111 kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); 112 else 113 free_pages(hpt, order - PAGE_SHIFT); 114 return -ENOMEM; 115 } 116 117 long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) 118 { 119 long err = -EBUSY; 120 long order; 121 122 mutex_lock(&kvm->lock); 123 if (kvm->arch.hpte_setup_done) { 124 kvm->arch.hpte_setup_done = 0; 125 /* order hpte_setup_done vs. vcpus_running */ 126 smp_mb(); 127 if (atomic_read(&kvm->arch.vcpus_running)) { 128 kvm->arch.hpte_setup_done = 1; 129 goto out; 130 } 131 } 132 if (kvm->arch.hpt_virt) { 133 order = kvm->arch.hpt_order; 134 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 135 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); 136 /* 137 * Reset all the reverse-mapping chains for all memslots 138 */ 139 kvmppc_rmap_reset(kvm); 140 /* Ensure that each vcpu will flush its TLB on next entry. */ 141 cpumask_setall(&kvm->arch.need_tlb_flush); 142 *htab_orderp = order; 143 err = 0; 144 } else { 145 err = kvmppc_alloc_hpt(kvm, htab_orderp); 146 order = *htab_orderp; 147 } 148 out: 149 mutex_unlock(&kvm->lock); 150 return err; 151 } 152 153 void kvmppc_free_hpt(struct kvm *kvm) 154 { 155 kvmppc_free_lpid(kvm->arch.lpid); 156 vfree(kvm->arch.revmap); 157 if (kvm->arch.hpt_cma_alloc) 158 kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), 159 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); 160 else 161 free_pages(kvm->arch.hpt_virt, 162 kvm->arch.hpt_order - PAGE_SHIFT); 163 } 164 165 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 166 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 167 { 168 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 169 } 170 171 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 172 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 173 { 174 return (pgsize == 0x10000) ? 0x1000 : 0; 175 } 176 177 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 178 unsigned long porder) 179 { 180 unsigned long i; 181 unsigned long npages; 182 unsigned long hp_v, hp_r; 183 unsigned long addr, hash; 184 unsigned long psize; 185 unsigned long hp0, hp1; 186 unsigned long idx_ret; 187 long ret; 188 struct kvm *kvm = vcpu->kvm; 189 190 psize = 1ul << porder; 191 npages = memslot->npages >> (porder - PAGE_SHIFT); 192 193 /* VRMA can't be > 1TB */ 194 if (npages > 1ul << (40 - porder)) 195 npages = 1ul << (40 - porder); 196 /* Can't use more than 1 HPTE per HPTEG */ 197 if (npages > kvm->arch.hpt_mask + 1) 198 npages = kvm->arch.hpt_mask + 1; 199 200 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 201 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 202 hp1 = hpte1_pgsize_encoding(psize) | 203 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 204 205 for (i = 0; i < npages; ++i) { 206 addr = i << porder; 207 /* can't use hpt_hash since va > 64 bits */ 208 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; 209 /* 210 * We assume that the hash table is empty and no 211 * vcpus are using it at this stage. Since we create 212 * at most one HPTE per HPTEG, we just assume entry 7 213 * is available and use it. 214 */ 215 hash = (hash << 3) + 7; 216 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 217 hp_r = hp1 | addr; 218 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 219 &idx_ret); 220 if (ret != H_SUCCESS) { 221 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 222 addr, ret); 223 break; 224 } 225 } 226 } 227 228 int kvmppc_mmu_hv_init(void) 229 { 230 unsigned long host_lpid, rsvd_lpid; 231 232 if (!cpu_has_feature(CPU_FTR_HVMODE)) 233 return -EINVAL; 234 235 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ 236 host_lpid = mfspr(SPRN_LPID); 237 rsvd_lpid = LPID_RSVD; 238 239 kvmppc_init_lpid(rsvd_lpid + 1); 240 241 kvmppc_claim_lpid(host_lpid); 242 /* rsvd_lpid is reserved for use in partition switching */ 243 kvmppc_claim_lpid(rsvd_lpid); 244 245 return 0; 246 } 247 248 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) 249 { 250 unsigned long msr = vcpu->arch.intr_msr; 251 252 /* If transactional, change to suspend mode on IRQ delivery */ 253 if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) 254 msr |= MSR_TS_S; 255 else 256 msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; 257 kvmppc_set_msr(vcpu, msr); 258 } 259 260 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 261 long pte_index, unsigned long pteh, 262 unsigned long ptel, unsigned long *pte_idx_ret) 263 { 264 long ret; 265 266 /* Protect linux PTE lookup from page table destruction */ 267 rcu_read_lock_sched(); /* this disables preemption too */ 268 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 269 current->mm->pgd, false, pte_idx_ret); 270 rcu_read_unlock_sched(); 271 if (ret == H_TOO_HARD) { 272 /* this can't happen */ 273 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 274 ret = H_RESOURCE; /* or something */ 275 } 276 return ret; 277 278 } 279 280 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 281 gva_t eaddr) 282 { 283 u64 mask; 284 int i; 285 286 for (i = 0; i < vcpu->arch.slb_nr; i++) { 287 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 288 continue; 289 290 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 291 mask = ESID_MASK_1T; 292 else 293 mask = ESID_MASK; 294 295 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 296 return &vcpu->arch.slb[i]; 297 } 298 return NULL; 299 } 300 301 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 302 unsigned long ea) 303 { 304 unsigned long ra_mask; 305 306 ra_mask = hpte_page_size(v, r) - 1; 307 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 308 } 309 310 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 311 struct kvmppc_pte *gpte, bool data, bool iswrite) 312 { 313 struct kvm *kvm = vcpu->kvm; 314 struct kvmppc_slb *slbe; 315 unsigned long slb_v; 316 unsigned long pp, key; 317 unsigned long v, orig_v, gr; 318 __be64 *hptep; 319 int index; 320 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 321 322 /* Get SLB entry */ 323 if (virtmode) { 324 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 325 if (!slbe) 326 return -EINVAL; 327 slb_v = slbe->origv; 328 } else { 329 /* real mode access */ 330 slb_v = vcpu->kvm->arch.vrma_slb_v; 331 } 332 333 preempt_disable(); 334 /* Find the HPTE in the hash table */ 335 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 336 HPTE_V_VALID | HPTE_V_ABSENT); 337 if (index < 0) { 338 preempt_enable(); 339 return -ENOENT; 340 } 341 hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); 342 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 343 if (cpu_has_feature(CPU_FTR_ARCH_300)) 344 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 345 gr = kvm->arch.revmap[index].guest_rpte; 346 347 unlock_hpte(hptep, orig_v); 348 preempt_enable(); 349 350 gpte->eaddr = eaddr; 351 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 352 353 /* Get PP bits and key for permission check */ 354 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 355 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 356 key &= slb_v; 357 358 /* Calculate permissions */ 359 gpte->may_read = hpte_read_permission(pp, key); 360 gpte->may_write = hpte_write_permission(pp, key); 361 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 362 363 /* Storage key permission check for POWER7 */ 364 if (data && virtmode) { 365 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 366 if (amrfield & 1) 367 gpte->may_read = 0; 368 if (amrfield & 2) 369 gpte->may_write = 0; 370 } 371 372 /* Get the guest physical address */ 373 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 374 return 0; 375 } 376 377 /* 378 * Quick test for whether an instruction is a load or a store. 379 * If the instruction is a load or a store, then this will indicate 380 * which it is, at least on server processors. (Embedded processors 381 * have some external PID instructions that don't follow the rule 382 * embodied here.) If the instruction isn't a load or store, then 383 * this doesn't return anything useful. 384 */ 385 static int instruction_is_store(unsigned int instr) 386 { 387 unsigned int mask; 388 389 mask = 0x10000000; 390 if ((instr & 0xfc000000) == 0x7c000000) 391 mask = 0x100; /* major opcode 31 */ 392 return (instr & mask) != 0; 393 } 394 395 static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, 396 unsigned long gpa, gva_t ea, int is_store) 397 { 398 u32 last_inst; 399 400 /* 401 * If we fail, we just return to the guest and try executing it again. 402 */ 403 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 404 EMULATE_DONE) 405 return RESUME_GUEST; 406 407 /* 408 * WARNING: We do not know for sure whether the instruction we just 409 * read from memory is the same that caused the fault in the first 410 * place. If the instruction we read is neither an load or a store, 411 * then it can't access memory, so we don't need to worry about 412 * enforcing access permissions. So, assuming it is a load or 413 * store, we just check that its direction (load or store) is 414 * consistent with the original fault, since that's what we 415 * checked the access permissions against. If there is a mismatch 416 * we just return and retry the instruction. 417 */ 418 419 if (instruction_is_store(last_inst) != !!is_store) 420 return RESUME_GUEST; 421 422 /* 423 * Emulated accesses are emulated by looking at the hash for 424 * translation once, then performing the access later. The 425 * translation could be invalidated in the meantime in which 426 * point performing the subsequent memory access on the old 427 * physical address could possibly be a security hole for the 428 * guest (but not the host). 429 * 430 * This is less of an issue for MMIO stores since they aren't 431 * globally visible. It could be an issue for MMIO loads to 432 * a certain extent but we'll ignore it for now. 433 */ 434 435 vcpu->arch.paddr_accessed = gpa; 436 vcpu->arch.vaddr_accessed = ea; 437 return kvmppc_emulate_mmio(run, vcpu); 438 } 439 440 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, 441 unsigned long ea, unsigned long dsisr) 442 { 443 struct kvm *kvm = vcpu->kvm; 444 unsigned long hpte[3], r; 445 unsigned long hnow_v, hnow_r; 446 __be64 *hptep; 447 unsigned long mmu_seq, psize, pte_size; 448 unsigned long gpa_base, gfn_base; 449 unsigned long gpa, gfn, hva, pfn; 450 struct kvm_memory_slot *memslot; 451 unsigned long *rmap; 452 struct revmap_entry *rev; 453 struct page *page, *pages[1]; 454 long index, ret, npages; 455 bool is_ci; 456 unsigned int writing, write_ok; 457 struct vm_area_struct *vma; 458 unsigned long rcbits; 459 long mmio_update; 460 461 /* 462 * Real-mode code has already searched the HPT and found the 463 * entry we're interested in. Lock the entry and check that 464 * it hasn't changed. If it has, just return and re-execute the 465 * instruction. 466 */ 467 if (ea != vcpu->arch.pgfault_addr) 468 return RESUME_GUEST; 469 470 if (vcpu->arch.pgfault_cache) { 471 mmio_update = atomic64_read(&kvm->arch.mmio_update); 472 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 473 r = vcpu->arch.pgfault_cache->rpte; 474 psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); 475 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 476 gfn_base = gpa_base >> PAGE_SHIFT; 477 gpa = gpa_base | (ea & (psize - 1)); 478 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 479 dsisr & DSISR_ISSTORE); 480 } 481 } 482 index = vcpu->arch.pgfault_index; 483 hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); 484 rev = &kvm->arch.revmap[index]; 485 preempt_disable(); 486 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 487 cpu_relax(); 488 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 489 hpte[1] = be64_to_cpu(hptep[1]); 490 hpte[2] = r = rev->guest_rpte; 491 unlock_hpte(hptep, hpte[0]); 492 preempt_enable(); 493 494 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 495 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); 496 hpte[1] = hpte_new_to_old_r(hpte[1]); 497 } 498 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 499 hpte[1] != vcpu->arch.pgfault_hpte[1]) 500 return RESUME_GUEST; 501 502 /* Translate the logical address and get the page */ 503 psize = hpte_page_size(hpte[0], r); 504 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 505 gfn_base = gpa_base >> PAGE_SHIFT; 506 gpa = gpa_base | (ea & (psize - 1)); 507 gfn = gpa >> PAGE_SHIFT; 508 memslot = gfn_to_memslot(kvm, gfn); 509 510 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); 511 512 /* No memslot means it's an emulated MMIO region */ 513 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 514 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 515 dsisr & DSISR_ISSTORE); 516 517 /* 518 * This should never happen, because of the slot_is_aligned() 519 * check in kvmppc_do_h_enter(). 520 */ 521 if (gfn_base < memslot->base_gfn) 522 return -EFAULT; 523 524 /* used to check for invalidations in progress */ 525 mmu_seq = kvm->mmu_notifier_seq; 526 smp_rmb(); 527 528 ret = -EFAULT; 529 is_ci = false; 530 pfn = 0; 531 page = NULL; 532 pte_size = PAGE_SIZE; 533 writing = (dsisr & DSISR_ISSTORE) != 0; 534 /* If writing != 0, then the HPTE must allow writing, if we get here */ 535 write_ok = writing; 536 hva = gfn_to_hva_memslot(memslot, gfn); 537 npages = get_user_pages_fast(hva, 1, writing, pages); 538 if (npages < 1) { 539 /* Check if it's an I/O mapping */ 540 down_read(¤t->mm->mmap_sem); 541 vma = find_vma(current->mm, hva); 542 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && 543 (vma->vm_flags & VM_PFNMAP)) { 544 pfn = vma->vm_pgoff + 545 ((hva - vma->vm_start) >> PAGE_SHIFT); 546 pte_size = psize; 547 is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); 548 write_ok = vma->vm_flags & VM_WRITE; 549 } 550 up_read(¤t->mm->mmap_sem); 551 if (!pfn) 552 goto out_put; 553 } else { 554 page = pages[0]; 555 pfn = page_to_pfn(page); 556 if (PageHuge(page)) { 557 page = compound_head(page); 558 pte_size <<= compound_order(page); 559 } 560 /* if the guest wants write access, see if that is OK */ 561 if (!writing && hpte_is_writable(r)) { 562 pte_t *ptep, pte; 563 unsigned long flags; 564 /* 565 * We need to protect against page table destruction 566 * hugepage split and collapse. 567 */ 568 local_irq_save(flags); 569 ptep = find_linux_pte_or_hugepte(current->mm->pgd, 570 hva, NULL, NULL); 571 if (ptep) { 572 pte = kvmppc_read_update_linux_pte(ptep, 1); 573 if (pte_write(pte)) 574 write_ok = 1; 575 } 576 local_irq_restore(flags); 577 } 578 } 579 580 if (psize > pte_size) 581 goto out_put; 582 583 /* Check WIMG vs. the actual page we're accessing */ 584 if (!hpte_cache_flags_ok(r, is_ci)) { 585 if (is_ci) 586 goto out_put; 587 /* 588 * Allow guest to map emulated device memory as 589 * uncacheable, but actually make it cacheable. 590 */ 591 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 592 } 593 594 /* 595 * Set the HPTE to point to pfn. 596 * Since the pfn is at PAGE_SIZE granularity, make sure we 597 * don't mask out lower-order bits if psize < PAGE_SIZE. 598 */ 599 if (psize < PAGE_SIZE) 600 psize = PAGE_SIZE; 601 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | 602 ((pfn << PAGE_SHIFT) & ~(psize - 1)); 603 if (hpte_is_writable(r) && !write_ok) 604 r = hpte_make_readonly(r); 605 ret = RESUME_GUEST; 606 preempt_disable(); 607 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 608 cpu_relax(); 609 hnow_v = be64_to_cpu(hptep[0]); 610 hnow_r = be64_to_cpu(hptep[1]); 611 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 612 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); 613 hnow_r = hpte_new_to_old_r(hnow_r); 614 } 615 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 616 rev->guest_rpte != hpte[2]) 617 /* HPTE has been changed under us; let the guest retry */ 618 goto out_unlock; 619 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 620 621 /* Always put the HPTE in the rmap chain for the page base address */ 622 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 623 lock_rmap(rmap); 624 625 /* Check if we might have been invalidated; let the guest retry if so */ 626 ret = RESUME_GUEST; 627 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 628 unlock_rmap(rmap); 629 goto out_unlock; 630 } 631 632 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 633 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 634 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 635 636 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { 637 /* HPTE was previously valid, so we need to invalidate it */ 638 unlock_rmap(rmap); 639 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 640 kvmppc_invalidate_hpte(kvm, hptep, index); 641 /* don't lose previous R and C bits */ 642 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 643 } else { 644 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 645 } 646 647 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 648 r = hpte_old_to_new_r(hpte[0], r); 649 hpte[0] = hpte_old_to_new_v(hpte[0]); 650 } 651 hptep[1] = cpu_to_be64(r); 652 eieio(); 653 __unlock_hpte(hptep, hpte[0]); 654 asm volatile("ptesync" : : : "memory"); 655 preempt_enable(); 656 if (page && hpte_is_writable(r)) 657 SetPageDirty(page); 658 659 out_put: 660 trace_kvm_page_fault_exit(vcpu, hpte, ret); 661 662 if (page) { 663 /* 664 * We drop pages[0] here, not page because page might 665 * have been set to the head page of a compound, but 666 * we have to drop the reference on the correct tail 667 * page to match the get inside gup() 668 */ 669 put_page(pages[0]); 670 } 671 return ret; 672 673 out_unlock: 674 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 675 preempt_enable(); 676 goto out_put; 677 } 678 679 static void kvmppc_rmap_reset(struct kvm *kvm) 680 { 681 struct kvm_memslots *slots; 682 struct kvm_memory_slot *memslot; 683 int srcu_idx; 684 685 srcu_idx = srcu_read_lock(&kvm->srcu); 686 slots = kvm_memslots(kvm); 687 kvm_for_each_memslot(memslot, slots) { 688 /* 689 * This assumes it is acceptable to lose reference and 690 * change bits across a reset. 691 */ 692 memset(memslot->arch.rmap, 0, 693 memslot->npages * sizeof(*memslot->arch.rmap)); 694 } 695 srcu_read_unlock(&kvm->srcu, srcu_idx); 696 } 697 698 static int kvm_handle_hva_range(struct kvm *kvm, 699 unsigned long start, 700 unsigned long end, 701 int (*handler)(struct kvm *kvm, 702 unsigned long *rmapp, 703 unsigned long gfn)) 704 { 705 int ret; 706 int retval = 0; 707 struct kvm_memslots *slots; 708 struct kvm_memory_slot *memslot; 709 710 slots = kvm_memslots(kvm); 711 kvm_for_each_memslot(memslot, slots) { 712 unsigned long hva_start, hva_end; 713 gfn_t gfn, gfn_end; 714 715 hva_start = max(start, memslot->userspace_addr); 716 hva_end = min(end, memslot->userspace_addr + 717 (memslot->npages << PAGE_SHIFT)); 718 if (hva_start >= hva_end) 719 continue; 720 /* 721 * {gfn(page) | page intersects with [hva_start, hva_end)} = 722 * {gfn, gfn+1, ..., gfn_end-1}. 723 */ 724 gfn = hva_to_gfn_memslot(hva_start, memslot); 725 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 726 727 for (; gfn < gfn_end; ++gfn) { 728 gfn_t gfn_offset = gfn - memslot->base_gfn; 729 730 ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); 731 retval |= ret; 732 } 733 } 734 735 return retval; 736 } 737 738 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 739 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 740 unsigned long gfn)) 741 { 742 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 743 } 744 745 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 746 unsigned long gfn) 747 { 748 struct revmap_entry *rev = kvm->arch.revmap; 749 unsigned long h, i, j; 750 __be64 *hptep; 751 unsigned long ptel, psize, rcbits; 752 753 for (;;) { 754 lock_rmap(rmapp); 755 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 756 unlock_rmap(rmapp); 757 break; 758 } 759 760 /* 761 * To avoid an ABBA deadlock with the HPTE lock bit, 762 * we can't spin on the HPTE lock while holding the 763 * rmap chain lock. 764 */ 765 i = *rmapp & KVMPPC_RMAP_INDEX; 766 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); 767 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 768 /* unlock rmap before spinning on the HPTE lock */ 769 unlock_rmap(rmapp); 770 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 771 cpu_relax(); 772 continue; 773 } 774 j = rev[i].forw; 775 if (j == i) { 776 /* chain is now empty */ 777 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 778 } else { 779 /* remove i from chain */ 780 h = rev[i].back; 781 rev[h].forw = j; 782 rev[j].back = h; 783 rev[i].forw = rev[i].back = i; 784 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 785 } 786 787 /* Now check and modify the HPTE */ 788 ptel = rev[i].guest_rpte; 789 psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); 790 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 791 hpte_rpn(ptel, psize) == gfn) { 792 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 793 kvmppc_invalidate_hpte(kvm, hptep, i); 794 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); 795 /* Harvest R and C */ 796 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 797 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 798 if (rcbits & HPTE_R_C) 799 kvmppc_update_rmap_change(rmapp, psize); 800 if (rcbits & ~rev[i].guest_rpte) { 801 rev[i].guest_rpte = ptel | rcbits; 802 note_hpte_modification(kvm, &rev[i]); 803 } 804 } 805 unlock_rmap(rmapp); 806 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 807 } 808 return 0; 809 } 810 811 int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) 812 { 813 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 814 return 0; 815 } 816 817 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) 818 { 819 kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); 820 return 0; 821 } 822 823 void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 824 struct kvm_memory_slot *memslot) 825 { 826 unsigned long *rmapp; 827 unsigned long gfn; 828 unsigned long n; 829 830 rmapp = memslot->arch.rmap; 831 gfn = memslot->base_gfn; 832 for (n = memslot->npages; n; --n) { 833 /* 834 * Testing the present bit without locking is OK because 835 * the memslot has been marked invalid already, and hence 836 * no new HPTEs referencing this page can be created, 837 * thus the present bit can't go from 0 to 1. 838 */ 839 if (*rmapp & KVMPPC_RMAP_PRESENT) 840 kvm_unmap_rmapp(kvm, rmapp, gfn); 841 ++rmapp; 842 ++gfn; 843 } 844 } 845 846 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 847 unsigned long gfn) 848 { 849 struct revmap_entry *rev = kvm->arch.revmap; 850 unsigned long head, i, j; 851 __be64 *hptep; 852 int ret = 0; 853 854 retry: 855 lock_rmap(rmapp); 856 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 857 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 858 ret = 1; 859 } 860 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 861 unlock_rmap(rmapp); 862 return ret; 863 } 864 865 i = head = *rmapp & KVMPPC_RMAP_INDEX; 866 do { 867 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); 868 j = rev[i].forw; 869 870 /* If this HPTE isn't referenced, ignore it */ 871 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) 872 continue; 873 874 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 875 /* unlock rmap before spinning on the HPTE lock */ 876 unlock_rmap(rmapp); 877 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 878 cpu_relax(); 879 goto retry; 880 } 881 882 /* Now check and modify the HPTE */ 883 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 884 (be64_to_cpu(hptep[1]) & HPTE_R_R)) { 885 kvmppc_clear_ref_hpte(kvm, hptep, i); 886 if (!(rev[i].guest_rpte & HPTE_R_R)) { 887 rev[i].guest_rpte |= HPTE_R_R; 888 note_hpte_modification(kvm, &rev[i]); 889 } 890 ret = 1; 891 } 892 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 893 } while ((i = j) != head); 894 895 unlock_rmap(rmapp); 896 return ret; 897 } 898 899 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) 900 { 901 return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp); 902 } 903 904 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 905 unsigned long gfn) 906 { 907 struct revmap_entry *rev = kvm->arch.revmap; 908 unsigned long head, i, j; 909 unsigned long *hp; 910 int ret = 1; 911 912 if (*rmapp & KVMPPC_RMAP_REFERENCED) 913 return 1; 914 915 lock_rmap(rmapp); 916 if (*rmapp & KVMPPC_RMAP_REFERENCED) 917 goto out; 918 919 if (*rmapp & KVMPPC_RMAP_PRESENT) { 920 i = head = *rmapp & KVMPPC_RMAP_INDEX; 921 do { 922 hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); 923 j = rev[i].forw; 924 if (be64_to_cpu(hp[1]) & HPTE_R_R) 925 goto out; 926 } while ((i = j) != head); 927 } 928 ret = 0; 929 930 out: 931 unlock_rmap(rmapp); 932 return ret; 933 } 934 935 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) 936 { 937 return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); 938 } 939 940 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) 941 { 942 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 943 } 944 945 static int vcpus_running(struct kvm *kvm) 946 { 947 return atomic_read(&kvm->arch.vcpus_running) != 0; 948 } 949 950 /* 951 * Returns the number of system pages that are dirty. 952 * This can be more than 1 if we find a huge-page HPTE. 953 */ 954 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 955 { 956 struct revmap_entry *rev = kvm->arch.revmap; 957 unsigned long head, i, j; 958 unsigned long n; 959 unsigned long v, r; 960 __be64 *hptep; 961 int npages_dirty = 0; 962 963 retry: 964 lock_rmap(rmapp); 965 if (*rmapp & KVMPPC_RMAP_CHANGED) { 966 long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER) 967 >> KVMPPC_RMAP_CHG_SHIFT; 968 *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER); 969 npages_dirty = 1; 970 if (change_order > PAGE_SHIFT) 971 npages_dirty = 1ul << (change_order - PAGE_SHIFT); 972 } 973 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 974 unlock_rmap(rmapp); 975 return npages_dirty; 976 } 977 978 i = head = *rmapp & KVMPPC_RMAP_INDEX; 979 do { 980 unsigned long hptep1; 981 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); 982 j = rev[i].forw; 983 984 /* 985 * Checking the C (changed) bit here is racy since there 986 * is no guarantee about when the hardware writes it back. 987 * If the HPTE is not writable then it is stable since the 988 * page can't be written to, and we would have done a tlbie 989 * (which forces the hardware to complete any writeback) 990 * when making the HPTE read-only. 991 * If vcpus are running then this call is racy anyway 992 * since the page could get dirtied subsequently, so we 993 * expect there to be a further call which would pick up 994 * any delayed C bit writeback. 995 * Otherwise we need to do the tlbie even if C==0 in 996 * order to pick up any delayed writeback of C. 997 */ 998 hptep1 = be64_to_cpu(hptep[1]); 999 if (!(hptep1 & HPTE_R_C) && 1000 (!hpte_is_writable(hptep1) || vcpus_running(kvm))) 1001 continue; 1002 1003 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1004 /* unlock rmap before spinning on the HPTE lock */ 1005 unlock_rmap(rmapp); 1006 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) 1007 cpu_relax(); 1008 goto retry; 1009 } 1010 1011 /* Now check and modify the HPTE */ 1012 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 1013 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 1014 continue; 1015 } 1016 1017 /* need to make it temporarily absent so C is stable */ 1018 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 1019 kvmppc_invalidate_hpte(kvm, hptep, i); 1020 v = be64_to_cpu(hptep[0]); 1021 r = be64_to_cpu(hptep[1]); 1022 if (r & HPTE_R_C) { 1023 hptep[1] = cpu_to_be64(r & ~HPTE_R_C); 1024 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1025 rev[i].guest_rpte |= HPTE_R_C; 1026 note_hpte_modification(kvm, &rev[i]); 1027 } 1028 n = hpte_page_size(v, r); 1029 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1030 if (n > npages_dirty) 1031 npages_dirty = n; 1032 eieio(); 1033 } 1034 v &= ~HPTE_V_ABSENT; 1035 v |= HPTE_V_VALID; 1036 __unlock_hpte(hptep, v); 1037 } while ((i = j) != head); 1038 1039 unlock_rmap(rmapp); 1040 return npages_dirty; 1041 } 1042 1043 static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1044 struct kvm_memory_slot *memslot, 1045 unsigned long *map) 1046 { 1047 unsigned long gfn; 1048 1049 if (!vpa->dirty || !vpa->pinned_addr) 1050 return; 1051 gfn = vpa->gpa >> PAGE_SHIFT; 1052 if (gfn < memslot->base_gfn || 1053 gfn >= memslot->base_gfn + memslot->npages) 1054 return; 1055 1056 vpa->dirty = false; 1057 if (map) 1058 __set_bit_le(gfn - memslot->base_gfn, map); 1059 } 1060 1061 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, 1062 unsigned long *map) 1063 { 1064 unsigned long i, j; 1065 unsigned long *rmapp; 1066 struct kvm_vcpu *vcpu; 1067 1068 preempt_disable(); 1069 rmapp = memslot->arch.rmap; 1070 for (i = 0; i < memslot->npages; ++i) { 1071 int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1072 /* 1073 * Note that if npages > 0 then i must be a multiple of npages, 1074 * since we always put huge-page HPTEs in the rmap chain 1075 * corresponding to their page base address. 1076 */ 1077 if (npages && map) 1078 for (j = i; npages; ++j, --npages) 1079 __set_bit_le(j, map); 1080 ++rmapp; 1081 } 1082 1083 /* Harvest dirty bits from VPA and DTL updates */ 1084 /* Note: we never modify the SLB shadow buffer areas */ 1085 kvm_for_each_vcpu(i, vcpu, kvm) { 1086 spin_lock(&vcpu->arch.vpa_update_lock); 1087 harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); 1088 harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); 1089 spin_unlock(&vcpu->arch.vpa_update_lock); 1090 } 1091 preempt_enable(); 1092 return 0; 1093 } 1094 1095 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1096 unsigned long *nb_ret) 1097 { 1098 struct kvm_memory_slot *memslot; 1099 unsigned long gfn = gpa >> PAGE_SHIFT; 1100 struct page *page, *pages[1]; 1101 int npages; 1102 unsigned long hva, offset; 1103 int srcu_idx; 1104 1105 srcu_idx = srcu_read_lock(&kvm->srcu); 1106 memslot = gfn_to_memslot(kvm, gfn); 1107 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1108 goto err; 1109 hva = gfn_to_hva_memslot(memslot, gfn); 1110 npages = get_user_pages_fast(hva, 1, 1, pages); 1111 if (npages < 1) 1112 goto err; 1113 page = pages[0]; 1114 srcu_read_unlock(&kvm->srcu, srcu_idx); 1115 1116 offset = gpa & (PAGE_SIZE - 1); 1117 if (nb_ret) 1118 *nb_ret = PAGE_SIZE - offset; 1119 return page_address(page) + offset; 1120 1121 err: 1122 srcu_read_unlock(&kvm->srcu, srcu_idx); 1123 return NULL; 1124 } 1125 1126 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1127 bool dirty) 1128 { 1129 struct page *page = virt_to_page(va); 1130 struct kvm_memory_slot *memslot; 1131 unsigned long gfn; 1132 unsigned long *rmap; 1133 int srcu_idx; 1134 1135 put_page(page); 1136 1137 if (!dirty) 1138 return; 1139 1140 /* We need to mark this page dirty in the rmap chain */ 1141 gfn = gpa >> PAGE_SHIFT; 1142 srcu_idx = srcu_read_lock(&kvm->srcu); 1143 memslot = gfn_to_memslot(kvm, gfn); 1144 if (memslot) { 1145 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1146 lock_rmap(rmap); 1147 *rmap |= KVMPPC_RMAP_CHANGED; 1148 unlock_rmap(rmap); 1149 } 1150 srcu_read_unlock(&kvm->srcu, srcu_idx); 1151 } 1152 1153 /* 1154 * Functions for reading and writing the hash table via reads and 1155 * writes on a file descriptor. 1156 * 1157 * Reads return the guest view of the hash table, which has to be 1158 * pieced together from the real hash table and the guest_rpte 1159 * values in the revmap array. 1160 * 1161 * On writes, each HPTE written is considered in turn, and if it 1162 * is valid, it is written to the HPT as if an H_ENTER with the 1163 * exact flag set was done. When the invalid count is non-zero 1164 * in the header written to the stream, the kernel will make 1165 * sure that that many HPTEs are invalid, and invalidate them 1166 * if not. 1167 */ 1168 1169 struct kvm_htab_ctx { 1170 unsigned long index; 1171 unsigned long flags; 1172 struct kvm *kvm; 1173 int first_pass; 1174 }; 1175 1176 #define HPTE_SIZE (2 * sizeof(unsigned long)) 1177 1178 /* 1179 * Returns 1 if this HPT entry has been modified or has pending 1180 * R/C bit changes. 1181 */ 1182 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) 1183 { 1184 unsigned long rcbits_unset; 1185 1186 if (revp->guest_rpte & HPTE_GR_MODIFIED) 1187 return 1; 1188 1189 /* Also need to consider changes in reference and changed bits */ 1190 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1191 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && 1192 (be64_to_cpu(hptp[1]) & rcbits_unset)) 1193 return 1; 1194 1195 return 0; 1196 } 1197 1198 static long record_hpte(unsigned long flags, __be64 *hptp, 1199 unsigned long *hpte, struct revmap_entry *revp, 1200 int want_valid, int first_pass) 1201 { 1202 unsigned long v, r, hr; 1203 unsigned long rcbits_unset; 1204 int ok = 1; 1205 int valid, dirty; 1206 1207 /* Unmodified entries are uninteresting except on the first pass */ 1208 dirty = hpte_dirty(revp, hptp); 1209 if (!first_pass && !dirty) 1210 return 0; 1211 1212 valid = 0; 1213 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1214 valid = 1; 1215 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1216 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) 1217 valid = 0; 1218 } 1219 if (valid != want_valid) 1220 return 0; 1221 1222 v = r = 0; 1223 if (valid || dirty) { 1224 /* lock the HPTE so it's stable and read it */ 1225 preempt_disable(); 1226 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1227 cpu_relax(); 1228 v = be64_to_cpu(hptp[0]); 1229 hr = be64_to_cpu(hptp[1]); 1230 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1231 v = hpte_new_to_old_v(v, hr); 1232 hr = hpte_new_to_old_r(hr); 1233 } 1234 1235 /* re-evaluate valid and dirty from synchronized HPTE value */ 1236 valid = !!(v & HPTE_V_VALID); 1237 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1238 1239 /* Harvest R and C into guest view if necessary */ 1240 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1241 if (valid && (rcbits_unset & hr)) { 1242 revp->guest_rpte |= (hr & 1243 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; 1244 dirty = 1; 1245 } 1246 1247 if (v & HPTE_V_ABSENT) { 1248 v &= ~HPTE_V_ABSENT; 1249 v |= HPTE_V_VALID; 1250 valid = 1; 1251 } 1252 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1253 valid = 0; 1254 1255 r = revp->guest_rpte; 1256 /* only clear modified if this is the right sort of entry */ 1257 if (valid == want_valid && dirty) { 1258 r &= ~HPTE_GR_MODIFIED; 1259 revp->guest_rpte = r; 1260 } 1261 unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1262 preempt_enable(); 1263 if (!(valid == want_valid && (first_pass || dirty))) 1264 ok = 0; 1265 } 1266 hpte[0] = cpu_to_be64(v); 1267 hpte[1] = cpu_to_be64(r); 1268 return ok; 1269 } 1270 1271 static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1272 size_t count, loff_t *ppos) 1273 { 1274 struct kvm_htab_ctx *ctx = file->private_data; 1275 struct kvm *kvm = ctx->kvm; 1276 struct kvm_get_htab_header hdr; 1277 __be64 *hptp; 1278 struct revmap_entry *revp; 1279 unsigned long i, nb, nw; 1280 unsigned long __user *lbuf; 1281 struct kvm_get_htab_header __user *hptr; 1282 unsigned long flags; 1283 int first_pass; 1284 unsigned long hpte[2]; 1285 1286 if (!access_ok(VERIFY_WRITE, buf, count)) 1287 return -EFAULT; 1288 1289 first_pass = ctx->first_pass; 1290 flags = ctx->flags; 1291 1292 i = ctx->index; 1293 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1294 revp = kvm->arch.revmap + i; 1295 lbuf = (unsigned long __user *)buf; 1296 1297 nb = 0; 1298 while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1299 /* Initialize header */ 1300 hptr = (struct kvm_get_htab_header __user *)buf; 1301 hdr.n_valid = 0; 1302 hdr.n_invalid = 0; 1303 nw = nb; 1304 nb += sizeof(hdr); 1305 lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1306 1307 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1308 if (!first_pass) { 1309 while (i < kvm->arch.hpt_npte && 1310 !hpte_dirty(revp, hptp)) { 1311 ++i; 1312 hptp += 2; 1313 ++revp; 1314 } 1315 } 1316 hdr.index = i; 1317 1318 /* Grab a series of valid entries */ 1319 while (i < kvm->arch.hpt_npte && 1320 hdr.n_valid < 0xffff && 1321 nb + HPTE_SIZE < count && 1322 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1323 /* valid entry, write it out */ 1324 ++hdr.n_valid; 1325 if (__put_user(hpte[0], lbuf) || 1326 __put_user(hpte[1], lbuf + 1)) 1327 return -EFAULT; 1328 nb += HPTE_SIZE; 1329 lbuf += 2; 1330 ++i; 1331 hptp += 2; 1332 ++revp; 1333 } 1334 /* Now skip invalid entries while we can */ 1335 while (i < kvm->arch.hpt_npte && 1336 hdr.n_invalid < 0xffff && 1337 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1338 /* found an invalid entry */ 1339 ++hdr.n_invalid; 1340 ++i; 1341 hptp += 2; 1342 ++revp; 1343 } 1344 1345 if (hdr.n_valid || hdr.n_invalid) { 1346 /* write back the header */ 1347 if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1348 return -EFAULT; 1349 nw = nb; 1350 buf = (char __user *)lbuf; 1351 } else { 1352 nb = nw; 1353 } 1354 1355 /* Check if we've wrapped around the hash table */ 1356 if (i >= kvm->arch.hpt_npte) { 1357 i = 0; 1358 ctx->first_pass = 0; 1359 break; 1360 } 1361 } 1362 1363 ctx->index = i; 1364 1365 return nb; 1366 } 1367 1368 static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1369 size_t count, loff_t *ppos) 1370 { 1371 struct kvm_htab_ctx *ctx = file->private_data; 1372 struct kvm *kvm = ctx->kvm; 1373 struct kvm_get_htab_header hdr; 1374 unsigned long i, j; 1375 unsigned long v, r; 1376 unsigned long __user *lbuf; 1377 __be64 *hptp; 1378 unsigned long tmp[2]; 1379 ssize_t nb; 1380 long int err, ret; 1381 int hpte_setup; 1382 1383 if (!access_ok(VERIFY_READ, buf, count)) 1384 return -EFAULT; 1385 1386 /* lock out vcpus from running while we're doing this */ 1387 mutex_lock(&kvm->lock); 1388 hpte_setup = kvm->arch.hpte_setup_done; 1389 if (hpte_setup) { 1390 kvm->arch.hpte_setup_done = 0; /* temporarily */ 1391 /* order hpte_setup_done vs. vcpus_running */ 1392 smp_mb(); 1393 if (atomic_read(&kvm->arch.vcpus_running)) { 1394 kvm->arch.hpte_setup_done = 1; 1395 mutex_unlock(&kvm->lock); 1396 return -EBUSY; 1397 } 1398 } 1399 1400 err = 0; 1401 for (nb = 0; nb + sizeof(hdr) <= count; ) { 1402 err = -EFAULT; 1403 if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1404 break; 1405 1406 err = 0; 1407 if (nb + hdr.n_valid * HPTE_SIZE > count) 1408 break; 1409 1410 nb += sizeof(hdr); 1411 buf += sizeof(hdr); 1412 1413 err = -EINVAL; 1414 i = hdr.index; 1415 if (i >= kvm->arch.hpt_npte || 1416 i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) 1417 break; 1418 1419 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1420 lbuf = (unsigned long __user *)buf; 1421 for (j = 0; j < hdr.n_valid; ++j) { 1422 __be64 hpte_v; 1423 __be64 hpte_r; 1424 1425 err = -EFAULT; 1426 if (__get_user(hpte_v, lbuf) || 1427 __get_user(hpte_r, lbuf + 1)) 1428 goto out; 1429 v = be64_to_cpu(hpte_v); 1430 r = be64_to_cpu(hpte_r); 1431 err = -EINVAL; 1432 if (!(v & HPTE_V_VALID)) 1433 goto out; 1434 lbuf += 2; 1435 nb += HPTE_SIZE; 1436 1437 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1438 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1439 err = -EIO; 1440 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1441 tmp); 1442 if (ret != H_SUCCESS) { 1443 pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1444 "r=%lx\n", ret, i, v, r); 1445 goto out; 1446 } 1447 if (!hpte_setup && is_vrma_hpte(v)) { 1448 unsigned long psize = hpte_base_page_size(v, r); 1449 unsigned long senc = slb_pgsize_encoding(psize); 1450 unsigned long lpcr; 1451 1452 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1453 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1454 lpcr = senc << (LPCR_VRMASD_SH - 4); 1455 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 1456 hpte_setup = 1; 1457 } 1458 ++i; 1459 hptp += 2; 1460 } 1461 1462 for (j = 0; j < hdr.n_invalid; ++j) { 1463 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1464 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1465 ++i; 1466 hptp += 2; 1467 } 1468 err = 0; 1469 } 1470 1471 out: 1472 /* Order HPTE updates vs. hpte_setup_done */ 1473 smp_wmb(); 1474 kvm->arch.hpte_setup_done = hpte_setup; 1475 mutex_unlock(&kvm->lock); 1476 1477 if (err) 1478 return err; 1479 return nb; 1480 } 1481 1482 static int kvm_htab_release(struct inode *inode, struct file *filp) 1483 { 1484 struct kvm_htab_ctx *ctx = filp->private_data; 1485 1486 filp->private_data = NULL; 1487 if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1488 atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1489 kvm_put_kvm(ctx->kvm); 1490 kfree(ctx); 1491 return 0; 1492 } 1493 1494 static const struct file_operations kvm_htab_fops = { 1495 .read = kvm_htab_read, 1496 .write = kvm_htab_write, 1497 .llseek = default_llseek, 1498 .release = kvm_htab_release, 1499 }; 1500 1501 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1502 { 1503 int ret; 1504 struct kvm_htab_ctx *ctx; 1505 int rwflag; 1506 1507 /* reject flags we don't recognize */ 1508 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1509 return -EINVAL; 1510 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1511 if (!ctx) 1512 return -ENOMEM; 1513 kvm_get_kvm(kvm); 1514 ctx->kvm = kvm; 1515 ctx->index = ghf->start_index; 1516 ctx->flags = ghf->flags; 1517 ctx->first_pass = 1; 1518 1519 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1520 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1521 if (ret < 0) { 1522 kvm_put_kvm(kvm); 1523 return ret; 1524 } 1525 1526 if (rwflag == O_RDONLY) { 1527 mutex_lock(&kvm->slots_lock); 1528 atomic_inc(&kvm->arch.hpte_mod_interest); 1529 /* make sure kvmppc_do_h_enter etc. see the increment */ 1530 synchronize_srcu_expedited(&kvm->srcu); 1531 mutex_unlock(&kvm->slots_lock); 1532 } 1533 1534 return ret; 1535 } 1536 1537 struct debugfs_htab_state { 1538 struct kvm *kvm; 1539 struct mutex mutex; 1540 unsigned long hpt_index; 1541 int chars_left; 1542 int buf_index; 1543 char buf[64]; 1544 }; 1545 1546 static int debugfs_htab_open(struct inode *inode, struct file *file) 1547 { 1548 struct kvm *kvm = inode->i_private; 1549 struct debugfs_htab_state *p; 1550 1551 p = kzalloc(sizeof(*p), GFP_KERNEL); 1552 if (!p) 1553 return -ENOMEM; 1554 1555 kvm_get_kvm(kvm); 1556 p->kvm = kvm; 1557 mutex_init(&p->mutex); 1558 file->private_data = p; 1559 1560 return nonseekable_open(inode, file); 1561 } 1562 1563 static int debugfs_htab_release(struct inode *inode, struct file *file) 1564 { 1565 struct debugfs_htab_state *p = file->private_data; 1566 1567 kvm_put_kvm(p->kvm); 1568 kfree(p); 1569 return 0; 1570 } 1571 1572 static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 1573 size_t len, loff_t *ppos) 1574 { 1575 struct debugfs_htab_state *p = file->private_data; 1576 ssize_t ret, r; 1577 unsigned long i, n; 1578 unsigned long v, hr, gr; 1579 struct kvm *kvm; 1580 __be64 *hptp; 1581 1582 ret = mutex_lock_interruptible(&p->mutex); 1583 if (ret) 1584 return ret; 1585 1586 if (p->chars_left) { 1587 n = p->chars_left; 1588 if (n > len) 1589 n = len; 1590 r = copy_to_user(buf, p->buf + p->buf_index, n); 1591 n -= r; 1592 p->chars_left -= n; 1593 p->buf_index += n; 1594 buf += n; 1595 len -= n; 1596 ret = n; 1597 if (r) { 1598 if (!n) 1599 ret = -EFAULT; 1600 goto out; 1601 } 1602 } 1603 1604 kvm = p->kvm; 1605 i = p->hpt_index; 1606 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1607 for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { 1608 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 1609 continue; 1610 1611 /* lock the HPTE so it's stable and read it */ 1612 preempt_disable(); 1613 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1614 cpu_relax(); 1615 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 1616 hr = be64_to_cpu(hptp[1]); 1617 gr = kvm->arch.revmap[i].guest_rpte; 1618 unlock_hpte(hptp, v); 1619 preempt_enable(); 1620 1621 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 1622 continue; 1623 1624 n = scnprintf(p->buf, sizeof(p->buf), 1625 "%6lx %.16lx %.16lx %.16lx\n", 1626 i, v, hr, gr); 1627 p->chars_left = n; 1628 if (n > len) 1629 n = len; 1630 r = copy_to_user(buf, p->buf, n); 1631 n -= r; 1632 p->chars_left -= n; 1633 p->buf_index = n; 1634 buf += n; 1635 len -= n; 1636 ret += n; 1637 if (r) { 1638 if (!ret) 1639 ret = -EFAULT; 1640 goto out; 1641 } 1642 } 1643 p->hpt_index = i; 1644 1645 out: 1646 mutex_unlock(&p->mutex); 1647 return ret; 1648 } 1649 1650 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 1651 size_t len, loff_t *ppos) 1652 { 1653 return -EACCES; 1654 } 1655 1656 static const struct file_operations debugfs_htab_fops = { 1657 .owner = THIS_MODULE, 1658 .open = debugfs_htab_open, 1659 .release = debugfs_htab_release, 1660 .read = debugfs_htab_read, 1661 .write = debugfs_htab_write, 1662 .llseek = generic_file_llseek, 1663 }; 1664 1665 void kvmppc_mmu_debugfs_init(struct kvm *kvm) 1666 { 1667 kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, 1668 kvm->arch.debugfs_dir, kvm, 1669 &debugfs_htab_fops); 1670 } 1671 1672 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 1673 { 1674 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 1675 1676 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 1677 1678 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 1679 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 1680 1681 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 1682 } 1683