1 /* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License, version 2, as 4 * published by the Free Software Foundation. 5 * 6 * This program is distributed in the hope that it will be useful, 7 * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 * GNU General Public License for more details. 10 * 11 * You should have received a copy of the GNU General Public License 12 * along with this program; if not, write to the Free Software 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 14 * 15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 16 */ 17 18 #include <linux/types.h> 19 #include <linux/string.h> 20 #include <linux/kvm.h> 21 #include <linux/kvm_host.h> 22 #include <linux/highmem.h> 23 #include <linux/gfp.h> 24 #include <linux/slab.h> 25 #include <linux/hugetlb.h> 26 #include <linux/vmalloc.h> 27 #include <linux/srcu.h> 28 #include <linux/anon_inodes.h> 29 #include <linux/file.h> 30 #include <linux/debugfs.h> 31 32 #include <asm/tlbflush.h> 33 #include <asm/kvm_ppc.h> 34 #include <asm/kvm_book3s.h> 35 #include <asm/book3s/64/mmu-hash.h> 36 #include <asm/hvcall.h> 37 #include <asm/synch.h> 38 #include <asm/ppc-opcode.h> 39 #include <asm/cputable.h> 40 41 #include "trace_hv.h" 42 43 //#define DEBUG_RESIZE_HPT 1 44 45 #ifdef DEBUG_RESIZE_HPT 46 #define resize_hpt_debug(resize, ...) \ 47 do { \ 48 printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ 49 printk(__VA_ARGS__); \ 50 } while (0) 51 #else 52 #define resize_hpt_debug(resize, ...) \ 53 do { } while (0) 54 #endif 55 56 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 57 long pte_index, unsigned long pteh, 58 unsigned long ptel, unsigned long *pte_idx_ret); 59 60 struct kvm_resize_hpt { 61 /* These fields read-only after init */ 62 struct kvm *kvm; 63 struct work_struct work; 64 u32 order; 65 66 /* These fields protected by kvm->lock */ 67 int error; 68 bool prepare_done; 69 70 /* Private to the work thread, until prepare_done is true, 71 * then protected by kvm->resize_hpt_sem */ 72 struct kvm_hpt_info hpt; 73 }; 74 75 static void kvmppc_rmap_reset(struct kvm *kvm); 76 77 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 78 { 79 unsigned long hpt = 0; 80 int cma = 0; 81 struct page *page = NULL; 82 struct revmap_entry *rev; 83 unsigned long npte; 84 85 if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) 86 return -EINVAL; 87 88 page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); 89 if (page) { 90 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 91 memset((void *)hpt, 0, (1ul << order)); 92 cma = 1; 93 } 94 95 if (!hpt) 96 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT 97 |__GFP_NOWARN, order - PAGE_SHIFT); 98 99 if (!hpt) 100 return -ENOMEM; 101 102 /* HPTEs are 2**4 bytes long */ 103 npte = 1ul << (order - 4); 104 105 /* Allocate reverse map array */ 106 rev = vmalloc(sizeof(struct revmap_entry) * npte); 107 if (!rev) { 108 pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); 109 if (cma) 110 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 111 else 112 free_pages(hpt, order - PAGE_SHIFT); 113 return -ENOMEM; 114 } 115 116 info->order = order; 117 info->virt = hpt; 118 info->cma = cma; 119 info->rev = rev; 120 121 return 0; 122 } 123 124 void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) 125 { 126 atomic64_set(&kvm->arch.mmio_update, 0); 127 kvm->arch.hpt = *info; 128 kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); 129 130 pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", 131 info->virt, (long)info->order, kvm->arch.lpid); 132 } 133 134 long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) 135 { 136 long err = -EBUSY; 137 struct kvm_hpt_info info; 138 139 if (kvm_is_radix(kvm)) 140 return -EINVAL; 141 142 mutex_lock(&kvm->lock); 143 if (kvm->arch.hpte_setup_done) { 144 kvm->arch.hpte_setup_done = 0; 145 /* order hpte_setup_done vs. vcpus_running */ 146 smp_mb(); 147 if (atomic_read(&kvm->arch.vcpus_running)) { 148 kvm->arch.hpte_setup_done = 1; 149 goto out; 150 } 151 } 152 if (kvm->arch.hpt.order == order) { 153 /* We already have a suitable HPT */ 154 155 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 156 memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); 157 /* 158 * Reset all the reverse-mapping chains for all memslots 159 */ 160 kvmppc_rmap_reset(kvm); 161 /* Ensure that each vcpu will flush its TLB on next entry. */ 162 cpumask_setall(&kvm->arch.need_tlb_flush); 163 err = 0; 164 goto out; 165 } 166 167 if (kvm->arch.hpt.virt) 168 kvmppc_free_hpt(&kvm->arch.hpt); 169 170 err = kvmppc_allocate_hpt(&info, order); 171 if (err < 0) 172 goto out; 173 kvmppc_set_hpt(kvm, &info); 174 175 out: 176 mutex_unlock(&kvm->lock); 177 return err; 178 } 179 180 void kvmppc_free_hpt(struct kvm_hpt_info *info) 181 { 182 vfree(info->rev); 183 if (info->cma) 184 kvm_free_hpt_cma(virt_to_page(info->virt), 185 1 << (info->order - PAGE_SHIFT)); 186 else if (info->virt) 187 free_pages(info->virt, info->order - PAGE_SHIFT); 188 info->virt = 0; 189 info->order = 0; 190 } 191 192 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 193 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 194 { 195 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 196 } 197 198 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 199 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 200 { 201 return (pgsize == 0x10000) ? 0x1000 : 0; 202 } 203 204 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 205 unsigned long porder) 206 { 207 unsigned long i; 208 unsigned long npages; 209 unsigned long hp_v, hp_r; 210 unsigned long addr, hash; 211 unsigned long psize; 212 unsigned long hp0, hp1; 213 unsigned long idx_ret; 214 long ret; 215 struct kvm *kvm = vcpu->kvm; 216 217 psize = 1ul << porder; 218 npages = memslot->npages >> (porder - PAGE_SHIFT); 219 220 /* VRMA can't be > 1TB */ 221 if (npages > 1ul << (40 - porder)) 222 npages = 1ul << (40 - porder); 223 /* Can't use more than 1 HPTE per HPTEG */ 224 if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) 225 npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; 226 227 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 228 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 229 hp1 = hpte1_pgsize_encoding(psize) | 230 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 231 232 for (i = 0; i < npages; ++i) { 233 addr = i << porder; 234 /* can't use hpt_hash since va > 64 bits */ 235 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) 236 & kvmppc_hpt_mask(&kvm->arch.hpt); 237 /* 238 * We assume that the hash table is empty and no 239 * vcpus are using it at this stage. Since we create 240 * at most one HPTE per HPTEG, we just assume entry 7 241 * is available and use it. 242 */ 243 hash = (hash << 3) + 7; 244 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 245 hp_r = hp1 | addr; 246 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 247 &idx_ret); 248 if (ret != H_SUCCESS) { 249 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 250 addr, ret); 251 break; 252 } 253 } 254 } 255 256 int kvmppc_mmu_hv_init(void) 257 { 258 unsigned long host_lpid, rsvd_lpid; 259 260 if (!cpu_has_feature(CPU_FTR_HVMODE)) 261 return -EINVAL; 262 263 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ 264 host_lpid = mfspr(SPRN_LPID); 265 rsvd_lpid = LPID_RSVD; 266 267 kvmppc_init_lpid(rsvd_lpid + 1); 268 269 kvmppc_claim_lpid(host_lpid); 270 /* rsvd_lpid is reserved for use in partition switching */ 271 kvmppc_claim_lpid(rsvd_lpid); 272 273 return 0; 274 } 275 276 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) 277 { 278 unsigned long msr = vcpu->arch.intr_msr; 279 280 /* If transactional, change to suspend mode on IRQ delivery */ 281 if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) 282 msr |= MSR_TS_S; 283 else 284 msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; 285 kvmppc_set_msr(vcpu, msr); 286 } 287 288 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 289 long pte_index, unsigned long pteh, 290 unsigned long ptel, unsigned long *pte_idx_ret) 291 { 292 long ret; 293 294 /* Protect linux PTE lookup from page table destruction */ 295 rcu_read_lock_sched(); /* this disables preemption too */ 296 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 297 current->mm->pgd, false, pte_idx_ret); 298 rcu_read_unlock_sched(); 299 if (ret == H_TOO_HARD) { 300 /* this can't happen */ 301 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 302 ret = H_RESOURCE; /* or something */ 303 } 304 return ret; 305 306 } 307 308 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 309 gva_t eaddr) 310 { 311 u64 mask; 312 int i; 313 314 for (i = 0; i < vcpu->arch.slb_nr; i++) { 315 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 316 continue; 317 318 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 319 mask = ESID_MASK_1T; 320 else 321 mask = ESID_MASK; 322 323 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 324 return &vcpu->arch.slb[i]; 325 } 326 return NULL; 327 } 328 329 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 330 unsigned long ea) 331 { 332 unsigned long ra_mask; 333 334 ra_mask = hpte_page_size(v, r) - 1; 335 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 336 } 337 338 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 339 struct kvmppc_pte *gpte, bool data, bool iswrite) 340 { 341 struct kvm *kvm = vcpu->kvm; 342 struct kvmppc_slb *slbe; 343 unsigned long slb_v; 344 unsigned long pp, key; 345 unsigned long v, orig_v, gr; 346 __be64 *hptep; 347 int index; 348 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 349 350 /* Get SLB entry */ 351 if (virtmode) { 352 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 353 if (!slbe) 354 return -EINVAL; 355 slb_v = slbe->origv; 356 } else { 357 /* real mode access */ 358 slb_v = vcpu->kvm->arch.vrma_slb_v; 359 } 360 361 preempt_disable(); 362 /* Find the HPTE in the hash table */ 363 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 364 HPTE_V_VALID | HPTE_V_ABSENT); 365 if (index < 0) { 366 preempt_enable(); 367 return -ENOENT; 368 } 369 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 370 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 371 if (cpu_has_feature(CPU_FTR_ARCH_300)) 372 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 373 gr = kvm->arch.hpt.rev[index].guest_rpte; 374 375 unlock_hpte(hptep, orig_v); 376 preempt_enable(); 377 378 gpte->eaddr = eaddr; 379 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 380 381 /* Get PP bits and key for permission check */ 382 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 383 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 384 key &= slb_v; 385 386 /* Calculate permissions */ 387 gpte->may_read = hpte_read_permission(pp, key); 388 gpte->may_write = hpte_write_permission(pp, key); 389 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 390 391 /* Storage key permission check for POWER7 */ 392 if (data && virtmode) { 393 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 394 if (amrfield & 1) 395 gpte->may_read = 0; 396 if (amrfield & 2) 397 gpte->may_write = 0; 398 } 399 400 /* Get the guest physical address */ 401 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 402 return 0; 403 } 404 405 /* 406 * Quick test for whether an instruction is a load or a store. 407 * If the instruction is a load or a store, then this will indicate 408 * which it is, at least on server processors. (Embedded processors 409 * have some external PID instructions that don't follow the rule 410 * embodied here.) If the instruction isn't a load or store, then 411 * this doesn't return anything useful. 412 */ 413 static int instruction_is_store(unsigned int instr) 414 { 415 unsigned int mask; 416 417 mask = 0x10000000; 418 if ((instr & 0xfc000000) == 0x7c000000) 419 mask = 0x100; /* major opcode 31 */ 420 return (instr & mask) != 0; 421 } 422 423 int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, 424 unsigned long gpa, gva_t ea, int is_store) 425 { 426 u32 last_inst; 427 428 /* 429 * If we fail, we just return to the guest and try executing it again. 430 */ 431 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 432 EMULATE_DONE) 433 return RESUME_GUEST; 434 435 /* 436 * WARNING: We do not know for sure whether the instruction we just 437 * read from memory is the same that caused the fault in the first 438 * place. If the instruction we read is neither an load or a store, 439 * then it can't access memory, so we don't need to worry about 440 * enforcing access permissions. So, assuming it is a load or 441 * store, we just check that its direction (load or store) is 442 * consistent with the original fault, since that's what we 443 * checked the access permissions against. If there is a mismatch 444 * we just return and retry the instruction. 445 */ 446 447 if (instruction_is_store(last_inst) != !!is_store) 448 return RESUME_GUEST; 449 450 /* 451 * Emulated accesses are emulated by looking at the hash for 452 * translation once, then performing the access later. The 453 * translation could be invalidated in the meantime in which 454 * point performing the subsequent memory access on the old 455 * physical address could possibly be a security hole for the 456 * guest (but not the host). 457 * 458 * This is less of an issue for MMIO stores since they aren't 459 * globally visible. It could be an issue for MMIO loads to 460 * a certain extent but we'll ignore it for now. 461 */ 462 463 vcpu->arch.paddr_accessed = gpa; 464 vcpu->arch.vaddr_accessed = ea; 465 return kvmppc_emulate_mmio(run, vcpu); 466 } 467 468 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, 469 unsigned long ea, unsigned long dsisr) 470 { 471 struct kvm *kvm = vcpu->kvm; 472 unsigned long hpte[3], r; 473 unsigned long hnow_v, hnow_r; 474 __be64 *hptep; 475 unsigned long mmu_seq, psize, pte_size; 476 unsigned long gpa_base, gfn_base; 477 unsigned long gpa, gfn, hva, pfn; 478 struct kvm_memory_slot *memslot; 479 unsigned long *rmap; 480 struct revmap_entry *rev; 481 struct page *page, *pages[1]; 482 long index, ret, npages; 483 bool is_ci; 484 unsigned int writing, write_ok; 485 struct vm_area_struct *vma; 486 unsigned long rcbits; 487 long mmio_update; 488 489 if (kvm_is_radix(kvm)) 490 return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); 491 492 /* 493 * Real-mode code has already searched the HPT and found the 494 * entry we're interested in. Lock the entry and check that 495 * it hasn't changed. If it has, just return and re-execute the 496 * instruction. 497 */ 498 if (ea != vcpu->arch.pgfault_addr) 499 return RESUME_GUEST; 500 501 if (vcpu->arch.pgfault_cache) { 502 mmio_update = atomic64_read(&kvm->arch.mmio_update); 503 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 504 r = vcpu->arch.pgfault_cache->rpte; 505 psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); 506 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 507 gfn_base = gpa_base >> PAGE_SHIFT; 508 gpa = gpa_base | (ea & (psize - 1)); 509 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 510 dsisr & DSISR_ISSTORE); 511 } 512 } 513 index = vcpu->arch.pgfault_index; 514 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 515 rev = &kvm->arch.hpt.rev[index]; 516 preempt_disable(); 517 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 518 cpu_relax(); 519 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 520 hpte[1] = be64_to_cpu(hptep[1]); 521 hpte[2] = r = rev->guest_rpte; 522 unlock_hpte(hptep, hpte[0]); 523 preempt_enable(); 524 525 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 526 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); 527 hpte[1] = hpte_new_to_old_r(hpte[1]); 528 } 529 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 530 hpte[1] != vcpu->arch.pgfault_hpte[1]) 531 return RESUME_GUEST; 532 533 /* Translate the logical address and get the page */ 534 psize = hpte_page_size(hpte[0], r); 535 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 536 gfn_base = gpa_base >> PAGE_SHIFT; 537 gpa = gpa_base | (ea & (psize - 1)); 538 gfn = gpa >> PAGE_SHIFT; 539 memslot = gfn_to_memslot(kvm, gfn); 540 541 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); 542 543 /* No memslot means it's an emulated MMIO region */ 544 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 545 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 546 dsisr & DSISR_ISSTORE); 547 548 /* 549 * This should never happen, because of the slot_is_aligned() 550 * check in kvmppc_do_h_enter(). 551 */ 552 if (gfn_base < memslot->base_gfn) 553 return -EFAULT; 554 555 /* used to check for invalidations in progress */ 556 mmu_seq = kvm->mmu_notifier_seq; 557 smp_rmb(); 558 559 ret = -EFAULT; 560 is_ci = false; 561 pfn = 0; 562 page = NULL; 563 pte_size = PAGE_SIZE; 564 writing = (dsisr & DSISR_ISSTORE) != 0; 565 /* If writing != 0, then the HPTE must allow writing, if we get here */ 566 write_ok = writing; 567 hva = gfn_to_hva_memslot(memslot, gfn); 568 npages = get_user_pages_fast(hva, 1, writing, pages); 569 if (npages < 1) { 570 /* Check if it's an I/O mapping */ 571 down_read(¤t->mm->mmap_sem); 572 vma = find_vma(current->mm, hva); 573 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && 574 (vma->vm_flags & VM_PFNMAP)) { 575 pfn = vma->vm_pgoff + 576 ((hva - vma->vm_start) >> PAGE_SHIFT); 577 pte_size = psize; 578 is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); 579 write_ok = vma->vm_flags & VM_WRITE; 580 } 581 up_read(¤t->mm->mmap_sem); 582 if (!pfn) 583 goto out_put; 584 } else { 585 page = pages[0]; 586 pfn = page_to_pfn(page); 587 if (PageHuge(page)) { 588 page = compound_head(page); 589 pte_size <<= compound_order(page); 590 } 591 /* if the guest wants write access, see if that is OK */ 592 if (!writing && hpte_is_writable(r)) { 593 pte_t *ptep, pte; 594 unsigned long flags; 595 /* 596 * We need to protect against page table destruction 597 * hugepage split and collapse. 598 */ 599 local_irq_save(flags); 600 ptep = find_linux_pte_or_hugepte(current->mm->pgd, 601 hva, NULL, NULL); 602 if (ptep) { 603 pte = kvmppc_read_update_linux_pte(ptep, 1); 604 if (pte_write(pte)) 605 write_ok = 1; 606 } 607 local_irq_restore(flags); 608 } 609 } 610 611 if (psize > pte_size) 612 goto out_put; 613 614 /* Check WIMG vs. the actual page we're accessing */ 615 if (!hpte_cache_flags_ok(r, is_ci)) { 616 if (is_ci) 617 goto out_put; 618 /* 619 * Allow guest to map emulated device memory as 620 * uncacheable, but actually make it cacheable. 621 */ 622 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 623 } 624 625 /* 626 * Set the HPTE to point to pfn. 627 * Since the pfn is at PAGE_SIZE granularity, make sure we 628 * don't mask out lower-order bits if psize < PAGE_SIZE. 629 */ 630 if (psize < PAGE_SIZE) 631 psize = PAGE_SIZE; 632 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | 633 ((pfn << PAGE_SHIFT) & ~(psize - 1)); 634 if (hpte_is_writable(r) && !write_ok) 635 r = hpte_make_readonly(r); 636 ret = RESUME_GUEST; 637 preempt_disable(); 638 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 639 cpu_relax(); 640 hnow_v = be64_to_cpu(hptep[0]); 641 hnow_r = be64_to_cpu(hptep[1]); 642 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 643 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); 644 hnow_r = hpte_new_to_old_r(hnow_r); 645 } 646 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 647 rev->guest_rpte != hpte[2]) 648 /* HPTE has been changed under us; let the guest retry */ 649 goto out_unlock; 650 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 651 652 /* Always put the HPTE in the rmap chain for the page base address */ 653 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 654 lock_rmap(rmap); 655 656 /* Check if we might have been invalidated; let the guest retry if so */ 657 ret = RESUME_GUEST; 658 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 659 unlock_rmap(rmap); 660 goto out_unlock; 661 } 662 663 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 664 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 665 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 666 667 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { 668 /* HPTE was previously valid, so we need to invalidate it */ 669 unlock_rmap(rmap); 670 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 671 kvmppc_invalidate_hpte(kvm, hptep, index); 672 /* don't lose previous R and C bits */ 673 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 674 } else { 675 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 676 } 677 678 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 679 r = hpte_old_to_new_r(hpte[0], r); 680 hpte[0] = hpte_old_to_new_v(hpte[0]); 681 } 682 hptep[1] = cpu_to_be64(r); 683 eieio(); 684 __unlock_hpte(hptep, hpte[0]); 685 asm volatile("ptesync" : : : "memory"); 686 preempt_enable(); 687 if (page && hpte_is_writable(r)) 688 SetPageDirty(page); 689 690 out_put: 691 trace_kvm_page_fault_exit(vcpu, hpte, ret); 692 693 if (page) { 694 /* 695 * We drop pages[0] here, not page because page might 696 * have been set to the head page of a compound, but 697 * we have to drop the reference on the correct tail 698 * page to match the get inside gup() 699 */ 700 put_page(pages[0]); 701 } 702 return ret; 703 704 out_unlock: 705 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 706 preempt_enable(); 707 goto out_put; 708 } 709 710 static void kvmppc_rmap_reset(struct kvm *kvm) 711 { 712 struct kvm_memslots *slots; 713 struct kvm_memory_slot *memslot; 714 int srcu_idx; 715 716 srcu_idx = srcu_read_lock(&kvm->srcu); 717 slots = kvm_memslots(kvm); 718 kvm_for_each_memslot(memslot, slots) { 719 /* 720 * This assumes it is acceptable to lose reference and 721 * change bits across a reset. 722 */ 723 memset(memslot->arch.rmap, 0, 724 memslot->npages * sizeof(*memslot->arch.rmap)); 725 } 726 srcu_read_unlock(&kvm->srcu, srcu_idx); 727 } 728 729 typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, 730 unsigned long gfn); 731 732 static int kvm_handle_hva_range(struct kvm *kvm, 733 unsigned long start, 734 unsigned long end, 735 hva_handler_fn handler) 736 { 737 int ret; 738 int retval = 0; 739 struct kvm_memslots *slots; 740 struct kvm_memory_slot *memslot; 741 742 slots = kvm_memslots(kvm); 743 kvm_for_each_memslot(memslot, slots) { 744 unsigned long hva_start, hva_end; 745 gfn_t gfn, gfn_end; 746 747 hva_start = max(start, memslot->userspace_addr); 748 hva_end = min(end, memslot->userspace_addr + 749 (memslot->npages << PAGE_SHIFT)); 750 if (hva_start >= hva_end) 751 continue; 752 /* 753 * {gfn(page) | page intersects with [hva_start, hva_end)} = 754 * {gfn, gfn+1, ..., gfn_end-1}. 755 */ 756 gfn = hva_to_gfn_memslot(hva_start, memslot); 757 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 758 759 for (; gfn < gfn_end; ++gfn) { 760 ret = handler(kvm, memslot, gfn); 761 retval |= ret; 762 } 763 } 764 765 return retval; 766 } 767 768 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 769 hva_handler_fn handler) 770 { 771 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 772 } 773 774 /* Must be called with both HPTE and rmap locked */ 775 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 776 unsigned long *rmapp, unsigned long gfn) 777 { 778 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 779 struct revmap_entry *rev = kvm->arch.hpt.rev; 780 unsigned long j, h; 781 unsigned long ptel, psize, rcbits; 782 783 j = rev[i].forw; 784 if (j == i) { 785 /* chain is now empty */ 786 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 787 } else { 788 /* remove i from chain */ 789 h = rev[i].back; 790 rev[h].forw = j; 791 rev[j].back = h; 792 rev[i].forw = rev[i].back = i; 793 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 794 } 795 796 /* Now check and modify the HPTE */ 797 ptel = rev[i].guest_rpte; 798 psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); 799 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 800 hpte_rpn(ptel, psize) == gfn) { 801 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 802 kvmppc_invalidate_hpte(kvm, hptep, i); 803 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); 804 /* Harvest R and C */ 805 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 806 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 807 if (rcbits & HPTE_R_C) 808 kvmppc_update_rmap_change(rmapp, psize); 809 if (rcbits & ~rev[i].guest_rpte) { 810 rev[i].guest_rpte = ptel | rcbits; 811 note_hpte_modification(kvm, &rev[i]); 812 } 813 } 814 } 815 816 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 817 unsigned long gfn) 818 { 819 unsigned long i; 820 __be64 *hptep; 821 unsigned long *rmapp; 822 823 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 824 for (;;) { 825 lock_rmap(rmapp); 826 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 827 unlock_rmap(rmapp); 828 break; 829 } 830 831 /* 832 * To avoid an ABBA deadlock with the HPTE lock bit, 833 * we can't spin on the HPTE lock while holding the 834 * rmap chain lock. 835 */ 836 i = *rmapp & KVMPPC_RMAP_INDEX; 837 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 838 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 839 /* unlock rmap before spinning on the HPTE lock */ 840 unlock_rmap(rmapp); 841 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 842 cpu_relax(); 843 continue; 844 } 845 846 kvmppc_unmap_hpte(kvm, i, rmapp, gfn); 847 unlock_rmap(rmapp); 848 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 849 } 850 return 0; 851 } 852 853 int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) 854 { 855 hva_handler_fn handler; 856 857 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 858 kvm_handle_hva(kvm, hva, handler); 859 return 0; 860 } 861 862 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) 863 { 864 hva_handler_fn handler; 865 866 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 867 kvm_handle_hva_range(kvm, start, end, handler); 868 return 0; 869 } 870 871 void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 872 struct kvm_memory_slot *memslot) 873 { 874 unsigned long gfn; 875 unsigned long n; 876 unsigned long *rmapp; 877 878 gfn = memslot->base_gfn; 879 rmapp = memslot->arch.rmap; 880 for (n = memslot->npages; n; --n, ++gfn) { 881 if (kvm_is_radix(kvm)) { 882 kvm_unmap_radix(kvm, memslot, gfn); 883 continue; 884 } 885 /* 886 * Testing the present bit without locking is OK because 887 * the memslot has been marked invalid already, and hence 888 * no new HPTEs referencing this page can be created, 889 * thus the present bit can't go from 0 to 1. 890 */ 891 if (*rmapp & KVMPPC_RMAP_PRESENT) 892 kvm_unmap_rmapp(kvm, memslot, gfn); 893 ++rmapp; 894 } 895 } 896 897 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 898 unsigned long gfn) 899 { 900 struct revmap_entry *rev = kvm->arch.hpt.rev; 901 unsigned long head, i, j; 902 __be64 *hptep; 903 int ret = 0; 904 unsigned long *rmapp; 905 906 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 907 retry: 908 lock_rmap(rmapp); 909 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 910 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 911 ret = 1; 912 } 913 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 914 unlock_rmap(rmapp); 915 return ret; 916 } 917 918 i = head = *rmapp & KVMPPC_RMAP_INDEX; 919 do { 920 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 921 j = rev[i].forw; 922 923 /* If this HPTE isn't referenced, ignore it */ 924 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) 925 continue; 926 927 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 928 /* unlock rmap before spinning on the HPTE lock */ 929 unlock_rmap(rmapp); 930 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 931 cpu_relax(); 932 goto retry; 933 } 934 935 /* Now check and modify the HPTE */ 936 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 937 (be64_to_cpu(hptep[1]) & HPTE_R_R)) { 938 kvmppc_clear_ref_hpte(kvm, hptep, i); 939 if (!(rev[i].guest_rpte & HPTE_R_R)) { 940 rev[i].guest_rpte |= HPTE_R_R; 941 note_hpte_modification(kvm, &rev[i]); 942 } 943 ret = 1; 944 } 945 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 946 } while ((i = j) != head); 947 948 unlock_rmap(rmapp); 949 return ret; 950 } 951 952 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) 953 { 954 hva_handler_fn handler; 955 956 handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; 957 return kvm_handle_hva_range(kvm, start, end, handler); 958 } 959 960 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 961 unsigned long gfn) 962 { 963 struct revmap_entry *rev = kvm->arch.hpt.rev; 964 unsigned long head, i, j; 965 unsigned long *hp; 966 int ret = 1; 967 unsigned long *rmapp; 968 969 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 970 if (*rmapp & KVMPPC_RMAP_REFERENCED) 971 return 1; 972 973 lock_rmap(rmapp); 974 if (*rmapp & KVMPPC_RMAP_REFERENCED) 975 goto out; 976 977 if (*rmapp & KVMPPC_RMAP_PRESENT) { 978 i = head = *rmapp & KVMPPC_RMAP_INDEX; 979 do { 980 hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); 981 j = rev[i].forw; 982 if (be64_to_cpu(hp[1]) & HPTE_R_R) 983 goto out; 984 } while ((i = j) != head); 985 } 986 ret = 0; 987 988 out: 989 unlock_rmap(rmapp); 990 return ret; 991 } 992 993 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) 994 { 995 hva_handler_fn handler; 996 997 handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; 998 return kvm_handle_hva(kvm, hva, handler); 999 } 1000 1001 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) 1002 { 1003 hva_handler_fn handler; 1004 1005 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 1006 kvm_handle_hva(kvm, hva, handler); 1007 } 1008 1009 static int vcpus_running(struct kvm *kvm) 1010 { 1011 return atomic_read(&kvm->arch.vcpus_running) != 0; 1012 } 1013 1014 /* 1015 * Returns the number of system pages that are dirty. 1016 * This can be more than 1 if we find a huge-page HPTE. 1017 */ 1018 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1019 { 1020 struct revmap_entry *rev = kvm->arch.hpt.rev; 1021 unsigned long head, i, j; 1022 unsigned long n; 1023 unsigned long v, r; 1024 __be64 *hptep; 1025 int npages_dirty = 0; 1026 1027 retry: 1028 lock_rmap(rmapp); 1029 if (*rmapp & KVMPPC_RMAP_CHANGED) { 1030 long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER) 1031 >> KVMPPC_RMAP_CHG_SHIFT; 1032 *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER); 1033 npages_dirty = 1; 1034 if (change_order > PAGE_SHIFT) 1035 npages_dirty = 1ul << (change_order - PAGE_SHIFT); 1036 } 1037 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1038 unlock_rmap(rmapp); 1039 return npages_dirty; 1040 } 1041 1042 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1043 do { 1044 unsigned long hptep1; 1045 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 1046 j = rev[i].forw; 1047 1048 /* 1049 * Checking the C (changed) bit here is racy since there 1050 * is no guarantee about when the hardware writes it back. 1051 * If the HPTE is not writable then it is stable since the 1052 * page can't be written to, and we would have done a tlbie 1053 * (which forces the hardware to complete any writeback) 1054 * when making the HPTE read-only. 1055 * If vcpus are running then this call is racy anyway 1056 * since the page could get dirtied subsequently, so we 1057 * expect there to be a further call which would pick up 1058 * any delayed C bit writeback. 1059 * Otherwise we need to do the tlbie even if C==0 in 1060 * order to pick up any delayed writeback of C. 1061 */ 1062 hptep1 = be64_to_cpu(hptep[1]); 1063 if (!(hptep1 & HPTE_R_C) && 1064 (!hpte_is_writable(hptep1) || vcpus_running(kvm))) 1065 continue; 1066 1067 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1068 /* unlock rmap before spinning on the HPTE lock */ 1069 unlock_rmap(rmapp); 1070 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) 1071 cpu_relax(); 1072 goto retry; 1073 } 1074 1075 /* Now check and modify the HPTE */ 1076 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 1077 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 1078 continue; 1079 } 1080 1081 /* need to make it temporarily absent so C is stable */ 1082 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 1083 kvmppc_invalidate_hpte(kvm, hptep, i); 1084 v = be64_to_cpu(hptep[0]); 1085 r = be64_to_cpu(hptep[1]); 1086 if (r & HPTE_R_C) { 1087 hptep[1] = cpu_to_be64(r & ~HPTE_R_C); 1088 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1089 rev[i].guest_rpte |= HPTE_R_C; 1090 note_hpte_modification(kvm, &rev[i]); 1091 } 1092 n = hpte_page_size(v, r); 1093 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1094 if (n > npages_dirty) 1095 npages_dirty = n; 1096 eieio(); 1097 } 1098 v &= ~HPTE_V_ABSENT; 1099 v |= HPTE_V_VALID; 1100 __unlock_hpte(hptep, v); 1101 } while ((i = j) != head); 1102 1103 unlock_rmap(rmapp); 1104 return npages_dirty; 1105 } 1106 1107 void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1108 struct kvm_memory_slot *memslot, 1109 unsigned long *map) 1110 { 1111 unsigned long gfn; 1112 1113 if (!vpa->dirty || !vpa->pinned_addr) 1114 return; 1115 gfn = vpa->gpa >> PAGE_SHIFT; 1116 if (gfn < memslot->base_gfn || 1117 gfn >= memslot->base_gfn + memslot->npages) 1118 return; 1119 1120 vpa->dirty = false; 1121 if (map) 1122 __set_bit_le(gfn - memslot->base_gfn, map); 1123 } 1124 1125 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1126 struct kvm_memory_slot *memslot, unsigned long *map) 1127 { 1128 unsigned long i, j; 1129 unsigned long *rmapp; 1130 1131 preempt_disable(); 1132 rmapp = memslot->arch.rmap; 1133 for (i = 0; i < memslot->npages; ++i) { 1134 int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1135 /* 1136 * Note that if npages > 0 then i must be a multiple of npages, 1137 * since we always put huge-page HPTEs in the rmap chain 1138 * corresponding to their page base address. 1139 */ 1140 if (npages && map) 1141 for (j = i; npages; ++j, --npages) 1142 __set_bit_le(j, map); 1143 ++rmapp; 1144 } 1145 preempt_enable(); 1146 return 0; 1147 } 1148 1149 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1150 unsigned long *nb_ret) 1151 { 1152 struct kvm_memory_slot *memslot; 1153 unsigned long gfn = gpa >> PAGE_SHIFT; 1154 struct page *page, *pages[1]; 1155 int npages; 1156 unsigned long hva, offset; 1157 int srcu_idx; 1158 1159 srcu_idx = srcu_read_lock(&kvm->srcu); 1160 memslot = gfn_to_memslot(kvm, gfn); 1161 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1162 goto err; 1163 hva = gfn_to_hva_memslot(memslot, gfn); 1164 npages = get_user_pages_fast(hva, 1, 1, pages); 1165 if (npages < 1) 1166 goto err; 1167 page = pages[0]; 1168 srcu_read_unlock(&kvm->srcu, srcu_idx); 1169 1170 offset = gpa & (PAGE_SIZE - 1); 1171 if (nb_ret) 1172 *nb_ret = PAGE_SIZE - offset; 1173 return page_address(page) + offset; 1174 1175 err: 1176 srcu_read_unlock(&kvm->srcu, srcu_idx); 1177 return NULL; 1178 } 1179 1180 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1181 bool dirty) 1182 { 1183 struct page *page = virt_to_page(va); 1184 struct kvm_memory_slot *memslot; 1185 unsigned long gfn; 1186 unsigned long *rmap; 1187 int srcu_idx; 1188 1189 put_page(page); 1190 1191 if (!dirty) 1192 return; 1193 1194 /* We need to mark this page dirty in the rmap chain */ 1195 gfn = gpa >> PAGE_SHIFT; 1196 srcu_idx = srcu_read_lock(&kvm->srcu); 1197 memslot = gfn_to_memslot(kvm, gfn); 1198 if (memslot) { 1199 if (!kvm_is_radix(kvm)) { 1200 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1201 lock_rmap(rmap); 1202 *rmap |= KVMPPC_RMAP_CHANGED; 1203 unlock_rmap(rmap); 1204 } else if (memslot->dirty_bitmap) { 1205 mark_page_dirty(kvm, gfn); 1206 } 1207 } 1208 srcu_read_unlock(&kvm->srcu, srcu_idx); 1209 } 1210 1211 /* 1212 * HPT resizing 1213 */ 1214 static int resize_hpt_allocate(struct kvm_resize_hpt *resize) 1215 { 1216 int rc; 1217 1218 rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); 1219 if (rc < 0) 1220 return rc; 1221 1222 resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", 1223 resize->hpt.virt); 1224 1225 return 0; 1226 } 1227 1228 static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, 1229 unsigned long idx) 1230 { 1231 struct kvm *kvm = resize->kvm; 1232 struct kvm_hpt_info *old = &kvm->arch.hpt; 1233 struct kvm_hpt_info *new = &resize->hpt; 1234 unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; 1235 unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; 1236 __be64 *hptep, *new_hptep; 1237 unsigned long vpte, rpte, guest_rpte; 1238 int ret; 1239 struct revmap_entry *rev; 1240 unsigned long apsize, psize, avpn, pteg, hash; 1241 unsigned long new_idx, new_pteg, replace_vpte; 1242 1243 hptep = (__be64 *)(old->virt + (idx << 4)); 1244 1245 /* Guest is stopped, so new HPTEs can't be added or faulted 1246 * in, only unmapped or altered by host actions. So, it's 1247 * safe to check this before we take the HPTE lock */ 1248 vpte = be64_to_cpu(hptep[0]); 1249 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1250 return 0; /* nothing to do */ 1251 1252 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 1253 cpu_relax(); 1254 1255 vpte = be64_to_cpu(hptep[0]); 1256 1257 ret = 0; 1258 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1259 /* Nothing to do */ 1260 goto out; 1261 1262 /* Unmap */ 1263 rev = &old->rev[idx]; 1264 guest_rpte = rev->guest_rpte; 1265 1266 ret = -EIO; 1267 apsize = hpte_page_size(vpte, guest_rpte); 1268 if (!apsize) 1269 goto out; 1270 1271 if (vpte & HPTE_V_VALID) { 1272 unsigned long gfn = hpte_rpn(guest_rpte, apsize); 1273 int srcu_idx = srcu_read_lock(&kvm->srcu); 1274 struct kvm_memory_slot *memslot = 1275 __gfn_to_memslot(kvm_memslots(kvm), gfn); 1276 1277 if (memslot) { 1278 unsigned long *rmapp; 1279 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1280 1281 lock_rmap(rmapp); 1282 kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); 1283 unlock_rmap(rmapp); 1284 } 1285 1286 srcu_read_unlock(&kvm->srcu, srcu_idx); 1287 } 1288 1289 /* Reload PTE after unmap */ 1290 vpte = be64_to_cpu(hptep[0]); 1291 1292 BUG_ON(vpte & HPTE_V_VALID); 1293 BUG_ON(!(vpte & HPTE_V_ABSENT)); 1294 1295 ret = 0; 1296 if (!(vpte & HPTE_V_BOLTED)) 1297 goto out; 1298 1299 rpte = be64_to_cpu(hptep[1]); 1300 psize = hpte_base_page_size(vpte, rpte); 1301 avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23); 1302 pteg = idx / HPTES_PER_GROUP; 1303 if (vpte & HPTE_V_SECONDARY) 1304 pteg = ~pteg; 1305 1306 if (!(vpte & HPTE_V_1TB_SEG)) { 1307 unsigned long offset, vsid; 1308 1309 /* We only have 28 - 23 bits of offset in avpn */ 1310 offset = (avpn & 0x1f) << 23; 1311 vsid = avpn >> 5; 1312 /* We can find more bits from the pteg value */ 1313 if (psize < (1ULL << 23)) 1314 offset |= ((vsid ^ pteg) & old_hash_mask) * psize; 1315 1316 hash = vsid ^ (offset / psize); 1317 } else { 1318 unsigned long offset, vsid; 1319 1320 /* We only have 40 - 23 bits of seg_off in avpn */ 1321 offset = (avpn & 0x1ffff) << 23; 1322 vsid = avpn >> 17; 1323 if (psize < (1ULL << 23)) 1324 offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize; 1325 1326 hash = vsid ^ (vsid << 25) ^ (offset / psize); 1327 } 1328 1329 new_pteg = hash & new_hash_mask; 1330 if (vpte & HPTE_V_SECONDARY) { 1331 BUG_ON(~pteg != (hash & old_hash_mask)); 1332 new_pteg = ~new_pteg; 1333 } else { 1334 BUG_ON(pteg != (hash & old_hash_mask)); 1335 } 1336 1337 new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); 1338 new_hptep = (__be64 *)(new->virt + (new_idx << 4)); 1339 1340 replace_vpte = be64_to_cpu(new_hptep[0]); 1341 1342 if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1343 BUG_ON(new->order >= old->order); 1344 1345 if (replace_vpte & HPTE_V_BOLTED) { 1346 if (vpte & HPTE_V_BOLTED) 1347 /* Bolted collision, nothing we can do */ 1348 ret = -ENOSPC; 1349 /* Discard the new HPTE */ 1350 goto out; 1351 } 1352 1353 /* Discard the previous HPTE */ 1354 } 1355 1356 new_hptep[1] = cpu_to_be64(rpte); 1357 new->rev[new_idx].guest_rpte = guest_rpte; 1358 /* No need for a barrier, since new HPT isn't active */ 1359 new_hptep[0] = cpu_to_be64(vpte); 1360 unlock_hpte(new_hptep, vpte); 1361 1362 out: 1363 unlock_hpte(hptep, vpte); 1364 return ret; 1365 } 1366 1367 static int resize_hpt_rehash(struct kvm_resize_hpt *resize) 1368 { 1369 struct kvm *kvm = resize->kvm; 1370 unsigned long i; 1371 int rc; 1372 1373 /* 1374 * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs 1375 * that POWER9 uses, and could well hit a BUG_ON on POWER9. 1376 */ 1377 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1378 return -EIO; 1379 for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { 1380 rc = resize_hpt_rehash_hpte(resize, i); 1381 if (rc != 0) 1382 return rc; 1383 } 1384 1385 return 0; 1386 } 1387 1388 static void resize_hpt_pivot(struct kvm_resize_hpt *resize) 1389 { 1390 struct kvm *kvm = resize->kvm; 1391 struct kvm_hpt_info hpt_tmp; 1392 1393 /* Exchange the pending tables in the resize structure with 1394 * the active tables */ 1395 1396 resize_hpt_debug(resize, "resize_hpt_pivot()\n"); 1397 1398 spin_lock(&kvm->mmu_lock); 1399 asm volatile("ptesync" : : : "memory"); 1400 1401 hpt_tmp = kvm->arch.hpt; 1402 kvmppc_set_hpt(kvm, &resize->hpt); 1403 resize->hpt = hpt_tmp; 1404 1405 spin_unlock(&kvm->mmu_lock); 1406 1407 synchronize_srcu_expedited(&kvm->srcu); 1408 1409 resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); 1410 } 1411 1412 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) 1413 { 1414 BUG_ON(kvm->arch.resize_hpt != resize); 1415 1416 if (!resize) 1417 return; 1418 1419 if (resize->hpt.virt) 1420 kvmppc_free_hpt(&resize->hpt); 1421 1422 kvm->arch.resize_hpt = NULL; 1423 kfree(resize); 1424 } 1425 1426 static void resize_hpt_prepare_work(struct work_struct *work) 1427 { 1428 struct kvm_resize_hpt *resize = container_of(work, 1429 struct kvm_resize_hpt, 1430 work); 1431 struct kvm *kvm = resize->kvm; 1432 int err; 1433 1434 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", 1435 resize->order); 1436 1437 err = resize_hpt_allocate(resize); 1438 1439 mutex_lock(&kvm->lock); 1440 1441 resize->error = err; 1442 resize->prepare_done = true; 1443 1444 mutex_unlock(&kvm->lock); 1445 } 1446 1447 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, 1448 struct kvm_ppc_resize_hpt *rhpt) 1449 { 1450 unsigned long flags = rhpt->flags; 1451 unsigned long shift = rhpt->shift; 1452 struct kvm_resize_hpt *resize; 1453 int ret; 1454 1455 if (flags != 0) 1456 return -EINVAL; 1457 1458 if (shift && ((shift < 18) || (shift > 46))) 1459 return -EINVAL; 1460 1461 mutex_lock(&kvm->lock); 1462 1463 resize = kvm->arch.resize_hpt; 1464 1465 if (resize) { 1466 if (resize->order == shift) { 1467 /* Suitable resize in progress */ 1468 if (resize->prepare_done) { 1469 ret = resize->error; 1470 if (ret != 0) 1471 resize_hpt_release(kvm, resize); 1472 } else { 1473 ret = 100; /* estimated time in ms */ 1474 } 1475 1476 goto out; 1477 } 1478 1479 /* not suitable, cancel it */ 1480 resize_hpt_release(kvm, resize); 1481 } 1482 1483 ret = 0; 1484 if (!shift) 1485 goto out; /* nothing to do */ 1486 1487 /* start new resize */ 1488 1489 resize = kzalloc(sizeof(*resize), GFP_KERNEL); 1490 resize->order = shift; 1491 resize->kvm = kvm; 1492 INIT_WORK(&resize->work, resize_hpt_prepare_work); 1493 kvm->arch.resize_hpt = resize; 1494 1495 schedule_work(&resize->work); 1496 1497 ret = 100; /* estimated time in ms */ 1498 1499 out: 1500 mutex_unlock(&kvm->lock); 1501 return ret; 1502 } 1503 1504 static void resize_hpt_boot_vcpu(void *opaque) 1505 { 1506 /* Nothing to do, just force a KVM exit */ 1507 } 1508 1509 long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, 1510 struct kvm_ppc_resize_hpt *rhpt) 1511 { 1512 unsigned long flags = rhpt->flags; 1513 unsigned long shift = rhpt->shift; 1514 struct kvm_resize_hpt *resize; 1515 long ret; 1516 1517 if (flags != 0) 1518 return -EINVAL; 1519 1520 if (shift && ((shift < 18) || (shift > 46))) 1521 return -EINVAL; 1522 1523 mutex_lock(&kvm->lock); 1524 1525 resize = kvm->arch.resize_hpt; 1526 1527 /* This shouldn't be possible */ 1528 ret = -EIO; 1529 if (WARN_ON(!kvm->arch.hpte_setup_done)) 1530 goto out_no_hpt; 1531 1532 /* Stop VCPUs from running while we mess with the HPT */ 1533 kvm->arch.hpte_setup_done = 0; 1534 smp_mb(); 1535 1536 /* Boot all CPUs out of the guest so they re-read 1537 * hpte_setup_done */ 1538 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1539 1540 ret = -ENXIO; 1541 if (!resize || (resize->order != shift)) 1542 goto out; 1543 1544 ret = -EBUSY; 1545 if (!resize->prepare_done) 1546 goto out; 1547 1548 ret = resize->error; 1549 if (ret != 0) 1550 goto out; 1551 1552 ret = resize_hpt_rehash(resize); 1553 if (ret != 0) 1554 goto out; 1555 1556 resize_hpt_pivot(resize); 1557 1558 out: 1559 /* Let VCPUs run again */ 1560 kvm->arch.hpte_setup_done = 1; 1561 smp_mb(); 1562 out_no_hpt: 1563 resize_hpt_release(kvm, resize); 1564 mutex_unlock(&kvm->lock); 1565 return ret; 1566 } 1567 1568 /* 1569 * Functions for reading and writing the hash table via reads and 1570 * writes on a file descriptor. 1571 * 1572 * Reads return the guest view of the hash table, which has to be 1573 * pieced together from the real hash table and the guest_rpte 1574 * values in the revmap array. 1575 * 1576 * On writes, each HPTE written is considered in turn, and if it 1577 * is valid, it is written to the HPT as if an H_ENTER with the 1578 * exact flag set was done. When the invalid count is non-zero 1579 * in the header written to the stream, the kernel will make 1580 * sure that that many HPTEs are invalid, and invalidate them 1581 * if not. 1582 */ 1583 1584 struct kvm_htab_ctx { 1585 unsigned long index; 1586 unsigned long flags; 1587 struct kvm *kvm; 1588 int first_pass; 1589 }; 1590 1591 #define HPTE_SIZE (2 * sizeof(unsigned long)) 1592 1593 /* 1594 * Returns 1 if this HPT entry has been modified or has pending 1595 * R/C bit changes. 1596 */ 1597 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) 1598 { 1599 unsigned long rcbits_unset; 1600 1601 if (revp->guest_rpte & HPTE_GR_MODIFIED) 1602 return 1; 1603 1604 /* Also need to consider changes in reference and changed bits */ 1605 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1606 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && 1607 (be64_to_cpu(hptp[1]) & rcbits_unset)) 1608 return 1; 1609 1610 return 0; 1611 } 1612 1613 static long record_hpte(unsigned long flags, __be64 *hptp, 1614 unsigned long *hpte, struct revmap_entry *revp, 1615 int want_valid, int first_pass) 1616 { 1617 unsigned long v, r, hr; 1618 unsigned long rcbits_unset; 1619 int ok = 1; 1620 int valid, dirty; 1621 1622 /* Unmodified entries are uninteresting except on the first pass */ 1623 dirty = hpte_dirty(revp, hptp); 1624 if (!first_pass && !dirty) 1625 return 0; 1626 1627 valid = 0; 1628 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1629 valid = 1; 1630 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1631 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) 1632 valid = 0; 1633 } 1634 if (valid != want_valid) 1635 return 0; 1636 1637 v = r = 0; 1638 if (valid || dirty) { 1639 /* lock the HPTE so it's stable and read it */ 1640 preempt_disable(); 1641 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1642 cpu_relax(); 1643 v = be64_to_cpu(hptp[0]); 1644 hr = be64_to_cpu(hptp[1]); 1645 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1646 v = hpte_new_to_old_v(v, hr); 1647 hr = hpte_new_to_old_r(hr); 1648 } 1649 1650 /* re-evaluate valid and dirty from synchronized HPTE value */ 1651 valid = !!(v & HPTE_V_VALID); 1652 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1653 1654 /* Harvest R and C into guest view if necessary */ 1655 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1656 if (valid && (rcbits_unset & hr)) { 1657 revp->guest_rpte |= (hr & 1658 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; 1659 dirty = 1; 1660 } 1661 1662 if (v & HPTE_V_ABSENT) { 1663 v &= ~HPTE_V_ABSENT; 1664 v |= HPTE_V_VALID; 1665 valid = 1; 1666 } 1667 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1668 valid = 0; 1669 1670 r = revp->guest_rpte; 1671 /* only clear modified if this is the right sort of entry */ 1672 if (valid == want_valid && dirty) { 1673 r &= ~HPTE_GR_MODIFIED; 1674 revp->guest_rpte = r; 1675 } 1676 unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1677 preempt_enable(); 1678 if (!(valid == want_valid && (first_pass || dirty))) 1679 ok = 0; 1680 } 1681 hpte[0] = cpu_to_be64(v); 1682 hpte[1] = cpu_to_be64(r); 1683 return ok; 1684 } 1685 1686 static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1687 size_t count, loff_t *ppos) 1688 { 1689 struct kvm_htab_ctx *ctx = file->private_data; 1690 struct kvm *kvm = ctx->kvm; 1691 struct kvm_get_htab_header hdr; 1692 __be64 *hptp; 1693 struct revmap_entry *revp; 1694 unsigned long i, nb, nw; 1695 unsigned long __user *lbuf; 1696 struct kvm_get_htab_header __user *hptr; 1697 unsigned long flags; 1698 int first_pass; 1699 unsigned long hpte[2]; 1700 1701 if (!access_ok(VERIFY_WRITE, buf, count)) 1702 return -EFAULT; 1703 1704 first_pass = ctx->first_pass; 1705 flags = ctx->flags; 1706 1707 i = ctx->index; 1708 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1709 revp = kvm->arch.hpt.rev + i; 1710 lbuf = (unsigned long __user *)buf; 1711 1712 nb = 0; 1713 while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1714 /* Initialize header */ 1715 hptr = (struct kvm_get_htab_header __user *)buf; 1716 hdr.n_valid = 0; 1717 hdr.n_invalid = 0; 1718 nw = nb; 1719 nb += sizeof(hdr); 1720 lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1721 1722 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1723 if (!first_pass) { 1724 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1725 !hpte_dirty(revp, hptp)) { 1726 ++i; 1727 hptp += 2; 1728 ++revp; 1729 } 1730 } 1731 hdr.index = i; 1732 1733 /* Grab a series of valid entries */ 1734 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1735 hdr.n_valid < 0xffff && 1736 nb + HPTE_SIZE < count && 1737 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1738 /* valid entry, write it out */ 1739 ++hdr.n_valid; 1740 if (__put_user(hpte[0], lbuf) || 1741 __put_user(hpte[1], lbuf + 1)) 1742 return -EFAULT; 1743 nb += HPTE_SIZE; 1744 lbuf += 2; 1745 ++i; 1746 hptp += 2; 1747 ++revp; 1748 } 1749 /* Now skip invalid entries while we can */ 1750 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1751 hdr.n_invalid < 0xffff && 1752 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1753 /* found an invalid entry */ 1754 ++hdr.n_invalid; 1755 ++i; 1756 hptp += 2; 1757 ++revp; 1758 } 1759 1760 if (hdr.n_valid || hdr.n_invalid) { 1761 /* write back the header */ 1762 if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1763 return -EFAULT; 1764 nw = nb; 1765 buf = (char __user *)lbuf; 1766 } else { 1767 nb = nw; 1768 } 1769 1770 /* Check if we've wrapped around the hash table */ 1771 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { 1772 i = 0; 1773 ctx->first_pass = 0; 1774 break; 1775 } 1776 } 1777 1778 ctx->index = i; 1779 1780 return nb; 1781 } 1782 1783 static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1784 size_t count, loff_t *ppos) 1785 { 1786 struct kvm_htab_ctx *ctx = file->private_data; 1787 struct kvm *kvm = ctx->kvm; 1788 struct kvm_get_htab_header hdr; 1789 unsigned long i, j; 1790 unsigned long v, r; 1791 unsigned long __user *lbuf; 1792 __be64 *hptp; 1793 unsigned long tmp[2]; 1794 ssize_t nb; 1795 long int err, ret; 1796 int hpte_setup; 1797 1798 if (!access_ok(VERIFY_READ, buf, count)) 1799 return -EFAULT; 1800 1801 /* lock out vcpus from running while we're doing this */ 1802 mutex_lock(&kvm->lock); 1803 hpte_setup = kvm->arch.hpte_setup_done; 1804 if (hpte_setup) { 1805 kvm->arch.hpte_setup_done = 0; /* temporarily */ 1806 /* order hpte_setup_done vs. vcpus_running */ 1807 smp_mb(); 1808 if (atomic_read(&kvm->arch.vcpus_running)) { 1809 kvm->arch.hpte_setup_done = 1; 1810 mutex_unlock(&kvm->lock); 1811 return -EBUSY; 1812 } 1813 } 1814 1815 err = 0; 1816 for (nb = 0; nb + sizeof(hdr) <= count; ) { 1817 err = -EFAULT; 1818 if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1819 break; 1820 1821 err = 0; 1822 if (nb + hdr.n_valid * HPTE_SIZE > count) 1823 break; 1824 1825 nb += sizeof(hdr); 1826 buf += sizeof(hdr); 1827 1828 err = -EINVAL; 1829 i = hdr.index; 1830 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || 1831 i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) 1832 break; 1833 1834 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1835 lbuf = (unsigned long __user *)buf; 1836 for (j = 0; j < hdr.n_valid; ++j) { 1837 __be64 hpte_v; 1838 __be64 hpte_r; 1839 1840 err = -EFAULT; 1841 if (__get_user(hpte_v, lbuf) || 1842 __get_user(hpte_r, lbuf + 1)) 1843 goto out; 1844 v = be64_to_cpu(hpte_v); 1845 r = be64_to_cpu(hpte_r); 1846 err = -EINVAL; 1847 if (!(v & HPTE_V_VALID)) 1848 goto out; 1849 lbuf += 2; 1850 nb += HPTE_SIZE; 1851 1852 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1853 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1854 err = -EIO; 1855 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1856 tmp); 1857 if (ret != H_SUCCESS) { 1858 pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1859 "r=%lx\n", ret, i, v, r); 1860 goto out; 1861 } 1862 if (!hpte_setup && is_vrma_hpte(v)) { 1863 unsigned long psize = hpte_base_page_size(v, r); 1864 unsigned long senc = slb_pgsize_encoding(psize); 1865 unsigned long lpcr; 1866 1867 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1868 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1869 lpcr = senc << (LPCR_VRMASD_SH - 4); 1870 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 1871 hpte_setup = 1; 1872 } 1873 ++i; 1874 hptp += 2; 1875 } 1876 1877 for (j = 0; j < hdr.n_invalid; ++j) { 1878 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1879 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1880 ++i; 1881 hptp += 2; 1882 } 1883 err = 0; 1884 } 1885 1886 out: 1887 /* Order HPTE updates vs. hpte_setup_done */ 1888 smp_wmb(); 1889 kvm->arch.hpte_setup_done = hpte_setup; 1890 mutex_unlock(&kvm->lock); 1891 1892 if (err) 1893 return err; 1894 return nb; 1895 } 1896 1897 static int kvm_htab_release(struct inode *inode, struct file *filp) 1898 { 1899 struct kvm_htab_ctx *ctx = filp->private_data; 1900 1901 filp->private_data = NULL; 1902 if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1903 atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1904 kvm_put_kvm(ctx->kvm); 1905 kfree(ctx); 1906 return 0; 1907 } 1908 1909 static const struct file_operations kvm_htab_fops = { 1910 .read = kvm_htab_read, 1911 .write = kvm_htab_write, 1912 .llseek = default_llseek, 1913 .release = kvm_htab_release, 1914 }; 1915 1916 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1917 { 1918 int ret; 1919 struct kvm_htab_ctx *ctx; 1920 int rwflag; 1921 1922 /* reject flags we don't recognize */ 1923 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1924 return -EINVAL; 1925 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1926 if (!ctx) 1927 return -ENOMEM; 1928 kvm_get_kvm(kvm); 1929 ctx->kvm = kvm; 1930 ctx->index = ghf->start_index; 1931 ctx->flags = ghf->flags; 1932 ctx->first_pass = 1; 1933 1934 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1935 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1936 if (ret < 0) { 1937 kvm_put_kvm(kvm); 1938 return ret; 1939 } 1940 1941 if (rwflag == O_RDONLY) { 1942 mutex_lock(&kvm->slots_lock); 1943 atomic_inc(&kvm->arch.hpte_mod_interest); 1944 /* make sure kvmppc_do_h_enter etc. see the increment */ 1945 synchronize_srcu_expedited(&kvm->srcu); 1946 mutex_unlock(&kvm->slots_lock); 1947 } 1948 1949 return ret; 1950 } 1951 1952 struct debugfs_htab_state { 1953 struct kvm *kvm; 1954 struct mutex mutex; 1955 unsigned long hpt_index; 1956 int chars_left; 1957 int buf_index; 1958 char buf[64]; 1959 }; 1960 1961 static int debugfs_htab_open(struct inode *inode, struct file *file) 1962 { 1963 struct kvm *kvm = inode->i_private; 1964 struct debugfs_htab_state *p; 1965 1966 p = kzalloc(sizeof(*p), GFP_KERNEL); 1967 if (!p) 1968 return -ENOMEM; 1969 1970 kvm_get_kvm(kvm); 1971 p->kvm = kvm; 1972 mutex_init(&p->mutex); 1973 file->private_data = p; 1974 1975 return nonseekable_open(inode, file); 1976 } 1977 1978 static int debugfs_htab_release(struct inode *inode, struct file *file) 1979 { 1980 struct debugfs_htab_state *p = file->private_data; 1981 1982 kvm_put_kvm(p->kvm); 1983 kfree(p); 1984 return 0; 1985 } 1986 1987 static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 1988 size_t len, loff_t *ppos) 1989 { 1990 struct debugfs_htab_state *p = file->private_data; 1991 ssize_t ret, r; 1992 unsigned long i, n; 1993 unsigned long v, hr, gr; 1994 struct kvm *kvm; 1995 __be64 *hptp; 1996 1997 ret = mutex_lock_interruptible(&p->mutex); 1998 if (ret) 1999 return ret; 2000 2001 if (p->chars_left) { 2002 n = p->chars_left; 2003 if (n > len) 2004 n = len; 2005 r = copy_to_user(buf, p->buf + p->buf_index, n); 2006 n -= r; 2007 p->chars_left -= n; 2008 p->buf_index += n; 2009 buf += n; 2010 len -= n; 2011 ret = n; 2012 if (r) { 2013 if (!n) 2014 ret = -EFAULT; 2015 goto out; 2016 } 2017 } 2018 2019 kvm = p->kvm; 2020 i = p->hpt_index; 2021 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2022 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); 2023 ++i, hptp += 2) { 2024 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 2025 continue; 2026 2027 /* lock the HPTE so it's stable and read it */ 2028 preempt_disable(); 2029 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 2030 cpu_relax(); 2031 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 2032 hr = be64_to_cpu(hptp[1]); 2033 gr = kvm->arch.hpt.rev[i].guest_rpte; 2034 unlock_hpte(hptp, v); 2035 preempt_enable(); 2036 2037 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 2038 continue; 2039 2040 n = scnprintf(p->buf, sizeof(p->buf), 2041 "%6lx %.16lx %.16lx %.16lx\n", 2042 i, v, hr, gr); 2043 p->chars_left = n; 2044 if (n > len) 2045 n = len; 2046 r = copy_to_user(buf, p->buf, n); 2047 n -= r; 2048 p->chars_left -= n; 2049 p->buf_index = n; 2050 buf += n; 2051 len -= n; 2052 ret += n; 2053 if (r) { 2054 if (!ret) 2055 ret = -EFAULT; 2056 goto out; 2057 } 2058 } 2059 p->hpt_index = i; 2060 2061 out: 2062 mutex_unlock(&p->mutex); 2063 return ret; 2064 } 2065 2066 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 2067 size_t len, loff_t *ppos) 2068 { 2069 return -EACCES; 2070 } 2071 2072 static const struct file_operations debugfs_htab_fops = { 2073 .owner = THIS_MODULE, 2074 .open = debugfs_htab_open, 2075 .release = debugfs_htab_release, 2076 .read = debugfs_htab_read, 2077 .write = debugfs_htab_write, 2078 .llseek = generic_file_llseek, 2079 }; 2080 2081 void kvmppc_mmu_debugfs_init(struct kvm *kvm) 2082 { 2083 kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, 2084 kvm->arch.debugfs_dir, kvm, 2085 &debugfs_htab_fops); 2086 } 2087 2088 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 2089 { 2090 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 2091 2092 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2093 2094 if (kvm_is_radix(vcpu->kvm)) 2095 mmu->xlate = kvmppc_mmu_radix_xlate; 2096 else 2097 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2098 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 2099 2100 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 2101 } 2102