1 /* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License, version 2, as 4 * published by the Free Software Foundation. 5 * 6 * This program is distributed in the hope that it will be useful, 7 * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 * GNU General Public License for more details. 10 * 11 * You should have received a copy of the GNU General Public License 12 * along with this program; if not, write to the Free Software 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 14 * 15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 16 */ 17 18 #include <linux/types.h> 19 #include <linux/string.h> 20 #include <linux/kvm.h> 21 #include <linux/kvm_host.h> 22 #include <linux/highmem.h> 23 #include <linux/gfp.h> 24 #include <linux/slab.h> 25 #include <linux/hugetlb.h> 26 #include <linux/vmalloc.h> 27 #include <linux/srcu.h> 28 #include <linux/anon_inodes.h> 29 #include <linux/file.h> 30 #include <linux/debugfs.h> 31 32 #include <asm/tlbflush.h> 33 #include <asm/kvm_ppc.h> 34 #include <asm/kvm_book3s.h> 35 #include <asm/book3s/64/mmu-hash.h> 36 #include <asm/hvcall.h> 37 #include <asm/synch.h> 38 #include <asm/ppc-opcode.h> 39 #include <asm/cputable.h> 40 41 #include "trace_hv.h" 42 43 //#define DEBUG_RESIZE_HPT 1 44 45 #ifdef DEBUG_RESIZE_HPT 46 #define resize_hpt_debug(resize, ...) \ 47 do { \ 48 printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ 49 printk(__VA_ARGS__); \ 50 } while (0) 51 #else 52 #define resize_hpt_debug(resize, ...) \ 53 do { } while (0) 54 #endif 55 56 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 57 long pte_index, unsigned long pteh, 58 unsigned long ptel, unsigned long *pte_idx_ret); 59 60 struct kvm_resize_hpt { 61 /* These fields read-only after init */ 62 struct kvm *kvm; 63 struct work_struct work; 64 u32 order; 65 66 /* These fields protected by kvm->lock */ 67 int error; 68 bool prepare_done; 69 70 /* Private to the work thread, until prepare_done is true, 71 * then protected by kvm->resize_hpt_sem */ 72 struct kvm_hpt_info hpt; 73 }; 74 75 static void kvmppc_rmap_reset(struct kvm *kvm); 76 77 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 78 { 79 unsigned long hpt = 0; 80 int cma = 0; 81 struct page *page = NULL; 82 struct revmap_entry *rev; 83 unsigned long npte; 84 85 if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) 86 return -EINVAL; 87 88 page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); 89 if (page) { 90 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 91 memset((void *)hpt, 0, (1ul << order)); 92 cma = 1; 93 } 94 95 if (!hpt) 96 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL 97 |__GFP_NOWARN, order - PAGE_SHIFT); 98 99 if (!hpt) 100 return -ENOMEM; 101 102 /* HPTEs are 2**4 bytes long */ 103 npte = 1ul << (order - 4); 104 105 /* Allocate reverse map array */ 106 rev = vmalloc(sizeof(struct revmap_entry) * npte); 107 if (!rev) { 108 pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); 109 if (cma) 110 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 111 else 112 free_pages(hpt, order - PAGE_SHIFT); 113 return -ENOMEM; 114 } 115 116 info->order = order; 117 info->virt = hpt; 118 info->cma = cma; 119 info->rev = rev; 120 121 return 0; 122 } 123 124 void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) 125 { 126 atomic64_set(&kvm->arch.mmio_update, 0); 127 kvm->arch.hpt = *info; 128 kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); 129 130 pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", 131 info->virt, (long)info->order, kvm->arch.lpid); 132 } 133 134 long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) 135 { 136 long err = -EBUSY; 137 struct kvm_hpt_info info; 138 139 if (kvm_is_radix(kvm)) 140 return -EINVAL; 141 142 mutex_lock(&kvm->lock); 143 if (kvm->arch.hpte_setup_done) { 144 kvm->arch.hpte_setup_done = 0; 145 /* order hpte_setup_done vs. vcpus_running */ 146 smp_mb(); 147 if (atomic_read(&kvm->arch.vcpus_running)) { 148 kvm->arch.hpte_setup_done = 1; 149 goto out; 150 } 151 } 152 if (kvm->arch.hpt.order == order) { 153 /* We already have a suitable HPT */ 154 155 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 156 memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); 157 /* 158 * Reset all the reverse-mapping chains for all memslots 159 */ 160 kvmppc_rmap_reset(kvm); 161 /* Ensure that each vcpu will flush its TLB on next entry. */ 162 cpumask_setall(&kvm->arch.need_tlb_flush); 163 err = 0; 164 goto out; 165 } 166 167 if (kvm->arch.hpt.virt) 168 kvmppc_free_hpt(&kvm->arch.hpt); 169 170 err = kvmppc_allocate_hpt(&info, order); 171 if (err < 0) 172 goto out; 173 kvmppc_set_hpt(kvm, &info); 174 175 out: 176 mutex_unlock(&kvm->lock); 177 return err; 178 } 179 180 void kvmppc_free_hpt(struct kvm_hpt_info *info) 181 { 182 vfree(info->rev); 183 if (info->cma) 184 kvm_free_hpt_cma(virt_to_page(info->virt), 185 1 << (info->order - PAGE_SHIFT)); 186 else if (info->virt) 187 free_pages(info->virt, info->order - PAGE_SHIFT); 188 info->virt = 0; 189 info->order = 0; 190 } 191 192 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 193 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 194 { 195 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 196 } 197 198 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 199 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 200 { 201 return (pgsize == 0x10000) ? 0x1000 : 0; 202 } 203 204 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 205 unsigned long porder) 206 { 207 unsigned long i; 208 unsigned long npages; 209 unsigned long hp_v, hp_r; 210 unsigned long addr, hash; 211 unsigned long psize; 212 unsigned long hp0, hp1; 213 unsigned long idx_ret; 214 long ret; 215 struct kvm *kvm = vcpu->kvm; 216 217 psize = 1ul << porder; 218 npages = memslot->npages >> (porder - PAGE_SHIFT); 219 220 /* VRMA can't be > 1TB */ 221 if (npages > 1ul << (40 - porder)) 222 npages = 1ul << (40 - porder); 223 /* Can't use more than 1 HPTE per HPTEG */ 224 if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) 225 npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; 226 227 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 228 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 229 hp1 = hpte1_pgsize_encoding(psize) | 230 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 231 232 for (i = 0; i < npages; ++i) { 233 addr = i << porder; 234 /* can't use hpt_hash since va > 64 bits */ 235 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) 236 & kvmppc_hpt_mask(&kvm->arch.hpt); 237 /* 238 * We assume that the hash table is empty and no 239 * vcpus are using it at this stage. Since we create 240 * at most one HPTE per HPTEG, we just assume entry 7 241 * is available and use it. 242 */ 243 hash = (hash << 3) + 7; 244 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 245 hp_r = hp1 | addr; 246 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 247 &idx_ret); 248 if (ret != H_SUCCESS) { 249 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 250 addr, ret); 251 break; 252 } 253 } 254 } 255 256 int kvmppc_mmu_hv_init(void) 257 { 258 unsigned long host_lpid, rsvd_lpid; 259 260 if (!cpu_has_feature(CPU_FTR_HVMODE)) 261 return -EINVAL; 262 263 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ 264 host_lpid = mfspr(SPRN_LPID); 265 rsvd_lpid = LPID_RSVD; 266 267 kvmppc_init_lpid(rsvd_lpid + 1); 268 269 kvmppc_claim_lpid(host_lpid); 270 /* rsvd_lpid is reserved for use in partition switching */ 271 kvmppc_claim_lpid(rsvd_lpid); 272 273 return 0; 274 } 275 276 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) 277 { 278 unsigned long msr = vcpu->arch.intr_msr; 279 280 /* If transactional, change to suspend mode on IRQ delivery */ 281 if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) 282 msr |= MSR_TS_S; 283 else 284 msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; 285 kvmppc_set_msr(vcpu, msr); 286 } 287 288 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 289 long pte_index, unsigned long pteh, 290 unsigned long ptel, unsigned long *pte_idx_ret) 291 { 292 long ret; 293 294 /* Protect linux PTE lookup from page table destruction */ 295 rcu_read_lock_sched(); /* this disables preemption too */ 296 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 297 current->mm->pgd, false, pte_idx_ret); 298 rcu_read_unlock_sched(); 299 if (ret == H_TOO_HARD) { 300 /* this can't happen */ 301 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 302 ret = H_RESOURCE; /* or something */ 303 } 304 return ret; 305 306 } 307 308 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 309 gva_t eaddr) 310 { 311 u64 mask; 312 int i; 313 314 for (i = 0; i < vcpu->arch.slb_nr; i++) { 315 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 316 continue; 317 318 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 319 mask = ESID_MASK_1T; 320 else 321 mask = ESID_MASK; 322 323 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 324 return &vcpu->arch.slb[i]; 325 } 326 return NULL; 327 } 328 329 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 330 unsigned long ea) 331 { 332 unsigned long ra_mask; 333 334 ra_mask = hpte_page_size(v, r) - 1; 335 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 336 } 337 338 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 339 struct kvmppc_pte *gpte, bool data, bool iswrite) 340 { 341 struct kvm *kvm = vcpu->kvm; 342 struct kvmppc_slb *slbe; 343 unsigned long slb_v; 344 unsigned long pp, key; 345 unsigned long v, orig_v, gr; 346 __be64 *hptep; 347 int index; 348 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 349 350 /* Get SLB entry */ 351 if (virtmode) { 352 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 353 if (!slbe) 354 return -EINVAL; 355 slb_v = slbe->origv; 356 } else { 357 /* real mode access */ 358 slb_v = vcpu->kvm->arch.vrma_slb_v; 359 } 360 361 preempt_disable(); 362 /* Find the HPTE in the hash table */ 363 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 364 HPTE_V_VALID | HPTE_V_ABSENT); 365 if (index < 0) { 366 preempt_enable(); 367 return -ENOENT; 368 } 369 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 370 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 371 if (cpu_has_feature(CPU_FTR_ARCH_300)) 372 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 373 gr = kvm->arch.hpt.rev[index].guest_rpte; 374 375 unlock_hpte(hptep, orig_v); 376 preempt_enable(); 377 378 gpte->eaddr = eaddr; 379 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 380 381 /* Get PP bits and key for permission check */ 382 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 383 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 384 key &= slb_v; 385 386 /* Calculate permissions */ 387 gpte->may_read = hpte_read_permission(pp, key); 388 gpte->may_write = hpte_write_permission(pp, key); 389 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 390 391 /* Storage key permission check for POWER7 */ 392 if (data && virtmode) { 393 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 394 if (amrfield & 1) 395 gpte->may_read = 0; 396 if (amrfield & 2) 397 gpte->may_write = 0; 398 } 399 400 /* Get the guest physical address */ 401 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 402 return 0; 403 } 404 405 /* 406 * Quick test for whether an instruction is a load or a store. 407 * If the instruction is a load or a store, then this will indicate 408 * which it is, at least on server processors. (Embedded processors 409 * have some external PID instructions that don't follow the rule 410 * embodied here.) If the instruction isn't a load or store, then 411 * this doesn't return anything useful. 412 */ 413 static int instruction_is_store(unsigned int instr) 414 { 415 unsigned int mask; 416 417 mask = 0x10000000; 418 if ((instr & 0xfc000000) == 0x7c000000) 419 mask = 0x100; /* major opcode 31 */ 420 return (instr & mask) != 0; 421 } 422 423 int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, 424 unsigned long gpa, gva_t ea, int is_store) 425 { 426 u32 last_inst; 427 428 /* 429 * If we fail, we just return to the guest and try executing it again. 430 */ 431 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 432 EMULATE_DONE) 433 return RESUME_GUEST; 434 435 /* 436 * WARNING: We do not know for sure whether the instruction we just 437 * read from memory is the same that caused the fault in the first 438 * place. If the instruction we read is neither an load or a store, 439 * then it can't access memory, so we don't need to worry about 440 * enforcing access permissions. So, assuming it is a load or 441 * store, we just check that its direction (load or store) is 442 * consistent with the original fault, since that's what we 443 * checked the access permissions against. If there is a mismatch 444 * we just return and retry the instruction. 445 */ 446 447 if (instruction_is_store(last_inst) != !!is_store) 448 return RESUME_GUEST; 449 450 /* 451 * Emulated accesses are emulated by looking at the hash for 452 * translation once, then performing the access later. The 453 * translation could be invalidated in the meantime in which 454 * point performing the subsequent memory access on the old 455 * physical address could possibly be a security hole for the 456 * guest (but not the host). 457 * 458 * This is less of an issue for MMIO stores since they aren't 459 * globally visible. It could be an issue for MMIO loads to 460 * a certain extent but we'll ignore it for now. 461 */ 462 463 vcpu->arch.paddr_accessed = gpa; 464 vcpu->arch.vaddr_accessed = ea; 465 return kvmppc_emulate_mmio(run, vcpu); 466 } 467 468 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, 469 unsigned long ea, unsigned long dsisr) 470 { 471 struct kvm *kvm = vcpu->kvm; 472 unsigned long hpte[3], r; 473 unsigned long hnow_v, hnow_r; 474 __be64 *hptep; 475 unsigned long mmu_seq, psize, pte_size; 476 unsigned long gpa_base, gfn_base; 477 unsigned long gpa, gfn, hva, pfn; 478 struct kvm_memory_slot *memslot; 479 unsigned long *rmap; 480 struct revmap_entry *rev; 481 struct page *page, *pages[1]; 482 long index, ret, npages; 483 bool is_ci; 484 unsigned int writing, write_ok; 485 struct vm_area_struct *vma; 486 unsigned long rcbits; 487 long mmio_update; 488 489 if (kvm_is_radix(kvm)) 490 return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); 491 492 /* 493 * Real-mode code has already searched the HPT and found the 494 * entry we're interested in. Lock the entry and check that 495 * it hasn't changed. If it has, just return and re-execute the 496 * instruction. 497 */ 498 if (ea != vcpu->arch.pgfault_addr) 499 return RESUME_GUEST; 500 501 if (vcpu->arch.pgfault_cache) { 502 mmio_update = atomic64_read(&kvm->arch.mmio_update); 503 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 504 r = vcpu->arch.pgfault_cache->rpte; 505 psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); 506 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 507 gfn_base = gpa_base >> PAGE_SHIFT; 508 gpa = gpa_base | (ea & (psize - 1)); 509 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 510 dsisr & DSISR_ISSTORE); 511 } 512 } 513 index = vcpu->arch.pgfault_index; 514 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 515 rev = &kvm->arch.hpt.rev[index]; 516 preempt_disable(); 517 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 518 cpu_relax(); 519 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 520 hpte[1] = be64_to_cpu(hptep[1]); 521 hpte[2] = r = rev->guest_rpte; 522 unlock_hpte(hptep, hpte[0]); 523 preempt_enable(); 524 525 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 526 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); 527 hpte[1] = hpte_new_to_old_r(hpte[1]); 528 } 529 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 530 hpte[1] != vcpu->arch.pgfault_hpte[1]) 531 return RESUME_GUEST; 532 533 /* Translate the logical address and get the page */ 534 psize = hpte_page_size(hpte[0], r); 535 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 536 gfn_base = gpa_base >> PAGE_SHIFT; 537 gpa = gpa_base | (ea & (psize - 1)); 538 gfn = gpa >> PAGE_SHIFT; 539 memslot = gfn_to_memslot(kvm, gfn); 540 541 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); 542 543 /* No memslot means it's an emulated MMIO region */ 544 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 545 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 546 dsisr & DSISR_ISSTORE); 547 548 /* 549 * This should never happen, because of the slot_is_aligned() 550 * check in kvmppc_do_h_enter(). 551 */ 552 if (gfn_base < memslot->base_gfn) 553 return -EFAULT; 554 555 /* used to check for invalidations in progress */ 556 mmu_seq = kvm->mmu_notifier_seq; 557 smp_rmb(); 558 559 ret = -EFAULT; 560 is_ci = false; 561 pfn = 0; 562 page = NULL; 563 pte_size = PAGE_SIZE; 564 writing = (dsisr & DSISR_ISSTORE) != 0; 565 /* If writing != 0, then the HPTE must allow writing, if we get here */ 566 write_ok = writing; 567 hva = gfn_to_hva_memslot(memslot, gfn); 568 npages = get_user_pages_fast(hva, 1, writing, pages); 569 if (npages < 1) { 570 /* Check if it's an I/O mapping */ 571 down_read(¤t->mm->mmap_sem); 572 vma = find_vma(current->mm, hva); 573 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && 574 (vma->vm_flags & VM_PFNMAP)) { 575 pfn = vma->vm_pgoff + 576 ((hva - vma->vm_start) >> PAGE_SHIFT); 577 pte_size = psize; 578 is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); 579 write_ok = vma->vm_flags & VM_WRITE; 580 } 581 up_read(¤t->mm->mmap_sem); 582 if (!pfn) 583 goto out_put; 584 } else { 585 page = pages[0]; 586 pfn = page_to_pfn(page); 587 if (PageHuge(page)) { 588 page = compound_head(page); 589 pte_size <<= compound_order(page); 590 } 591 /* if the guest wants write access, see if that is OK */ 592 if (!writing && hpte_is_writable(r)) { 593 pte_t *ptep, pte; 594 unsigned long flags; 595 /* 596 * We need to protect against page table destruction 597 * hugepage split and collapse. 598 */ 599 local_irq_save(flags); 600 ptep = find_linux_pte_or_hugepte(current->mm->pgd, 601 hva, NULL, NULL); 602 if (ptep) { 603 pte = kvmppc_read_update_linux_pte(ptep, 1); 604 if (__pte_write(pte)) 605 write_ok = 1; 606 } 607 local_irq_restore(flags); 608 } 609 } 610 611 if (psize > pte_size) 612 goto out_put; 613 614 /* Check WIMG vs. the actual page we're accessing */ 615 if (!hpte_cache_flags_ok(r, is_ci)) { 616 if (is_ci) 617 goto out_put; 618 /* 619 * Allow guest to map emulated device memory as 620 * uncacheable, but actually make it cacheable. 621 */ 622 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 623 } 624 625 /* 626 * Set the HPTE to point to pfn. 627 * Since the pfn is at PAGE_SIZE granularity, make sure we 628 * don't mask out lower-order bits if psize < PAGE_SIZE. 629 */ 630 if (psize < PAGE_SIZE) 631 psize = PAGE_SIZE; 632 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | 633 ((pfn << PAGE_SHIFT) & ~(psize - 1)); 634 if (hpte_is_writable(r) && !write_ok) 635 r = hpte_make_readonly(r); 636 ret = RESUME_GUEST; 637 preempt_disable(); 638 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 639 cpu_relax(); 640 hnow_v = be64_to_cpu(hptep[0]); 641 hnow_r = be64_to_cpu(hptep[1]); 642 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 643 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); 644 hnow_r = hpte_new_to_old_r(hnow_r); 645 } 646 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 647 rev->guest_rpte != hpte[2]) 648 /* HPTE has been changed under us; let the guest retry */ 649 goto out_unlock; 650 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 651 652 /* Always put the HPTE in the rmap chain for the page base address */ 653 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 654 lock_rmap(rmap); 655 656 /* Check if we might have been invalidated; let the guest retry if so */ 657 ret = RESUME_GUEST; 658 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 659 unlock_rmap(rmap); 660 goto out_unlock; 661 } 662 663 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 664 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 665 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 666 667 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { 668 /* HPTE was previously valid, so we need to invalidate it */ 669 unlock_rmap(rmap); 670 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 671 kvmppc_invalidate_hpte(kvm, hptep, index); 672 /* don't lose previous R and C bits */ 673 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 674 } else { 675 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 676 } 677 678 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 679 r = hpte_old_to_new_r(hpte[0], r); 680 hpte[0] = hpte_old_to_new_v(hpte[0]); 681 } 682 hptep[1] = cpu_to_be64(r); 683 eieio(); 684 __unlock_hpte(hptep, hpte[0]); 685 asm volatile("ptesync" : : : "memory"); 686 preempt_enable(); 687 if (page && hpte_is_writable(r)) 688 SetPageDirty(page); 689 690 out_put: 691 trace_kvm_page_fault_exit(vcpu, hpte, ret); 692 693 if (page) { 694 /* 695 * We drop pages[0] here, not page because page might 696 * have been set to the head page of a compound, but 697 * we have to drop the reference on the correct tail 698 * page to match the get inside gup() 699 */ 700 put_page(pages[0]); 701 } 702 return ret; 703 704 out_unlock: 705 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 706 preempt_enable(); 707 goto out_put; 708 } 709 710 static void kvmppc_rmap_reset(struct kvm *kvm) 711 { 712 struct kvm_memslots *slots; 713 struct kvm_memory_slot *memslot; 714 int srcu_idx; 715 716 srcu_idx = srcu_read_lock(&kvm->srcu); 717 slots = kvm_memslots(kvm); 718 kvm_for_each_memslot(memslot, slots) { 719 /* 720 * This assumes it is acceptable to lose reference and 721 * change bits across a reset. 722 */ 723 memset(memslot->arch.rmap, 0, 724 memslot->npages * sizeof(*memslot->arch.rmap)); 725 } 726 srcu_read_unlock(&kvm->srcu, srcu_idx); 727 } 728 729 typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, 730 unsigned long gfn); 731 732 static int kvm_handle_hva_range(struct kvm *kvm, 733 unsigned long start, 734 unsigned long end, 735 hva_handler_fn handler) 736 { 737 int ret; 738 int retval = 0; 739 struct kvm_memslots *slots; 740 struct kvm_memory_slot *memslot; 741 742 slots = kvm_memslots(kvm); 743 kvm_for_each_memslot(memslot, slots) { 744 unsigned long hva_start, hva_end; 745 gfn_t gfn, gfn_end; 746 747 hva_start = max(start, memslot->userspace_addr); 748 hva_end = min(end, memslot->userspace_addr + 749 (memslot->npages << PAGE_SHIFT)); 750 if (hva_start >= hva_end) 751 continue; 752 /* 753 * {gfn(page) | page intersects with [hva_start, hva_end)} = 754 * {gfn, gfn+1, ..., gfn_end-1}. 755 */ 756 gfn = hva_to_gfn_memslot(hva_start, memslot); 757 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 758 759 for (; gfn < gfn_end; ++gfn) { 760 ret = handler(kvm, memslot, gfn); 761 retval |= ret; 762 } 763 } 764 765 return retval; 766 } 767 768 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 769 hva_handler_fn handler) 770 { 771 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 772 } 773 774 /* Must be called with both HPTE and rmap locked */ 775 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 776 unsigned long *rmapp, unsigned long gfn) 777 { 778 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 779 struct revmap_entry *rev = kvm->arch.hpt.rev; 780 unsigned long j, h; 781 unsigned long ptel, psize, rcbits; 782 783 j = rev[i].forw; 784 if (j == i) { 785 /* chain is now empty */ 786 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 787 } else { 788 /* remove i from chain */ 789 h = rev[i].back; 790 rev[h].forw = j; 791 rev[j].back = h; 792 rev[i].forw = rev[i].back = i; 793 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 794 } 795 796 /* Now check and modify the HPTE */ 797 ptel = rev[i].guest_rpte; 798 psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); 799 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 800 hpte_rpn(ptel, psize) == gfn) { 801 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 802 kvmppc_invalidate_hpte(kvm, hptep, i); 803 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); 804 /* Harvest R and C */ 805 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 806 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 807 if (rcbits & HPTE_R_C) 808 kvmppc_update_rmap_change(rmapp, psize); 809 if (rcbits & ~rev[i].guest_rpte) { 810 rev[i].guest_rpte = ptel | rcbits; 811 note_hpte_modification(kvm, &rev[i]); 812 } 813 } 814 } 815 816 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 817 unsigned long gfn) 818 { 819 unsigned long i; 820 __be64 *hptep; 821 unsigned long *rmapp; 822 823 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 824 for (;;) { 825 lock_rmap(rmapp); 826 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 827 unlock_rmap(rmapp); 828 break; 829 } 830 831 /* 832 * To avoid an ABBA deadlock with the HPTE lock bit, 833 * we can't spin on the HPTE lock while holding the 834 * rmap chain lock. 835 */ 836 i = *rmapp & KVMPPC_RMAP_INDEX; 837 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 838 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 839 /* unlock rmap before spinning on the HPTE lock */ 840 unlock_rmap(rmapp); 841 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 842 cpu_relax(); 843 continue; 844 } 845 846 kvmppc_unmap_hpte(kvm, i, rmapp, gfn); 847 unlock_rmap(rmapp); 848 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 849 } 850 return 0; 851 } 852 853 int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) 854 { 855 hva_handler_fn handler; 856 857 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 858 kvm_handle_hva(kvm, hva, handler); 859 return 0; 860 } 861 862 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) 863 { 864 hva_handler_fn handler; 865 866 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 867 kvm_handle_hva_range(kvm, start, end, handler); 868 return 0; 869 } 870 871 void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 872 struct kvm_memory_slot *memslot) 873 { 874 unsigned long gfn; 875 unsigned long n; 876 unsigned long *rmapp; 877 878 gfn = memslot->base_gfn; 879 rmapp = memslot->arch.rmap; 880 for (n = memslot->npages; n; --n, ++gfn) { 881 if (kvm_is_radix(kvm)) { 882 kvm_unmap_radix(kvm, memslot, gfn); 883 continue; 884 } 885 /* 886 * Testing the present bit without locking is OK because 887 * the memslot has been marked invalid already, and hence 888 * no new HPTEs referencing this page can be created, 889 * thus the present bit can't go from 0 to 1. 890 */ 891 if (*rmapp & KVMPPC_RMAP_PRESENT) 892 kvm_unmap_rmapp(kvm, memslot, gfn); 893 ++rmapp; 894 } 895 } 896 897 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 898 unsigned long gfn) 899 { 900 struct revmap_entry *rev = kvm->arch.hpt.rev; 901 unsigned long head, i, j; 902 __be64 *hptep; 903 int ret = 0; 904 unsigned long *rmapp; 905 906 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 907 retry: 908 lock_rmap(rmapp); 909 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 910 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 911 ret = 1; 912 } 913 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 914 unlock_rmap(rmapp); 915 return ret; 916 } 917 918 i = head = *rmapp & KVMPPC_RMAP_INDEX; 919 do { 920 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 921 j = rev[i].forw; 922 923 /* If this HPTE isn't referenced, ignore it */ 924 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) 925 continue; 926 927 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 928 /* unlock rmap before spinning on the HPTE lock */ 929 unlock_rmap(rmapp); 930 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 931 cpu_relax(); 932 goto retry; 933 } 934 935 /* Now check and modify the HPTE */ 936 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 937 (be64_to_cpu(hptep[1]) & HPTE_R_R)) { 938 kvmppc_clear_ref_hpte(kvm, hptep, i); 939 if (!(rev[i].guest_rpte & HPTE_R_R)) { 940 rev[i].guest_rpte |= HPTE_R_R; 941 note_hpte_modification(kvm, &rev[i]); 942 } 943 ret = 1; 944 } 945 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 946 } while ((i = j) != head); 947 948 unlock_rmap(rmapp); 949 return ret; 950 } 951 952 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) 953 { 954 hva_handler_fn handler; 955 956 handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; 957 return kvm_handle_hva_range(kvm, start, end, handler); 958 } 959 960 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 961 unsigned long gfn) 962 { 963 struct revmap_entry *rev = kvm->arch.hpt.rev; 964 unsigned long head, i, j; 965 unsigned long *hp; 966 int ret = 1; 967 unsigned long *rmapp; 968 969 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 970 if (*rmapp & KVMPPC_RMAP_REFERENCED) 971 return 1; 972 973 lock_rmap(rmapp); 974 if (*rmapp & KVMPPC_RMAP_REFERENCED) 975 goto out; 976 977 if (*rmapp & KVMPPC_RMAP_PRESENT) { 978 i = head = *rmapp & KVMPPC_RMAP_INDEX; 979 do { 980 hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); 981 j = rev[i].forw; 982 if (be64_to_cpu(hp[1]) & HPTE_R_R) 983 goto out; 984 } while ((i = j) != head); 985 } 986 ret = 0; 987 988 out: 989 unlock_rmap(rmapp); 990 return ret; 991 } 992 993 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) 994 { 995 hva_handler_fn handler; 996 997 handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; 998 return kvm_handle_hva(kvm, hva, handler); 999 } 1000 1001 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) 1002 { 1003 hva_handler_fn handler; 1004 1005 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 1006 kvm_handle_hva(kvm, hva, handler); 1007 } 1008 1009 static int vcpus_running(struct kvm *kvm) 1010 { 1011 return atomic_read(&kvm->arch.vcpus_running) != 0; 1012 } 1013 1014 /* 1015 * Returns the number of system pages that are dirty. 1016 * This can be more than 1 if we find a huge-page HPTE. 1017 */ 1018 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1019 { 1020 struct revmap_entry *rev = kvm->arch.hpt.rev; 1021 unsigned long head, i, j; 1022 unsigned long n; 1023 unsigned long v, r; 1024 __be64 *hptep; 1025 int npages_dirty = 0; 1026 1027 retry: 1028 lock_rmap(rmapp); 1029 if (*rmapp & KVMPPC_RMAP_CHANGED) { 1030 long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER) 1031 >> KVMPPC_RMAP_CHG_SHIFT; 1032 *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER); 1033 npages_dirty = 1; 1034 if (change_order > PAGE_SHIFT) 1035 npages_dirty = 1ul << (change_order - PAGE_SHIFT); 1036 } 1037 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1038 unlock_rmap(rmapp); 1039 return npages_dirty; 1040 } 1041 1042 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1043 do { 1044 unsigned long hptep1; 1045 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 1046 j = rev[i].forw; 1047 1048 /* 1049 * Checking the C (changed) bit here is racy since there 1050 * is no guarantee about when the hardware writes it back. 1051 * If the HPTE is not writable then it is stable since the 1052 * page can't be written to, and we would have done a tlbie 1053 * (which forces the hardware to complete any writeback) 1054 * when making the HPTE read-only. 1055 * If vcpus are running then this call is racy anyway 1056 * since the page could get dirtied subsequently, so we 1057 * expect there to be a further call which would pick up 1058 * any delayed C bit writeback. 1059 * Otherwise we need to do the tlbie even if C==0 in 1060 * order to pick up any delayed writeback of C. 1061 */ 1062 hptep1 = be64_to_cpu(hptep[1]); 1063 if (!(hptep1 & HPTE_R_C) && 1064 (!hpte_is_writable(hptep1) || vcpus_running(kvm))) 1065 continue; 1066 1067 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1068 /* unlock rmap before spinning on the HPTE lock */ 1069 unlock_rmap(rmapp); 1070 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) 1071 cpu_relax(); 1072 goto retry; 1073 } 1074 1075 /* Now check and modify the HPTE */ 1076 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 1077 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 1078 continue; 1079 } 1080 1081 /* need to make it temporarily absent so C is stable */ 1082 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 1083 kvmppc_invalidate_hpte(kvm, hptep, i); 1084 v = be64_to_cpu(hptep[0]); 1085 r = be64_to_cpu(hptep[1]); 1086 if (r & HPTE_R_C) { 1087 hptep[1] = cpu_to_be64(r & ~HPTE_R_C); 1088 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1089 rev[i].guest_rpte |= HPTE_R_C; 1090 note_hpte_modification(kvm, &rev[i]); 1091 } 1092 n = hpte_page_size(v, r); 1093 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1094 if (n > npages_dirty) 1095 npages_dirty = n; 1096 eieio(); 1097 } 1098 v &= ~HPTE_V_ABSENT; 1099 v |= HPTE_V_VALID; 1100 __unlock_hpte(hptep, v); 1101 } while ((i = j) != head); 1102 1103 unlock_rmap(rmapp); 1104 return npages_dirty; 1105 } 1106 1107 void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1108 struct kvm_memory_slot *memslot, 1109 unsigned long *map) 1110 { 1111 unsigned long gfn; 1112 1113 if (!vpa->dirty || !vpa->pinned_addr) 1114 return; 1115 gfn = vpa->gpa >> PAGE_SHIFT; 1116 if (gfn < memslot->base_gfn || 1117 gfn >= memslot->base_gfn + memslot->npages) 1118 return; 1119 1120 vpa->dirty = false; 1121 if (map) 1122 __set_bit_le(gfn - memslot->base_gfn, map); 1123 } 1124 1125 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1126 struct kvm_memory_slot *memslot, unsigned long *map) 1127 { 1128 unsigned long i, j; 1129 unsigned long *rmapp; 1130 1131 preempt_disable(); 1132 rmapp = memslot->arch.rmap; 1133 for (i = 0; i < memslot->npages; ++i) { 1134 int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1135 /* 1136 * Note that if npages > 0 then i must be a multiple of npages, 1137 * since we always put huge-page HPTEs in the rmap chain 1138 * corresponding to their page base address. 1139 */ 1140 if (npages && map) 1141 for (j = i; npages; ++j, --npages) 1142 __set_bit_le(j, map); 1143 ++rmapp; 1144 } 1145 preempt_enable(); 1146 return 0; 1147 } 1148 1149 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1150 unsigned long *nb_ret) 1151 { 1152 struct kvm_memory_slot *memslot; 1153 unsigned long gfn = gpa >> PAGE_SHIFT; 1154 struct page *page, *pages[1]; 1155 int npages; 1156 unsigned long hva, offset; 1157 int srcu_idx; 1158 1159 srcu_idx = srcu_read_lock(&kvm->srcu); 1160 memslot = gfn_to_memslot(kvm, gfn); 1161 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1162 goto err; 1163 hva = gfn_to_hva_memslot(memslot, gfn); 1164 npages = get_user_pages_fast(hva, 1, 1, pages); 1165 if (npages < 1) 1166 goto err; 1167 page = pages[0]; 1168 srcu_read_unlock(&kvm->srcu, srcu_idx); 1169 1170 offset = gpa & (PAGE_SIZE - 1); 1171 if (nb_ret) 1172 *nb_ret = PAGE_SIZE - offset; 1173 return page_address(page) + offset; 1174 1175 err: 1176 srcu_read_unlock(&kvm->srcu, srcu_idx); 1177 return NULL; 1178 } 1179 1180 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1181 bool dirty) 1182 { 1183 struct page *page = virt_to_page(va); 1184 struct kvm_memory_slot *memslot; 1185 unsigned long gfn; 1186 unsigned long *rmap; 1187 int srcu_idx; 1188 1189 put_page(page); 1190 1191 if (!dirty) 1192 return; 1193 1194 /* We need to mark this page dirty in the rmap chain */ 1195 gfn = gpa >> PAGE_SHIFT; 1196 srcu_idx = srcu_read_lock(&kvm->srcu); 1197 memslot = gfn_to_memslot(kvm, gfn); 1198 if (memslot) { 1199 if (!kvm_is_radix(kvm)) { 1200 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1201 lock_rmap(rmap); 1202 *rmap |= KVMPPC_RMAP_CHANGED; 1203 unlock_rmap(rmap); 1204 } else if (memslot->dirty_bitmap) { 1205 mark_page_dirty(kvm, gfn); 1206 } 1207 } 1208 srcu_read_unlock(&kvm->srcu, srcu_idx); 1209 } 1210 1211 /* 1212 * HPT resizing 1213 */ 1214 static int resize_hpt_allocate(struct kvm_resize_hpt *resize) 1215 { 1216 int rc; 1217 1218 rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); 1219 if (rc < 0) 1220 return rc; 1221 1222 resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", 1223 resize->hpt.virt); 1224 1225 return 0; 1226 } 1227 1228 static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, 1229 unsigned long idx) 1230 { 1231 struct kvm *kvm = resize->kvm; 1232 struct kvm_hpt_info *old = &kvm->arch.hpt; 1233 struct kvm_hpt_info *new = &resize->hpt; 1234 unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; 1235 unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; 1236 __be64 *hptep, *new_hptep; 1237 unsigned long vpte, rpte, guest_rpte; 1238 int ret; 1239 struct revmap_entry *rev; 1240 unsigned long apsize, psize, avpn, pteg, hash; 1241 unsigned long new_idx, new_pteg, replace_vpte; 1242 1243 hptep = (__be64 *)(old->virt + (idx << 4)); 1244 1245 /* Guest is stopped, so new HPTEs can't be added or faulted 1246 * in, only unmapped or altered by host actions. So, it's 1247 * safe to check this before we take the HPTE lock */ 1248 vpte = be64_to_cpu(hptep[0]); 1249 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1250 return 0; /* nothing to do */ 1251 1252 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 1253 cpu_relax(); 1254 1255 vpte = be64_to_cpu(hptep[0]); 1256 1257 ret = 0; 1258 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1259 /* Nothing to do */ 1260 goto out; 1261 1262 /* Unmap */ 1263 rev = &old->rev[idx]; 1264 guest_rpte = rev->guest_rpte; 1265 1266 ret = -EIO; 1267 apsize = hpte_page_size(vpte, guest_rpte); 1268 if (!apsize) 1269 goto out; 1270 1271 if (vpte & HPTE_V_VALID) { 1272 unsigned long gfn = hpte_rpn(guest_rpte, apsize); 1273 int srcu_idx = srcu_read_lock(&kvm->srcu); 1274 struct kvm_memory_slot *memslot = 1275 __gfn_to_memslot(kvm_memslots(kvm), gfn); 1276 1277 if (memslot) { 1278 unsigned long *rmapp; 1279 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1280 1281 lock_rmap(rmapp); 1282 kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); 1283 unlock_rmap(rmapp); 1284 } 1285 1286 srcu_read_unlock(&kvm->srcu, srcu_idx); 1287 } 1288 1289 /* Reload PTE after unmap */ 1290 vpte = be64_to_cpu(hptep[0]); 1291 1292 BUG_ON(vpte & HPTE_V_VALID); 1293 BUG_ON(!(vpte & HPTE_V_ABSENT)); 1294 1295 ret = 0; 1296 if (!(vpte & HPTE_V_BOLTED)) 1297 goto out; 1298 1299 rpte = be64_to_cpu(hptep[1]); 1300 psize = hpte_base_page_size(vpte, rpte); 1301 avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23); 1302 pteg = idx / HPTES_PER_GROUP; 1303 if (vpte & HPTE_V_SECONDARY) 1304 pteg = ~pteg; 1305 1306 if (!(vpte & HPTE_V_1TB_SEG)) { 1307 unsigned long offset, vsid; 1308 1309 /* We only have 28 - 23 bits of offset in avpn */ 1310 offset = (avpn & 0x1f) << 23; 1311 vsid = avpn >> 5; 1312 /* We can find more bits from the pteg value */ 1313 if (psize < (1ULL << 23)) 1314 offset |= ((vsid ^ pteg) & old_hash_mask) * psize; 1315 1316 hash = vsid ^ (offset / psize); 1317 } else { 1318 unsigned long offset, vsid; 1319 1320 /* We only have 40 - 23 bits of seg_off in avpn */ 1321 offset = (avpn & 0x1ffff) << 23; 1322 vsid = avpn >> 17; 1323 if (psize < (1ULL << 23)) 1324 offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize; 1325 1326 hash = vsid ^ (vsid << 25) ^ (offset / psize); 1327 } 1328 1329 new_pteg = hash & new_hash_mask; 1330 if (vpte & HPTE_V_SECONDARY) { 1331 BUG_ON(~pteg != (hash & old_hash_mask)); 1332 new_pteg = ~new_pteg; 1333 } else { 1334 BUG_ON(pteg != (hash & old_hash_mask)); 1335 } 1336 1337 new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); 1338 new_hptep = (__be64 *)(new->virt + (new_idx << 4)); 1339 1340 replace_vpte = be64_to_cpu(new_hptep[0]); 1341 1342 if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1343 BUG_ON(new->order >= old->order); 1344 1345 if (replace_vpte & HPTE_V_BOLTED) { 1346 if (vpte & HPTE_V_BOLTED) 1347 /* Bolted collision, nothing we can do */ 1348 ret = -ENOSPC; 1349 /* Discard the new HPTE */ 1350 goto out; 1351 } 1352 1353 /* Discard the previous HPTE */ 1354 } 1355 1356 new_hptep[1] = cpu_to_be64(rpte); 1357 new->rev[new_idx].guest_rpte = guest_rpte; 1358 /* No need for a barrier, since new HPT isn't active */ 1359 new_hptep[0] = cpu_to_be64(vpte); 1360 unlock_hpte(new_hptep, vpte); 1361 1362 out: 1363 unlock_hpte(hptep, vpte); 1364 return ret; 1365 } 1366 1367 static int resize_hpt_rehash(struct kvm_resize_hpt *resize) 1368 { 1369 struct kvm *kvm = resize->kvm; 1370 unsigned long i; 1371 int rc; 1372 1373 /* 1374 * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs 1375 * that POWER9 uses, and could well hit a BUG_ON on POWER9. 1376 */ 1377 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1378 return -EIO; 1379 for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { 1380 rc = resize_hpt_rehash_hpte(resize, i); 1381 if (rc != 0) 1382 return rc; 1383 } 1384 1385 return 0; 1386 } 1387 1388 static void resize_hpt_pivot(struct kvm_resize_hpt *resize) 1389 { 1390 struct kvm *kvm = resize->kvm; 1391 struct kvm_hpt_info hpt_tmp; 1392 1393 /* Exchange the pending tables in the resize structure with 1394 * the active tables */ 1395 1396 resize_hpt_debug(resize, "resize_hpt_pivot()\n"); 1397 1398 spin_lock(&kvm->mmu_lock); 1399 asm volatile("ptesync" : : : "memory"); 1400 1401 hpt_tmp = kvm->arch.hpt; 1402 kvmppc_set_hpt(kvm, &resize->hpt); 1403 resize->hpt = hpt_tmp; 1404 1405 spin_unlock(&kvm->mmu_lock); 1406 1407 synchronize_srcu_expedited(&kvm->srcu); 1408 1409 resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); 1410 } 1411 1412 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) 1413 { 1414 BUG_ON(kvm->arch.resize_hpt != resize); 1415 1416 if (!resize) 1417 return; 1418 1419 if (resize->hpt.virt) 1420 kvmppc_free_hpt(&resize->hpt); 1421 1422 kvm->arch.resize_hpt = NULL; 1423 kfree(resize); 1424 } 1425 1426 static void resize_hpt_prepare_work(struct work_struct *work) 1427 { 1428 struct kvm_resize_hpt *resize = container_of(work, 1429 struct kvm_resize_hpt, 1430 work); 1431 struct kvm *kvm = resize->kvm; 1432 int err; 1433 1434 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", 1435 resize->order); 1436 1437 err = resize_hpt_allocate(resize); 1438 1439 mutex_lock(&kvm->lock); 1440 1441 resize->error = err; 1442 resize->prepare_done = true; 1443 1444 mutex_unlock(&kvm->lock); 1445 } 1446 1447 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, 1448 struct kvm_ppc_resize_hpt *rhpt) 1449 { 1450 unsigned long flags = rhpt->flags; 1451 unsigned long shift = rhpt->shift; 1452 struct kvm_resize_hpt *resize; 1453 int ret; 1454 1455 if (flags != 0) 1456 return -EINVAL; 1457 1458 if (shift && ((shift < 18) || (shift > 46))) 1459 return -EINVAL; 1460 1461 mutex_lock(&kvm->lock); 1462 1463 resize = kvm->arch.resize_hpt; 1464 1465 if (resize) { 1466 if (resize->order == shift) { 1467 /* Suitable resize in progress */ 1468 if (resize->prepare_done) { 1469 ret = resize->error; 1470 if (ret != 0) 1471 resize_hpt_release(kvm, resize); 1472 } else { 1473 ret = 100; /* estimated time in ms */ 1474 } 1475 1476 goto out; 1477 } 1478 1479 /* not suitable, cancel it */ 1480 resize_hpt_release(kvm, resize); 1481 } 1482 1483 ret = 0; 1484 if (!shift) 1485 goto out; /* nothing to do */ 1486 1487 /* start new resize */ 1488 1489 resize = kzalloc(sizeof(*resize), GFP_KERNEL); 1490 if (!resize) { 1491 ret = -ENOMEM; 1492 goto out; 1493 } 1494 resize->order = shift; 1495 resize->kvm = kvm; 1496 INIT_WORK(&resize->work, resize_hpt_prepare_work); 1497 kvm->arch.resize_hpt = resize; 1498 1499 schedule_work(&resize->work); 1500 1501 ret = 100; /* estimated time in ms */ 1502 1503 out: 1504 mutex_unlock(&kvm->lock); 1505 return ret; 1506 } 1507 1508 static void resize_hpt_boot_vcpu(void *opaque) 1509 { 1510 /* Nothing to do, just force a KVM exit */ 1511 } 1512 1513 long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, 1514 struct kvm_ppc_resize_hpt *rhpt) 1515 { 1516 unsigned long flags = rhpt->flags; 1517 unsigned long shift = rhpt->shift; 1518 struct kvm_resize_hpt *resize; 1519 long ret; 1520 1521 if (flags != 0) 1522 return -EINVAL; 1523 1524 if (shift && ((shift < 18) || (shift > 46))) 1525 return -EINVAL; 1526 1527 mutex_lock(&kvm->lock); 1528 1529 resize = kvm->arch.resize_hpt; 1530 1531 /* This shouldn't be possible */ 1532 ret = -EIO; 1533 if (WARN_ON(!kvm->arch.hpte_setup_done)) 1534 goto out_no_hpt; 1535 1536 /* Stop VCPUs from running while we mess with the HPT */ 1537 kvm->arch.hpte_setup_done = 0; 1538 smp_mb(); 1539 1540 /* Boot all CPUs out of the guest so they re-read 1541 * hpte_setup_done */ 1542 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1543 1544 ret = -ENXIO; 1545 if (!resize || (resize->order != shift)) 1546 goto out; 1547 1548 ret = -EBUSY; 1549 if (!resize->prepare_done) 1550 goto out; 1551 1552 ret = resize->error; 1553 if (ret != 0) 1554 goto out; 1555 1556 ret = resize_hpt_rehash(resize); 1557 if (ret != 0) 1558 goto out; 1559 1560 resize_hpt_pivot(resize); 1561 1562 out: 1563 /* Let VCPUs run again */ 1564 kvm->arch.hpte_setup_done = 1; 1565 smp_mb(); 1566 out_no_hpt: 1567 resize_hpt_release(kvm, resize); 1568 mutex_unlock(&kvm->lock); 1569 return ret; 1570 } 1571 1572 /* 1573 * Functions for reading and writing the hash table via reads and 1574 * writes on a file descriptor. 1575 * 1576 * Reads return the guest view of the hash table, which has to be 1577 * pieced together from the real hash table and the guest_rpte 1578 * values in the revmap array. 1579 * 1580 * On writes, each HPTE written is considered in turn, and if it 1581 * is valid, it is written to the HPT as if an H_ENTER with the 1582 * exact flag set was done. When the invalid count is non-zero 1583 * in the header written to the stream, the kernel will make 1584 * sure that that many HPTEs are invalid, and invalidate them 1585 * if not. 1586 */ 1587 1588 struct kvm_htab_ctx { 1589 unsigned long index; 1590 unsigned long flags; 1591 struct kvm *kvm; 1592 int first_pass; 1593 }; 1594 1595 #define HPTE_SIZE (2 * sizeof(unsigned long)) 1596 1597 /* 1598 * Returns 1 if this HPT entry has been modified or has pending 1599 * R/C bit changes. 1600 */ 1601 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) 1602 { 1603 unsigned long rcbits_unset; 1604 1605 if (revp->guest_rpte & HPTE_GR_MODIFIED) 1606 return 1; 1607 1608 /* Also need to consider changes in reference and changed bits */ 1609 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1610 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && 1611 (be64_to_cpu(hptp[1]) & rcbits_unset)) 1612 return 1; 1613 1614 return 0; 1615 } 1616 1617 static long record_hpte(unsigned long flags, __be64 *hptp, 1618 unsigned long *hpte, struct revmap_entry *revp, 1619 int want_valid, int first_pass) 1620 { 1621 unsigned long v, r, hr; 1622 unsigned long rcbits_unset; 1623 int ok = 1; 1624 int valid, dirty; 1625 1626 /* Unmodified entries are uninteresting except on the first pass */ 1627 dirty = hpte_dirty(revp, hptp); 1628 if (!first_pass && !dirty) 1629 return 0; 1630 1631 valid = 0; 1632 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1633 valid = 1; 1634 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1635 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) 1636 valid = 0; 1637 } 1638 if (valid != want_valid) 1639 return 0; 1640 1641 v = r = 0; 1642 if (valid || dirty) { 1643 /* lock the HPTE so it's stable and read it */ 1644 preempt_disable(); 1645 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1646 cpu_relax(); 1647 v = be64_to_cpu(hptp[0]); 1648 hr = be64_to_cpu(hptp[1]); 1649 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1650 v = hpte_new_to_old_v(v, hr); 1651 hr = hpte_new_to_old_r(hr); 1652 } 1653 1654 /* re-evaluate valid and dirty from synchronized HPTE value */ 1655 valid = !!(v & HPTE_V_VALID); 1656 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1657 1658 /* Harvest R and C into guest view if necessary */ 1659 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1660 if (valid && (rcbits_unset & hr)) { 1661 revp->guest_rpte |= (hr & 1662 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; 1663 dirty = 1; 1664 } 1665 1666 if (v & HPTE_V_ABSENT) { 1667 v &= ~HPTE_V_ABSENT; 1668 v |= HPTE_V_VALID; 1669 valid = 1; 1670 } 1671 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1672 valid = 0; 1673 1674 r = revp->guest_rpte; 1675 /* only clear modified if this is the right sort of entry */ 1676 if (valid == want_valid && dirty) { 1677 r &= ~HPTE_GR_MODIFIED; 1678 revp->guest_rpte = r; 1679 } 1680 unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1681 preempt_enable(); 1682 if (!(valid == want_valid && (first_pass || dirty))) 1683 ok = 0; 1684 } 1685 hpte[0] = cpu_to_be64(v); 1686 hpte[1] = cpu_to_be64(r); 1687 return ok; 1688 } 1689 1690 static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1691 size_t count, loff_t *ppos) 1692 { 1693 struct kvm_htab_ctx *ctx = file->private_data; 1694 struct kvm *kvm = ctx->kvm; 1695 struct kvm_get_htab_header hdr; 1696 __be64 *hptp; 1697 struct revmap_entry *revp; 1698 unsigned long i, nb, nw; 1699 unsigned long __user *lbuf; 1700 struct kvm_get_htab_header __user *hptr; 1701 unsigned long flags; 1702 int first_pass; 1703 unsigned long hpte[2]; 1704 1705 if (!access_ok(VERIFY_WRITE, buf, count)) 1706 return -EFAULT; 1707 1708 first_pass = ctx->first_pass; 1709 flags = ctx->flags; 1710 1711 i = ctx->index; 1712 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1713 revp = kvm->arch.hpt.rev + i; 1714 lbuf = (unsigned long __user *)buf; 1715 1716 nb = 0; 1717 while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1718 /* Initialize header */ 1719 hptr = (struct kvm_get_htab_header __user *)buf; 1720 hdr.n_valid = 0; 1721 hdr.n_invalid = 0; 1722 nw = nb; 1723 nb += sizeof(hdr); 1724 lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1725 1726 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1727 if (!first_pass) { 1728 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1729 !hpte_dirty(revp, hptp)) { 1730 ++i; 1731 hptp += 2; 1732 ++revp; 1733 } 1734 } 1735 hdr.index = i; 1736 1737 /* Grab a series of valid entries */ 1738 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1739 hdr.n_valid < 0xffff && 1740 nb + HPTE_SIZE < count && 1741 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1742 /* valid entry, write it out */ 1743 ++hdr.n_valid; 1744 if (__put_user(hpte[0], lbuf) || 1745 __put_user(hpte[1], lbuf + 1)) 1746 return -EFAULT; 1747 nb += HPTE_SIZE; 1748 lbuf += 2; 1749 ++i; 1750 hptp += 2; 1751 ++revp; 1752 } 1753 /* Now skip invalid entries while we can */ 1754 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1755 hdr.n_invalid < 0xffff && 1756 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1757 /* found an invalid entry */ 1758 ++hdr.n_invalid; 1759 ++i; 1760 hptp += 2; 1761 ++revp; 1762 } 1763 1764 if (hdr.n_valid || hdr.n_invalid) { 1765 /* write back the header */ 1766 if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1767 return -EFAULT; 1768 nw = nb; 1769 buf = (char __user *)lbuf; 1770 } else { 1771 nb = nw; 1772 } 1773 1774 /* Check if we've wrapped around the hash table */ 1775 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { 1776 i = 0; 1777 ctx->first_pass = 0; 1778 break; 1779 } 1780 } 1781 1782 ctx->index = i; 1783 1784 return nb; 1785 } 1786 1787 static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1788 size_t count, loff_t *ppos) 1789 { 1790 struct kvm_htab_ctx *ctx = file->private_data; 1791 struct kvm *kvm = ctx->kvm; 1792 struct kvm_get_htab_header hdr; 1793 unsigned long i, j; 1794 unsigned long v, r; 1795 unsigned long __user *lbuf; 1796 __be64 *hptp; 1797 unsigned long tmp[2]; 1798 ssize_t nb; 1799 long int err, ret; 1800 int hpte_setup; 1801 1802 if (!access_ok(VERIFY_READ, buf, count)) 1803 return -EFAULT; 1804 1805 /* lock out vcpus from running while we're doing this */ 1806 mutex_lock(&kvm->lock); 1807 hpte_setup = kvm->arch.hpte_setup_done; 1808 if (hpte_setup) { 1809 kvm->arch.hpte_setup_done = 0; /* temporarily */ 1810 /* order hpte_setup_done vs. vcpus_running */ 1811 smp_mb(); 1812 if (atomic_read(&kvm->arch.vcpus_running)) { 1813 kvm->arch.hpte_setup_done = 1; 1814 mutex_unlock(&kvm->lock); 1815 return -EBUSY; 1816 } 1817 } 1818 1819 err = 0; 1820 for (nb = 0; nb + sizeof(hdr) <= count; ) { 1821 err = -EFAULT; 1822 if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1823 break; 1824 1825 err = 0; 1826 if (nb + hdr.n_valid * HPTE_SIZE > count) 1827 break; 1828 1829 nb += sizeof(hdr); 1830 buf += sizeof(hdr); 1831 1832 err = -EINVAL; 1833 i = hdr.index; 1834 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || 1835 i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) 1836 break; 1837 1838 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1839 lbuf = (unsigned long __user *)buf; 1840 for (j = 0; j < hdr.n_valid; ++j) { 1841 __be64 hpte_v; 1842 __be64 hpte_r; 1843 1844 err = -EFAULT; 1845 if (__get_user(hpte_v, lbuf) || 1846 __get_user(hpte_r, lbuf + 1)) 1847 goto out; 1848 v = be64_to_cpu(hpte_v); 1849 r = be64_to_cpu(hpte_r); 1850 err = -EINVAL; 1851 if (!(v & HPTE_V_VALID)) 1852 goto out; 1853 lbuf += 2; 1854 nb += HPTE_SIZE; 1855 1856 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1857 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1858 err = -EIO; 1859 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1860 tmp); 1861 if (ret != H_SUCCESS) { 1862 pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1863 "r=%lx\n", ret, i, v, r); 1864 goto out; 1865 } 1866 if (!hpte_setup && is_vrma_hpte(v)) { 1867 unsigned long psize = hpte_base_page_size(v, r); 1868 unsigned long senc = slb_pgsize_encoding(psize); 1869 unsigned long lpcr; 1870 1871 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1872 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1873 lpcr = senc << (LPCR_VRMASD_SH - 4); 1874 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 1875 hpte_setup = 1; 1876 } 1877 ++i; 1878 hptp += 2; 1879 } 1880 1881 for (j = 0; j < hdr.n_invalid; ++j) { 1882 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1883 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1884 ++i; 1885 hptp += 2; 1886 } 1887 err = 0; 1888 } 1889 1890 out: 1891 /* Order HPTE updates vs. hpte_setup_done */ 1892 smp_wmb(); 1893 kvm->arch.hpte_setup_done = hpte_setup; 1894 mutex_unlock(&kvm->lock); 1895 1896 if (err) 1897 return err; 1898 return nb; 1899 } 1900 1901 static int kvm_htab_release(struct inode *inode, struct file *filp) 1902 { 1903 struct kvm_htab_ctx *ctx = filp->private_data; 1904 1905 filp->private_data = NULL; 1906 if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1907 atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1908 kvm_put_kvm(ctx->kvm); 1909 kfree(ctx); 1910 return 0; 1911 } 1912 1913 static const struct file_operations kvm_htab_fops = { 1914 .read = kvm_htab_read, 1915 .write = kvm_htab_write, 1916 .llseek = default_llseek, 1917 .release = kvm_htab_release, 1918 }; 1919 1920 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1921 { 1922 int ret; 1923 struct kvm_htab_ctx *ctx; 1924 int rwflag; 1925 1926 /* reject flags we don't recognize */ 1927 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1928 return -EINVAL; 1929 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1930 if (!ctx) 1931 return -ENOMEM; 1932 kvm_get_kvm(kvm); 1933 ctx->kvm = kvm; 1934 ctx->index = ghf->start_index; 1935 ctx->flags = ghf->flags; 1936 ctx->first_pass = 1; 1937 1938 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1939 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1940 if (ret < 0) { 1941 kvm_put_kvm(kvm); 1942 return ret; 1943 } 1944 1945 if (rwflag == O_RDONLY) { 1946 mutex_lock(&kvm->slots_lock); 1947 atomic_inc(&kvm->arch.hpte_mod_interest); 1948 /* make sure kvmppc_do_h_enter etc. see the increment */ 1949 synchronize_srcu_expedited(&kvm->srcu); 1950 mutex_unlock(&kvm->slots_lock); 1951 } 1952 1953 return ret; 1954 } 1955 1956 struct debugfs_htab_state { 1957 struct kvm *kvm; 1958 struct mutex mutex; 1959 unsigned long hpt_index; 1960 int chars_left; 1961 int buf_index; 1962 char buf[64]; 1963 }; 1964 1965 static int debugfs_htab_open(struct inode *inode, struct file *file) 1966 { 1967 struct kvm *kvm = inode->i_private; 1968 struct debugfs_htab_state *p; 1969 1970 p = kzalloc(sizeof(*p), GFP_KERNEL); 1971 if (!p) 1972 return -ENOMEM; 1973 1974 kvm_get_kvm(kvm); 1975 p->kvm = kvm; 1976 mutex_init(&p->mutex); 1977 file->private_data = p; 1978 1979 return nonseekable_open(inode, file); 1980 } 1981 1982 static int debugfs_htab_release(struct inode *inode, struct file *file) 1983 { 1984 struct debugfs_htab_state *p = file->private_data; 1985 1986 kvm_put_kvm(p->kvm); 1987 kfree(p); 1988 return 0; 1989 } 1990 1991 static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 1992 size_t len, loff_t *ppos) 1993 { 1994 struct debugfs_htab_state *p = file->private_data; 1995 ssize_t ret, r; 1996 unsigned long i, n; 1997 unsigned long v, hr, gr; 1998 struct kvm *kvm; 1999 __be64 *hptp; 2000 2001 ret = mutex_lock_interruptible(&p->mutex); 2002 if (ret) 2003 return ret; 2004 2005 if (p->chars_left) { 2006 n = p->chars_left; 2007 if (n > len) 2008 n = len; 2009 r = copy_to_user(buf, p->buf + p->buf_index, n); 2010 n -= r; 2011 p->chars_left -= n; 2012 p->buf_index += n; 2013 buf += n; 2014 len -= n; 2015 ret = n; 2016 if (r) { 2017 if (!n) 2018 ret = -EFAULT; 2019 goto out; 2020 } 2021 } 2022 2023 kvm = p->kvm; 2024 i = p->hpt_index; 2025 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2026 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); 2027 ++i, hptp += 2) { 2028 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 2029 continue; 2030 2031 /* lock the HPTE so it's stable and read it */ 2032 preempt_disable(); 2033 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 2034 cpu_relax(); 2035 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 2036 hr = be64_to_cpu(hptp[1]); 2037 gr = kvm->arch.hpt.rev[i].guest_rpte; 2038 unlock_hpte(hptp, v); 2039 preempt_enable(); 2040 2041 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 2042 continue; 2043 2044 n = scnprintf(p->buf, sizeof(p->buf), 2045 "%6lx %.16lx %.16lx %.16lx\n", 2046 i, v, hr, gr); 2047 p->chars_left = n; 2048 if (n > len) 2049 n = len; 2050 r = copy_to_user(buf, p->buf, n); 2051 n -= r; 2052 p->chars_left -= n; 2053 p->buf_index = n; 2054 buf += n; 2055 len -= n; 2056 ret += n; 2057 if (r) { 2058 if (!ret) 2059 ret = -EFAULT; 2060 goto out; 2061 } 2062 } 2063 p->hpt_index = i; 2064 2065 out: 2066 mutex_unlock(&p->mutex); 2067 return ret; 2068 } 2069 2070 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 2071 size_t len, loff_t *ppos) 2072 { 2073 return -EACCES; 2074 } 2075 2076 static const struct file_operations debugfs_htab_fops = { 2077 .owner = THIS_MODULE, 2078 .open = debugfs_htab_open, 2079 .release = debugfs_htab_release, 2080 .read = debugfs_htab_read, 2081 .write = debugfs_htab_write, 2082 .llseek = generic_file_llseek, 2083 }; 2084 2085 void kvmppc_mmu_debugfs_init(struct kvm *kvm) 2086 { 2087 kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, 2088 kvm->arch.debugfs_dir, kvm, 2089 &debugfs_htab_fops); 2090 } 2091 2092 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 2093 { 2094 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 2095 2096 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2097 2098 if (kvm_is_radix(vcpu->kvm)) 2099 mmu->xlate = kvmppc_mmu_radix_xlate; 2100 else 2101 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2102 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 2103 2104 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 2105 } 2106