1 /* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License, version 2, as 4 * published by the Free Software Foundation. 5 * 6 * This program is distributed in the hope that it will be useful, 7 * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 * GNU General Public License for more details. 10 * 11 * You should have received a copy of the GNU General Public License 12 * along with this program; if not, write to the Free Software 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 14 * 15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 16 */ 17 18 #include <linux/types.h> 19 #include <linux/string.h> 20 #include <linux/kvm.h> 21 #include <linux/kvm_host.h> 22 #include <linux/highmem.h> 23 #include <linux/gfp.h> 24 #include <linux/slab.h> 25 #include <linux/hugetlb.h> 26 #include <linux/vmalloc.h> 27 #include <linux/srcu.h> 28 #include <linux/anon_inodes.h> 29 #include <linux/file.h> 30 #include <linux/debugfs.h> 31 32 #include <asm/kvm_ppc.h> 33 #include <asm/kvm_book3s.h> 34 #include <asm/book3s/64/mmu-hash.h> 35 #include <asm/hvcall.h> 36 #include <asm/synch.h> 37 #include <asm/ppc-opcode.h> 38 #include <asm/cputable.h> 39 #include <asm/pte-walk.h> 40 41 #include "trace_hv.h" 42 43 //#define DEBUG_RESIZE_HPT 1 44 45 #ifdef DEBUG_RESIZE_HPT 46 #define resize_hpt_debug(resize, ...) \ 47 do { \ 48 printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ 49 printk(__VA_ARGS__); \ 50 } while (0) 51 #else 52 #define resize_hpt_debug(resize, ...) \ 53 do { } while (0) 54 #endif 55 56 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 57 long pte_index, unsigned long pteh, 58 unsigned long ptel, unsigned long *pte_idx_ret); 59 60 struct kvm_resize_hpt { 61 /* These fields read-only after init */ 62 struct kvm *kvm; 63 struct work_struct work; 64 u32 order; 65 66 /* These fields protected by kvm->lock */ 67 68 /* Possible values and their usage: 69 * <0 an error occurred during allocation, 70 * -EBUSY allocation is in the progress, 71 * 0 allocation made successfuly. 72 */ 73 int error; 74 75 /* Private to the work thread, until error != -EBUSY, 76 * then protected by kvm->lock. 77 */ 78 struct kvm_hpt_info hpt; 79 }; 80 81 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 82 { 83 unsigned long hpt = 0; 84 int cma = 0; 85 struct page *page = NULL; 86 struct revmap_entry *rev; 87 unsigned long npte; 88 89 if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) 90 return -EINVAL; 91 92 page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); 93 if (page) { 94 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 95 memset((void *)hpt, 0, (1ul << order)); 96 cma = 1; 97 } 98 99 if (!hpt) 100 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL 101 |__GFP_NOWARN, order - PAGE_SHIFT); 102 103 if (!hpt) 104 return -ENOMEM; 105 106 /* HPTEs are 2**4 bytes long */ 107 npte = 1ul << (order - 4); 108 109 /* Allocate reverse map array */ 110 rev = vmalloc(array_size(npte, sizeof(struct revmap_entry))); 111 if (!rev) { 112 if (cma) 113 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 114 else 115 free_pages(hpt, order - PAGE_SHIFT); 116 return -ENOMEM; 117 } 118 119 info->order = order; 120 info->virt = hpt; 121 info->cma = cma; 122 info->rev = rev; 123 124 return 0; 125 } 126 127 void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) 128 { 129 atomic64_set(&kvm->arch.mmio_update, 0); 130 kvm->arch.hpt = *info; 131 kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); 132 133 pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", 134 info->virt, (long)info->order, kvm->arch.lpid); 135 } 136 137 long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) 138 { 139 long err = -EBUSY; 140 struct kvm_hpt_info info; 141 142 mutex_lock(&kvm->lock); 143 if (kvm->arch.mmu_ready) { 144 kvm->arch.mmu_ready = 0; 145 /* order mmu_ready vs. vcpus_running */ 146 smp_mb(); 147 if (atomic_read(&kvm->arch.vcpus_running)) { 148 kvm->arch.mmu_ready = 1; 149 goto out; 150 } 151 } 152 if (kvm_is_radix(kvm)) { 153 err = kvmppc_switch_mmu_to_hpt(kvm); 154 if (err) 155 goto out; 156 } 157 158 if (kvm->arch.hpt.order == order) { 159 /* We already have a suitable HPT */ 160 161 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 162 memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); 163 /* 164 * Reset all the reverse-mapping chains for all memslots 165 */ 166 kvmppc_rmap_reset(kvm); 167 err = 0; 168 goto out; 169 } 170 171 if (kvm->arch.hpt.virt) { 172 kvmppc_free_hpt(&kvm->arch.hpt); 173 kvmppc_rmap_reset(kvm); 174 } 175 176 err = kvmppc_allocate_hpt(&info, order); 177 if (err < 0) 178 goto out; 179 kvmppc_set_hpt(kvm, &info); 180 181 out: 182 if (err == 0) 183 /* Ensure that each vcpu will flush its TLB on next entry. */ 184 cpumask_setall(&kvm->arch.need_tlb_flush); 185 186 mutex_unlock(&kvm->lock); 187 return err; 188 } 189 190 void kvmppc_free_hpt(struct kvm_hpt_info *info) 191 { 192 vfree(info->rev); 193 info->rev = NULL; 194 if (info->cma) 195 kvm_free_hpt_cma(virt_to_page(info->virt), 196 1 << (info->order - PAGE_SHIFT)); 197 else if (info->virt) 198 free_pages(info->virt, info->order - PAGE_SHIFT); 199 info->virt = 0; 200 info->order = 0; 201 } 202 203 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 204 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 205 { 206 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 207 } 208 209 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 210 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 211 { 212 return (pgsize == 0x10000) ? 0x1000 : 0; 213 } 214 215 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 216 unsigned long porder) 217 { 218 unsigned long i; 219 unsigned long npages; 220 unsigned long hp_v, hp_r; 221 unsigned long addr, hash; 222 unsigned long psize; 223 unsigned long hp0, hp1; 224 unsigned long idx_ret; 225 long ret; 226 struct kvm *kvm = vcpu->kvm; 227 228 psize = 1ul << porder; 229 npages = memslot->npages >> (porder - PAGE_SHIFT); 230 231 /* VRMA can't be > 1TB */ 232 if (npages > 1ul << (40 - porder)) 233 npages = 1ul << (40 - porder); 234 /* Can't use more than 1 HPTE per HPTEG */ 235 if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) 236 npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; 237 238 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 239 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 240 hp1 = hpte1_pgsize_encoding(psize) | 241 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 242 243 for (i = 0; i < npages; ++i) { 244 addr = i << porder; 245 /* can't use hpt_hash since va > 64 bits */ 246 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) 247 & kvmppc_hpt_mask(&kvm->arch.hpt); 248 /* 249 * We assume that the hash table is empty and no 250 * vcpus are using it at this stage. Since we create 251 * at most one HPTE per HPTEG, we just assume entry 7 252 * is available and use it. 253 */ 254 hash = (hash << 3) + 7; 255 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 256 hp_r = hp1 | addr; 257 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 258 &idx_ret); 259 if (ret != H_SUCCESS) { 260 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 261 addr, ret); 262 break; 263 } 264 } 265 } 266 267 int kvmppc_mmu_hv_init(void) 268 { 269 unsigned long host_lpid, rsvd_lpid; 270 271 if (!cpu_has_feature(CPU_FTR_HVMODE)) 272 return -EINVAL; 273 274 if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) 275 return -EINVAL; 276 277 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ 278 host_lpid = mfspr(SPRN_LPID); 279 rsvd_lpid = LPID_RSVD; 280 281 kvmppc_init_lpid(rsvd_lpid + 1); 282 283 kvmppc_claim_lpid(host_lpid); 284 /* rsvd_lpid is reserved for use in partition switching */ 285 kvmppc_claim_lpid(rsvd_lpid); 286 287 return 0; 288 } 289 290 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) 291 { 292 unsigned long msr = vcpu->arch.intr_msr; 293 294 /* If transactional, change to suspend mode on IRQ delivery */ 295 if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) 296 msr |= MSR_TS_S; 297 else 298 msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; 299 kvmppc_set_msr(vcpu, msr); 300 } 301 302 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 303 long pte_index, unsigned long pteh, 304 unsigned long ptel, unsigned long *pte_idx_ret) 305 { 306 long ret; 307 308 /* Protect linux PTE lookup from page table destruction */ 309 rcu_read_lock_sched(); /* this disables preemption too */ 310 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 311 current->mm->pgd, false, pte_idx_ret); 312 rcu_read_unlock_sched(); 313 if (ret == H_TOO_HARD) { 314 /* this can't happen */ 315 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 316 ret = H_RESOURCE; /* or something */ 317 } 318 return ret; 319 320 } 321 322 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 323 gva_t eaddr) 324 { 325 u64 mask; 326 int i; 327 328 for (i = 0; i < vcpu->arch.slb_nr; i++) { 329 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 330 continue; 331 332 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 333 mask = ESID_MASK_1T; 334 else 335 mask = ESID_MASK; 336 337 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 338 return &vcpu->arch.slb[i]; 339 } 340 return NULL; 341 } 342 343 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 344 unsigned long ea) 345 { 346 unsigned long ra_mask; 347 348 ra_mask = kvmppc_actual_pgsz(v, r) - 1; 349 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 350 } 351 352 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 353 struct kvmppc_pte *gpte, bool data, bool iswrite) 354 { 355 struct kvm *kvm = vcpu->kvm; 356 struct kvmppc_slb *slbe; 357 unsigned long slb_v; 358 unsigned long pp, key; 359 unsigned long v, orig_v, gr; 360 __be64 *hptep; 361 int index; 362 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 363 364 if (kvm_is_radix(vcpu->kvm)) 365 return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); 366 367 /* Get SLB entry */ 368 if (virtmode) { 369 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 370 if (!slbe) 371 return -EINVAL; 372 slb_v = slbe->origv; 373 } else { 374 /* real mode access */ 375 slb_v = vcpu->kvm->arch.vrma_slb_v; 376 } 377 378 preempt_disable(); 379 /* Find the HPTE in the hash table */ 380 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 381 HPTE_V_VALID | HPTE_V_ABSENT); 382 if (index < 0) { 383 preempt_enable(); 384 return -ENOENT; 385 } 386 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 387 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 388 if (cpu_has_feature(CPU_FTR_ARCH_300)) 389 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 390 gr = kvm->arch.hpt.rev[index].guest_rpte; 391 392 unlock_hpte(hptep, orig_v); 393 preempt_enable(); 394 395 gpte->eaddr = eaddr; 396 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 397 398 /* Get PP bits and key for permission check */ 399 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 400 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 401 key &= slb_v; 402 403 /* Calculate permissions */ 404 gpte->may_read = hpte_read_permission(pp, key); 405 gpte->may_write = hpte_write_permission(pp, key); 406 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 407 408 /* Storage key permission check for POWER7 */ 409 if (data && virtmode) { 410 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 411 if (amrfield & 1) 412 gpte->may_read = 0; 413 if (amrfield & 2) 414 gpte->may_write = 0; 415 } 416 417 /* Get the guest physical address */ 418 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 419 return 0; 420 } 421 422 /* 423 * Quick test for whether an instruction is a load or a store. 424 * If the instruction is a load or a store, then this will indicate 425 * which it is, at least on server processors. (Embedded processors 426 * have some external PID instructions that don't follow the rule 427 * embodied here.) If the instruction isn't a load or store, then 428 * this doesn't return anything useful. 429 */ 430 static int instruction_is_store(unsigned int instr) 431 { 432 unsigned int mask; 433 434 mask = 0x10000000; 435 if ((instr & 0xfc000000) == 0x7c000000) 436 mask = 0x100; /* major opcode 31 */ 437 return (instr & mask) != 0; 438 } 439 440 int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, 441 unsigned long gpa, gva_t ea, int is_store) 442 { 443 u32 last_inst; 444 445 /* 446 * If we fail, we just return to the guest and try executing it again. 447 */ 448 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 449 EMULATE_DONE) 450 return RESUME_GUEST; 451 452 /* 453 * WARNING: We do not know for sure whether the instruction we just 454 * read from memory is the same that caused the fault in the first 455 * place. If the instruction we read is neither an load or a store, 456 * then it can't access memory, so we don't need to worry about 457 * enforcing access permissions. So, assuming it is a load or 458 * store, we just check that its direction (load or store) is 459 * consistent with the original fault, since that's what we 460 * checked the access permissions against. If there is a mismatch 461 * we just return and retry the instruction. 462 */ 463 464 if (instruction_is_store(last_inst) != !!is_store) 465 return RESUME_GUEST; 466 467 /* 468 * Emulated accesses are emulated by looking at the hash for 469 * translation once, then performing the access later. The 470 * translation could be invalidated in the meantime in which 471 * point performing the subsequent memory access on the old 472 * physical address could possibly be a security hole for the 473 * guest (but not the host). 474 * 475 * This is less of an issue for MMIO stores since they aren't 476 * globally visible. It could be an issue for MMIO loads to 477 * a certain extent but we'll ignore it for now. 478 */ 479 480 vcpu->arch.paddr_accessed = gpa; 481 vcpu->arch.vaddr_accessed = ea; 482 return kvmppc_emulate_mmio(run, vcpu); 483 } 484 485 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, 486 unsigned long ea, unsigned long dsisr) 487 { 488 struct kvm *kvm = vcpu->kvm; 489 unsigned long hpte[3], r; 490 unsigned long hnow_v, hnow_r; 491 __be64 *hptep; 492 unsigned long mmu_seq, psize, pte_size; 493 unsigned long gpa_base, gfn_base; 494 unsigned long gpa, gfn, hva, pfn; 495 struct kvm_memory_slot *memslot; 496 unsigned long *rmap; 497 struct revmap_entry *rev; 498 struct page *page, *pages[1]; 499 long index, ret, npages; 500 bool is_ci; 501 unsigned int writing, write_ok; 502 struct vm_area_struct *vma; 503 unsigned long rcbits; 504 long mmio_update; 505 506 if (kvm_is_radix(kvm)) 507 return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); 508 509 /* 510 * Real-mode code has already searched the HPT and found the 511 * entry we're interested in. Lock the entry and check that 512 * it hasn't changed. If it has, just return and re-execute the 513 * instruction. 514 */ 515 if (ea != vcpu->arch.pgfault_addr) 516 return RESUME_GUEST; 517 518 if (vcpu->arch.pgfault_cache) { 519 mmio_update = atomic64_read(&kvm->arch.mmio_update); 520 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 521 r = vcpu->arch.pgfault_cache->rpte; 522 psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], 523 r); 524 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 525 gfn_base = gpa_base >> PAGE_SHIFT; 526 gpa = gpa_base | (ea & (psize - 1)); 527 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 528 dsisr & DSISR_ISSTORE); 529 } 530 } 531 index = vcpu->arch.pgfault_index; 532 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 533 rev = &kvm->arch.hpt.rev[index]; 534 preempt_disable(); 535 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 536 cpu_relax(); 537 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 538 hpte[1] = be64_to_cpu(hptep[1]); 539 hpte[2] = r = rev->guest_rpte; 540 unlock_hpte(hptep, hpte[0]); 541 preempt_enable(); 542 543 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 544 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); 545 hpte[1] = hpte_new_to_old_r(hpte[1]); 546 } 547 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 548 hpte[1] != vcpu->arch.pgfault_hpte[1]) 549 return RESUME_GUEST; 550 551 /* Translate the logical address and get the page */ 552 psize = kvmppc_actual_pgsz(hpte[0], r); 553 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 554 gfn_base = gpa_base >> PAGE_SHIFT; 555 gpa = gpa_base | (ea & (psize - 1)); 556 gfn = gpa >> PAGE_SHIFT; 557 memslot = gfn_to_memslot(kvm, gfn); 558 559 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); 560 561 /* No memslot means it's an emulated MMIO region */ 562 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 563 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 564 dsisr & DSISR_ISSTORE); 565 566 /* 567 * This should never happen, because of the slot_is_aligned() 568 * check in kvmppc_do_h_enter(). 569 */ 570 if (gfn_base < memslot->base_gfn) 571 return -EFAULT; 572 573 /* used to check for invalidations in progress */ 574 mmu_seq = kvm->mmu_notifier_seq; 575 smp_rmb(); 576 577 ret = -EFAULT; 578 is_ci = false; 579 pfn = 0; 580 page = NULL; 581 pte_size = PAGE_SIZE; 582 writing = (dsisr & DSISR_ISSTORE) != 0; 583 /* If writing != 0, then the HPTE must allow writing, if we get here */ 584 write_ok = writing; 585 hva = gfn_to_hva_memslot(memslot, gfn); 586 npages = get_user_pages_fast(hva, 1, writing, pages); 587 if (npages < 1) { 588 /* Check if it's an I/O mapping */ 589 down_read(¤t->mm->mmap_sem); 590 vma = find_vma(current->mm, hva); 591 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && 592 (vma->vm_flags & VM_PFNMAP)) { 593 pfn = vma->vm_pgoff + 594 ((hva - vma->vm_start) >> PAGE_SHIFT); 595 pte_size = psize; 596 is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); 597 write_ok = vma->vm_flags & VM_WRITE; 598 } 599 up_read(¤t->mm->mmap_sem); 600 if (!pfn) 601 goto out_put; 602 } else { 603 page = pages[0]; 604 pfn = page_to_pfn(page); 605 if (PageHuge(page)) { 606 page = compound_head(page); 607 pte_size <<= compound_order(page); 608 } 609 /* if the guest wants write access, see if that is OK */ 610 if (!writing && hpte_is_writable(r)) { 611 pte_t *ptep, pte; 612 unsigned long flags; 613 /* 614 * We need to protect against page table destruction 615 * hugepage split and collapse. 616 */ 617 local_irq_save(flags); 618 ptep = find_current_mm_pte(current->mm->pgd, 619 hva, NULL, NULL); 620 if (ptep) { 621 pte = kvmppc_read_update_linux_pte(ptep, 1); 622 if (__pte_write(pte)) 623 write_ok = 1; 624 } 625 local_irq_restore(flags); 626 } 627 } 628 629 if (psize > pte_size) 630 goto out_put; 631 632 /* Check WIMG vs. the actual page we're accessing */ 633 if (!hpte_cache_flags_ok(r, is_ci)) { 634 if (is_ci) 635 goto out_put; 636 /* 637 * Allow guest to map emulated device memory as 638 * uncacheable, but actually make it cacheable. 639 */ 640 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 641 } 642 643 /* 644 * Set the HPTE to point to pfn. 645 * Since the pfn is at PAGE_SIZE granularity, make sure we 646 * don't mask out lower-order bits if psize < PAGE_SIZE. 647 */ 648 if (psize < PAGE_SIZE) 649 psize = PAGE_SIZE; 650 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | 651 ((pfn << PAGE_SHIFT) & ~(psize - 1)); 652 if (hpte_is_writable(r) && !write_ok) 653 r = hpte_make_readonly(r); 654 ret = RESUME_GUEST; 655 preempt_disable(); 656 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 657 cpu_relax(); 658 hnow_v = be64_to_cpu(hptep[0]); 659 hnow_r = be64_to_cpu(hptep[1]); 660 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 661 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); 662 hnow_r = hpte_new_to_old_r(hnow_r); 663 } 664 665 /* 666 * If the HPT is being resized, don't update the HPTE, 667 * instead let the guest retry after the resize operation is complete. 668 * The synchronization for mmu_ready test vs. set is provided 669 * by the HPTE lock. 670 */ 671 if (!kvm->arch.mmu_ready) 672 goto out_unlock; 673 674 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 675 rev->guest_rpte != hpte[2]) 676 /* HPTE has been changed under us; let the guest retry */ 677 goto out_unlock; 678 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 679 680 /* Always put the HPTE in the rmap chain for the page base address */ 681 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 682 lock_rmap(rmap); 683 684 /* Check if we might have been invalidated; let the guest retry if so */ 685 ret = RESUME_GUEST; 686 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 687 unlock_rmap(rmap); 688 goto out_unlock; 689 } 690 691 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 692 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 693 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 694 695 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { 696 /* HPTE was previously valid, so we need to invalidate it */ 697 unlock_rmap(rmap); 698 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 699 kvmppc_invalidate_hpte(kvm, hptep, index); 700 /* don't lose previous R and C bits */ 701 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 702 } else { 703 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 704 } 705 706 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 707 r = hpte_old_to_new_r(hpte[0], r); 708 hpte[0] = hpte_old_to_new_v(hpte[0]); 709 } 710 hptep[1] = cpu_to_be64(r); 711 eieio(); 712 __unlock_hpte(hptep, hpte[0]); 713 asm volatile("ptesync" : : : "memory"); 714 preempt_enable(); 715 if (page && hpte_is_writable(r)) 716 SetPageDirty(page); 717 718 out_put: 719 trace_kvm_page_fault_exit(vcpu, hpte, ret); 720 721 if (page) { 722 /* 723 * We drop pages[0] here, not page because page might 724 * have been set to the head page of a compound, but 725 * we have to drop the reference on the correct tail 726 * page to match the get inside gup() 727 */ 728 put_page(pages[0]); 729 } 730 return ret; 731 732 out_unlock: 733 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 734 preempt_enable(); 735 goto out_put; 736 } 737 738 void kvmppc_rmap_reset(struct kvm *kvm) 739 { 740 struct kvm_memslots *slots; 741 struct kvm_memory_slot *memslot; 742 int srcu_idx; 743 744 srcu_idx = srcu_read_lock(&kvm->srcu); 745 slots = kvm_memslots(kvm); 746 kvm_for_each_memslot(memslot, slots) { 747 /* 748 * This assumes it is acceptable to lose reference and 749 * change bits across a reset. 750 */ 751 memset(memslot->arch.rmap, 0, 752 memslot->npages * sizeof(*memslot->arch.rmap)); 753 } 754 srcu_read_unlock(&kvm->srcu, srcu_idx); 755 } 756 757 typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, 758 unsigned long gfn); 759 760 static int kvm_handle_hva_range(struct kvm *kvm, 761 unsigned long start, 762 unsigned long end, 763 hva_handler_fn handler) 764 { 765 int ret; 766 int retval = 0; 767 struct kvm_memslots *slots; 768 struct kvm_memory_slot *memslot; 769 770 slots = kvm_memslots(kvm); 771 kvm_for_each_memslot(memslot, slots) { 772 unsigned long hva_start, hva_end; 773 gfn_t gfn, gfn_end; 774 775 hva_start = max(start, memslot->userspace_addr); 776 hva_end = min(end, memslot->userspace_addr + 777 (memslot->npages << PAGE_SHIFT)); 778 if (hva_start >= hva_end) 779 continue; 780 /* 781 * {gfn(page) | page intersects with [hva_start, hva_end)} = 782 * {gfn, gfn+1, ..., gfn_end-1}. 783 */ 784 gfn = hva_to_gfn_memslot(hva_start, memslot); 785 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 786 787 for (; gfn < gfn_end; ++gfn) { 788 ret = handler(kvm, memslot, gfn); 789 retval |= ret; 790 } 791 } 792 793 return retval; 794 } 795 796 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 797 hva_handler_fn handler) 798 { 799 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 800 } 801 802 /* Must be called with both HPTE and rmap locked */ 803 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 804 struct kvm_memory_slot *memslot, 805 unsigned long *rmapp, unsigned long gfn) 806 { 807 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 808 struct revmap_entry *rev = kvm->arch.hpt.rev; 809 unsigned long j, h; 810 unsigned long ptel, psize, rcbits; 811 812 j = rev[i].forw; 813 if (j == i) { 814 /* chain is now empty */ 815 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 816 } else { 817 /* remove i from chain */ 818 h = rev[i].back; 819 rev[h].forw = j; 820 rev[j].back = h; 821 rev[i].forw = rev[i].back = i; 822 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 823 } 824 825 /* Now check and modify the HPTE */ 826 ptel = rev[i].guest_rpte; 827 psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); 828 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 829 hpte_rpn(ptel, psize) == gfn) { 830 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 831 kvmppc_invalidate_hpte(kvm, hptep, i); 832 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); 833 /* Harvest R and C */ 834 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 835 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 836 if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) 837 kvmppc_update_dirty_map(memslot, gfn, psize); 838 if (rcbits & ~rev[i].guest_rpte) { 839 rev[i].guest_rpte = ptel | rcbits; 840 note_hpte_modification(kvm, &rev[i]); 841 } 842 } 843 } 844 845 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 846 unsigned long gfn) 847 { 848 unsigned long i; 849 __be64 *hptep; 850 unsigned long *rmapp; 851 852 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 853 for (;;) { 854 lock_rmap(rmapp); 855 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 856 unlock_rmap(rmapp); 857 break; 858 } 859 860 /* 861 * To avoid an ABBA deadlock with the HPTE lock bit, 862 * we can't spin on the HPTE lock while holding the 863 * rmap chain lock. 864 */ 865 i = *rmapp & KVMPPC_RMAP_INDEX; 866 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 867 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 868 /* unlock rmap before spinning on the HPTE lock */ 869 unlock_rmap(rmapp); 870 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 871 cpu_relax(); 872 continue; 873 } 874 875 kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); 876 unlock_rmap(rmapp); 877 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 878 } 879 return 0; 880 } 881 882 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) 883 { 884 hva_handler_fn handler; 885 886 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 887 kvm_handle_hva_range(kvm, start, end, handler); 888 return 0; 889 } 890 891 void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 892 struct kvm_memory_slot *memslot) 893 { 894 unsigned long gfn; 895 unsigned long n; 896 unsigned long *rmapp; 897 898 gfn = memslot->base_gfn; 899 rmapp = memslot->arch.rmap; 900 for (n = memslot->npages; n; --n, ++gfn) { 901 if (kvm_is_radix(kvm)) { 902 kvm_unmap_radix(kvm, memslot, gfn); 903 continue; 904 } 905 /* 906 * Testing the present bit without locking is OK because 907 * the memslot has been marked invalid already, and hence 908 * no new HPTEs referencing this page can be created, 909 * thus the present bit can't go from 0 to 1. 910 */ 911 if (*rmapp & KVMPPC_RMAP_PRESENT) 912 kvm_unmap_rmapp(kvm, memslot, gfn); 913 ++rmapp; 914 } 915 } 916 917 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 918 unsigned long gfn) 919 { 920 struct revmap_entry *rev = kvm->arch.hpt.rev; 921 unsigned long head, i, j; 922 __be64 *hptep; 923 int ret = 0; 924 unsigned long *rmapp; 925 926 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 927 retry: 928 lock_rmap(rmapp); 929 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 930 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 931 ret = 1; 932 } 933 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 934 unlock_rmap(rmapp); 935 return ret; 936 } 937 938 i = head = *rmapp & KVMPPC_RMAP_INDEX; 939 do { 940 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 941 j = rev[i].forw; 942 943 /* If this HPTE isn't referenced, ignore it */ 944 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) 945 continue; 946 947 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 948 /* unlock rmap before spinning on the HPTE lock */ 949 unlock_rmap(rmapp); 950 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 951 cpu_relax(); 952 goto retry; 953 } 954 955 /* Now check and modify the HPTE */ 956 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 957 (be64_to_cpu(hptep[1]) & HPTE_R_R)) { 958 kvmppc_clear_ref_hpte(kvm, hptep, i); 959 if (!(rev[i].guest_rpte & HPTE_R_R)) { 960 rev[i].guest_rpte |= HPTE_R_R; 961 note_hpte_modification(kvm, &rev[i]); 962 } 963 ret = 1; 964 } 965 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 966 } while ((i = j) != head); 967 968 unlock_rmap(rmapp); 969 return ret; 970 } 971 972 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) 973 { 974 hva_handler_fn handler; 975 976 handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; 977 return kvm_handle_hva_range(kvm, start, end, handler); 978 } 979 980 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 981 unsigned long gfn) 982 { 983 struct revmap_entry *rev = kvm->arch.hpt.rev; 984 unsigned long head, i, j; 985 unsigned long *hp; 986 int ret = 1; 987 unsigned long *rmapp; 988 989 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 990 if (*rmapp & KVMPPC_RMAP_REFERENCED) 991 return 1; 992 993 lock_rmap(rmapp); 994 if (*rmapp & KVMPPC_RMAP_REFERENCED) 995 goto out; 996 997 if (*rmapp & KVMPPC_RMAP_PRESENT) { 998 i = head = *rmapp & KVMPPC_RMAP_INDEX; 999 do { 1000 hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); 1001 j = rev[i].forw; 1002 if (be64_to_cpu(hp[1]) & HPTE_R_R) 1003 goto out; 1004 } while ((i = j) != head); 1005 } 1006 ret = 0; 1007 1008 out: 1009 unlock_rmap(rmapp); 1010 return ret; 1011 } 1012 1013 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) 1014 { 1015 hva_handler_fn handler; 1016 1017 handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; 1018 return kvm_handle_hva(kvm, hva, handler); 1019 } 1020 1021 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) 1022 { 1023 hva_handler_fn handler; 1024 1025 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 1026 kvm_handle_hva(kvm, hva, handler); 1027 } 1028 1029 static int vcpus_running(struct kvm *kvm) 1030 { 1031 return atomic_read(&kvm->arch.vcpus_running) != 0; 1032 } 1033 1034 /* 1035 * Returns the number of system pages that are dirty. 1036 * This can be more than 1 if we find a huge-page HPTE. 1037 */ 1038 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1039 { 1040 struct revmap_entry *rev = kvm->arch.hpt.rev; 1041 unsigned long head, i, j; 1042 unsigned long n; 1043 unsigned long v, r; 1044 __be64 *hptep; 1045 int npages_dirty = 0; 1046 1047 retry: 1048 lock_rmap(rmapp); 1049 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1050 unlock_rmap(rmapp); 1051 return npages_dirty; 1052 } 1053 1054 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1055 do { 1056 unsigned long hptep1; 1057 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 1058 j = rev[i].forw; 1059 1060 /* 1061 * Checking the C (changed) bit here is racy since there 1062 * is no guarantee about when the hardware writes it back. 1063 * If the HPTE is not writable then it is stable since the 1064 * page can't be written to, and we would have done a tlbie 1065 * (which forces the hardware to complete any writeback) 1066 * when making the HPTE read-only. 1067 * If vcpus are running then this call is racy anyway 1068 * since the page could get dirtied subsequently, so we 1069 * expect there to be a further call which would pick up 1070 * any delayed C bit writeback. 1071 * Otherwise we need to do the tlbie even if C==0 in 1072 * order to pick up any delayed writeback of C. 1073 */ 1074 hptep1 = be64_to_cpu(hptep[1]); 1075 if (!(hptep1 & HPTE_R_C) && 1076 (!hpte_is_writable(hptep1) || vcpus_running(kvm))) 1077 continue; 1078 1079 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1080 /* unlock rmap before spinning on the HPTE lock */ 1081 unlock_rmap(rmapp); 1082 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) 1083 cpu_relax(); 1084 goto retry; 1085 } 1086 1087 /* Now check and modify the HPTE */ 1088 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 1089 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 1090 continue; 1091 } 1092 1093 /* need to make it temporarily absent so C is stable */ 1094 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 1095 kvmppc_invalidate_hpte(kvm, hptep, i); 1096 v = be64_to_cpu(hptep[0]); 1097 r = be64_to_cpu(hptep[1]); 1098 if (r & HPTE_R_C) { 1099 hptep[1] = cpu_to_be64(r & ~HPTE_R_C); 1100 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1101 rev[i].guest_rpte |= HPTE_R_C; 1102 note_hpte_modification(kvm, &rev[i]); 1103 } 1104 n = kvmppc_actual_pgsz(v, r); 1105 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1106 if (n > npages_dirty) 1107 npages_dirty = n; 1108 eieio(); 1109 } 1110 v &= ~HPTE_V_ABSENT; 1111 v |= HPTE_V_VALID; 1112 __unlock_hpte(hptep, v); 1113 } while ((i = j) != head); 1114 1115 unlock_rmap(rmapp); 1116 return npages_dirty; 1117 } 1118 1119 void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1120 struct kvm_memory_slot *memslot, 1121 unsigned long *map) 1122 { 1123 unsigned long gfn; 1124 1125 if (!vpa->dirty || !vpa->pinned_addr) 1126 return; 1127 gfn = vpa->gpa >> PAGE_SHIFT; 1128 if (gfn < memslot->base_gfn || 1129 gfn >= memslot->base_gfn + memslot->npages) 1130 return; 1131 1132 vpa->dirty = false; 1133 if (map) 1134 __set_bit_le(gfn - memslot->base_gfn, map); 1135 } 1136 1137 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1138 struct kvm_memory_slot *memslot, unsigned long *map) 1139 { 1140 unsigned long i; 1141 unsigned long *rmapp; 1142 1143 preempt_disable(); 1144 rmapp = memslot->arch.rmap; 1145 for (i = 0; i < memslot->npages; ++i) { 1146 int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1147 /* 1148 * Note that if npages > 0 then i must be a multiple of npages, 1149 * since we always put huge-page HPTEs in the rmap chain 1150 * corresponding to their page base address. 1151 */ 1152 if (npages) 1153 set_dirty_bits(map, i, npages); 1154 ++rmapp; 1155 } 1156 preempt_enable(); 1157 return 0; 1158 } 1159 1160 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1161 unsigned long *nb_ret) 1162 { 1163 struct kvm_memory_slot *memslot; 1164 unsigned long gfn = gpa >> PAGE_SHIFT; 1165 struct page *page, *pages[1]; 1166 int npages; 1167 unsigned long hva, offset; 1168 int srcu_idx; 1169 1170 srcu_idx = srcu_read_lock(&kvm->srcu); 1171 memslot = gfn_to_memslot(kvm, gfn); 1172 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1173 goto err; 1174 hva = gfn_to_hva_memslot(memslot, gfn); 1175 npages = get_user_pages_fast(hva, 1, 1, pages); 1176 if (npages < 1) 1177 goto err; 1178 page = pages[0]; 1179 srcu_read_unlock(&kvm->srcu, srcu_idx); 1180 1181 offset = gpa & (PAGE_SIZE - 1); 1182 if (nb_ret) 1183 *nb_ret = PAGE_SIZE - offset; 1184 return page_address(page) + offset; 1185 1186 err: 1187 srcu_read_unlock(&kvm->srcu, srcu_idx); 1188 return NULL; 1189 } 1190 1191 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1192 bool dirty) 1193 { 1194 struct page *page = virt_to_page(va); 1195 struct kvm_memory_slot *memslot; 1196 unsigned long gfn; 1197 int srcu_idx; 1198 1199 put_page(page); 1200 1201 if (!dirty) 1202 return; 1203 1204 /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ 1205 gfn = gpa >> PAGE_SHIFT; 1206 srcu_idx = srcu_read_lock(&kvm->srcu); 1207 memslot = gfn_to_memslot(kvm, gfn); 1208 if (memslot && memslot->dirty_bitmap) 1209 set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); 1210 srcu_read_unlock(&kvm->srcu, srcu_idx); 1211 } 1212 1213 /* 1214 * HPT resizing 1215 */ 1216 static int resize_hpt_allocate(struct kvm_resize_hpt *resize) 1217 { 1218 int rc; 1219 1220 rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); 1221 if (rc < 0) 1222 return rc; 1223 1224 resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", 1225 resize->hpt.virt); 1226 1227 return 0; 1228 } 1229 1230 static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, 1231 unsigned long idx) 1232 { 1233 struct kvm *kvm = resize->kvm; 1234 struct kvm_hpt_info *old = &kvm->arch.hpt; 1235 struct kvm_hpt_info *new = &resize->hpt; 1236 unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; 1237 unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; 1238 __be64 *hptep, *new_hptep; 1239 unsigned long vpte, rpte, guest_rpte; 1240 int ret; 1241 struct revmap_entry *rev; 1242 unsigned long apsize, avpn, pteg, hash; 1243 unsigned long new_idx, new_pteg, replace_vpte; 1244 int pshift; 1245 1246 hptep = (__be64 *)(old->virt + (idx << 4)); 1247 1248 /* Guest is stopped, so new HPTEs can't be added or faulted 1249 * in, only unmapped or altered by host actions. So, it's 1250 * safe to check this before we take the HPTE lock */ 1251 vpte = be64_to_cpu(hptep[0]); 1252 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1253 return 0; /* nothing to do */ 1254 1255 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 1256 cpu_relax(); 1257 1258 vpte = be64_to_cpu(hptep[0]); 1259 1260 ret = 0; 1261 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1262 /* Nothing to do */ 1263 goto out; 1264 1265 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1266 rpte = be64_to_cpu(hptep[1]); 1267 vpte = hpte_new_to_old_v(vpte, rpte); 1268 } 1269 1270 /* Unmap */ 1271 rev = &old->rev[idx]; 1272 guest_rpte = rev->guest_rpte; 1273 1274 ret = -EIO; 1275 apsize = kvmppc_actual_pgsz(vpte, guest_rpte); 1276 if (!apsize) 1277 goto out; 1278 1279 if (vpte & HPTE_V_VALID) { 1280 unsigned long gfn = hpte_rpn(guest_rpte, apsize); 1281 int srcu_idx = srcu_read_lock(&kvm->srcu); 1282 struct kvm_memory_slot *memslot = 1283 __gfn_to_memslot(kvm_memslots(kvm), gfn); 1284 1285 if (memslot) { 1286 unsigned long *rmapp; 1287 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1288 1289 lock_rmap(rmapp); 1290 kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); 1291 unlock_rmap(rmapp); 1292 } 1293 1294 srcu_read_unlock(&kvm->srcu, srcu_idx); 1295 } 1296 1297 /* Reload PTE after unmap */ 1298 vpte = be64_to_cpu(hptep[0]); 1299 BUG_ON(vpte & HPTE_V_VALID); 1300 BUG_ON(!(vpte & HPTE_V_ABSENT)); 1301 1302 ret = 0; 1303 if (!(vpte & HPTE_V_BOLTED)) 1304 goto out; 1305 1306 rpte = be64_to_cpu(hptep[1]); 1307 1308 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1309 vpte = hpte_new_to_old_v(vpte, rpte); 1310 rpte = hpte_new_to_old_r(rpte); 1311 } 1312 1313 pshift = kvmppc_hpte_base_page_shift(vpte, rpte); 1314 avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23); 1315 pteg = idx / HPTES_PER_GROUP; 1316 if (vpte & HPTE_V_SECONDARY) 1317 pteg = ~pteg; 1318 1319 if (!(vpte & HPTE_V_1TB_SEG)) { 1320 unsigned long offset, vsid; 1321 1322 /* We only have 28 - 23 bits of offset in avpn */ 1323 offset = (avpn & 0x1f) << 23; 1324 vsid = avpn >> 5; 1325 /* We can find more bits from the pteg value */ 1326 if (pshift < 23) 1327 offset |= ((vsid ^ pteg) & old_hash_mask) << pshift; 1328 1329 hash = vsid ^ (offset >> pshift); 1330 } else { 1331 unsigned long offset, vsid; 1332 1333 /* We only have 40 - 23 bits of seg_off in avpn */ 1334 offset = (avpn & 0x1ffff) << 23; 1335 vsid = avpn >> 17; 1336 if (pshift < 23) 1337 offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift; 1338 1339 hash = vsid ^ (vsid << 25) ^ (offset >> pshift); 1340 } 1341 1342 new_pteg = hash & new_hash_mask; 1343 if (vpte & HPTE_V_SECONDARY) 1344 new_pteg = ~hash & new_hash_mask; 1345 1346 new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); 1347 new_hptep = (__be64 *)(new->virt + (new_idx << 4)); 1348 1349 replace_vpte = be64_to_cpu(new_hptep[0]); 1350 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1351 unsigned long replace_rpte = be64_to_cpu(new_hptep[1]); 1352 replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte); 1353 } 1354 1355 if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1356 BUG_ON(new->order >= old->order); 1357 1358 if (replace_vpte & HPTE_V_BOLTED) { 1359 if (vpte & HPTE_V_BOLTED) 1360 /* Bolted collision, nothing we can do */ 1361 ret = -ENOSPC; 1362 /* Discard the new HPTE */ 1363 goto out; 1364 } 1365 1366 /* Discard the previous HPTE */ 1367 } 1368 1369 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1370 rpte = hpte_old_to_new_r(vpte, rpte); 1371 vpte = hpte_old_to_new_v(vpte); 1372 } 1373 1374 new_hptep[1] = cpu_to_be64(rpte); 1375 new->rev[new_idx].guest_rpte = guest_rpte; 1376 /* No need for a barrier, since new HPT isn't active */ 1377 new_hptep[0] = cpu_to_be64(vpte); 1378 unlock_hpte(new_hptep, vpte); 1379 1380 out: 1381 unlock_hpte(hptep, vpte); 1382 return ret; 1383 } 1384 1385 static int resize_hpt_rehash(struct kvm_resize_hpt *resize) 1386 { 1387 struct kvm *kvm = resize->kvm; 1388 unsigned long i; 1389 int rc; 1390 1391 for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { 1392 rc = resize_hpt_rehash_hpte(resize, i); 1393 if (rc != 0) 1394 return rc; 1395 } 1396 1397 return 0; 1398 } 1399 1400 static void resize_hpt_pivot(struct kvm_resize_hpt *resize) 1401 { 1402 struct kvm *kvm = resize->kvm; 1403 struct kvm_hpt_info hpt_tmp; 1404 1405 /* Exchange the pending tables in the resize structure with 1406 * the active tables */ 1407 1408 resize_hpt_debug(resize, "resize_hpt_pivot()\n"); 1409 1410 spin_lock(&kvm->mmu_lock); 1411 asm volatile("ptesync" : : : "memory"); 1412 1413 hpt_tmp = kvm->arch.hpt; 1414 kvmppc_set_hpt(kvm, &resize->hpt); 1415 resize->hpt = hpt_tmp; 1416 1417 spin_unlock(&kvm->mmu_lock); 1418 1419 synchronize_srcu_expedited(&kvm->srcu); 1420 1421 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1422 kvmppc_setup_partition_table(kvm); 1423 1424 resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); 1425 } 1426 1427 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) 1428 { 1429 if (WARN_ON(!mutex_is_locked(&kvm->lock))) 1430 return; 1431 1432 if (!resize) 1433 return; 1434 1435 if (resize->error != -EBUSY) { 1436 if (resize->hpt.virt) 1437 kvmppc_free_hpt(&resize->hpt); 1438 kfree(resize); 1439 } 1440 1441 if (kvm->arch.resize_hpt == resize) 1442 kvm->arch.resize_hpt = NULL; 1443 } 1444 1445 static void resize_hpt_prepare_work(struct work_struct *work) 1446 { 1447 struct kvm_resize_hpt *resize = container_of(work, 1448 struct kvm_resize_hpt, 1449 work); 1450 struct kvm *kvm = resize->kvm; 1451 int err = 0; 1452 1453 if (WARN_ON(resize->error != -EBUSY)) 1454 return; 1455 1456 mutex_lock(&kvm->lock); 1457 1458 /* Request is still current? */ 1459 if (kvm->arch.resize_hpt == resize) { 1460 /* We may request large allocations here: 1461 * do not sleep with kvm->lock held for a while. 1462 */ 1463 mutex_unlock(&kvm->lock); 1464 1465 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", 1466 resize->order); 1467 1468 err = resize_hpt_allocate(resize); 1469 1470 /* We have strict assumption about -EBUSY 1471 * when preparing for HPT resize. 1472 */ 1473 if (WARN_ON(err == -EBUSY)) 1474 err = -EINPROGRESS; 1475 1476 mutex_lock(&kvm->lock); 1477 /* It is possible that kvm->arch.resize_hpt != resize 1478 * after we grab kvm->lock again. 1479 */ 1480 } 1481 1482 resize->error = err; 1483 1484 if (kvm->arch.resize_hpt != resize) 1485 resize_hpt_release(kvm, resize); 1486 1487 mutex_unlock(&kvm->lock); 1488 } 1489 1490 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, 1491 struct kvm_ppc_resize_hpt *rhpt) 1492 { 1493 unsigned long flags = rhpt->flags; 1494 unsigned long shift = rhpt->shift; 1495 struct kvm_resize_hpt *resize; 1496 int ret; 1497 1498 if (flags != 0 || kvm_is_radix(kvm)) 1499 return -EINVAL; 1500 1501 if (shift && ((shift < 18) || (shift > 46))) 1502 return -EINVAL; 1503 1504 mutex_lock(&kvm->lock); 1505 1506 resize = kvm->arch.resize_hpt; 1507 1508 if (resize) { 1509 if (resize->order == shift) { 1510 /* Suitable resize in progress? */ 1511 ret = resize->error; 1512 if (ret == -EBUSY) 1513 ret = 100; /* estimated time in ms */ 1514 else if (ret) 1515 resize_hpt_release(kvm, resize); 1516 1517 goto out; 1518 } 1519 1520 /* not suitable, cancel it */ 1521 resize_hpt_release(kvm, resize); 1522 } 1523 1524 ret = 0; 1525 if (!shift) 1526 goto out; /* nothing to do */ 1527 1528 /* start new resize */ 1529 1530 resize = kzalloc(sizeof(*resize), GFP_KERNEL); 1531 if (!resize) { 1532 ret = -ENOMEM; 1533 goto out; 1534 } 1535 1536 resize->error = -EBUSY; 1537 resize->order = shift; 1538 resize->kvm = kvm; 1539 INIT_WORK(&resize->work, resize_hpt_prepare_work); 1540 kvm->arch.resize_hpt = resize; 1541 1542 schedule_work(&resize->work); 1543 1544 ret = 100; /* estimated time in ms */ 1545 1546 out: 1547 mutex_unlock(&kvm->lock); 1548 return ret; 1549 } 1550 1551 static void resize_hpt_boot_vcpu(void *opaque) 1552 { 1553 /* Nothing to do, just force a KVM exit */ 1554 } 1555 1556 long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, 1557 struct kvm_ppc_resize_hpt *rhpt) 1558 { 1559 unsigned long flags = rhpt->flags; 1560 unsigned long shift = rhpt->shift; 1561 struct kvm_resize_hpt *resize; 1562 long ret; 1563 1564 if (flags != 0 || kvm_is_radix(kvm)) 1565 return -EINVAL; 1566 1567 if (shift && ((shift < 18) || (shift > 46))) 1568 return -EINVAL; 1569 1570 mutex_lock(&kvm->lock); 1571 1572 resize = kvm->arch.resize_hpt; 1573 1574 /* This shouldn't be possible */ 1575 ret = -EIO; 1576 if (WARN_ON(!kvm->arch.mmu_ready)) 1577 goto out_no_hpt; 1578 1579 /* Stop VCPUs from running while we mess with the HPT */ 1580 kvm->arch.mmu_ready = 0; 1581 smp_mb(); 1582 1583 /* Boot all CPUs out of the guest so they re-read 1584 * mmu_ready */ 1585 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1586 1587 ret = -ENXIO; 1588 if (!resize || (resize->order != shift)) 1589 goto out; 1590 1591 ret = resize->error; 1592 if (ret) 1593 goto out; 1594 1595 ret = resize_hpt_rehash(resize); 1596 if (ret) 1597 goto out; 1598 1599 resize_hpt_pivot(resize); 1600 1601 out: 1602 /* Let VCPUs run again */ 1603 kvm->arch.mmu_ready = 1; 1604 smp_mb(); 1605 out_no_hpt: 1606 resize_hpt_release(kvm, resize); 1607 mutex_unlock(&kvm->lock); 1608 return ret; 1609 } 1610 1611 /* 1612 * Functions for reading and writing the hash table via reads and 1613 * writes on a file descriptor. 1614 * 1615 * Reads return the guest view of the hash table, which has to be 1616 * pieced together from the real hash table and the guest_rpte 1617 * values in the revmap array. 1618 * 1619 * On writes, each HPTE written is considered in turn, and if it 1620 * is valid, it is written to the HPT as if an H_ENTER with the 1621 * exact flag set was done. When the invalid count is non-zero 1622 * in the header written to the stream, the kernel will make 1623 * sure that that many HPTEs are invalid, and invalidate them 1624 * if not. 1625 */ 1626 1627 struct kvm_htab_ctx { 1628 unsigned long index; 1629 unsigned long flags; 1630 struct kvm *kvm; 1631 int first_pass; 1632 }; 1633 1634 #define HPTE_SIZE (2 * sizeof(unsigned long)) 1635 1636 /* 1637 * Returns 1 if this HPT entry has been modified or has pending 1638 * R/C bit changes. 1639 */ 1640 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) 1641 { 1642 unsigned long rcbits_unset; 1643 1644 if (revp->guest_rpte & HPTE_GR_MODIFIED) 1645 return 1; 1646 1647 /* Also need to consider changes in reference and changed bits */ 1648 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1649 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && 1650 (be64_to_cpu(hptp[1]) & rcbits_unset)) 1651 return 1; 1652 1653 return 0; 1654 } 1655 1656 static long record_hpte(unsigned long flags, __be64 *hptp, 1657 unsigned long *hpte, struct revmap_entry *revp, 1658 int want_valid, int first_pass) 1659 { 1660 unsigned long v, r, hr; 1661 unsigned long rcbits_unset; 1662 int ok = 1; 1663 int valid, dirty; 1664 1665 /* Unmodified entries are uninteresting except on the first pass */ 1666 dirty = hpte_dirty(revp, hptp); 1667 if (!first_pass && !dirty) 1668 return 0; 1669 1670 valid = 0; 1671 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1672 valid = 1; 1673 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1674 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) 1675 valid = 0; 1676 } 1677 if (valid != want_valid) 1678 return 0; 1679 1680 v = r = 0; 1681 if (valid || dirty) { 1682 /* lock the HPTE so it's stable and read it */ 1683 preempt_disable(); 1684 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1685 cpu_relax(); 1686 v = be64_to_cpu(hptp[0]); 1687 hr = be64_to_cpu(hptp[1]); 1688 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1689 v = hpte_new_to_old_v(v, hr); 1690 hr = hpte_new_to_old_r(hr); 1691 } 1692 1693 /* re-evaluate valid and dirty from synchronized HPTE value */ 1694 valid = !!(v & HPTE_V_VALID); 1695 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1696 1697 /* Harvest R and C into guest view if necessary */ 1698 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1699 if (valid && (rcbits_unset & hr)) { 1700 revp->guest_rpte |= (hr & 1701 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; 1702 dirty = 1; 1703 } 1704 1705 if (v & HPTE_V_ABSENT) { 1706 v &= ~HPTE_V_ABSENT; 1707 v |= HPTE_V_VALID; 1708 valid = 1; 1709 } 1710 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1711 valid = 0; 1712 1713 r = revp->guest_rpte; 1714 /* only clear modified if this is the right sort of entry */ 1715 if (valid == want_valid && dirty) { 1716 r &= ~HPTE_GR_MODIFIED; 1717 revp->guest_rpte = r; 1718 } 1719 unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1720 preempt_enable(); 1721 if (!(valid == want_valid && (first_pass || dirty))) 1722 ok = 0; 1723 } 1724 hpte[0] = cpu_to_be64(v); 1725 hpte[1] = cpu_to_be64(r); 1726 return ok; 1727 } 1728 1729 static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1730 size_t count, loff_t *ppos) 1731 { 1732 struct kvm_htab_ctx *ctx = file->private_data; 1733 struct kvm *kvm = ctx->kvm; 1734 struct kvm_get_htab_header hdr; 1735 __be64 *hptp; 1736 struct revmap_entry *revp; 1737 unsigned long i, nb, nw; 1738 unsigned long __user *lbuf; 1739 struct kvm_get_htab_header __user *hptr; 1740 unsigned long flags; 1741 int first_pass; 1742 unsigned long hpte[2]; 1743 1744 if (!access_ok(VERIFY_WRITE, buf, count)) 1745 return -EFAULT; 1746 if (kvm_is_radix(kvm)) 1747 return 0; 1748 1749 first_pass = ctx->first_pass; 1750 flags = ctx->flags; 1751 1752 i = ctx->index; 1753 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1754 revp = kvm->arch.hpt.rev + i; 1755 lbuf = (unsigned long __user *)buf; 1756 1757 nb = 0; 1758 while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1759 /* Initialize header */ 1760 hptr = (struct kvm_get_htab_header __user *)buf; 1761 hdr.n_valid = 0; 1762 hdr.n_invalid = 0; 1763 nw = nb; 1764 nb += sizeof(hdr); 1765 lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1766 1767 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1768 if (!first_pass) { 1769 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1770 !hpte_dirty(revp, hptp)) { 1771 ++i; 1772 hptp += 2; 1773 ++revp; 1774 } 1775 } 1776 hdr.index = i; 1777 1778 /* Grab a series of valid entries */ 1779 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1780 hdr.n_valid < 0xffff && 1781 nb + HPTE_SIZE < count && 1782 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1783 /* valid entry, write it out */ 1784 ++hdr.n_valid; 1785 if (__put_user(hpte[0], lbuf) || 1786 __put_user(hpte[1], lbuf + 1)) 1787 return -EFAULT; 1788 nb += HPTE_SIZE; 1789 lbuf += 2; 1790 ++i; 1791 hptp += 2; 1792 ++revp; 1793 } 1794 /* Now skip invalid entries while we can */ 1795 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1796 hdr.n_invalid < 0xffff && 1797 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1798 /* found an invalid entry */ 1799 ++hdr.n_invalid; 1800 ++i; 1801 hptp += 2; 1802 ++revp; 1803 } 1804 1805 if (hdr.n_valid || hdr.n_invalid) { 1806 /* write back the header */ 1807 if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1808 return -EFAULT; 1809 nw = nb; 1810 buf = (char __user *)lbuf; 1811 } else { 1812 nb = nw; 1813 } 1814 1815 /* Check if we've wrapped around the hash table */ 1816 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { 1817 i = 0; 1818 ctx->first_pass = 0; 1819 break; 1820 } 1821 } 1822 1823 ctx->index = i; 1824 1825 return nb; 1826 } 1827 1828 static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1829 size_t count, loff_t *ppos) 1830 { 1831 struct kvm_htab_ctx *ctx = file->private_data; 1832 struct kvm *kvm = ctx->kvm; 1833 struct kvm_get_htab_header hdr; 1834 unsigned long i, j; 1835 unsigned long v, r; 1836 unsigned long __user *lbuf; 1837 __be64 *hptp; 1838 unsigned long tmp[2]; 1839 ssize_t nb; 1840 long int err, ret; 1841 int mmu_ready; 1842 int pshift; 1843 1844 if (!access_ok(VERIFY_READ, buf, count)) 1845 return -EFAULT; 1846 if (kvm_is_radix(kvm)) 1847 return -EINVAL; 1848 1849 /* lock out vcpus from running while we're doing this */ 1850 mutex_lock(&kvm->lock); 1851 mmu_ready = kvm->arch.mmu_ready; 1852 if (mmu_ready) { 1853 kvm->arch.mmu_ready = 0; /* temporarily */ 1854 /* order mmu_ready vs. vcpus_running */ 1855 smp_mb(); 1856 if (atomic_read(&kvm->arch.vcpus_running)) { 1857 kvm->arch.mmu_ready = 1; 1858 mutex_unlock(&kvm->lock); 1859 return -EBUSY; 1860 } 1861 } 1862 1863 err = 0; 1864 for (nb = 0; nb + sizeof(hdr) <= count; ) { 1865 err = -EFAULT; 1866 if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1867 break; 1868 1869 err = 0; 1870 if (nb + hdr.n_valid * HPTE_SIZE > count) 1871 break; 1872 1873 nb += sizeof(hdr); 1874 buf += sizeof(hdr); 1875 1876 err = -EINVAL; 1877 i = hdr.index; 1878 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || 1879 i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) 1880 break; 1881 1882 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1883 lbuf = (unsigned long __user *)buf; 1884 for (j = 0; j < hdr.n_valid; ++j) { 1885 __be64 hpte_v; 1886 __be64 hpte_r; 1887 1888 err = -EFAULT; 1889 if (__get_user(hpte_v, lbuf) || 1890 __get_user(hpte_r, lbuf + 1)) 1891 goto out; 1892 v = be64_to_cpu(hpte_v); 1893 r = be64_to_cpu(hpte_r); 1894 err = -EINVAL; 1895 if (!(v & HPTE_V_VALID)) 1896 goto out; 1897 pshift = kvmppc_hpte_base_page_shift(v, r); 1898 if (pshift <= 0) 1899 goto out; 1900 lbuf += 2; 1901 nb += HPTE_SIZE; 1902 1903 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1904 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1905 err = -EIO; 1906 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1907 tmp); 1908 if (ret != H_SUCCESS) { 1909 pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1910 "r=%lx\n", ret, i, v, r); 1911 goto out; 1912 } 1913 if (!mmu_ready && is_vrma_hpte(v)) { 1914 unsigned long senc, lpcr; 1915 1916 senc = slb_pgsize_encoding(1ul << pshift); 1917 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1918 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1919 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 1920 lpcr = senc << (LPCR_VRMASD_SH - 4); 1921 kvmppc_update_lpcr(kvm, lpcr, 1922 LPCR_VRMASD); 1923 } else { 1924 kvmppc_setup_partition_table(kvm); 1925 } 1926 mmu_ready = 1; 1927 } 1928 ++i; 1929 hptp += 2; 1930 } 1931 1932 for (j = 0; j < hdr.n_invalid; ++j) { 1933 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1934 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1935 ++i; 1936 hptp += 2; 1937 } 1938 err = 0; 1939 } 1940 1941 out: 1942 /* Order HPTE updates vs. mmu_ready */ 1943 smp_wmb(); 1944 kvm->arch.mmu_ready = mmu_ready; 1945 mutex_unlock(&kvm->lock); 1946 1947 if (err) 1948 return err; 1949 return nb; 1950 } 1951 1952 static int kvm_htab_release(struct inode *inode, struct file *filp) 1953 { 1954 struct kvm_htab_ctx *ctx = filp->private_data; 1955 1956 filp->private_data = NULL; 1957 if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1958 atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1959 kvm_put_kvm(ctx->kvm); 1960 kfree(ctx); 1961 return 0; 1962 } 1963 1964 static const struct file_operations kvm_htab_fops = { 1965 .read = kvm_htab_read, 1966 .write = kvm_htab_write, 1967 .llseek = default_llseek, 1968 .release = kvm_htab_release, 1969 }; 1970 1971 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1972 { 1973 int ret; 1974 struct kvm_htab_ctx *ctx; 1975 int rwflag; 1976 1977 /* reject flags we don't recognize */ 1978 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1979 return -EINVAL; 1980 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1981 if (!ctx) 1982 return -ENOMEM; 1983 kvm_get_kvm(kvm); 1984 ctx->kvm = kvm; 1985 ctx->index = ghf->start_index; 1986 ctx->flags = ghf->flags; 1987 ctx->first_pass = 1; 1988 1989 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1990 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1991 if (ret < 0) { 1992 kfree(ctx); 1993 kvm_put_kvm(kvm); 1994 return ret; 1995 } 1996 1997 if (rwflag == O_RDONLY) { 1998 mutex_lock(&kvm->slots_lock); 1999 atomic_inc(&kvm->arch.hpte_mod_interest); 2000 /* make sure kvmppc_do_h_enter etc. see the increment */ 2001 synchronize_srcu_expedited(&kvm->srcu); 2002 mutex_unlock(&kvm->slots_lock); 2003 } 2004 2005 return ret; 2006 } 2007 2008 struct debugfs_htab_state { 2009 struct kvm *kvm; 2010 struct mutex mutex; 2011 unsigned long hpt_index; 2012 int chars_left; 2013 int buf_index; 2014 char buf[64]; 2015 }; 2016 2017 static int debugfs_htab_open(struct inode *inode, struct file *file) 2018 { 2019 struct kvm *kvm = inode->i_private; 2020 struct debugfs_htab_state *p; 2021 2022 p = kzalloc(sizeof(*p), GFP_KERNEL); 2023 if (!p) 2024 return -ENOMEM; 2025 2026 kvm_get_kvm(kvm); 2027 p->kvm = kvm; 2028 mutex_init(&p->mutex); 2029 file->private_data = p; 2030 2031 return nonseekable_open(inode, file); 2032 } 2033 2034 static int debugfs_htab_release(struct inode *inode, struct file *file) 2035 { 2036 struct debugfs_htab_state *p = file->private_data; 2037 2038 kvm_put_kvm(p->kvm); 2039 kfree(p); 2040 return 0; 2041 } 2042 2043 static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 2044 size_t len, loff_t *ppos) 2045 { 2046 struct debugfs_htab_state *p = file->private_data; 2047 ssize_t ret, r; 2048 unsigned long i, n; 2049 unsigned long v, hr, gr; 2050 struct kvm *kvm; 2051 __be64 *hptp; 2052 2053 kvm = p->kvm; 2054 if (kvm_is_radix(kvm)) 2055 return 0; 2056 2057 ret = mutex_lock_interruptible(&p->mutex); 2058 if (ret) 2059 return ret; 2060 2061 if (p->chars_left) { 2062 n = p->chars_left; 2063 if (n > len) 2064 n = len; 2065 r = copy_to_user(buf, p->buf + p->buf_index, n); 2066 n -= r; 2067 p->chars_left -= n; 2068 p->buf_index += n; 2069 buf += n; 2070 len -= n; 2071 ret = n; 2072 if (r) { 2073 if (!n) 2074 ret = -EFAULT; 2075 goto out; 2076 } 2077 } 2078 2079 i = p->hpt_index; 2080 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2081 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); 2082 ++i, hptp += 2) { 2083 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 2084 continue; 2085 2086 /* lock the HPTE so it's stable and read it */ 2087 preempt_disable(); 2088 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 2089 cpu_relax(); 2090 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 2091 hr = be64_to_cpu(hptp[1]); 2092 gr = kvm->arch.hpt.rev[i].guest_rpte; 2093 unlock_hpte(hptp, v); 2094 preempt_enable(); 2095 2096 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 2097 continue; 2098 2099 n = scnprintf(p->buf, sizeof(p->buf), 2100 "%6lx %.16lx %.16lx %.16lx\n", 2101 i, v, hr, gr); 2102 p->chars_left = n; 2103 if (n > len) 2104 n = len; 2105 r = copy_to_user(buf, p->buf, n); 2106 n -= r; 2107 p->chars_left -= n; 2108 p->buf_index = n; 2109 buf += n; 2110 len -= n; 2111 ret += n; 2112 if (r) { 2113 if (!ret) 2114 ret = -EFAULT; 2115 goto out; 2116 } 2117 } 2118 p->hpt_index = i; 2119 2120 out: 2121 mutex_unlock(&p->mutex); 2122 return ret; 2123 } 2124 2125 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 2126 size_t len, loff_t *ppos) 2127 { 2128 return -EACCES; 2129 } 2130 2131 static const struct file_operations debugfs_htab_fops = { 2132 .owner = THIS_MODULE, 2133 .open = debugfs_htab_open, 2134 .release = debugfs_htab_release, 2135 .read = debugfs_htab_read, 2136 .write = debugfs_htab_write, 2137 .llseek = generic_file_llseek, 2138 }; 2139 2140 void kvmppc_mmu_debugfs_init(struct kvm *kvm) 2141 { 2142 kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, 2143 kvm->arch.debugfs_dir, kvm, 2144 &debugfs_htab_fops); 2145 } 2146 2147 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 2148 { 2149 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 2150 2151 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2152 2153 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2154 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 2155 2156 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 2157 } 2158