1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 5 */ 6 7 #include <linux/types.h> 8 #include <linux/string.h> 9 #include <linux/kvm.h> 10 #include <linux/kvm_host.h> 11 #include <linux/highmem.h> 12 #include <linux/gfp.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/vmalloc.h> 16 #include <linux/srcu.h> 17 #include <linux/anon_inodes.h> 18 #include <linux/file.h> 19 #include <linux/debugfs.h> 20 21 #include <asm/kvm_ppc.h> 22 #include <asm/kvm_book3s.h> 23 #include <asm/book3s/64/mmu-hash.h> 24 #include <asm/hvcall.h> 25 #include <asm/synch.h> 26 #include <asm/ppc-opcode.h> 27 #include <asm/cputable.h> 28 #include <asm/pte-walk.h> 29 30 #include "trace_hv.h" 31 32 //#define DEBUG_RESIZE_HPT 1 33 34 #ifdef DEBUG_RESIZE_HPT 35 #define resize_hpt_debug(resize, ...) \ 36 do { \ 37 printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ 38 printk(__VA_ARGS__); \ 39 } while (0) 40 #else 41 #define resize_hpt_debug(resize, ...) \ 42 do { } while (0) 43 #endif 44 45 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 46 long pte_index, unsigned long pteh, 47 unsigned long ptel, unsigned long *pte_idx_ret); 48 49 struct kvm_resize_hpt { 50 /* These fields read-only after init */ 51 struct kvm *kvm; 52 struct work_struct work; 53 u32 order; 54 55 /* These fields protected by kvm->arch.mmu_setup_lock */ 56 57 /* Possible values and their usage: 58 * <0 an error occurred during allocation, 59 * -EBUSY allocation is in the progress, 60 * 0 allocation made successfuly. 61 */ 62 int error; 63 64 /* Private to the work thread, until error != -EBUSY, 65 * then protected by kvm->arch.mmu_setup_lock. 66 */ 67 struct kvm_hpt_info hpt; 68 }; 69 70 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 71 { 72 unsigned long hpt = 0; 73 int cma = 0; 74 struct page *page = NULL; 75 struct revmap_entry *rev; 76 unsigned long npte; 77 78 if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) 79 return -EINVAL; 80 81 page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); 82 if (page) { 83 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 84 memset((void *)hpt, 0, (1ul << order)); 85 cma = 1; 86 } 87 88 if (!hpt) 89 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL 90 |__GFP_NOWARN, order - PAGE_SHIFT); 91 92 if (!hpt) 93 return -ENOMEM; 94 95 /* HPTEs are 2**4 bytes long */ 96 npte = 1ul << (order - 4); 97 98 /* Allocate reverse map array */ 99 rev = vmalloc(array_size(npte, sizeof(struct revmap_entry))); 100 if (!rev) { 101 if (cma) 102 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 103 else 104 free_pages(hpt, order - PAGE_SHIFT); 105 return -ENOMEM; 106 } 107 108 info->order = order; 109 info->virt = hpt; 110 info->cma = cma; 111 info->rev = rev; 112 113 return 0; 114 } 115 116 void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) 117 { 118 atomic64_set(&kvm->arch.mmio_update, 0); 119 kvm->arch.hpt = *info; 120 kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); 121 122 pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", 123 info->virt, (long)info->order, kvm->arch.lpid); 124 } 125 126 long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) 127 { 128 long err = -EBUSY; 129 struct kvm_hpt_info info; 130 131 mutex_lock(&kvm->arch.mmu_setup_lock); 132 if (kvm->arch.mmu_ready) { 133 kvm->arch.mmu_ready = 0; 134 /* order mmu_ready vs. vcpus_running */ 135 smp_mb(); 136 if (atomic_read(&kvm->arch.vcpus_running)) { 137 kvm->arch.mmu_ready = 1; 138 goto out; 139 } 140 } 141 if (kvm_is_radix(kvm)) { 142 err = kvmppc_switch_mmu_to_hpt(kvm); 143 if (err) 144 goto out; 145 } 146 147 if (kvm->arch.hpt.order == order) { 148 /* We already have a suitable HPT */ 149 150 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 151 memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); 152 /* 153 * Reset all the reverse-mapping chains for all memslots 154 */ 155 kvmppc_rmap_reset(kvm); 156 err = 0; 157 goto out; 158 } 159 160 if (kvm->arch.hpt.virt) { 161 kvmppc_free_hpt(&kvm->arch.hpt); 162 kvmppc_rmap_reset(kvm); 163 } 164 165 err = kvmppc_allocate_hpt(&info, order); 166 if (err < 0) 167 goto out; 168 kvmppc_set_hpt(kvm, &info); 169 170 out: 171 if (err == 0) 172 /* Ensure that each vcpu will flush its TLB on next entry. */ 173 cpumask_setall(&kvm->arch.need_tlb_flush); 174 175 mutex_unlock(&kvm->arch.mmu_setup_lock); 176 return err; 177 } 178 179 void kvmppc_free_hpt(struct kvm_hpt_info *info) 180 { 181 vfree(info->rev); 182 info->rev = NULL; 183 if (info->cma) 184 kvm_free_hpt_cma(virt_to_page(info->virt), 185 1 << (info->order - PAGE_SHIFT)); 186 else if (info->virt) 187 free_pages(info->virt, info->order - PAGE_SHIFT); 188 info->virt = 0; 189 info->order = 0; 190 } 191 192 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 193 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 194 { 195 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 196 } 197 198 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 199 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 200 { 201 return (pgsize == 0x10000) ? 0x1000 : 0; 202 } 203 204 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 205 unsigned long porder) 206 { 207 unsigned long i; 208 unsigned long npages; 209 unsigned long hp_v, hp_r; 210 unsigned long addr, hash; 211 unsigned long psize; 212 unsigned long hp0, hp1; 213 unsigned long idx_ret; 214 long ret; 215 struct kvm *kvm = vcpu->kvm; 216 217 psize = 1ul << porder; 218 npages = memslot->npages >> (porder - PAGE_SHIFT); 219 220 /* VRMA can't be > 1TB */ 221 if (npages > 1ul << (40 - porder)) 222 npages = 1ul << (40 - porder); 223 /* Can't use more than 1 HPTE per HPTEG */ 224 if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) 225 npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; 226 227 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 228 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 229 hp1 = hpte1_pgsize_encoding(psize) | 230 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 231 232 for (i = 0; i < npages; ++i) { 233 addr = i << porder; 234 /* can't use hpt_hash since va > 64 bits */ 235 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) 236 & kvmppc_hpt_mask(&kvm->arch.hpt); 237 /* 238 * We assume that the hash table is empty and no 239 * vcpus are using it at this stage. Since we create 240 * at most one HPTE per HPTEG, we just assume entry 7 241 * is available and use it. 242 */ 243 hash = (hash << 3) + 7; 244 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 245 hp_r = hp1 | addr; 246 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 247 &idx_ret); 248 if (ret != H_SUCCESS) { 249 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 250 addr, ret); 251 break; 252 } 253 } 254 } 255 256 int kvmppc_mmu_hv_init(void) 257 { 258 unsigned long host_lpid, rsvd_lpid; 259 260 if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) 261 return -EINVAL; 262 263 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ 264 host_lpid = 0; 265 if (cpu_has_feature(CPU_FTR_HVMODE)) 266 host_lpid = mfspr(SPRN_LPID); 267 rsvd_lpid = LPID_RSVD; 268 269 kvmppc_init_lpid(rsvd_lpid + 1); 270 271 kvmppc_claim_lpid(host_lpid); 272 /* rsvd_lpid is reserved for use in partition switching */ 273 kvmppc_claim_lpid(rsvd_lpid); 274 275 return 0; 276 } 277 278 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) 279 { 280 unsigned long msr = vcpu->arch.intr_msr; 281 282 /* If transactional, change to suspend mode on IRQ delivery */ 283 if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) 284 msr |= MSR_TS_S; 285 else 286 msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; 287 kvmppc_set_msr(vcpu, msr); 288 } 289 290 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 291 long pte_index, unsigned long pteh, 292 unsigned long ptel, unsigned long *pte_idx_ret) 293 { 294 long ret; 295 296 /* Protect linux PTE lookup from page table destruction */ 297 rcu_read_lock_sched(); /* this disables preemption too */ 298 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 299 current->mm->pgd, false, pte_idx_ret); 300 rcu_read_unlock_sched(); 301 if (ret == H_TOO_HARD) { 302 /* this can't happen */ 303 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 304 ret = H_RESOURCE; /* or something */ 305 } 306 return ret; 307 308 } 309 310 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 311 gva_t eaddr) 312 { 313 u64 mask; 314 int i; 315 316 for (i = 0; i < vcpu->arch.slb_nr; i++) { 317 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 318 continue; 319 320 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 321 mask = ESID_MASK_1T; 322 else 323 mask = ESID_MASK; 324 325 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 326 return &vcpu->arch.slb[i]; 327 } 328 return NULL; 329 } 330 331 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 332 unsigned long ea) 333 { 334 unsigned long ra_mask; 335 336 ra_mask = kvmppc_actual_pgsz(v, r) - 1; 337 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 338 } 339 340 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 341 struct kvmppc_pte *gpte, bool data, bool iswrite) 342 { 343 struct kvm *kvm = vcpu->kvm; 344 struct kvmppc_slb *slbe; 345 unsigned long slb_v; 346 unsigned long pp, key; 347 unsigned long v, orig_v, gr; 348 __be64 *hptep; 349 long int index; 350 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 351 352 if (kvm_is_radix(vcpu->kvm)) 353 return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); 354 355 /* Get SLB entry */ 356 if (virtmode) { 357 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 358 if (!slbe) 359 return -EINVAL; 360 slb_v = slbe->origv; 361 } else { 362 /* real mode access */ 363 slb_v = vcpu->kvm->arch.vrma_slb_v; 364 } 365 366 preempt_disable(); 367 /* Find the HPTE in the hash table */ 368 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 369 HPTE_V_VALID | HPTE_V_ABSENT); 370 if (index < 0) { 371 preempt_enable(); 372 return -ENOENT; 373 } 374 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 375 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 376 if (cpu_has_feature(CPU_FTR_ARCH_300)) 377 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 378 gr = kvm->arch.hpt.rev[index].guest_rpte; 379 380 unlock_hpte(hptep, orig_v); 381 preempt_enable(); 382 383 gpte->eaddr = eaddr; 384 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 385 386 /* Get PP bits and key for permission check */ 387 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 388 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 389 key &= slb_v; 390 391 /* Calculate permissions */ 392 gpte->may_read = hpte_read_permission(pp, key); 393 gpte->may_write = hpte_write_permission(pp, key); 394 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 395 396 /* Storage key permission check for POWER7 */ 397 if (data && virtmode) { 398 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 399 if (amrfield & 1) 400 gpte->may_read = 0; 401 if (amrfield & 2) 402 gpte->may_write = 0; 403 } 404 405 /* Get the guest physical address */ 406 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 407 return 0; 408 } 409 410 /* 411 * Quick test for whether an instruction is a load or a store. 412 * If the instruction is a load or a store, then this will indicate 413 * which it is, at least on server processors. (Embedded processors 414 * have some external PID instructions that don't follow the rule 415 * embodied here.) If the instruction isn't a load or store, then 416 * this doesn't return anything useful. 417 */ 418 static int instruction_is_store(unsigned int instr) 419 { 420 unsigned int mask; 421 422 mask = 0x10000000; 423 if ((instr & 0xfc000000) == 0x7c000000) 424 mask = 0x100; /* major opcode 31 */ 425 return (instr & mask) != 0; 426 } 427 428 int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, 429 unsigned long gpa, gva_t ea, int is_store) 430 { 431 u32 last_inst; 432 433 /* 434 * Fast path - check if the guest physical address corresponds to a 435 * device on the FAST_MMIO_BUS, if so we can avoid loading the 436 * instruction all together, then we can just handle it and return. 437 */ 438 if (is_store) { 439 int idx, ret; 440 441 idx = srcu_read_lock(&vcpu->kvm->srcu); 442 ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0, 443 NULL); 444 srcu_read_unlock(&vcpu->kvm->srcu, idx); 445 if (!ret) { 446 kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); 447 return RESUME_GUEST; 448 } 449 } 450 451 /* 452 * If we fail, we just return to the guest and try executing it again. 453 */ 454 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 455 EMULATE_DONE) 456 return RESUME_GUEST; 457 458 /* 459 * WARNING: We do not know for sure whether the instruction we just 460 * read from memory is the same that caused the fault in the first 461 * place. If the instruction we read is neither an load or a store, 462 * then it can't access memory, so we don't need to worry about 463 * enforcing access permissions. So, assuming it is a load or 464 * store, we just check that its direction (load or store) is 465 * consistent with the original fault, since that's what we 466 * checked the access permissions against. If there is a mismatch 467 * we just return and retry the instruction. 468 */ 469 470 if (instruction_is_store(last_inst) != !!is_store) 471 return RESUME_GUEST; 472 473 /* 474 * Emulated accesses are emulated by looking at the hash for 475 * translation once, then performing the access later. The 476 * translation could be invalidated in the meantime in which 477 * point performing the subsequent memory access on the old 478 * physical address could possibly be a security hole for the 479 * guest (but not the host). 480 * 481 * This is less of an issue for MMIO stores since they aren't 482 * globally visible. It could be an issue for MMIO loads to 483 * a certain extent but we'll ignore it for now. 484 */ 485 486 vcpu->arch.paddr_accessed = gpa; 487 vcpu->arch.vaddr_accessed = ea; 488 return kvmppc_emulate_mmio(run, vcpu); 489 } 490 491 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, 492 unsigned long ea, unsigned long dsisr) 493 { 494 struct kvm *kvm = vcpu->kvm; 495 unsigned long hpte[3], r; 496 unsigned long hnow_v, hnow_r; 497 __be64 *hptep; 498 unsigned long mmu_seq, psize, pte_size; 499 unsigned long gpa_base, gfn_base; 500 unsigned long gpa, gfn, hva, pfn; 501 struct kvm_memory_slot *memslot; 502 unsigned long *rmap; 503 struct revmap_entry *rev; 504 struct page *page, *pages[1]; 505 long index, ret, npages; 506 bool is_ci; 507 unsigned int writing, write_ok; 508 struct vm_area_struct *vma; 509 unsigned long rcbits; 510 long mmio_update; 511 512 if (kvm_is_radix(kvm)) 513 return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); 514 515 /* 516 * Real-mode code has already searched the HPT and found the 517 * entry we're interested in. Lock the entry and check that 518 * it hasn't changed. If it has, just return and re-execute the 519 * instruction. 520 */ 521 if (ea != vcpu->arch.pgfault_addr) 522 return RESUME_GUEST; 523 524 if (vcpu->arch.pgfault_cache) { 525 mmio_update = atomic64_read(&kvm->arch.mmio_update); 526 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 527 r = vcpu->arch.pgfault_cache->rpte; 528 psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], 529 r); 530 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 531 gfn_base = gpa_base >> PAGE_SHIFT; 532 gpa = gpa_base | (ea & (psize - 1)); 533 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 534 dsisr & DSISR_ISSTORE); 535 } 536 } 537 index = vcpu->arch.pgfault_index; 538 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 539 rev = &kvm->arch.hpt.rev[index]; 540 preempt_disable(); 541 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 542 cpu_relax(); 543 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 544 hpte[1] = be64_to_cpu(hptep[1]); 545 hpte[2] = r = rev->guest_rpte; 546 unlock_hpte(hptep, hpte[0]); 547 preempt_enable(); 548 549 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 550 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); 551 hpte[1] = hpte_new_to_old_r(hpte[1]); 552 } 553 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 554 hpte[1] != vcpu->arch.pgfault_hpte[1]) 555 return RESUME_GUEST; 556 557 /* Translate the logical address and get the page */ 558 psize = kvmppc_actual_pgsz(hpte[0], r); 559 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 560 gfn_base = gpa_base >> PAGE_SHIFT; 561 gpa = gpa_base | (ea & (psize - 1)); 562 gfn = gpa >> PAGE_SHIFT; 563 memslot = gfn_to_memslot(kvm, gfn); 564 565 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); 566 567 /* No memslot means it's an emulated MMIO region */ 568 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 569 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 570 dsisr & DSISR_ISSTORE); 571 572 /* 573 * This should never happen, because of the slot_is_aligned() 574 * check in kvmppc_do_h_enter(). 575 */ 576 if (gfn_base < memslot->base_gfn) 577 return -EFAULT; 578 579 /* used to check for invalidations in progress */ 580 mmu_seq = kvm->mmu_notifier_seq; 581 smp_rmb(); 582 583 ret = -EFAULT; 584 is_ci = false; 585 pfn = 0; 586 page = NULL; 587 pte_size = PAGE_SIZE; 588 writing = (dsisr & DSISR_ISSTORE) != 0; 589 /* If writing != 0, then the HPTE must allow writing, if we get here */ 590 write_ok = writing; 591 hva = gfn_to_hva_memslot(memslot, gfn); 592 npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); 593 if (npages < 1) { 594 /* Check if it's an I/O mapping */ 595 down_read(¤t->mm->mmap_sem); 596 vma = find_vma(current->mm, hva); 597 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && 598 (vma->vm_flags & VM_PFNMAP)) { 599 pfn = vma->vm_pgoff + 600 ((hva - vma->vm_start) >> PAGE_SHIFT); 601 pte_size = psize; 602 is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); 603 write_ok = vma->vm_flags & VM_WRITE; 604 } 605 up_read(¤t->mm->mmap_sem); 606 if (!pfn) 607 goto out_put; 608 } else { 609 page = pages[0]; 610 pfn = page_to_pfn(page); 611 if (PageHuge(page)) { 612 page = compound_head(page); 613 pte_size <<= compound_order(page); 614 } 615 /* if the guest wants write access, see if that is OK */ 616 if (!writing && hpte_is_writable(r)) { 617 pte_t *ptep, pte; 618 unsigned long flags; 619 /* 620 * We need to protect against page table destruction 621 * hugepage split and collapse. 622 */ 623 local_irq_save(flags); 624 ptep = find_current_mm_pte(current->mm->pgd, 625 hva, NULL, NULL); 626 if (ptep) { 627 pte = kvmppc_read_update_linux_pte(ptep, 1); 628 if (__pte_write(pte)) 629 write_ok = 1; 630 } 631 local_irq_restore(flags); 632 } 633 } 634 635 if (psize > pte_size) 636 goto out_put; 637 638 /* Check WIMG vs. the actual page we're accessing */ 639 if (!hpte_cache_flags_ok(r, is_ci)) { 640 if (is_ci) 641 goto out_put; 642 /* 643 * Allow guest to map emulated device memory as 644 * uncacheable, but actually make it cacheable. 645 */ 646 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 647 } 648 649 /* 650 * Set the HPTE to point to pfn. 651 * Since the pfn is at PAGE_SIZE granularity, make sure we 652 * don't mask out lower-order bits if psize < PAGE_SIZE. 653 */ 654 if (psize < PAGE_SIZE) 655 psize = PAGE_SIZE; 656 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | 657 ((pfn << PAGE_SHIFT) & ~(psize - 1)); 658 if (hpte_is_writable(r) && !write_ok) 659 r = hpte_make_readonly(r); 660 ret = RESUME_GUEST; 661 preempt_disable(); 662 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 663 cpu_relax(); 664 hnow_v = be64_to_cpu(hptep[0]); 665 hnow_r = be64_to_cpu(hptep[1]); 666 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 667 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); 668 hnow_r = hpte_new_to_old_r(hnow_r); 669 } 670 671 /* 672 * If the HPT is being resized, don't update the HPTE, 673 * instead let the guest retry after the resize operation is complete. 674 * The synchronization for mmu_ready test vs. set is provided 675 * by the HPTE lock. 676 */ 677 if (!kvm->arch.mmu_ready) 678 goto out_unlock; 679 680 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 681 rev->guest_rpte != hpte[2]) 682 /* HPTE has been changed under us; let the guest retry */ 683 goto out_unlock; 684 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 685 686 /* Always put the HPTE in the rmap chain for the page base address */ 687 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 688 lock_rmap(rmap); 689 690 /* Check if we might have been invalidated; let the guest retry if so */ 691 ret = RESUME_GUEST; 692 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 693 unlock_rmap(rmap); 694 goto out_unlock; 695 } 696 697 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 698 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 699 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 700 701 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { 702 /* HPTE was previously valid, so we need to invalidate it */ 703 unlock_rmap(rmap); 704 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 705 kvmppc_invalidate_hpte(kvm, hptep, index); 706 /* don't lose previous R and C bits */ 707 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 708 } else { 709 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 710 } 711 712 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 713 r = hpte_old_to_new_r(hpte[0], r); 714 hpte[0] = hpte_old_to_new_v(hpte[0]); 715 } 716 hptep[1] = cpu_to_be64(r); 717 eieio(); 718 __unlock_hpte(hptep, hpte[0]); 719 asm volatile("ptesync" : : : "memory"); 720 preempt_enable(); 721 if (page && hpte_is_writable(r)) 722 SetPageDirty(page); 723 724 out_put: 725 trace_kvm_page_fault_exit(vcpu, hpte, ret); 726 727 if (page) { 728 /* 729 * We drop pages[0] here, not page because page might 730 * have been set to the head page of a compound, but 731 * we have to drop the reference on the correct tail 732 * page to match the get inside gup() 733 */ 734 put_page(pages[0]); 735 } 736 return ret; 737 738 out_unlock: 739 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 740 preempt_enable(); 741 goto out_put; 742 } 743 744 void kvmppc_rmap_reset(struct kvm *kvm) 745 { 746 struct kvm_memslots *slots; 747 struct kvm_memory_slot *memslot; 748 int srcu_idx; 749 750 srcu_idx = srcu_read_lock(&kvm->srcu); 751 slots = kvm_memslots(kvm); 752 kvm_for_each_memslot(memslot, slots) { 753 /* Mutual exclusion with kvm_unmap_hva_range etc. */ 754 spin_lock(&kvm->mmu_lock); 755 /* 756 * This assumes it is acceptable to lose reference and 757 * change bits across a reset. 758 */ 759 memset(memslot->arch.rmap, 0, 760 memslot->npages * sizeof(*memslot->arch.rmap)); 761 spin_unlock(&kvm->mmu_lock); 762 } 763 srcu_read_unlock(&kvm->srcu, srcu_idx); 764 } 765 766 typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, 767 unsigned long gfn); 768 769 static int kvm_handle_hva_range(struct kvm *kvm, 770 unsigned long start, 771 unsigned long end, 772 hva_handler_fn handler) 773 { 774 int ret; 775 int retval = 0; 776 struct kvm_memslots *slots; 777 struct kvm_memory_slot *memslot; 778 779 slots = kvm_memslots(kvm); 780 kvm_for_each_memslot(memslot, slots) { 781 unsigned long hva_start, hva_end; 782 gfn_t gfn, gfn_end; 783 784 hva_start = max(start, memslot->userspace_addr); 785 hva_end = min(end, memslot->userspace_addr + 786 (memslot->npages << PAGE_SHIFT)); 787 if (hva_start >= hva_end) 788 continue; 789 /* 790 * {gfn(page) | page intersects with [hva_start, hva_end)} = 791 * {gfn, gfn+1, ..., gfn_end-1}. 792 */ 793 gfn = hva_to_gfn_memslot(hva_start, memslot); 794 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 795 796 for (; gfn < gfn_end; ++gfn) { 797 ret = handler(kvm, memslot, gfn); 798 retval |= ret; 799 } 800 } 801 802 return retval; 803 } 804 805 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 806 hva_handler_fn handler) 807 { 808 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 809 } 810 811 /* Must be called with both HPTE and rmap locked */ 812 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 813 struct kvm_memory_slot *memslot, 814 unsigned long *rmapp, unsigned long gfn) 815 { 816 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 817 struct revmap_entry *rev = kvm->arch.hpt.rev; 818 unsigned long j, h; 819 unsigned long ptel, psize, rcbits; 820 821 j = rev[i].forw; 822 if (j == i) { 823 /* chain is now empty */ 824 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 825 } else { 826 /* remove i from chain */ 827 h = rev[i].back; 828 rev[h].forw = j; 829 rev[j].back = h; 830 rev[i].forw = rev[i].back = i; 831 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 832 } 833 834 /* Now check and modify the HPTE */ 835 ptel = rev[i].guest_rpte; 836 psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); 837 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 838 hpte_rpn(ptel, psize) == gfn) { 839 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 840 kvmppc_invalidate_hpte(kvm, hptep, i); 841 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); 842 /* Harvest R and C */ 843 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 844 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 845 if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) 846 kvmppc_update_dirty_map(memslot, gfn, psize); 847 if (rcbits & ~rev[i].guest_rpte) { 848 rev[i].guest_rpte = ptel | rcbits; 849 note_hpte_modification(kvm, &rev[i]); 850 } 851 } 852 } 853 854 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 855 unsigned long gfn) 856 { 857 unsigned long i; 858 __be64 *hptep; 859 unsigned long *rmapp; 860 861 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 862 for (;;) { 863 lock_rmap(rmapp); 864 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 865 unlock_rmap(rmapp); 866 break; 867 } 868 869 /* 870 * To avoid an ABBA deadlock with the HPTE lock bit, 871 * we can't spin on the HPTE lock while holding the 872 * rmap chain lock. 873 */ 874 i = *rmapp & KVMPPC_RMAP_INDEX; 875 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 876 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 877 /* unlock rmap before spinning on the HPTE lock */ 878 unlock_rmap(rmapp); 879 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 880 cpu_relax(); 881 continue; 882 } 883 884 kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); 885 unlock_rmap(rmapp); 886 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 887 } 888 return 0; 889 } 890 891 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) 892 { 893 hva_handler_fn handler; 894 895 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 896 kvm_handle_hva_range(kvm, start, end, handler); 897 return 0; 898 } 899 900 void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 901 struct kvm_memory_slot *memslot) 902 { 903 unsigned long gfn; 904 unsigned long n; 905 unsigned long *rmapp; 906 907 gfn = memslot->base_gfn; 908 rmapp = memslot->arch.rmap; 909 if (kvm_is_radix(kvm)) { 910 kvmppc_radix_flush_memslot(kvm, memslot); 911 return; 912 } 913 914 for (n = memslot->npages; n; --n, ++gfn) { 915 /* 916 * Testing the present bit without locking is OK because 917 * the memslot has been marked invalid already, and hence 918 * no new HPTEs referencing this page can be created, 919 * thus the present bit can't go from 0 to 1. 920 */ 921 if (*rmapp & KVMPPC_RMAP_PRESENT) 922 kvm_unmap_rmapp(kvm, memslot, gfn); 923 ++rmapp; 924 } 925 } 926 927 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 928 unsigned long gfn) 929 { 930 struct revmap_entry *rev = kvm->arch.hpt.rev; 931 unsigned long head, i, j; 932 __be64 *hptep; 933 int ret = 0; 934 unsigned long *rmapp; 935 936 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 937 retry: 938 lock_rmap(rmapp); 939 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 940 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 941 ret = 1; 942 } 943 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 944 unlock_rmap(rmapp); 945 return ret; 946 } 947 948 i = head = *rmapp & KVMPPC_RMAP_INDEX; 949 do { 950 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 951 j = rev[i].forw; 952 953 /* If this HPTE isn't referenced, ignore it */ 954 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) 955 continue; 956 957 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 958 /* unlock rmap before spinning on the HPTE lock */ 959 unlock_rmap(rmapp); 960 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 961 cpu_relax(); 962 goto retry; 963 } 964 965 /* Now check and modify the HPTE */ 966 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 967 (be64_to_cpu(hptep[1]) & HPTE_R_R)) { 968 kvmppc_clear_ref_hpte(kvm, hptep, i); 969 if (!(rev[i].guest_rpte & HPTE_R_R)) { 970 rev[i].guest_rpte |= HPTE_R_R; 971 note_hpte_modification(kvm, &rev[i]); 972 } 973 ret = 1; 974 } 975 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 976 } while ((i = j) != head); 977 978 unlock_rmap(rmapp); 979 return ret; 980 } 981 982 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) 983 { 984 hva_handler_fn handler; 985 986 handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; 987 return kvm_handle_hva_range(kvm, start, end, handler); 988 } 989 990 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 991 unsigned long gfn) 992 { 993 struct revmap_entry *rev = kvm->arch.hpt.rev; 994 unsigned long head, i, j; 995 unsigned long *hp; 996 int ret = 1; 997 unsigned long *rmapp; 998 999 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1000 if (*rmapp & KVMPPC_RMAP_REFERENCED) 1001 return 1; 1002 1003 lock_rmap(rmapp); 1004 if (*rmapp & KVMPPC_RMAP_REFERENCED) 1005 goto out; 1006 1007 if (*rmapp & KVMPPC_RMAP_PRESENT) { 1008 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1009 do { 1010 hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); 1011 j = rev[i].forw; 1012 if (be64_to_cpu(hp[1]) & HPTE_R_R) 1013 goto out; 1014 } while ((i = j) != head); 1015 } 1016 ret = 0; 1017 1018 out: 1019 unlock_rmap(rmapp); 1020 return ret; 1021 } 1022 1023 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) 1024 { 1025 hva_handler_fn handler; 1026 1027 handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; 1028 return kvm_handle_hva(kvm, hva, handler); 1029 } 1030 1031 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) 1032 { 1033 hva_handler_fn handler; 1034 1035 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 1036 kvm_handle_hva(kvm, hva, handler); 1037 } 1038 1039 static int vcpus_running(struct kvm *kvm) 1040 { 1041 return atomic_read(&kvm->arch.vcpus_running) != 0; 1042 } 1043 1044 /* 1045 * Returns the number of system pages that are dirty. 1046 * This can be more than 1 if we find a huge-page HPTE. 1047 */ 1048 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1049 { 1050 struct revmap_entry *rev = kvm->arch.hpt.rev; 1051 unsigned long head, i, j; 1052 unsigned long n; 1053 unsigned long v, r; 1054 __be64 *hptep; 1055 int npages_dirty = 0; 1056 1057 retry: 1058 lock_rmap(rmapp); 1059 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1060 unlock_rmap(rmapp); 1061 return npages_dirty; 1062 } 1063 1064 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1065 do { 1066 unsigned long hptep1; 1067 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 1068 j = rev[i].forw; 1069 1070 /* 1071 * Checking the C (changed) bit here is racy since there 1072 * is no guarantee about when the hardware writes it back. 1073 * If the HPTE is not writable then it is stable since the 1074 * page can't be written to, and we would have done a tlbie 1075 * (which forces the hardware to complete any writeback) 1076 * when making the HPTE read-only. 1077 * If vcpus are running then this call is racy anyway 1078 * since the page could get dirtied subsequently, so we 1079 * expect there to be a further call which would pick up 1080 * any delayed C bit writeback. 1081 * Otherwise we need to do the tlbie even if C==0 in 1082 * order to pick up any delayed writeback of C. 1083 */ 1084 hptep1 = be64_to_cpu(hptep[1]); 1085 if (!(hptep1 & HPTE_R_C) && 1086 (!hpte_is_writable(hptep1) || vcpus_running(kvm))) 1087 continue; 1088 1089 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1090 /* unlock rmap before spinning on the HPTE lock */ 1091 unlock_rmap(rmapp); 1092 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) 1093 cpu_relax(); 1094 goto retry; 1095 } 1096 1097 /* Now check and modify the HPTE */ 1098 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 1099 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 1100 continue; 1101 } 1102 1103 /* need to make it temporarily absent so C is stable */ 1104 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 1105 kvmppc_invalidate_hpte(kvm, hptep, i); 1106 v = be64_to_cpu(hptep[0]); 1107 r = be64_to_cpu(hptep[1]); 1108 if (r & HPTE_R_C) { 1109 hptep[1] = cpu_to_be64(r & ~HPTE_R_C); 1110 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1111 rev[i].guest_rpte |= HPTE_R_C; 1112 note_hpte_modification(kvm, &rev[i]); 1113 } 1114 n = kvmppc_actual_pgsz(v, r); 1115 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1116 if (n > npages_dirty) 1117 npages_dirty = n; 1118 eieio(); 1119 } 1120 v &= ~HPTE_V_ABSENT; 1121 v |= HPTE_V_VALID; 1122 __unlock_hpte(hptep, v); 1123 } while ((i = j) != head); 1124 1125 unlock_rmap(rmapp); 1126 return npages_dirty; 1127 } 1128 1129 void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1130 struct kvm_memory_slot *memslot, 1131 unsigned long *map) 1132 { 1133 unsigned long gfn; 1134 1135 if (!vpa->dirty || !vpa->pinned_addr) 1136 return; 1137 gfn = vpa->gpa >> PAGE_SHIFT; 1138 if (gfn < memslot->base_gfn || 1139 gfn >= memslot->base_gfn + memslot->npages) 1140 return; 1141 1142 vpa->dirty = false; 1143 if (map) 1144 __set_bit_le(gfn - memslot->base_gfn, map); 1145 } 1146 1147 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1148 struct kvm_memory_slot *memslot, unsigned long *map) 1149 { 1150 unsigned long i; 1151 unsigned long *rmapp; 1152 1153 preempt_disable(); 1154 rmapp = memslot->arch.rmap; 1155 for (i = 0; i < memslot->npages; ++i) { 1156 int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1157 /* 1158 * Note that if npages > 0 then i must be a multiple of npages, 1159 * since we always put huge-page HPTEs in the rmap chain 1160 * corresponding to their page base address. 1161 */ 1162 if (npages) 1163 set_dirty_bits(map, i, npages); 1164 ++rmapp; 1165 } 1166 preempt_enable(); 1167 return 0; 1168 } 1169 1170 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1171 unsigned long *nb_ret) 1172 { 1173 struct kvm_memory_slot *memslot; 1174 unsigned long gfn = gpa >> PAGE_SHIFT; 1175 struct page *page, *pages[1]; 1176 int npages; 1177 unsigned long hva, offset; 1178 int srcu_idx; 1179 1180 srcu_idx = srcu_read_lock(&kvm->srcu); 1181 memslot = gfn_to_memslot(kvm, gfn); 1182 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1183 goto err; 1184 hva = gfn_to_hva_memslot(memslot, gfn); 1185 npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); 1186 if (npages < 1) 1187 goto err; 1188 page = pages[0]; 1189 srcu_read_unlock(&kvm->srcu, srcu_idx); 1190 1191 offset = gpa & (PAGE_SIZE - 1); 1192 if (nb_ret) 1193 *nb_ret = PAGE_SIZE - offset; 1194 return page_address(page) + offset; 1195 1196 err: 1197 srcu_read_unlock(&kvm->srcu, srcu_idx); 1198 return NULL; 1199 } 1200 1201 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1202 bool dirty) 1203 { 1204 struct page *page = virt_to_page(va); 1205 struct kvm_memory_slot *memslot; 1206 unsigned long gfn; 1207 int srcu_idx; 1208 1209 put_page(page); 1210 1211 if (!dirty) 1212 return; 1213 1214 /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ 1215 gfn = gpa >> PAGE_SHIFT; 1216 srcu_idx = srcu_read_lock(&kvm->srcu); 1217 memslot = gfn_to_memslot(kvm, gfn); 1218 if (memslot && memslot->dirty_bitmap) 1219 set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); 1220 srcu_read_unlock(&kvm->srcu, srcu_idx); 1221 } 1222 1223 /* 1224 * HPT resizing 1225 */ 1226 static int resize_hpt_allocate(struct kvm_resize_hpt *resize) 1227 { 1228 int rc; 1229 1230 rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); 1231 if (rc < 0) 1232 return rc; 1233 1234 resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", 1235 resize->hpt.virt); 1236 1237 return 0; 1238 } 1239 1240 static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, 1241 unsigned long idx) 1242 { 1243 struct kvm *kvm = resize->kvm; 1244 struct kvm_hpt_info *old = &kvm->arch.hpt; 1245 struct kvm_hpt_info *new = &resize->hpt; 1246 unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; 1247 unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; 1248 __be64 *hptep, *new_hptep; 1249 unsigned long vpte, rpte, guest_rpte; 1250 int ret; 1251 struct revmap_entry *rev; 1252 unsigned long apsize, avpn, pteg, hash; 1253 unsigned long new_idx, new_pteg, replace_vpte; 1254 int pshift; 1255 1256 hptep = (__be64 *)(old->virt + (idx << 4)); 1257 1258 /* Guest is stopped, so new HPTEs can't be added or faulted 1259 * in, only unmapped or altered by host actions. So, it's 1260 * safe to check this before we take the HPTE lock */ 1261 vpte = be64_to_cpu(hptep[0]); 1262 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1263 return 0; /* nothing to do */ 1264 1265 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 1266 cpu_relax(); 1267 1268 vpte = be64_to_cpu(hptep[0]); 1269 1270 ret = 0; 1271 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1272 /* Nothing to do */ 1273 goto out; 1274 1275 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1276 rpte = be64_to_cpu(hptep[1]); 1277 vpte = hpte_new_to_old_v(vpte, rpte); 1278 } 1279 1280 /* Unmap */ 1281 rev = &old->rev[idx]; 1282 guest_rpte = rev->guest_rpte; 1283 1284 ret = -EIO; 1285 apsize = kvmppc_actual_pgsz(vpte, guest_rpte); 1286 if (!apsize) 1287 goto out; 1288 1289 if (vpte & HPTE_V_VALID) { 1290 unsigned long gfn = hpte_rpn(guest_rpte, apsize); 1291 int srcu_idx = srcu_read_lock(&kvm->srcu); 1292 struct kvm_memory_slot *memslot = 1293 __gfn_to_memslot(kvm_memslots(kvm), gfn); 1294 1295 if (memslot) { 1296 unsigned long *rmapp; 1297 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1298 1299 lock_rmap(rmapp); 1300 kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); 1301 unlock_rmap(rmapp); 1302 } 1303 1304 srcu_read_unlock(&kvm->srcu, srcu_idx); 1305 } 1306 1307 /* Reload PTE after unmap */ 1308 vpte = be64_to_cpu(hptep[0]); 1309 BUG_ON(vpte & HPTE_V_VALID); 1310 BUG_ON(!(vpte & HPTE_V_ABSENT)); 1311 1312 ret = 0; 1313 if (!(vpte & HPTE_V_BOLTED)) 1314 goto out; 1315 1316 rpte = be64_to_cpu(hptep[1]); 1317 1318 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1319 vpte = hpte_new_to_old_v(vpte, rpte); 1320 rpte = hpte_new_to_old_r(rpte); 1321 } 1322 1323 pshift = kvmppc_hpte_base_page_shift(vpte, rpte); 1324 avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23); 1325 pteg = idx / HPTES_PER_GROUP; 1326 if (vpte & HPTE_V_SECONDARY) 1327 pteg = ~pteg; 1328 1329 if (!(vpte & HPTE_V_1TB_SEG)) { 1330 unsigned long offset, vsid; 1331 1332 /* We only have 28 - 23 bits of offset in avpn */ 1333 offset = (avpn & 0x1f) << 23; 1334 vsid = avpn >> 5; 1335 /* We can find more bits from the pteg value */ 1336 if (pshift < 23) 1337 offset |= ((vsid ^ pteg) & old_hash_mask) << pshift; 1338 1339 hash = vsid ^ (offset >> pshift); 1340 } else { 1341 unsigned long offset, vsid; 1342 1343 /* We only have 40 - 23 bits of seg_off in avpn */ 1344 offset = (avpn & 0x1ffff) << 23; 1345 vsid = avpn >> 17; 1346 if (pshift < 23) 1347 offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift; 1348 1349 hash = vsid ^ (vsid << 25) ^ (offset >> pshift); 1350 } 1351 1352 new_pteg = hash & new_hash_mask; 1353 if (vpte & HPTE_V_SECONDARY) 1354 new_pteg = ~hash & new_hash_mask; 1355 1356 new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); 1357 new_hptep = (__be64 *)(new->virt + (new_idx << 4)); 1358 1359 replace_vpte = be64_to_cpu(new_hptep[0]); 1360 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1361 unsigned long replace_rpte = be64_to_cpu(new_hptep[1]); 1362 replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte); 1363 } 1364 1365 if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1366 BUG_ON(new->order >= old->order); 1367 1368 if (replace_vpte & HPTE_V_BOLTED) { 1369 if (vpte & HPTE_V_BOLTED) 1370 /* Bolted collision, nothing we can do */ 1371 ret = -ENOSPC; 1372 /* Discard the new HPTE */ 1373 goto out; 1374 } 1375 1376 /* Discard the previous HPTE */ 1377 } 1378 1379 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1380 rpte = hpte_old_to_new_r(vpte, rpte); 1381 vpte = hpte_old_to_new_v(vpte); 1382 } 1383 1384 new_hptep[1] = cpu_to_be64(rpte); 1385 new->rev[new_idx].guest_rpte = guest_rpte; 1386 /* No need for a barrier, since new HPT isn't active */ 1387 new_hptep[0] = cpu_to_be64(vpte); 1388 unlock_hpte(new_hptep, vpte); 1389 1390 out: 1391 unlock_hpte(hptep, vpte); 1392 return ret; 1393 } 1394 1395 static int resize_hpt_rehash(struct kvm_resize_hpt *resize) 1396 { 1397 struct kvm *kvm = resize->kvm; 1398 unsigned long i; 1399 int rc; 1400 1401 for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { 1402 rc = resize_hpt_rehash_hpte(resize, i); 1403 if (rc != 0) 1404 return rc; 1405 } 1406 1407 return 0; 1408 } 1409 1410 static void resize_hpt_pivot(struct kvm_resize_hpt *resize) 1411 { 1412 struct kvm *kvm = resize->kvm; 1413 struct kvm_hpt_info hpt_tmp; 1414 1415 /* Exchange the pending tables in the resize structure with 1416 * the active tables */ 1417 1418 resize_hpt_debug(resize, "resize_hpt_pivot()\n"); 1419 1420 spin_lock(&kvm->mmu_lock); 1421 asm volatile("ptesync" : : : "memory"); 1422 1423 hpt_tmp = kvm->arch.hpt; 1424 kvmppc_set_hpt(kvm, &resize->hpt); 1425 resize->hpt = hpt_tmp; 1426 1427 spin_unlock(&kvm->mmu_lock); 1428 1429 synchronize_srcu_expedited(&kvm->srcu); 1430 1431 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1432 kvmppc_setup_partition_table(kvm); 1433 1434 resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); 1435 } 1436 1437 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) 1438 { 1439 if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock))) 1440 return; 1441 1442 if (!resize) 1443 return; 1444 1445 if (resize->error != -EBUSY) { 1446 if (resize->hpt.virt) 1447 kvmppc_free_hpt(&resize->hpt); 1448 kfree(resize); 1449 } 1450 1451 if (kvm->arch.resize_hpt == resize) 1452 kvm->arch.resize_hpt = NULL; 1453 } 1454 1455 static void resize_hpt_prepare_work(struct work_struct *work) 1456 { 1457 struct kvm_resize_hpt *resize = container_of(work, 1458 struct kvm_resize_hpt, 1459 work); 1460 struct kvm *kvm = resize->kvm; 1461 int err = 0; 1462 1463 if (WARN_ON(resize->error != -EBUSY)) 1464 return; 1465 1466 mutex_lock(&kvm->arch.mmu_setup_lock); 1467 1468 /* Request is still current? */ 1469 if (kvm->arch.resize_hpt == resize) { 1470 /* We may request large allocations here: 1471 * do not sleep with kvm->arch.mmu_setup_lock held for a while. 1472 */ 1473 mutex_unlock(&kvm->arch.mmu_setup_lock); 1474 1475 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", 1476 resize->order); 1477 1478 err = resize_hpt_allocate(resize); 1479 1480 /* We have strict assumption about -EBUSY 1481 * when preparing for HPT resize. 1482 */ 1483 if (WARN_ON(err == -EBUSY)) 1484 err = -EINPROGRESS; 1485 1486 mutex_lock(&kvm->arch.mmu_setup_lock); 1487 /* It is possible that kvm->arch.resize_hpt != resize 1488 * after we grab kvm->arch.mmu_setup_lock again. 1489 */ 1490 } 1491 1492 resize->error = err; 1493 1494 if (kvm->arch.resize_hpt != resize) 1495 resize_hpt_release(kvm, resize); 1496 1497 mutex_unlock(&kvm->arch.mmu_setup_lock); 1498 } 1499 1500 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, 1501 struct kvm_ppc_resize_hpt *rhpt) 1502 { 1503 unsigned long flags = rhpt->flags; 1504 unsigned long shift = rhpt->shift; 1505 struct kvm_resize_hpt *resize; 1506 int ret; 1507 1508 if (flags != 0 || kvm_is_radix(kvm)) 1509 return -EINVAL; 1510 1511 if (shift && ((shift < 18) || (shift > 46))) 1512 return -EINVAL; 1513 1514 mutex_lock(&kvm->arch.mmu_setup_lock); 1515 1516 resize = kvm->arch.resize_hpt; 1517 1518 if (resize) { 1519 if (resize->order == shift) { 1520 /* Suitable resize in progress? */ 1521 ret = resize->error; 1522 if (ret == -EBUSY) 1523 ret = 100; /* estimated time in ms */ 1524 else if (ret) 1525 resize_hpt_release(kvm, resize); 1526 1527 goto out; 1528 } 1529 1530 /* not suitable, cancel it */ 1531 resize_hpt_release(kvm, resize); 1532 } 1533 1534 ret = 0; 1535 if (!shift) 1536 goto out; /* nothing to do */ 1537 1538 /* start new resize */ 1539 1540 resize = kzalloc(sizeof(*resize), GFP_KERNEL); 1541 if (!resize) { 1542 ret = -ENOMEM; 1543 goto out; 1544 } 1545 1546 resize->error = -EBUSY; 1547 resize->order = shift; 1548 resize->kvm = kvm; 1549 INIT_WORK(&resize->work, resize_hpt_prepare_work); 1550 kvm->arch.resize_hpt = resize; 1551 1552 schedule_work(&resize->work); 1553 1554 ret = 100; /* estimated time in ms */ 1555 1556 out: 1557 mutex_unlock(&kvm->arch.mmu_setup_lock); 1558 return ret; 1559 } 1560 1561 static void resize_hpt_boot_vcpu(void *opaque) 1562 { 1563 /* Nothing to do, just force a KVM exit */ 1564 } 1565 1566 long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, 1567 struct kvm_ppc_resize_hpt *rhpt) 1568 { 1569 unsigned long flags = rhpt->flags; 1570 unsigned long shift = rhpt->shift; 1571 struct kvm_resize_hpt *resize; 1572 long ret; 1573 1574 if (flags != 0 || kvm_is_radix(kvm)) 1575 return -EINVAL; 1576 1577 if (shift && ((shift < 18) || (shift > 46))) 1578 return -EINVAL; 1579 1580 mutex_lock(&kvm->arch.mmu_setup_lock); 1581 1582 resize = kvm->arch.resize_hpt; 1583 1584 /* This shouldn't be possible */ 1585 ret = -EIO; 1586 if (WARN_ON(!kvm->arch.mmu_ready)) 1587 goto out_no_hpt; 1588 1589 /* Stop VCPUs from running while we mess with the HPT */ 1590 kvm->arch.mmu_ready = 0; 1591 smp_mb(); 1592 1593 /* Boot all CPUs out of the guest so they re-read 1594 * mmu_ready */ 1595 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1596 1597 ret = -ENXIO; 1598 if (!resize || (resize->order != shift)) 1599 goto out; 1600 1601 ret = resize->error; 1602 if (ret) 1603 goto out; 1604 1605 ret = resize_hpt_rehash(resize); 1606 if (ret) 1607 goto out; 1608 1609 resize_hpt_pivot(resize); 1610 1611 out: 1612 /* Let VCPUs run again */ 1613 kvm->arch.mmu_ready = 1; 1614 smp_mb(); 1615 out_no_hpt: 1616 resize_hpt_release(kvm, resize); 1617 mutex_unlock(&kvm->arch.mmu_setup_lock); 1618 return ret; 1619 } 1620 1621 /* 1622 * Functions for reading and writing the hash table via reads and 1623 * writes on a file descriptor. 1624 * 1625 * Reads return the guest view of the hash table, which has to be 1626 * pieced together from the real hash table and the guest_rpte 1627 * values in the revmap array. 1628 * 1629 * On writes, each HPTE written is considered in turn, and if it 1630 * is valid, it is written to the HPT as if an H_ENTER with the 1631 * exact flag set was done. When the invalid count is non-zero 1632 * in the header written to the stream, the kernel will make 1633 * sure that that many HPTEs are invalid, and invalidate them 1634 * if not. 1635 */ 1636 1637 struct kvm_htab_ctx { 1638 unsigned long index; 1639 unsigned long flags; 1640 struct kvm *kvm; 1641 int first_pass; 1642 }; 1643 1644 #define HPTE_SIZE (2 * sizeof(unsigned long)) 1645 1646 /* 1647 * Returns 1 if this HPT entry has been modified or has pending 1648 * R/C bit changes. 1649 */ 1650 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) 1651 { 1652 unsigned long rcbits_unset; 1653 1654 if (revp->guest_rpte & HPTE_GR_MODIFIED) 1655 return 1; 1656 1657 /* Also need to consider changes in reference and changed bits */ 1658 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1659 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && 1660 (be64_to_cpu(hptp[1]) & rcbits_unset)) 1661 return 1; 1662 1663 return 0; 1664 } 1665 1666 static long record_hpte(unsigned long flags, __be64 *hptp, 1667 unsigned long *hpte, struct revmap_entry *revp, 1668 int want_valid, int first_pass) 1669 { 1670 unsigned long v, r, hr; 1671 unsigned long rcbits_unset; 1672 int ok = 1; 1673 int valid, dirty; 1674 1675 /* Unmodified entries are uninteresting except on the first pass */ 1676 dirty = hpte_dirty(revp, hptp); 1677 if (!first_pass && !dirty) 1678 return 0; 1679 1680 valid = 0; 1681 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1682 valid = 1; 1683 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1684 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) 1685 valid = 0; 1686 } 1687 if (valid != want_valid) 1688 return 0; 1689 1690 v = r = 0; 1691 if (valid || dirty) { 1692 /* lock the HPTE so it's stable and read it */ 1693 preempt_disable(); 1694 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1695 cpu_relax(); 1696 v = be64_to_cpu(hptp[0]); 1697 hr = be64_to_cpu(hptp[1]); 1698 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1699 v = hpte_new_to_old_v(v, hr); 1700 hr = hpte_new_to_old_r(hr); 1701 } 1702 1703 /* re-evaluate valid and dirty from synchronized HPTE value */ 1704 valid = !!(v & HPTE_V_VALID); 1705 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1706 1707 /* Harvest R and C into guest view if necessary */ 1708 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1709 if (valid && (rcbits_unset & hr)) { 1710 revp->guest_rpte |= (hr & 1711 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; 1712 dirty = 1; 1713 } 1714 1715 if (v & HPTE_V_ABSENT) { 1716 v &= ~HPTE_V_ABSENT; 1717 v |= HPTE_V_VALID; 1718 valid = 1; 1719 } 1720 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1721 valid = 0; 1722 1723 r = revp->guest_rpte; 1724 /* only clear modified if this is the right sort of entry */ 1725 if (valid == want_valid && dirty) { 1726 r &= ~HPTE_GR_MODIFIED; 1727 revp->guest_rpte = r; 1728 } 1729 unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1730 preempt_enable(); 1731 if (!(valid == want_valid && (first_pass || dirty))) 1732 ok = 0; 1733 } 1734 hpte[0] = cpu_to_be64(v); 1735 hpte[1] = cpu_to_be64(r); 1736 return ok; 1737 } 1738 1739 static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1740 size_t count, loff_t *ppos) 1741 { 1742 struct kvm_htab_ctx *ctx = file->private_data; 1743 struct kvm *kvm = ctx->kvm; 1744 struct kvm_get_htab_header hdr; 1745 __be64 *hptp; 1746 struct revmap_entry *revp; 1747 unsigned long i, nb, nw; 1748 unsigned long __user *lbuf; 1749 struct kvm_get_htab_header __user *hptr; 1750 unsigned long flags; 1751 int first_pass; 1752 unsigned long hpte[2]; 1753 1754 if (!access_ok(buf, count)) 1755 return -EFAULT; 1756 if (kvm_is_radix(kvm)) 1757 return 0; 1758 1759 first_pass = ctx->first_pass; 1760 flags = ctx->flags; 1761 1762 i = ctx->index; 1763 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1764 revp = kvm->arch.hpt.rev + i; 1765 lbuf = (unsigned long __user *)buf; 1766 1767 nb = 0; 1768 while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1769 /* Initialize header */ 1770 hptr = (struct kvm_get_htab_header __user *)buf; 1771 hdr.n_valid = 0; 1772 hdr.n_invalid = 0; 1773 nw = nb; 1774 nb += sizeof(hdr); 1775 lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1776 1777 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1778 if (!first_pass) { 1779 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1780 !hpte_dirty(revp, hptp)) { 1781 ++i; 1782 hptp += 2; 1783 ++revp; 1784 } 1785 } 1786 hdr.index = i; 1787 1788 /* Grab a series of valid entries */ 1789 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1790 hdr.n_valid < 0xffff && 1791 nb + HPTE_SIZE < count && 1792 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1793 /* valid entry, write it out */ 1794 ++hdr.n_valid; 1795 if (__put_user(hpte[0], lbuf) || 1796 __put_user(hpte[1], lbuf + 1)) 1797 return -EFAULT; 1798 nb += HPTE_SIZE; 1799 lbuf += 2; 1800 ++i; 1801 hptp += 2; 1802 ++revp; 1803 } 1804 /* Now skip invalid entries while we can */ 1805 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1806 hdr.n_invalid < 0xffff && 1807 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1808 /* found an invalid entry */ 1809 ++hdr.n_invalid; 1810 ++i; 1811 hptp += 2; 1812 ++revp; 1813 } 1814 1815 if (hdr.n_valid || hdr.n_invalid) { 1816 /* write back the header */ 1817 if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1818 return -EFAULT; 1819 nw = nb; 1820 buf = (char __user *)lbuf; 1821 } else { 1822 nb = nw; 1823 } 1824 1825 /* Check if we've wrapped around the hash table */ 1826 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { 1827 i = 0; 1828 ctx->first_pass = 0; 1829 break; 1830 } 1831 } 1832 1833 ctx->index = i; 1834 1835 return nb; 1836 } 1837 1838 static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1839 size_t count, loff_t *ppos) 1840 { 1841 struct kvm_htab_ctx *ctx = file->private_data; 1842 struct kvm *kvm = ctx->kvm; 1843 struct kvm_get_htab_header hdr; 1844 unsigned long i, j; 1845 unsigned long v, r; 1846 unsigned long __user *lbuf; 1847 __be64 *hptp; 1848 unsigned long tmp[2]; 1849 ssize_t nb; 1850 long int err, ret; 1851 int mmu_ready; 1852 int pshift; 1853 1854 if (!access_ok(buf, count)) 1855 return -EFAULT; 1856 if (kvm_is_radix(kvm)) 1857 return -EINVAL; 1858 1859 /* lock out vcpus from running while we're doing this */ 1860 mutex_lock(&kvm->arch.mmu_setup_lock); 1861 mmu_ready = kvm->arch.mmu_ready; 1862 if (mmu_ready) { 1863 kvm->arch.mmu_ready = 0; /* temporarily */ 1864 /* order mmu_ready vs. vcpus_running */ 1865 smp_mb(); 1866 if (atomic_read(&kvm->arch.vcpus_running)) { 1867 kvm->arch.mmu_ready = 1; 1868 mutex_unlock(&kvm->arch.mmu_setup_lock); 1869 return -EBUSY; 1870 } 1871 } 1872 1873 err = 0; 1874 for (nb = 0; nb + sizeof(hdr) <= count; ) { 1875 err = -EFAULT; 1876 if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1877 break; 1878 1879 err = 0; 1880 if (nb + hdr.n_valid * HPTE_SIZE > count) 1881 break; 1882 1883 nb += sizeof(hdr); 1884 buf += sizeof(hdr); 1885 1886 err = -EINVAL; 1887 i = hdr.index; 1888 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || 1889 i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) 1890 break; 1891 1892 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1893 lbuf = (unsigned long __user *)buf; 1894 for (j = 0; j < hdr.n_valid; ++j) { 1895 __be64 hpte_v; 1896 __be64 hpte_r; 1897 1898 err = -EFAULT; 1899 if (__get_user(hpte_v, lbuf) || 1900 __get_user(hpte_r, lbuf + 1)) 1901 goto out; 1902 v = be64_to_cpu(hpte_v); 1903 r = be64_to_cpu(hpte_r); 1904 err = -EINVAL; 1905 if (!(v & HPTE_V_VALID)) 1906 goto out; 1907 pshift = kvmppc_hpte_base_page_shift(v, r); 1908 if (pshift <= 0) 1909 goto out; 1910 lbuf += 2; 1911 nb += HPTE_SIZE; 1912 1913 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1914 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1915 err = -EIO; 1916 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1917 tmp); 1918 if (ret != H_SUCCESS) { 1919 pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1920 "r=%lx\n", ret, i, v, r); 1921 goto out; 1922 } 1923 if (!mmu_ready && is_vrma_hpte(v)) { 1924 unsigned long senc, lpcr; 1925 1926 senc = slb_pgsize_encoding(1ul << pshift); 1927 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1928 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1929 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 1930 lpcr = senc << (LPCR_VRMASD_SH - 4); 1931 kvmppc_update_lpcr(kvm, lpcr, 1932 LPCR_VRMASD); 1933 } else { 1934 kvmppc_setup_partition_table(kvm); 1935 } 1936 mmu_ready = 1; 1937 } 1938 ++i; 1939 hptp += 2; 1940 } 1941 1942 for (j = 0; j < hdr.n_invalid; ++j) { 1943 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1944 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1945 ++i; 1946 hptp += 2; 1947 } 1948 err = 0; 1949 } 1950 1951 out: 1952 /* Order HPTE updates vs. mmu_ready */ 1953 smp_wmb(); 1954 kvm->arch.mmu_ready = mmu_ready; 1955 mutex_unlock(&kvm->arch.mmu_setup_lock); 1956 1957 if (err) 1958 return err; 1959 return nb; 1960 } 1961 1962 static int kvm_htab_release(struct inode *inode, struct file *filp) 1963 { 1964 struct kvm_htab_ctx *ctx = filp->private_data; 1965 1966 filp->private_data = NULL; 1967 if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1968 atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1969 kvm_put_kvm(ctx->kvm); 1970 kfree(ctx); 1971 return 0; 1972 } 1973 1974 static const struct file_operations kvm_htab_fops = { 1975 .read = kvm_htab_read, 1976 .write = kvm_htab_write, 1977 .llseek = default_llseek, 1978 .release = kvm_htab_release, 1979 }; 1980 1981 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1982 { 1983 int ret; 1984 struct kvm_htab_ctx *ctx; 1985 int rwflag; 1986 1987 /* reject flags we don't recognize */ 1988 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1989 return -EINVAL; 1990 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1991 if (!ctx) 1992 return -ENOMEM; 1993 kvm_get_kvm(kvm); 1994 ctx->kvm = kvm; 1995 ctx->index = ghf->start_index; 1996 ctx->flags = ghf->flags; 1997 ctx->first_pass = 1; 1998 1999 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 2000 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 2001 if (ret < 0) { 2002 kfree(ctx); 2003 kvm_put_kvm(kvm); 2004 return ret; 2005 } 2006 2007 if (rwflag == O_RDONLY) { 2008 mutex_lock(&kvm->slots_lock); 2009 atomic_inc(&kvm->arch.hpte_mod_interest); 2010 /* make sure kvmppc_do_h_enter etc. see the increment */ 2011 synchronize_srcu_expedited(&kvm->srcu); 2012 mutex_unlock(&kvm->slots_lock); 2013 } 2014 2015 return ret; 2016 } 2017 2018 struct debugfs_htab_state { 2019 struct kvm *kvm; 2020 struct mutex mutex; 2021 unsigned long hpt_index; 2022 int chars_left; 2023 int buf_index; 2024 char buf[64]; 2025 }; 2026 2027 static int debugfs_htab_open(struct inode *inode, struct file *file) 2028 { 2029 struct kvm *kvm = inode->i_private; 2030 struct debugfs_htab_state *p; 2031 2032 p = kzalloc(sizeof(*p), GFP_KERNEL); 2033 if (!p) 2034 return -ENOMEM; 2035 2036 kvm_get_kvm(kvm); 2037 p->kvm = kvm; 2038 mutex_init(&p->mutex); 2039 file->private_data = p; 2040 2041 return nonseekable_open(inode, file); 2042 } 2043 2044 static int debugfs_htab_release(struct inode *inode, struct file *file) 2045 { 2046 struct debugfs_htab_state *p = file->private_data; 2047 2048 kvm_put_kvm(p->kvm); 2049 kfree(p); 2050 return 0; 2051 } 2052 2053 static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 2054 size_t len, loff_t *ppos) 2055 { 2056 struct debugfs_htab_state *p = file->private_data; 2057 ssize_t ret, r; 2058 unsigned long i, n; 2059 unsigned long v, hr, gr; 2060 struct kvm *kvm; 2061 __be64 *hptp; 2062 2063 kvm = p->kvm; 2064 if (kvm_is_radix(kvm)) 2065 return 0; 2066 2067 ret = mutex_lock_interruptible(&p->mutex); 2068 if (ret) 2069 return ret; 2070 2071 if (p->chars_left) { 2072 n = p->chars_left; 2073 if (n > len) 2074 n = len; 2075 r = copy_to_user(buf, p->buf + p->buf_index, n); 2076 n -= r; 2077 p->chars_left -= n; 2078 p->buf_index += n; 2079 buf += n; 2080 len -= n; 2081 ret = n; 2082 if (r) { 2083 if (!n) 2084 ret = -EFAULT; 2085 goto out; 2086 } 2087 } 2088 2089 i = p->hpt_index; 2090 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2091 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); 2092 ++i, hptp += 2) { 2093 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 2094 continue; 2095 2096 /* lock the HPTE so it's stable and read it */ 2097 preempt_disable(); 2098 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 2099 cpu_relax(); 2100 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 2101 hr = be64_to_cpu(hptp[1]); 2102 gr = kvm->arch.hpt.rev[i].guest_rpte; 2103 unlock_hpte(hptp, v); 2104 preempt_enable(); 2105 2106 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 2107 continue; 2108 2109 n = scnprintf(p->buf, sizeof(p->buf), 2110 "%6lx %.16lx %.16lx %.16lx\n", 2111 i, v, hr, gr); 2112 p->chars_left = n; 2113 if (n > len) 2114 n = len; 2115 r = copy_to_user(buf, p->buf, n); 2116 n -= r; 2117 p->chars_left -= n; 2118 p->buf_index = n; 2119 buf += n; 2120 len -= n; 2121 ret += n; 2122 if (r) { 2123 if (!ret) 2124 ret = -EFAULT; 2125 goto out; 2126 } 2127 } 2128 p->hpt_index = i; 2129 2130 out: 2131 mutex_unlock(&p->mutex); 2132 return ret; 2133 } 2134 2135 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 2136 size_t len, loff_t *ppos) 2137 { 2138 return -EACCES; 2139 } 2140 2141 static const struct file_operations debugfs_htab_fops = { 2142 .owner = THIS_MODULE, 2143 .open = debugfs_htab_open, 2144 .release = debugfs_htab_release, 2145 .read = debugfs_htab_read, 2146 .write = debugfs_htab_write, 2147 .llseek = generic_file_llseek, 2148 }; 2149 2150 void kvmppc_mmu_debugfs_init(struct kvm *kvm) 2151 { 2152 kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, 2153 kvm->arch.debugfs_dir, kvm, 2154 &debugfs_htab_fops); 2155 } 2156 2157 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 2158 { 2159 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 2160 2161 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2162 2163 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2164 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 2165 2166 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 2167 } 2168