1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 5 */ 6 7 #include <linux/types.h> 8 #include <linux/string.h> 9 #include <linux/kvm.h> 10 #include <linux/kvm_host.h> 11 #include <linux/highmem.h> 12 #include <linux/gfp.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/vmalloc.h> 16 #include <linux/srcu.h> 17 #include <linux/anon_inodes.h> 18 #include <linux/file.h> 19 #include <linux/debugfs.h> 20 21 #include <asm/kvm_ppc.h> 22 #include <asm/kvm_book3s.h> 23 #include <asm/book3s/64/mmu-hash.h> 24 #include <asm/hvcall.h> 25 #include <asm/synch.h> 26 #include <asm/ppc-opcode.h> 27 #include <asm/cputable.h> 28 #include <asm/pte-walk.h> 29 30 #include "trace_hv.h" 31 32 //#define DEBUG_RESIZE_HPT 1 33 34 #ifdef DEBUG_RESIZE_HPT 35 #define resize_hpt_debug(resize, ...) \ 36 do { \ 37 printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ 38 printk(__VA_ARGS__); \ 39 } while (0) 40 #else 41 #define resize_hpt_debug(resize, ...) \ 42 do { } while (0) 43 #endif 44 45 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 46 long pte_index, unsigned long pteh, 47 unsigned long ptel, unsigned long *pte_idx_ret); 48 49 struct kvm_resize_hpt { 50 /* These fields read-only after init */ 51 struct kvm *kvm; 52 struct work_struct work; 53 u32 order; 54 55 /* These fields protected by kvm->arch.mmu_setup_lock */ 56 57 /* Possible values and their usage: 58 * <0 an error occurred during allocation, 59 * -EBUSY allocation is in the progress, 60 * 0 allocation made successfuly. 61 */ 62 int error; 63 64 /* Private to the work thread, until error != -EBUSY, 65 * then protected by kvm->arch.mmu_setup_lock. 66 */ 67 struct kvm_hpt_info hpt; 68 }; 69 70 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 71 { 72 unsigned long hpt = 0; 73 int cma = 0; 74 struct page *page = NULL; 75 struct revmap_entry *rev; 76 unsigned long npte; 77 78 if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) 79 return -EINVAL; 80 81 page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); 82 if (page) { 83 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 84 memset((void *)hpt, 0, (1ul << order)); 85 cma = 1; 86 } 87 88 if (!hpt) 89 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL 90 |__GFP_NOWARN, order - PAGE_SHIFT); 91 92 if (!hpt) 93 return -ENOMEM; 94 95 /* HPTEs are 2**4 bytes long */ 96 npte = 1ul << (order - 4); 97 98 /* Allocate reverse map array */ 99 rev = vmalloc(array_size(npte, sizeof(struct revmap_entry))); 100 if (!rev) { 101 if (cma) 102 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 103 else 104 free_pages(hpt, order - PAGE_SHIFT); 105 return -ENOMEM; 106 } 107 108 info->order = order; 109 info->virt = hpt; 110 info->cma = cma; 111 info->rev = rev; 112 113 return 0; 114 } 115 116 void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) 117 { 118 atomic64_set(&kvm->arch.mmio_update, 0); 119 kvm->arch.hpt = *info; 120 kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); 121 122 pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", 123 info->virt, (long)info->order, kvm->arch.lpid); 124 } 125 126 long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) 127 { 128 long err = -EBUSY; 129 struct kvm_hpt_info info; 130 131 mutex_lock(&kvm->arch.mmu_setup_lock); 132 if (kvm->arch.mmu_ready) { 133 kvm->arch.mmu_ready = 0; 134 /* order mmu_ready vs. vcpus_running */ 135 smp_mb(); 136 if (atomic_read(&kvm->arch.vcpus_running)) { 137 kvm->arch.mmu_ready = 1; 138 goto out; 139 } 140 } 141 if (kvm_is_radix(kvm)) { 142 err = kvmppc_switch_mmu_to_hpt(kvm); 143 if (err) 144 goto out; 145 } 146 147 if (kvm->arch.hpt.order == order) { 148 /* We already have a suitable HPT */ 149 150 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 151 memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); 152 /* 153 * Reset all the reverse-mapping chains for all memslots 154 */ 155 kvmppc_rmap_reset(kvm); 156 err = 0; 157 goto out; 158 } 159 160 if (kvm->arch.hpt.virt) { 161 kvmppc_free_hpt(&kvm->arch.hpt); 162 kvmppc_rmap_reset(kvm); 163 } 164 165 err = kvmppc_allocate_hpt(&info, order); 166 if (err < 0) 167 goto out; 168 kvmppc_set_hpt(kvm, &info); 169 170 out: 171 if (err == 0) 172 /* Ensure that each vcpu will flush its TLB on next entry. */ 173 cpumask_setall(&kvm->arch.need_tlb_flush); 174 175 mutex_unlock(&kvm->arch.mmu_setup_lock); 176 return err; 177 } 178 179 void kvmppc_free_hpt(struct kvm_hpt_info *info) 180 { 181 vfree(info->rev); 182 info->rev = NULL; 183 if (info->cma) 184 kvm_free_hpt_cma(virt_to_page(info->virt), 185 1 << (info->order - PAGE_SHIFT)); 186 else if (info->virt) 187 free_pages(info->virt, info->order - PAGE_SHIFT); 188 info->virt = 0; 189 info->order = 0; 190 } 191 192 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 193 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 194 { 195 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 196 } 197 198 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 199 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 200 { 201 return (pgsize == 0x10000) ? 0x1000 : 0; 202 } 203 204 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 205 unsigned long porder) 206 { 207 unsigned long i; 208 unsigned long npages; 209 unsigned long hp_v, hp_r; 210 unsigned long addr, hash; 211 unsigned long psize; 212 unsigned long hp0, hp1; 213 unsigned long idx_ret; 214 long ret; 215 struct kvm *kvm = vcpu->kvm; 216 217 psize = 1ul << porder; 218 npages = memslot->npages >> (porder - PAGE_SHIFT); 219 220 /* VRMA can't be > 1TB */ 221 if (npages > 1ul << (40 - porder)) 222 npages = 1ul << (40 - porder); 223 /* Can't use more than 1 HPTE per HPTEG */ 224 if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) 225 npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; 226 227 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 228 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 229 hp1 = hpte1_pgsize_encoding(psize) | 230 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 231 232 for (i = 0; i < npages; ++i) { 233 addr = i << porder; 234 /* can't use hpt_hash since va > 64 bits */ 235 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) 236 & kvmppc_hpt_mask(&kvm->arch.hpt); 237 /* 238 * We assume that the hash table is empty and no 239 * vcpus are using it at this stage. Since we create 240 * at most one HPTE per HPTEG, we just assume entry 7 241 * is available and use it. 242 */ 243 hash = (hash << 3) + 7; 244 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 245 hp_r = hp1 | addr; 246 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 247 &idx_ret); 248 if (ret != H_SUCCESS) { 249 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 250 addr, ret); 251 break; 252 } 253 } 254 } 255 256 int kvmppc_mmu_hv_init(void) 257 { 258 unsigned long host_lpid, rsvd_lpid; 259 260 if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) 261 return -EINVAL; 262 263 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ 264 host_lpid = 0; 265 if (cpu_has_feature(CPU_FTR_HVMODE)) 266 host_lpid = mfspr(SPRN_LPID); 267 rsvd_lpid = LPID_RSVD; 268 269 kvmppc_init_lpid(rsvd_lpid + 1); 270 271 kvmppc_claim_lpid(host_lpid); 272 /* rsvd_lpid is reserved for use in partition switching */ 273 kvmppc_claim_lpid(rsvd_lpid); 274 275 return 0; 276 } 277 278 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 279 long pte_index, unsigned long pteh, 280 unsigned long ptel, unsigned long *pte_idx_ret) 281 { 282 long ret; 283 284 preempt_disable(); 285 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 286 kvm->mm->pgd, false, pte_idx_ret); 287 preempt_enable(); 288 if (ret == H_TOO_HARD) { 289 /* this can't happen */ 290 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 291 ret = H_RESOURCE; /* or something */ 292 } 293 return ret; 294 295 } 296 297 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 298 gva_t eaddr) 299 { 300 u64 mask; 301 int i; 302 303 for (i = 0; i < vcpu->arch.slb_nr; i++) { 304 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 305 continue; 306 307 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 308 mask = ESID_MASK_1T; 309 else 310 mask = ESID_MASK; 311 312 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 313 return &vcpu->arch.slb[i]; 314 } 315 return NULL; 316 } 317 318 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 319 unsigned long ea) 320 { 321 unsigned long ra_mask; 322 323 ra_mask = kvmppc_actual_pgsz(v, r) - 1; 324 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 325 } 326 327 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 328 struct kvmppc_pte *gpte, bool data, bool iswrite) 329 { 330 struct kvm *kvm = vcpu->kvm; 331 struct kvmppc_slb *slbe; 332 unsigned long slb_v; 333 unsigned long pp, key; 334 unsigned long v, orig_v, gr; 335 __be64 *hptep; 336 long int index; 337 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 338 339 if (kvm_is_radix(vcpu->kvm)) 340 return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); 341 342 /* Get SLB entry */ 343 if (virtmode) { 344 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 345 if (!slbe) 346 return -EINVAL; 347 slb_v = slbe->origv; 348 } else { 349 /* real mode access */ 350 slb_v = vcpu->kvm->arch.vrma_slb_v; 351 } 352 353 preempt_disable(); 354 /* Find the HPTE in the hash table */ 355 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 356 HPTE_V_VALID | HPTE_V_ABSENT); 357 if (index < 0) { 358 preempt_enable(); 359 return -ENOENT; 360 } 361 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 362 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 363 if (cpu_has_feature(CPU_FTR_ARCH_300)) 364 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 365 gr = kvm->arch.hpt.rev[index].guest_rpte; 366 367 unlock_hpte(hptep, orig_v); 368 preempt_enable(); 369 370 gpte->eaddr = eaddr; 371 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 372 373 /* Get PP bits and key for permission check */ 374 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 375 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 376 key &= slb_v; 377 378 /* Calculate permissions */ 379 gpte->may_read = hpte_read_permission(pp, key); 380 gpte->may_write = hpte_write_permission(pp, key); 381 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 382 383 /* Storage key permission check for POWER7 */ 384 if (data && virtmode) { 385 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 386 if (amrfield & 1) 387 gpte->may_read = 0; 388 if (amrfield & 2) 389 gpte->may_write = 0; 390 } 391 392 /* Get the guest physical address */ 393 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 394 return 0; 395 } 396 397 /* 398 * Quick test for whether an instruction is a load or a store. 399 * If the instruction is a load or a store, then this will indicate 400 * which it is, at least on server processors. (Embedded processors 401 * have some external PID instructions that don't follow the rule 402 * embodied here.) If the instruction isn't a load or store, then 403 * this doesn't return anything useful. 404 */ 405 static int instruction_is_store(unsigned int instr) 406 { 407 unsigned int mask; 408 409 mask = 0x10000000; 410 if ((instr & 0xfc000000) == 0x7c000000) 411 mask = 0x100; /* major opcode 31 */ 412 return (instr & mask) != 0; 413 } 414 415 int kvmppc_hv_emulate_mmio(struct kvm_vcpu *vcpu, 416 unsigned long gpa, gva_t ea, int is_store) 417 { 418 u32 last_inst; 419 420 /* 421 * Fast path - check if the guest physical address corresponds to a 422 * device on the FAST_MMIO_BUS, if so we can avoid loading the 423 * instruction all together, then we can just handle it and return. 424 */ 425 if (is_store) { 426 int idx, ret; 427 428 idx = srcu_read_lock(&vcpu->kvm->srcu); 429 ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0, 430 NULL); 431 srcu_read_unlock(&vcpu->kvm->srcu, idx); 432 if (!ret) { 433 kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); 434 return RESUME_GUEST; 435 } 436 } 437 438 /* 439 * If we fail, we just return to the guest and try executing it again. 440 */ 441 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 442 EMULATE_DONE) 443 return RESUME_GUEST; 444 445 /* 446 * WARNING: We do not know for sure whether the instruction we just 447 * read from memory is the same that caused the fault in the first 448 * place. If the instruction we read is neither an load or a store, 449 * then it can't access memory, so we don't need to worry about 450 * enforcing access permissions. So, assuming it is a load or 451 * store, we just check that its direction (load or store) is 452 * consistent with the original fault, since that's what we 453 * checked the access permissions against. If there is a mismatch 454 * we just return and retry the instruction. 455 */ 456 457 if (instruction_is_store(last_inst) != !!is_store) 458 return RESUME_GUEST; 459 460 /* 461 * Emulated accesses are emulated by looking at the hash for 462 * translation once, then performing the access later. The 463 * translation could be invalidated in the meantime in which 464 * point performing the subsequent memory access on the old 465 * physical address could possibly be a security hole for the 466 * guest (but not the host). 467 * 468 * This is less of an issue for MMIO stores since they aren't 469 * globally visible. It could be an issue for MMIO loads to 470 * a certain extent but we'll ignore it for now. 471 */ 472 473 vcpu->arch.paddr_accessed = gpa; 474 vcpu->arch.vaddr_accessed = ea; 475 return kvmppc_emulate_mmio(vcpu); 476 } 477 478 int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, 479 unsigned long ea, unsigned long dsisr) 480 { 481 struct kvm *kvm = vcpu->kvm; 482 unsigned long hpte[3], r; 483 unsigned long hnow_v, hnow_r; 484 __be64 *hptep; 485 unsigned long mmu_seq, psize, pte_size; 486 unsigned long gpa_base, gfn_base; 487 unsigned long gpa, gfn, hva, pfn, hpa; 488 struct kvm_memory_slot *memslot; 489 unsigned long *rmap; 490 struct revmap_entry *rev; 491 struct page *page; 492 long index, ret; 493 bool is_ci; 494 bool writing, write_ok; 495 unsigned int shift; 496 unsigned long rcbits; 497 long mmio_update; 498 pte_t pte, *ptep; 499 500 if (kvm_is_radix(kvm)) 501 return kvmppc_book3s_radix_page_fault(vcpu, ea, dsisr); 502 503 /* 504 * Real-mode code has already searched the HPT and found the 505 * entry we're interested in. Lock the entry and check that 506 * it hasn't changed. If it has, just return and re-execute the 507 * instruction. 508 */ 509 if (ea != vcpu->arch.pgfault_addr) 510 return RESUME_GUEST; 511 512 if (vcpu->arch.pgfault_cache) { 513 mmio_update = atomic64_read(&kvm->arch.mmio_update); 514 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 515 r = vcpu->arch.pgfault_cache->rpte; 516 psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], 517 r); 518 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 519 gfn_base = gpa_base >> PAGE_SHIFT; 520 gpa = gpa_base | (ea & (psize - 1)); 521 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, 522 dsisr & DSISR_ISSTORE); 523 } 524 } 525 index = vcpu->arch.pgfault_index; 526 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 527 rev = &kvm->arch.hpt.rev[index]; 528 preempt_disable(); 529 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 530 cpu_relax(); 531 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 532 hpte[1] = be64_to_cpu(hptep[1]); 533 hpte[2] = r = rev->guest_rpte; 534 unlock_hpte(hptep, hpte[0]); 535 preempt_enable(); 536 537 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 538 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); 539 hpte[1] = hpte_new_to_old_r(hpte[1]); 540 } 541 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 542 hpte[1] != vcpu->arch.pgfault_hpte[1]) 543 return RESUME_GUEST; 544 545 /* Translate the logical address and get the page */ 546 psize = kvmppc_actual_pgsz(hpte[0], r); 547 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 548 gfn_base = gpa_base >> PAGE_SHIFT; 549 gpa = gpa_base | (ea & (psize - 1)); 550 gfn = gpa >> PAGE_SHIFT; 551 memslot = gfn_to_memslot(kvm, gfn); 552 553 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); 554 555 /* No memslot means it's an emulated MMIO region */ 556 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 557 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, 558 dsisr & DSISR_ISSTORE); 559 560 /* 561 * This should never happen, because of the slot_is_aligned() 562 * check in kvmppc_do_h_enter(). 563 */ 564 if (gfn_base < memslot->base_gfn) 565 return -EFAULT; 566 567 /* used to check for invalidations in progress */ 568 mmu_seq = kvm->mmu_notifier_seq; 569 smp_rmb(); 570 571 ret = -EFAULT; 572 page = NULL; 573 writing = (dsisr & DSISR_ISSTORE) != 0; 574 /* If writing != 0, then the HPTE must allow writing, if we get here */ 575 write_ok = writing; 576 hva = gfn_to_hva_memslot(memslot, gfn); 577 578 /* 579 * Do a fast check first, since __gfn_to_pfn_memslot doesn't 580 * do it with !atomic && !async, which is how we call it. 581 * We always ask for write permission since the common case 582 * is that the page is writable. 583 */ 584 if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { 585 write_ok = true; 586 } else { 587 /* Call KVM generic code to do the slow-path check */ 588 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 589 writing, &write_ok); 590 if (is_error_noslot_pfn(pfn)) 591 return -EFAULT; 592 page = NULL; 593 if (pfn_valid(pfn)) { 594 page = pfn_to_page(pfn); 595 if (PageReserved(page)) 596 page = NULL; 597 } 598 } 599 600 /* 601 * Read the PTE from the process' radix tree and use that 602 * so we get the shift and attribute bits. 603 */ 604 spin_lock(&kvm->mmu_lock); 605 ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); 606 pte = __pte(0); 607 if (ptep) 608 pte = READ_ONCE(*ptep); 609 spin_unlock(&kvm->mmu_lock); 610 /* 611 * If the PTE disappeared temporarily due to a THP 612 * collapse, just return and let the guest try again. 613 */ 614 if (!pte_present(pte)) { 615 if (page) 616 put_page(page); 617 return RESUME_GUEST; 618 } 619 hpa = pte_pfn(pte) << PAGE_SHIFT; 620 pte_size = PAGE_SIZE; 621 if (shift) 622 pte_size = 1ul << shift; 623 is_ci = pte_ci(pte); 624 625 if (psize > pte_size) 626 goto out_put; 627 if (pte_size > psize) 628 hpa |= hva & (pte_size - psize); 629 630 /* Check WIMG vs. the actual page we're accessing */ 631 if (!hpte_cache_flags_ok(r, is_ci)) { 632 if (is_ci) 633 goto out_put; 634 /* 635 * Allow guest to map emulated device memory as 636 * uncacheable, but actually make it cacheable. 637 */ 638 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 639 } 640 641 /* 642 * Set the HPTE to point to hpa. 643 * Since the hpa is at PAGE_SIZE granularity, make sure we 644 * don't mask out lower-order bits if psize < PAGE_SIZE. 645 */ 646 if (psize < PAGE_SIZE) 647 psize = PAGE_SIZE; 648 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa; 649 if (hpte_is_writable(r) && !write_ok) 650 r = hpte_make_readonly(r); 651 ret = RESUME_GUEST; 652 preempt_disable(); 653 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 654 cpu_relax(); 655 hnow_v = be64_to_cpu(hptep[0]); 656 hnow_r = be64_to_cpu(hptep[1]); 657 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 658 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); 659 hnow_r = hpte_new_to_old_r(hnow_r); 660 } 661 662 /* 663 * If the HPT is being resized, don't update the HPTE, 664 * instead let the guest retry after the resize operation is complete. 665 * The synchronization for mmu_ready test vs. set is provided 666 * by the HPTE lock. 667 */ 668 if (!kvm->arch.mmu_ready) 669 goto out_unlock; 670 671 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 672 rev->guest_rpte != hpte[2]) 673 /* HPTE has been changed under us; let the guest retry */ 674 goto out_unlock; 675 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 676 677 /* Always put the HPTE in the rmap chain for the page base address */ 678 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 679 lock_rmap(rmap); 680 681 /* Check if we might have been invalidated; let the guest retry if so */ 682 ret = RESUME_GUEST; 683 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 684 unlock_rmap(rmap); 685 goto out_unlock; 686 } 687 688 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 689 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 690 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 691 692 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { 693 /* HPTE was previously valid, so we need to invalidate it */ 694 unlock_rmap(rmap); 695 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 696 kvmppc_invalidate_hpte(kvm, hptep, index); 697 /* don't lose previous R and C bits */ 698 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 699 } else { 700 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 701 } 702 703 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 704 r = hpte_old_to_new_r(hpte[0], r); 705 hpte[0] = hpte_old_to_new_v(hpte[0]); 706 } 707 hptep[1] = cpu_to_be64(r); 708 eieio(); 709 __unlock_hpte(hptep, hpte[0]); 710 asm volatile("ptesync" : : : "memory"); 711 preempt_enable(); 712 if (page && hpte_is_writable(r)) 713 set_page_dirty_lock(page); 714 715 out_put: 716 trace_kvm_page_fault_exit(vcpu, hpte, ret); 717 718 if (page) 719 put_page(page); 720 return ret; 721 722 out_unlock: 723 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 724 preempt_enable(); 725 goto out_put; 726 } 727 728 void kvmppc_rmap_reset(struct kvm *kvm) 729 { 730 struct kvm_memslots *slots; 731 struct kvm_memory_slot *memslot; 732 int srcu_idx; 733 734 srcu_idx = srcu_read_lock(&kvm->srcu); 735 slots = kvm_memslots(kvm); 736 kvm_for_each_memslot(memslot, slots) { 737 /* Mutual exclusion with kvm_unmap_hva_range etc. */ 738 spin_lock(&kvm->mmu_lock); 739 /* 740 * This assumes it is acceptable to lose reference and 741 * change bits across a reset. 742 */ 743 memset(memslot->arch.rmap, 0, 744 memslot->npages * sizeof(*memslot->arch.rmap)); 745 spin_unlock(&kvm->mmu_lock); 746 } 747 srcu_read_unlock(&kvm->srcu, srcu_idx); 748 } 749 750 typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, 751 unsigned long gfn); 752 753 static int kvm_handle_hva_range(struct kvm *kvm, 754 unsigned long start, 755 unsigned long end, 756 hva_handler_fn handler) 757 { 758 int ret; 759 int retval = 0; 760 struct kvm_memslots *slots; 761 struct kvm_memory_slot *memslot; 762 763 slots = kvm_memslots(kvm); 764 kvm_for_each_memslot(memslot, slots) { 765 unsigned long hva_start, hva_end; 766 gfn_t gfn, gfn_end; 767 768 hva_start = max(start, memslot->userspace_addr); 769 hva_end = min(end, memslot->userspace_addr + 770 (memslot->npages << PAGE_SHIFT)); 771 if (hva_start >= hva_end) 772 continue; 773 /* 774 * {gfn(page) | page intersects with [hva_start, hva_end)} = 775 * {gfn, gfn+1, ..., gfn_end-1}. 776 */ 777 gfn = hva_to_gfn_memslot(hva_start, memslot); 778 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 779 780 for (; gfn < gfn_end; ++gfn) { 781 ret = handler(kvm, memslot, gfn); 782 retval |= ret; 783 } 784 } 785 786 return retval; 787 } 788 789 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 790 hva_handler_fn handler) 791 { 792 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 793 } 794 795 /* Must be called with both HPTE and rmap locked */ 796 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 797 struct kvm_memory_slot *memslot, 798 unsigned long *rmapp, unsigned long gfn) 799 { 800 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 801 struct revmap_entry *rev = kvm->arch.hpt.rev; 802 unsigned long j, h; 803 unsigned long ptel, psize, rcbits; 804 805 j = rev[i].forw; 806 if (j == i) { 807 /* chain is now empty */ 808 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 809 } else { 810 /* remove i from chain */ 811 h = rev[i].back; 812 rev[h].forw = j; 813 rev[j].back = h; 814 rev[i].forw = rev[i].back = i; 815 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 816 } 817 818 /* Now check and modify the HPTE */ 819 ptel = rev[i].guest_rpte; 820 psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); 821 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 822 hpte_rpn(ptel, psize) == gfn) { 823 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 824 kvmppc_invalidate_hpte(kvm, hptep, i); 825 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); 826 /* Harvest R and C */ 827 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 828 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 829 if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) 830 kvmppc_update_dirty_map(memslot, gfn, psize); 831 if (rcbits & ~rev[i].guest_rpte) { 832 rev[i].guest_rpte = ptel | rcbits; 833 note_hpte_modification(kvm, &rev[i]); 834 } 835 } 836 } 837 838 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 839 unsigned long gfn) 840 { 841 unsigned long i; 842 __be64 *hptep; 843 unsigned long *rmapp; 844 845 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 846 for (;;) { 847 lock_rmap(rmapp); 848 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 849 unlock_rmap(rmapp); 850 break; 851 } 852 853 /* 854 * To avoid an ABBA deadlock with the HPTE lock bit, 855 * we can't spin on the HPTE lock while holding the 856 * rmap chain lock. 857 */ 858 i = *rmapp & KVMPPC_RMAP_INDEX; 859 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 860 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 861 /* unlock rmap before spinning on the HPTE lock */ 862 unlock_rmap(rmapp); 863 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 864 cpu_relax(); 865 continue; 866 } 867 868 kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); 869 unlock_rmap(rmapp); 870 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 871 } 872 return 0; 873 } 874 875 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) 876 { 877 hva_handler_fn handler; 878 879 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 880 kvm_handle_hva_range(kvm, start, end, handler); 881 return 0; 882 } 883 884 void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 885 struct kvm_memory_slot *memslot) 886 { 887 unsigned long gfn; 888 unsigned long n; 889 unsigned long *rmapp; 890 891 gfn = memslot->base_gfn; 892 rmapp = memslot->arch.rmap; 893 if (kvm_is_radix(kvm)) { 894 kvmppc_radix_flush_memslot(kvm, memslot); 895 return; 896 } 897 898 for (n = memslot->npages; n; --n, ++gfn) { 899 /* 900 * Testing the present bit without locking is OK because 901 * the memslot has been marked invalid already, and hence 902 * no new HPTEs referencing this page can be created, 903 * thus the present bit can't go from 0 to 1. 904 */ 905 if (*rmapp & KVMPPC_RMAP_PRESENT) 906 kvm_unmap_rmapp(kvm, memslot, gfn); 907 ++rmapp; 908 } 909 } 910 911 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 912 unsigned long gfn) 913 { 914 struct revmap_entry *rev = kvm->arch.hpt.rev; 915 unsigned long head, i, j; 916 __be64 *hptep; 917 int ret = 0; 918 unsigned long *rmapp; 919 920 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 921 retry: 922 lock_rmap(rmapp); 923 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 924 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 925 ret = 1; 926 } 927 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 928 unlock_rmap(rmapp); 929 return ret; 930 } 931 932 i = head = *rmapp & KVMPPC_RMAP_INDEX; 933 do { 934 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 935 j = rev[i].forw; 936 937 /* If this HPTE isn't referenced, ignore it */ 938 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) 939 continue; 940 941 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 942 /* unlock rmap before spinning on the HPTE lock */ 943 unlock_rmap(rmapp); 944 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 945 cpu_relax(); 946 goto retry; 947 } 948 949 /* Now check and modify the HPTE */ 950 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 951 (be64_to_cpu(hptep[1]) & HPTE_R_R)) { 952 kvmppc_clear_ref_hpte(kvm, hptep, i); 953 if (!(rev[i].guest_rpte & HPTE_R_R)) { 954 rev[i].guest_rpte |= HPTE_R_R; 955 note_hpte_modification(kvm, &rev[i]); 956 } 957 ret = 1; 958 } 959 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 960 } while ((i = j) != head); 961 962 unlock_rmap(rmapp); 963 return ret; 964 } 965 966 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) 967 { 968 hva_handler_fn handler; 969 970 handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; 971 return kvm_handle_hva_range(kvm, start, end, handler); 972 } 973 974 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 975 unsigned long gfn) 976 { 977 struct revmap_entry *rev = kvm->arch.hpt.rev; 978 unsigned long head, i, j; 979 unsigned long *hp; 980 int ret = 1; 981 unsigned long *rmapp; 982 983 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 984 if (*rmapp & KVMPPC_RMAP_REFERENCED) 985 return 1; 986 987 lock_rmap(rmapp); 988 if (*rmapp & KVMPPC_RMAP_REFERENCED) 989 goto out; 990 991 if (*rmapp & KVMPPC_RMAP_PRESENT) { 992 i = head = *rmapp & KVMPPC_RMAP_INDEX; 993 do { 994 hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); 995 j = rev[i].forw; 996 if (be64_to_cpu(hp[1]) & HPTE_R_R) 997 goto out; 998 } while ((i = j) != head); 999 } 1000 ret = 0; 1001 1002 out: 1003 unlock_rmap(rmapp); 1004 return ret; 1005 } 1006 1007 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) 1008 { 1009 hva_handler_fn handler; 1010 1011 handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; 1012 return kvm_handle_hva(kvm, hva, handler); 1013 } 1014 1015 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) 1016 { 1017 hva_handler_fn handler; 1018 1019 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; 1020 kvm_handle_hva(kvm, hva, handler); 1021 } 1022 1023 static int vcpus_running(struct kvm *kvm) 1024 { 1025 return atomic_read(&kvm->arch.vcpus_running) != 0; 1026 } 1027 1028 /* 1029 * Returns the number of system pages that are dirty. 1030 * This can be more than 1 if we find a huge-page HPTE. 1031 */ 1032 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1033 { 1034 struct revmap_entry *rev = kvm->arch.hpt.rev; 1035 unsigned long head, i, j; 1036 unsigned long n; 1037 unsigned long v, r; 1038 __be64 *hptep; 1039 int npages_dirty = 0; 1040 1041 retry: 1042 lock_rmap(rmapp); 1043 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1044 unlock_rmap(rmapp); 1045 return npages_dirty; 1046 } 1047 1048 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1049 do { 1050 unsigned long hptep1; 1051 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 1052 j = rev[i].forw; 1053 1054 /* 1055 * Checking the C (changed) bit here is racy since there 1056 * is no guarantee about when the hardware writes it back. 1057 * If the HPTE is not writable then it is stable since the 1058 * page can't be written to, and we would have done a tlbie 1059 * (which forces the hardware to complete any writeback) 1060 * when making the HPTE read-only. 1061 * If vcpus are running then this call is racy anyway 1062 * since the page could get dirtied subsequently, so we 1063 * expect there to be a further call which would pick up 1064 * any delayed C bit writeback. 1065 * Otherwise we need to do the tlbie even if C==0 in 1066 * order to pick up any delayed writeback of C. 1067 */ 1068 hptep1 = be64_to_cpu(hptep[1]); 1069 if (!(hptep1 & HPTE_R_C) && 1070 (!hpte_is_writable(hptep1) || vcpus_running(kvm))) 1071 continue; 1072 1073 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1074 /* unlock rmap before spinning on the HPTE lock */ 1075 unlock_rmap(rmapp); 1076 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) 1077 cpu_relax(); 1078 goto retry; 1079 } 1080 1081 /* Now check and modify the HPTE */ 1082 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 1083 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 1084 continue; 1085 } 1086 1087 /* need to make it temporarily absent so C is stable */ 1088 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 1089 kvmppc_invalidate_hpte(kvm, hptep, i); 1090 v = be64_to_cpu(hptep[0]); 1091 r = be64_to_cpu(hptep[1]); 1092 if (r & HPTE_R_C) { 1093 hptep[1] = cpu_to_be64(r & ~HPTE_R_C); 1094 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1095 rev[i].guest_rpte |= HPTE_R_C; 1096 note_hpte_modification(kvm, &rev[i]); 1097 } 1098 n = kvmppc_actual_pgsz(v, r); 1099 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1100 if (n > npages_dirty) 1101 npages_dirty = n; 1102 eieio(); 1103 } 1104 v &= ~HPTE_V_ABSENT; 1105 v |= HPTE_V_VALID; 1106 __unlock_hpte(hptep, v); 1107 } while ((i = j) != head); 1108 1109 unlock_rmap(rmapp); 1110 return npages_dirty; 1111 } 1112 1113 void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1114 struct kvm_memory_slot *memslot, 1115 unsigned long *map) 1116 { 1117 unsigned long gfn; 1118 1119 if (!vpa->dirty || !vpa->pinned_addr) 1120 return; 1121 gfn = vpa->gpa >> PAGE_SHIFT; 1122 if (gfn < memslot->base_gfn || 1123 gfn >= memslot->base_gfn + memslot->npages) 1124 return; 1125 1126 vpa->dirty = false; 1127 if (map) 1128 __set_bit_le(gfn - memslot->base_gfn, map); 1129 } 1130 1131 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1132 struct kvm_memory_slot *memslot, unsigned long *map) 1133 { 1134 unsigned long i; 1135 unsigned long *rmapp; 1136 1137 preempt_disable(); 1138 rmapp = memslot->arch.rmap; 1139 for (i = 0; i < memslot->npages; ++i) { 1140 int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1141 /* 1142 * Note that if npages > 0 then i must be a multiple of npages, 1143 * since we always put huge-page HPTEs in the rmap chain 1144 * corresponding to their page base address. 1145 */ 1146 if (npages) 1147 set_dirty_bits(map, i, npages); 1148 ++rmapp; 1149 } 1150 preempt_enable(); 1151 return 0; 1152 } 1153 1154 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1155 unsigned long *nb_ret) 1156 { 1157 struct kvm_memory_slot *memslot; 1158 unsigned long gfn = gpa >> PAGE_SHIFT; 1159 struct page *page, *pages[1]; 1160 int npages; 1161 unsigned long hva, offset; 1162 int srcu_idx; 1163 1164 srcu_idx = srcu_read_lock(&kvm->srcu); 1165 memslot = gfn_to_memslot(kvm, gfn); 1166 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1167 goto err; 1168 hva = gfn_to_hva_memslot(memslot, gfn); 1169 npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); 1170 if (npages < 1) 1171 goto err; 1172 page = pages[0]; 1173 srcu_read_unlock(&kvm->srcu, srcu_idx); 1174 1175 offset = gpa & (PAGE_SIZE - 1); 1176 if (nb_ret) 1177 *nb_ret = PAGE_SIZE - offset; 1178 return page_address(page) + offset; 1179 1180 err: 1181 srcu_read_unlock(&kvm->srcu, srcu_idx); 1182 return NULL; 1183 } 1184 1185 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1186 bool dirty) 1187 { 1188 struct page *page = virt_to_page(va); 1189 struct kvm_memory_slot *memslot; 1190 unsigned long gfn; 1191 int srcu_idx; 1192 1193 put_page(page); 1194 1195 if (!dirty) 1196 return; 1197 1198 /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ 1199 gfn = gpa >> PAGE_SHIFT; 1200 srcu_idx = srcu_read_lock(&kvm->srcu); 1201 memslot = gfn_to_memslot(kvm, gfn); 1202 if (memslot && memslot->dirty_bitmap) 1203 set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); 1204 srcu_read_unlock(&kvm->srcu, srcu_idx); 1205 } 1206 1207 /* 1208 * HPT resizing 1209 */ 1210 static int resize_hpt_allocate(struct kvm_resize_hpt *resize) 1211 { 1212 int rc; 1213 1214 rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); 1215 if (rc < 0) 1216 return rc; 1217 1218 resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", 1219 resize->hpt.virt); 1220 1221 return 0; 1222 } 1223 1224 static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, 1225 unsigned long idx) 1226 { 1227 struct kvm *kvm = resize->kvm; 1228 struct kvm_hpt_info *old = &kvm->arch.hpt; 1229 struct kvm_hpt_info *new = &resize->hpt; 1230 unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; 1231 unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; 1232 __be64 *hptep, *new_hptep; 1233 unsigned long vpte, rpte, guest_rpte; 1234 int ret; 1235 struct revmap_entry *rev; 1236 unsigned long apsize, avpn, pteg, hash; 1237 unsigned long new_idx, new_pteg, replace_vpte; 1238 int pshift; 1239 1240 hptep = (__be64 *)(old->virt + (idx << 4)); 1241 1242 /* Guest is stopped, so new HPTEs can't be added or faulted 1243 * in, only unmapped or altered by host actions. So, it's 1244 * safe to check this before we take the HPTE lock */ 1245 vpte = be64_to_cpu(hptep[0]); 1246 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1247 return 0; /* nothing to do */ 1248 1249 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 1250 cpu_relax(); 1251 1252 vpte = be64_to_cpu(hptep[0]); 1253 1254 ret = 0; 1255 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1256 /* Nothing to do */ 1257 goto out; 1258 1259 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1260 rpte = be64_to_cpu(hptep[1]); 1261 vpte = hpte_new_to_old_v(vpte, rpte); 1262 } 1263 1264 /* Unmap */ 1265 rev = &old->rev[idx]; 1266 guest_rpte = rev->guest_rpte; 1267 1268 ret = -EIO; 1269 apsize = kvmppc_actual_pgsz(vpte, guest_rpte); 1270 if (!apsize) 1271 goto out; 1272 1273 if (vpte & HPTE_V_VALID) { 1274 unsigned long gfn = hpte_rpn(guest_rpte, apsize); 1275 int srcu_idx = srcu_read_lock(&kvm->srcu); 1276 struct kvm_memory_slot *memslot = 1277 __gfn_to_memslot(kvm_memslots(kvm), gfn); 1278 1279 if (memslot) { 1280 unsigned long *rmapp; 1281 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1282 1283 lock_rmap(rmapp); 1284 kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); 1285 unlock_rmap(rmapp); 1286 } 1287 1288 srcu_read_unlock(&kvm->srcu, srcu_idx); 1289 } 1290 1291 /* Reload PTE after unmap */ 1292 vpte = be64_to_cpu(hptep[0]); 1293 BUG_ON(vpte & HPTE_V_VALID); 1294 BUG_ON(!(vpte & HPTE_V_ABSENT)); 1295 1296 ret = 0; 1297 if (!(vpte & HPTE_V_BOLTED)) 1298 goto out; 1299 1300 rpte = be64_to_cpu(hptep[1]); 1301 1302 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1303 vpte = hpte_new_to_old_v(vpte, rpte); 1304 rpte = hpte_new_to_old_r(rpte); 1305 } 1306 1307 pshift = kvmppc_hpte_base_page_shift(vpte, rpte); 1308 avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23); 1309 pteg = idx / HPTES_PER_GROUP; 1310 if (vpte & HPTE_V_SECONDARY) 1311 pteg = ~pteg; 1312 1313 if (!(vpte & HPTE_V_1TB_SEG)) { 1314 unsigned long offset, vsid; 1315 1316 /* We only have 28 - 23 bits of offset in avpn */ 1317 offset = (avpn & 0x1f) << 23; 1318 vsid = avpn >> 5; 1319 /* We can find more bits from the pteg value */ 1320 if (pshift < 23) 1321 offset |= ((vsid ^ pteg) & old_hash_mask) << pshift; 1322 1323 hash = vsid ^ (offset >> pshift); 1324 } else { 1325 unsigned long offset, vsid; 1326 1327 /* We only have 40 - 23 bits of seg_off in avpn */ 1328 offset = (avpn & 0x1ffff) << 23; 1329 vsid = avpn >> 17; 1330 if (pshift < 23) 1331 offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift; 1332 1333 hash = vsid ^ (vsid << 25) ^ (offset >> pshift); 1334 } 1335 1336 new_pteg = hash & new_hash_mask; 1337 if (vpte & HPTE_V_SECONDARY) 1338 new_pteg = ~hash & new_hash_mask; 1339 1340 new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); 1341 new_hptep = (__be64 *)(new->virt + (new_idx << 4)); 1342 1343 replace_vpte = be64_to_cpu(new_hptep[0]); 1344 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1345 unsigned long replace_rpte = be64_to_cpu(new_hptep[1]); 1346 replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte); 1347 } 1348 1349 if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1350 BUG_ON(new->order >= old->order); 1351 1352 if (replace_vpte & HPTE_V_BOLTED) { 1353 if (vpte & HPTE_V_BOLTED) 1354 /* Bolted collision, nothing we can do */ 1355 ret = -ENOSPC; 1356 /* Discard the new HPTE */ 1357 goto out; 1358 } 1359 1360 /* Discard the previous HPTE */ 1361 } 1362 1363 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1364 rpte = hpte_old_to_new_r(vpte, rpte); 1365 vpte = hpte_old_to_new_v(vpte); 1366 } 1367 1368 new_hptep[1] = cpu_to_be64(rpte); 1369 new->rev[new_idx].guest_rpte = guest_rpte; 1370 /* No need for a barrier, since new HPT isn't active */ 1371 new_hptep[0] = cpu_to_be64(vpte); 1372 unlock_hpte(new_hptep, vpte); 1373 1374 out: 1375 unlock_hpte(hptep, vpte); 1376 return ret; 1377 } 1378 1379 static int resize_hpt_rehash(struct kvm_resize_hpt *resize) 1380 { 1381 struct kvm *kvm = resize->kvm; 1382 unsigned long i; 1383 int rc; 1384 1385 for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { 1386 rc = resize_hpt_rehash_hpte(resize, i); 1387 if (rc != 0) 1388 return rc; 1389 } 1390 1391 return 0; 1392 } 1393 1394 static void resize_hpt_pivot(struct kvm_resize_hpt *resize) 1395 { 1396 struct kvm *kvm = resize->kvm; 1397 struct kvm_hpt_info hpt_tmp; 1398 1399 /* Exchange the pending tables in the resize structure with 1400 * the active tables */ 1401 1402 resize_hpt_debug(resize, "resize_hpt_pivot()\n"); 1403 1404 spin_lock(&kvm->mmu_lock); 1405 asm volatile("ptesync" : : : "memory"); 1406 1407 hpt_tmp = kvm->arch.hpt; 1408 kvmppc_set_hpt(kvm, &resize->hpt); 1409 resize->hpt = hpt_tmp; 1410 1411 spin_unlock(&kvm->mmu_lock); 1412 1413 synchronize_srcu_expedited(&kvm->srcu); 1414 1415 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1416 kvmppc_setup_partition_table(kvm); 1417 1418 resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); 1419 } 1420 1421 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) 1422 { 1423 if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock))) 1424 return; 1425 1426 if (!resize) 1427 return; 1428 1429 if (resize->error != -EBUSY) { 1430 if (resize->hpt.virt) 1431 kvmppc_free_hpt(&resize->hpt); 1432 kfree(resize); 1433 } 1434 1435 if (kvm->arch.resize_hpt == resize) 1436 kvm->arch.resize_hpt = NULL; 1437 } 1438 1439 static void resize_hpt_prepare_work(struct work_struct *work) 1440 { 1441 struct kvm_resize_hpt *resize = container_of(work, 1442 struct kvm_resize_hpt, 1443 work); 1444 struct kvm *kvm = resize->kvm; 1445 int err = 0; 1446 1447 if (WARN_ON(resize->error != -EBUSY)) 1448 return; 1449 1450 mutex_lock(&kvm->arch.mmu_setup_lock); 1451 1452 /* Request is still current? */ 1453 if (kvm->arch.resize_hpt == resize) { 1454 /* We may request large allocations here: 1455 * do not sleep with kvm->arch.mmu_setup_lock held for a while. 1456 */ 1457 mutex_unlock(&kvm->arch.mmu_setup_lock); 1458 1459 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", 1460 resize->order); 1461 1462 err = resize_hpt_allocate(resize); 1463 1464 /* We have strict assumption about -EBUSY 1465 * when preparing for HPT resize. 1466 */ 1467 if (WARN_ON(err == -EBUSY)) 1468 err = -EINPROGRESS; 1469 1470 mutex_lock(&kvm->arch.mmu_setup_lock); 1471 /* It is possible that kvm->arch.resize_hpt != resize 1472 * after we grab kvm->arch.mmu_setup_lock again. 1473 */ 1474 } 1475 1476 resize->error = err; 1477 1478 if (kvm->arch.resize_hpt != resize) 1479 resize_hpt_release(kvm, resize); 1480 1481 mutex_unlock(&kvm->arch.mmu_setup_lock); 1482 } 1483 1484 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, 1485 struct kvm_ppc_resize_hpt *rhpt) 1486 { 1487 unsigned long flags = rhpt->flags; 1488 unsigned long shift = rhpt->shift; 1489 struct kvm_resize_hpt *resize; 1490 int ret; 1491 1492 if (flags != 0 || kvm_is_radix(kvm)) 1493 return -EINVAL; 1494 1495 if (shift && ((shift < 18) || (shift > 46))) 1496 return -EINVAL; 1497 1498 mutex_lock(&kvm->arch.mmu_setup_lock); 1499 1500 resize = kvm->arch.resize_hpt; 1501 1502 if (resize) { 1503 if (resize->order == shift) { 1504 /* Suitable resize in progress? */ 1505 ret = resize->error; 1506 if (ret == -EBUSY) 1507 ret = 100; /* estimated time in ms */ 1508 else if (ret) 1509 resize_hpt_release(kvm, resize); 1510 1511 goto out; 1512 } 1513 1514 /* not suitable, cancel it */ 1515 resize_hpt_release(kvm, resize); 1516 } 1517 1518 ret = 0; 1519 if (!shift) 1520 goto out; /* nothing to do */ 1521 1522 /* start new resize */ 1523 1524 resize = kzalloc(sizeof(*resize), GFP_KERNEL); 1525 if (!resize) { 1526 ret = -ENOMEM; 1527 goto out; 1528 } 1529 1530 resize->error = -EBUSY; 1531 resize->order = shift; 1532 resize->kvm = kvm; 1533 INIT_WORK(&resize->work, resize_hpt_prepare_work); 1534 kvm->arch.resize_hpt = resize; 1535 1536 schedule_work(&resize->work); 1537 1538 ret = 100; /* estimated time in ms */ 1539 1540 out: 1541 mutex_unlock(&kvm->arch.mmu_setup_lock); 1542 return ret; 1543 } 1544 1545 static void resize_hpt_boot_vcpu(void *opaque) 1546 { 1547 /* Nothing to do, just force a KVM exit */ 1548 } 1549 1550 long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, 1551 struct kvm_ppc_resize_hpt *rhpt) 1552 { 1553 unsigned long flags = rhpt->flags; 1554 unsigned long shift = rhpt->shift; 1555 struct kvm_resize_hpt *resize; 1556 long ret; 1557 1558 if (flags != 0 || kvm_is_radix(kvm)) 1559 return -EINVAL; 1560 1561 if (shift && ((shift < 18) || (shift > 46))) 1562 return -EINVAL; 1563 1564 mutex_lock(&kvm->arch.mmu_setup_lock); 1565 1566 resize = kvm->arch.resize_hpt; 1567 1568 /* This shouldn't be possible */ 1569 ret = -EIO; 1570 if (WARN_ON(!kvm->arch.mmu_ready)) 1571 goto out_no_hpt; 1572 1573 /* Stop VCPUs from running while we mess with the HPT */ 1574 kvm->arch.mmu_ready = 0; 1575 smp_mb(); 1576 1577 /* Boot all CPUs out of the guest so they re-read 1578 * mmu_ready */ 1579 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1580 1581 ret = -ENXIO; 1582 if (!resize || (resize->order != shift)) 1583 goto out; 1584 1585 ret = resize->error; 1586 if (ret) 1587 goto out; 1588 1589 ret = resize_hpt_rehash(resize); 1590 if (ret) 1591 goto out; 1592 1593 resize_hpt_pivot(resize); 1594 1595 out: 1596 /* Let VCPUs run again */ 1597 kvm->arch.mmu_ready = 1; 1598 smp_mb(); 1599 out_no_hpt: 1600 resize_hpt_release(kvm, resize); 1601 mutex_unlock(&kvm->arch.mmu_setup_lock); 1602 return ret; 1603 } 1604 1605 /* 1606 * Functions for reading and writing the hash table via reads and 1607 * writes on a file descriptor. 1608 * 1609 * Reads return the guest view of the hash table, which has to be 1610 * pieced together from the real hash table and the guest_rpte 1611 * values in the revmap array. 1612 * 1613 * On writes, each HPTE written is considered in turn, and if it 1614 * is valid, it is written to the HPT as if an H_ENTER with the 1615 * exact flag set was done. When the invalid count is non-zero 1616 * in the header written to the stream, the kernel will make 1617 * sure that that many HPTEs are invalid, and invalidate them 1618 * if not. 1619 */ 1620 1621 struct kvm_htab_ctx { 1622 unsigned long index; 1623 unsigned long flags; 1624 struct kvm *kvm; 1625 int first_pass; 1626 }; 1627 1628 #define HPTE_SIZE (2 * sizeof(unsigned long)) 1629 1630 /* 1631 * Returns 1 if this HPT entry has been modified or has pending 1632 * R/C bit changes. 1633 */ 1634 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) 1635 { 1636 unsigned long rcbits_unset; 1637 1638 if (revp->guest_rpte & HPTE_GR_MODIFIED) 1639 return 1; 1640 1641 /* Also need to consider changes in reference and changed bits */ 1642 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1643 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && 1644 (be64_to_cpu(hptp[1]) & rcbits_unset)) 1645 return 1; 1646 1647 return 0; 1648 } 1649 1650 static long record_hpte(unsigned long flags, __be64 *hptp, 1651 unsigned long *hpte, struct revmap_entry *revp, 1652 int want_valid, int first_pass) 1653 { 1654 unsigned long v, r, hr; 1655 unsigned long rcbits_unset; 1656 int ok = 1; 1657 int valid, dirty; 1658 1659 /* Unmodified entries are uninteresting except on the first pass */ 1660 dirty = hpte_dirty(revp, hptp); 1661 if (!first_pass && !dirty) 1662 return 0; 1663 1664 valid = 0; 1665 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1666 valid = 1; 1667 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1668 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) 1669 valid = 0; 1670 } 1671 if (valid != want_valid) 1672 return 0; 1673 1674 v = r = 0; 1675 if (valid || dirty) { 1676 /* lock the HPTE so it's stable and read it */ 1677 preempt_disable(); 1678 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1679 cpu_relax(); 1680 v = be64_to_cpu(hptp[0]); 1681 hr = be64_to_cpu(hptp[1]); 1682 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1683 v = hpte_new_to_old_v(v, hr); 1684 hr = hpte_new_to_old_r(hr); 1685 } 1686 1687 /* re-evaluate valid and dirty from synchronized HPTE value */ 1688 valid = !!(v & HPTE_V_VALID); 1689 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1690 1691 /* Harvest R and C into guest view if necessary */ 1692 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1693 if (valid && (rcbits_unset & hr)) { 1694 revp->guest_rpte |= (hr & 1695 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; 1696 dirty = 1; 1697 } 1698 1699 if (v & HPTE_V_ABSENT) { 1700 v &= ~HPTE_V_ABSENT; 1701 v |= HPTE_V_VALID; 1702 valid = 1; 1703 } 1704 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1705 valid = 0; 1706 1707 r = revp->guest_rpte; 1708 /* only clear modified if this is the right sort of entry */ 1709 if (valid == want_valid && dirty) { 1710 r &= ~HPTE_GR_MODIFIED; 1711 revp->guest_rpte = r; 1712 } 1713 unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1714 preempt_enable(); 1715 if (!(valid == want_valid && (first_pass || dirty))) 1716 ok = 0; 1717 } 1718 hpte[0] = cpu_to_be64(v); 1719 hpte[1] = cpu_to_be64(r); 1720 return ok; 1721 } 1722 1723 static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1724 size_t count, loff_t *ppos) 1725 { 1726 struct kvm_htab_ctx *ctx = file->private_data; 1727 struct kvm *kvm = ctx->kvm; 1728 struct kvm_get_htab_header hdr; 1729 __be64 *hptp; 1730 struct revmap_entry *revp; 1731 unsigned long i, nb, nw; 1732 unsigned long __user *lbuf; 1733 struct kvm_get_htab_header __user *hptr; 1734 unsigned long flags; 1735 int first_pass; 1736 unsigned long hpte[2]; 1737 1738 if (!access_ok(buf, count)) 1739 return -EFAULT; 1740 if (kvm_is_radix(kvm)) 1741 return 0; 1742 1743 first_pass = ctx->first_pass; 1744 flags = ctx->flags; 1745 1746 i = ctx->index; 1747 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1748 revp = kvm->arch.hpt.rev + i; 1749 lbuf = (unsigned long __user *)buf; 1750 1751 nb = 0; 1752 while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1753 /* Initialize header */ 1754 hptr = (struct kvm_get_htab_header __user *)buf; 1755 hdr.n_valid = 0; 1756 hdr.n_invalid = 0; 1757 nw = nb; 1758 nb += sizeof(hdr); 1759 lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1760 1761 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1762 if (!first_pass) { 1763 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1764 !hpte_dirty(revp, hptp)) { 1765 ++i; 1766 hptp += 2; 1767 ++revp; 1768 } 1769 } 1770 hdr.index = i; 1771 1772 /* Grab a series of valid entries */ 1773 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1774 hdr.n_valid < 0xffff && 1775 nb + HPTE_SIZE < count && 1776 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1777 /* valid entry, write it out */ 1778 ++hdr.n_valid; 1779 if (__put_user(hpte[0], lbuf) || 1780 __put_user(hpte[1], lbuf + 1)) 1781 return -EFAULT; 1782 nb += HPTE_SIZE; 1783 lbuf += 2; 1784 ++i; 1785 hptp += 2; 1786 ++revp; 1787 } 1788 /* Now skip invalid entries while we can */ 1789 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1790 hdr.n_invalid < 0xffff && 1791 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1792 /* found an invalid entry */ 1793 ++hdr.n_invalid; 1794 ++i; 1795 hptp += 2; 1796 ++revp; 1797 } 1798 1799 if (hdr.n_valid || hdr.n_invalid) { 1800 /* write back the header */ 1801 if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1802 return -EFAULT; 1803 nw = nb; 1804 buf = (char __user *)lbuf; 1805 } else { 1806 nb = nw; 1807 } 1808 1809 /* Check if we've wrapped around the hash table */ 1810 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { 1811 i = 0; 1812 ctx->first_pass = 0; 1813 break; 1814 } 1815 } 1816 1817 ctx->index = i; 1818 1819 return nb; 1820 } 1821 1822 static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1823 size_t count, loff_t *ppos) 1824 { 1825 struct kvm_htab_ctx *ctx = file->private_data; 1826 struct kvm *kvm = ctx->kvm; 1827 struct kvm_get_htab_header hdr; 1828 unsigned long i, j; 1829 unsigned long v, r; 1830 unsigned long __user *lbuf; 1831 __be64 *hptp; 1832 unsigned long tmp[2]; 1833 ssize_t nb; 1834 long int err, ret; 1835 int mmu_ready; 1836 int pshift; 1837 1838 if (!access_ok(buf, count)) 1839 return -EFAULT; 1840 if (kvm_is_radix(kvm)) 1841 return -EINVAL; 1842 1843 /* lock out vcpus from running while we're doing this */ 1844 mutex_lock(&kvm->arch.mmu_setup_lock); 1845 mmu_ready = kvm->arch.mmu_ready; 1846 if (mmu_ready) { 1847 kvm->arch.mmu_ready = 0; /* temporarily */ 1848 /* order mmu_ready vs. vcpus_running */ 1849 smp_mb(); 1850 if (atomic_read(&kvm->arch.vcpus_running)) { 1851 kvm->arch.mmu_ready = 1; 1852 mutex_unlock(&kvm->arch.mmu_setup_lock); 1853 return -EBUSY; 1854 } 1855 } 1856 1857 err = 0; 1858 for (nb = 0; nb + sizeof(hdr) <= count; ) { 1859 err = -EFAULT; 1860 if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1861 break; 1862 1863 err = 0; 1864 if (nb + hdr.n_valid * HPTE_SIZE > count) 1865 break; 1866 1867 nb += sizeof(hdr); 1868 buf += sizeof(hdr); 1869 1870 err = -EINVAL; 1871 i = hdr.index; 1872 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || 1873 i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) 1874 break; 1875 1876 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1877 lbuf = (unsigned long __user *)buf; 1878 for (j = 0; j < hdr.n_valid; ++j) { 1879 __be64 hpte_v; 1880 __be64 hpte_r; 1881 1882 err = -EFAULT; 1883 if (__get_user(hpte_v, lbuf) || 1884 __get_user(hpte_r, lbuf + 1)) 1885 goto out; 1886 v = be64_to_cpu(hpte_v); 1887 r = be64_to_cpu(hpte_r); 1888 err = -EINVAL; 1889 if (!(v & HPTE_V_VALID)) 1890 goto out; 1891 pshift = kvmppc_hpte_base_page_shift(v, r); 1892 if (pshift <= 0) 1893 goto out; 1894 lbuf += 2; 1895 nb += HPTE_SIZE; 1896 1897 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1898 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1899 err = -EIO; 1900 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1901 tmp); 1902 if (ret != H_SUCCESS) { 1903 pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1904 "r=%lx\n", ret, i, v, r); 1905 goto out; 1906 } 1907 if (!mmu_ready && is_vrma_hpte(v)) { 1908 unsigned long senc, lpcr; 1909 1910 senc = slb_pgsize_encoding(1ul << pshift); 1911 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1912 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1913 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 1914 lpcr = senc << (LPCR_VRMASD_SH - 4); 1915 kvmppc_update_lpcr(kvm, lpcr, 1916 LPCR_VRMASD); 1917 } else { 1918 kvmppc_setup_partition_table(kvm); 1919 } 1920 mmu_ready = 1; 1921 } 1922 ++i; 1923 hptp += 2; 1924 } 1925 1926 for (j = 0; j < hdr.n_invalid; ++j) { 1927 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1928 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1929 ++i; 1930 hptp += 2; 1931 } 1932 err = 0; 1933 } 1934 1935 out: 1936 /* Order HPTE updates vs. mmu_ready */ 1937 smp_wmb(); 1938 kvm->arch.mmu_ready = mmu_ready; 1939 mutex_unlock(&kvm->arch.mmu_setup_lock); 1940 1941 if (err) 1942 return err; 1943 return nb; 1944 } 1945 1946 static int kvm_htab_release(struct inode *inode, struct file *filp) 1947 { 1948 struct kvm_htab_ctx *ctx = filp->private_data; 1949 1950 filp->private_data = NULL; 1951 if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1952 atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1953 kvm_put_kvm(ctx->kvm); 1954 kfree(ctx); 1955 return 0; 1956 } 1957 1958 static const struct file_operations kvm_htab_fops = { 1959 .read = kvm_htab_read, 1960 .write = kvm_htab_write, 1961 .llseek = default_llseek, 1962 .release = kvm_htab_release, 1963 }; 1964 1965 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1966 { 1967 int ret; 1968 struct kvm_htab_ctx *ctx; 1969 int rwflag; 1970 1971 /* reject flags we don't recognize */ 1972 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1973 return -EINVAL; 1974 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1975 if (!ctx) 1976 return -ENOMEM; 1977 kvm_get_kvm(kvm); 1978 ctx->kvm = kvm; 1979 ctx->index = ghf->start_index; 1980 ctx->flags = ghf->flags; 1981 ctx->first_pass = 1; 1982 1983 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1984 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1985 if (ret < 0) { 1986 kfree(ctx); 1987 kvm_put_kvm_no_destroy(kvm); 1988 return ret; 1989 } 1990 1991 if (rwflag == O_RDONLY) { 1992 mutex_lock(&kvm->slots_lock); 1993 atomic_inc(&kvm->arch.hpte_mod_interest); 1994 /* make sure kvmppc_do_h_enter etc. see the increment */ 1995 synchronize_srcu_expedited(&kvm->srcu); 1996 mutex_unlock(&kvm->slots_lock); 1997 } 1998 1999 return ret; 2000 } 2001 2002 struct debugfs_htab_state { 2003 struct kvm *kvm; 2004 struct mutex mutex; 2005 unsigned long hpt_index; 2006 int chars_left; 2007 int buf_index; 2008 char buf[64]; 2009 }; 2010 2011 static int debugfs_htab_open(struct inode *inode, struct file *file) 2012 { 2013 struct kvm *kvm = inode->i_private; 2014 struct debugfs_htab_state *p; 2015 2016 p = kzalloc(sizeof(*p), GFP_KERNEL); 2017 if (!p) 2018 return -ENOMEM; 2019 2020 kvm_get_kvm(kvm); 2021 p->kvm = kvm; 2022 mutex_init(&p->mutex); 2023 file->private_data = p; 2024 2025 return nonseekable_open(inode, file); 2026 } 2027 2028 static int debugfs_htab_release(struct inode *inode, struct file *file) 2029 { 2030 struct debugfs_htab_state *p = file->private_data; 2031 2032 kvm_put_kvm(p->kvm); 2033 kfree(p); 2034 return 0; 2035 } 2036 2037 static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 2038 size_t len, loff_t *ppos) 2039 { 2040 struct debugfs_htab_state *p = file->private_data; 2041 ssize_t ret, r; 2042 unsigned long i, n; 2043 unsigned long v, hr, gr; 2044 struct kvm *kvm; 2045 __be64 *hptp; 2046 2047 kvm = p->kvm; 2048 if (kvm_is_radix(kvm)) 2049 return 0; 2050 2051 ret = mutex_lock_interruptible(&p->mutex); 2052 if (ret) 2053 return ret; 2054 2055 if (p->chars_left) { 2056 n = p->chars_left; 2057 if (n > len) 2058 n = len; 2059 r = copy_to_user(buf, p->buf + p->buf_index, n); 2060 n -= r; 2061 p->chars_left -= n; 2062 p->buf_index += n; 2063 buf += n; 2064 len -= n; 2065 ret = n; 2066 if (r) { 2067 if (!n) 2068 ret = -EFAULT; 2069 goto out; 2070 } 2071 } 2072 2073 i = p->hpt_index; 2074 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2075 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); 2076 ++i, hptp += 2) { 2077 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 2078 continue; 2079 2080 /* lock the HPTE so it's stable and read it */ 2081 preempt_disable(); 2082 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 2083 cpu_relax(); 2084 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 2085 hr = be64_to_cpu(hptp[1]); 2086 gr = kvm->arch.hpt.rev[i].guest_rpte; 2087 unlock_hpte(hptp, v); 2088 preempt_enable(); 2089 2090 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 2091 continue; 2092 2093 n = scnprintf(p->buf, sizeof(p->buf), 2094 "%6lx %.16lx %.16lx %.16lx\n", 2095 i, v, hr, gr); 2096 p->chars_left = n; 2097 if (n > len) 2098 n = len; 2099 r = copy_to_user(buf, p->buf, n); 2100 n -= r; 2101 p->chars_left -= n; 2102 p->buf_index = n; 2103 buf += n; 2104 len -= n; 2105 ret += n; 2106 if (r) { 2107 if (!ret) 2108 ret = -EFAULT; 2109 goto out; 2110 } 2111 } 2112 p->hpt_index = i; 2113 2114 out: 2115 mutex_unlock(&p->mutex); 2116 return ret; 2117 } 2118 2119 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 2120 size_t len, loff_t *ppos) 2121 { 2122 return -EACCES; 2123 } 2124 2125 static const struct file_operations debugfs_htab_fops = { 2126 .owner = THIS_MODULE, 2127 .open = debugfs_htab_open, 2128 .release = debugfs_htab_release, 2129 .read = debugfs_htab_read, 2130 .write = debugfs_htab_write, 2131 .llseek = generic_file_llseek, 2132 }; 2133 2134 void kvmppc_mmu_debugfs_init(struct kvm *kvm) 2135 { 2136 debugfs_create_file("htab", 0400, kvm->arch.debugfs_dir, kvm, 2137 &debugfs_htab_fops); 2138 } 2139 2140 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 2141 { 2142 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 2143 2144 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2145 2146 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2147 2148 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 2149 } 2150