1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 5 */ 6 7 #include <linux/types.h> 8 #include <linux/string.h> 9 #include <linux/kvm.h> 10 #include <linux/kvm_host.h> 11 #include <linux/highmem.h> 12 #include <linux/gfp.h> 13 #include <linux/slab.h> 14 #include <linux/hugetlb.h> 15 #include <linux/vmalloc.h> 16 #include <linux/srcu.h> 17 #include <linux/anon_inodes.h> 18 #include <linux/file.h> 19 #include <linux/debugfs.h> 20 21 #include <asm/kvm_ppc.h> 22 #include <asm/kvm_book3s.h> 23 #include <asm/book3s/64/mmu-hash.h> 24 #include <asm/hvcall.h> 25 #include <asm/synch.h> 26 #include <asm/ppc-opcode.h> 27 #include <asm/cputable.h> 28 #include <asm/pte-walk.h> 29 30 #include "book3s.h" 31 #include "trace_hv.h" 32 33 //#define DEBUG_RESIZE_HPT 1 34 35 #ifdef DEBUG_RESIZE_HPT 36 #define resize_hpt_debug(resize, ...) \ 37 do { \ 38 printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ 39 printk(__VA_ARGS__); \ 40 } while (0) 41 #else 42 #define resize_hpt_debug(resize, ...) \ 43 do { } while (0) 44 #endif 45 46 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 47 long pte_index, unsigned long pteh, 48 unsigned long ptel, unsigned long *pte_idx_ret); 49 50 struct kvm_resize_hpt { 51 /* These fields read-only after init */ 52 struct kvm *kvm; 53 struct work_struct work; 54 u32 order; 55 56 /* These fields protected by kvm->arch.mmu_setup_lock */ 57 58 /* Possible values and their usage: 59 * <0 an error occurred during allocation, 60 * -EBUSY allocation is in the progress, 61 * 0 allocation made successfuly. 62 */ 63 int error; 64 65 /* Private to the work thread, until error != -EBUSY, 66 * then protected by kvm->arch.mmu_setup_lock. 67 */ 68 struct kvm_hpt_info hpt; 69 }; 70 71 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 72 { 73 unsigned long hpt = 0; 74 int cma = 0; 75 struct page *page = NULL; 76 struct revmap_entry *rev; 77 unsigned long npte; 78 79 if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) 80 return -EINVAL; 81 82 page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); 83 if (page) { 84 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 85 memset((void *)hpt, 0, (1ul << order)); 86 cma = 1; 87 } 88 89 if (!hpt) 90 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL 91 |__GFP_NOWARN, order - PAGE_SHIFT); 92 93 if (!hpt) 94 return -ENOMEM; 95 96 /* HPTEs are 2**4 bytes long */ 97 npte = 1ul << (order - 4); 98 99 /* Allocate reverse map array */ 100 rev = vmalloc(array_size(npte, sizeof(struct revmap_entry))); 101 if (!rev) { 102 if (cma) 103 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 104 else 105 free_pages(hpt, order - PAGE_SHIFT); 106 return -ENOMEM; 107 } 108 109 info->order = order; 110 info->virt = hpt; 111 info->cma = cma; 112 info->rev = rev; 113 114 return 0; 115 } 116 117 void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) 118 { 119 atomic64_set(&kvm->arch.mmio_update, 0); 120 kvm->arch.hpt = *info; 121 kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); 122 123 pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", 124 info->virt, (long)info->order, kvm->arch.lpid); 125 } 126 127 long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) 128 { 129 long err = -EBUSY; 130 struct kvm_hpt_info info; 131 132 mutex_lock(&kvm->arch.mmu_setup_lock); 133 if (kvm->arch.mmu_ready) { 134 kvm->arch.mmu_ready = 0; 135 /* order mmu_ready vs. vcpus_running */ 136 smp_mb(); 137 if (atomic_read(&kvm->arch.vcpus_running)) { 138 kvm->arch.mmu_ready = 1; 139 goto out; 140 } 141 } 142 if (kvm_is_radix(kvm)) { 143 err = kvmppc_switch_mmu_to_hpt(kvm); 144 if (err) 145 goto out; 146 } 147 148 if (kvm->arch.hpt.order == order) { 149 /* We already have a suitable HPT */ 150 151 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 152 memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); 153 /* 154 * Reset all the reverse-mapping chains for all memslots 155 */ 156 kvmppc_rmap_reset(kvm); 157 err = 0; 158 goto out; 159 } 160 161 if (kvm->arch.hpt.virt) { 162 kvmppc_free_hpt(&kvm->arch.hpt); 163 kvmppc_rmap_reset(kvm); 164 } 165 166 err = kvmppc_allocate_hpt(&info, order); 167 if (err < 0) 168 goto out; 169 kvmppc_set_hpt(kvm, &info); 170 171 out: 172 if (err == 0) 173 /* Ensure that each vcpu will flush its TLB on next entry. */ 174 cpumask_setall(&kvm->arch.need_tlb_flush); 175 176 mutex_unlock(&kvm->arch.mmu_setup_lock); 177 return err; 178 } 179 180 void kvmppc_free_hpt(struct kvm_hpt_info *info) 181 { 182 vfree(info->rev); 183 info->rev = NULL; 184 if (info->cma) 185 kvm_free_hpt_cma(virt_to_page(info->virt), 186 1 << (info->order - PAGE_SHIFT)); 187 else if (info->virt) 188 free_pages(info->virt, info->order - PAGE_SHIFT); 189 info->virt = 0; 190 info->order = 0; 191 } 192 193 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 194 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 195 { 196 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 197 } 198 199 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 200 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 201 { 202 return (pgsize == 0x10000) ? 0x1000 : 0; 203 } 204 205 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 206 unsigned long porder) 207 { 208 unsigned long i; 209 unsigned long npages; 210 unsigned long hp_v, hp_r; 211 unsigned long addr, hash; 212 unsigned long psize; 213 unsigned long hp0, hp1; 214 unsigned long idx_ret; 215 long ret; 216 struct kvm *kvm = vcpu->kvm; 217 218 psize = 1ul << porder; 219 npages = memslot->npages >> (porder - PAGE_SHIFT); 220 221 /* VRMA can't be > 1TB */ 222 if (npages > 1ul << (40 - porder)) 223 npages = 1ul << (40 - porder); 224 /* Can't use more than 1 HPTE per HPTEG */ 225 if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) 226 npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; 227 228 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 229 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 230 hp1 = hpte1_pgsize_encoding(psize) | 231 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 232 233 for (i = 0; i < npages; ++i) { 234 addr = i << porder; 235 /* can't use hpt_hash since va > 64 bits */ 236 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) 237 & kvmppc_hpt_mask(&kvm->arch.hpt); 238 /* 239 * We assume that the hash table is empty and no 240 * vcpus are using it at this stage. Since we create 241 * at most one HPTE per HPTEG, we just assume entry 7 242 * is available and use it. 243 */ 244 hash = (hash << 3) + 7; 245 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 246 hp_r = hp1 | addr; 247 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 248 &idx_ret); 249 if (ret != H_SUCCESS) { 250 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 251 addr, ret); 252 break; 253 } 254 } 255 } 256 257 int kvmppc_mmu_hv_init(void) 258 { 259 unsigned long host_lpid, rsvd_lpid; 260 261 if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) 262 return -EINVAL; 263 264 host_lpid = 0; 265 if (cpu_has_feature(CPU_FTR_HVMODE)) 266 host_lpid = mfspr(SPRN_LPID); 267 268 /* POWER8 and above have 12-bit LPIDs (10-bit in POWER7) */ 269 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 270 rsvd_lpid = LPID_RSVD; 271 else 272 rsvd_lpid = LPID_RSVD_POWER7; 273 274 kvmppc_init_lpid(rsvd_lpid + 1); 275 276 kvmppc_claim_lpid(host_lpid); 277 /* rsvd_lpid is reserved for use in partition switching */ 278 kvmppc_claim_lpid(rsvd_lpid); 279 280 return 0; 281 } 282 283 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 284 long pte_index, unsigned long pteh, 285 unsigned long ptel, unsigned long *pte_idx_ret) 286 { 287 long ret; 288 289 preempt_disable(); 290 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 291 kvm->mm->pgd, false, pte_idx_ret); 292 preempt_enable(); 293 if (ret == H_TOO_HARD) { 294 /* this can't happen */ 295 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 296 ret = H_RESOURCE; /* or something */ 297 } 298 return ret; 299 300 } 301 302 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 303 gva_t eaddr) 304 { 305 u64 mask; 306 int i; 307 308 for (i = 0; i < vcpu->arch.slb_nr; i++) { 309 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 310 continue; 311 312 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 313 mask = ESID_MASK_1T; 314 else 315 mask = ESID_MASK; 316 317 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 318 return &vcpu->arch.slb[i]; 319 } 320 return NULL; 321 } 322 323 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 324 unsigned long ea) 325 { 326 unsigned long ra_mask; 327 328 ra_mask = kvmppc_actual_pgsz(v, r) - 1; 329 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 330 } 331 332 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 333 struct kvmppc_pte *gpte, bool data, bool iswrite) 334 { 335 struct kvm *kvm = vcpu->kvm; 336 struct kvmppc_slb *slbe; 337 unsigned long slb_v; 338 unsigned long pp, key; 339 unsigned long v, orig_v, gr; 340 __be64 *hptep; 341 long int index; 342 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 343 344 if (kvm_is_radix(vcpu->kvm)) 345 return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); 346 347 /* Get SLB entry */ 348 if (virtmode) { 349 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 350 if (!slbe) 351 return -EINVAL; 352 slb_v = slbe->origv; 353 } else { 354 /* real mode access */ 355 slb_v = vcpu->kvm->arch.vrma_slb_v; 356 } 357 358 preempt_disable(); 359 /* Find the HPTE in the hash table */ 360 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 361 HPTE_V_VALID | HPTE_V_ABSENT); 362 if (index < 0) { 363 preempt_enable(); 364 return -ENOENT; 365 } 366 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 367 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 368 if (cpu_has_feature(CPU_FTR_ARCH_300)) 369 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 370 gr = kvm->arch.hpt.rev[index].guest_rpte; 371 372 unlock_hpte(hptep, orig_v); 373 preempt_enable(); 374 375 gpte->eaddr = eaddr; 376 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 377 378 /* Get PP bits and key for permission check */ 379 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 380 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 381 key &= slb_v; 382 383 /* Calculate permissions */ 384 gpte->may_read = hpte_read_permission(pp, key); 385 gpte->may_write = hpte_write_permission(pp, key); 386 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 387 388 /* Storage key permission check for POWER7 */ 389 if (data && virtmode) { 390 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 391 if (amrfield & 1) 392 gpte->may_read = 0; 393 if (amrfield & 2) 394 gpte->may_write = 0; 395 } 396 397 /* Get the guest physical address */ 398 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 399 return 0; 400 } 401 402 /* 403 * Quick test for whether an instruction is a load or a store. 404 * If the instruction is a load or a store, then this will indicate 405 * which it is, at least on server processors. (Embedded processors 406 * have some external PID instructions that don't follow the rule 407 * embodied here.) If the instruction isn't a load or store, then 408 * this doesn't return anything useful. 409 */ 410 static int instruction_is_store(unsigned int instr) 411 { 412 unsigned int mask; 413 414 mask = 0x10000000; 415 if ((instr & 0xfc000000) == 0x7c000000) 416 mask = 0x100; /* major opcode 31 */ 417 return (instr & mask) != 0; 418 } 419 420 int kvmppc_hv_emulate_mmio(struct kvm_vcpu *vcpu, 421 unsigned long gpa, gva_t ea, int is_store) 422 { 423 u32 last_inst; 424 425 /* 426 * Fast path - check if the guest physical address corresponds to a 427 * device on the FAST_MMIO_BUS, if so we can avoid loading the 428 * instruction all together, then we can just handle it and return. 429 */ 430 if (is_store) { 431 int idx, ret; 432 433 idx = srcu_read_lock(&vcpu->kvm->srcu); 434 ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0, 435 NULL); 436 srcu_read_unlock(&vcpu->kvm->srcu, idx); 437 if (!ret) { 438 kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); 439 return RESUME_GUEST; 440 } 441 } 442 443 /* 444 * If we fail, we just return to the guest and try executing it again. 445 */ 446 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 447 EMULATE_DONE) 448 return RESUME_GUEST; 449 450 /* 451 * WARNING: We do not know for sure whether the instruction we just 452 * read from memory is the same that caused the fault in the first 453 * place. If the instruction we read is neither an load or a store, 454 * then it can't access memory, so we don't need to worry about 455 * enforcing access permissions. So, assuming it is a load or 456 * store, we just check that its direction (load or store) is 457 * consistent with the original fault, since that's what we 458 * checked the access permissions against. If there is a mismatch 459 * we just return and retry the instruction. 460 */ 461 462 if (instruction_is_store(last_inst) != !!is_store) 463 return RESUME_GUEST; 464 465 /* 466 * Emulated accesses are emulated by looking at the hash for 467 * translation once, then performing the access later. The 468 * translation could be invalidated in the meantime in which 469 * point performing the subsequent memory access on the old 470 * physical address could possibly be a security hole for the 471 * guest (but not the host). 472 * 473 * This is less of an issue for MMIO stores since they aren't 474 * globally visible. It could be an issue for MMIO loads to 475 * a certain extent but we'll ignore it for now. 476 */ 477 478 vcpu->arch.paddr_accessed = gpa; 479 vcpu->arch.vaddr_accessed = ea; 480 return kvmppc_emulate_mmio(vcpu); 481 } 482 483 int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, 484 unsigned long ea, unsigned long dsisr) 485 { 486 struct kvm *kvm = vcpu->kvm; 487 unsigned long hpte[3], r; 488 unsigned long hnow_v, hnow_r; 489 __be64 *hptep; 490 unsigned long mmu_seq, psize, pte_size; 491 unsigned long gpa_base, gfn_base; 492 unsigned long gpa, gfn, hva, pfn, hpa; 493 struct kvm_memory_slot *memslot; 494 unsigned long *rmap; 495 struct revmap_entry *rev; 496 struct page *page; 497 long index, ret; 498 bool is_ci; 499 bool writing, write_ok; 500 unsigned int shift; 501 unsigned long rcbits; 502 long mmio_update; 503 pte_t pte, *ptep; 504 505 if (kvm_is_radix(kvm)) 506 return kvmppc_book3s_radix_page_fault(vcpu, ea, dsisr); 507 508 /* 509 * Real-mode code has already searched the HPT and found the 510 * entry we're interested in. Lock the entry and check that 511 * it hasn't changed. If it has, just return and re-execute the 512 * instruction. 513 */ 514 if (ea != vcpu->arch.pgfault_addr) 515 return RESUME_GUEST; 516 517 if (vcpu->arch.pgfault_cache) { 518 mmio_update = atomic64_read(&kvm->arch.mmio_update); 519 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 520 r = vcpu->arch.pgfault_cache->rpte; 521 psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], 522 r); 523 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 524 gfn_base = gpa_base >> PAGE_SHIFT; 525 gpa = gpa_base | (ea & (psize - 1)); 526 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, 527 dsisr & DSISR_ISSTORE); 528 } 529 } 530 index = vcpu->arch.pgfault_index; 531 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 532 rev = &kvm->arch.hpt.rev[index]; 533 preempt_disable(); 534 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 535 cpu_relax(); 536 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 537 hpte[1] = be64_to_cpu(hptep[1]); 538 hpte[2] = r = rev->guest_rpte; 539 unlock_hpte(hptep, hpte[0]); 540 preempt_enable(); 541 542 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 543 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); 544 hpte[1] = hpte_new_to_old_r(hpte[1]); 545 } 546 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 547 hpte[1] != vcpu->arch.pgfault_hpte[1]) 548 return RESUME_GUEST; 549 550 /* Translate the logical address and get the page */ 551 psize = kvmppc_actual_pgsz(hpte[0], r); 552 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 553 gfn_base = gpa_base >> PAGE_SHIFT; 554 gpa = gpa_base | (ea & (psize - 1)); 555 gfn = gpa >> PAGE_SHIFT; 556 memslot = gfn_to_memslot(kvm, gfn); 557 558 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); 559 560 /* No memslot means it's an emulated MMIO region */ 561 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 562 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, 563 dsisr & DSISR_ISSTORE); 564 565 /* 566 * This should never happen, because of the slot_is_aligned() 567 * check in kvmppc_do_h_enter(). 568 */ 569 if (gfn_base < memslot->base_gfn) 570 return -EFAULT; 571 572 /* used to check for invalidations in progress */ 573 mmu_seq = kvm->mmu_notifier_seq; 574 smp_rmb(); 575 576 ret = -EFAULT; 577 page = NULL; 578 writing = (dsisr & DSISR_ISSTORE) != 0; 579 /* If writing != 0, then the HPTE must allow writing, if we get here */ 580 write_ok = writing; 581 hva = gfn_to_hva_memslot(memslot, gfn); 582 583 /* 584 * Do a fast check first, since __gfn_to_pfn_memslot doesn't 585 * do it with !atomic && !async, which is how we call it. 586 * We always ask for write permission since the common case 587 * is that the page is writable. 588 */ 589 if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { 590 write_ok = true; 591 } else { 592 /* Call KVM generic code to do the slow-path check */ 593 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 594 writing, &write_ok, NULL); 595 if (is_error_noslot_pfn(pfn)) 596 return -EFAULT; 597 page = NULL; 598 if (pfn_valid(pfn)) { 599 page = pfn_to_page(pfn); 600 if (PageReserved(page)) 601 page = NULL; 602 } 603 } 604 605 /* 606 * Read the PTE from the process' radix tree and use that 607 * so we get the shift and attribute bits. 608 */ 609 spin_lock(&kvm->mmu_lock); 610 ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); 611 pte = __pte(0); 612 if (ptep) 613 pte = READ_ONCE(*ptep); 614 spin_unlock(&kvm->mmu_lock); 615 /* 616 * If the PTE disappeared temporarily due to a THP 617 * collapse, just return and let the guest try again. 618 */ 619 if (!pte_present(pte)) { 620 if (page) 621 put_page(page); 622 return RESUME_GUEST; 623 } 624 hpa = pte_pfn(pte) << PAGE_SHIFT; 625 pte_size = PAGE_SIZE; 626 if (shift) 627 pte_size = 1ul << shift; 628 is_ci = pte_ci(pte); 629 630 if (psize > pte_size) 631 goto out_put; 632 if (pte_size > psize) 633 hpa |= hva & (pte_size - psize); 634 635 /* Check WIMG vs. the actual page we're accessing */ 636 if (!hpte_cache_flags_ok(r, is_ci)) { 637 if (is_ci) 638 goto out_put; 639 /* 640 * Allow guest to map emulated device memory as 641 * uncacheable, but actually make it cacheable. 642 */ 643 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 644 } 645 646 /* 647 * Set the HPTE to point to hpa. 648 * Since the hpa is at PAGE_SIZE granularity, make sure we 649 * don't mask out lower-order bits if psize < PAGE_SIZE. 650 */ 651 if (psize < PAGE_SIZE) 652 psize = PAGE_SIZE; 653 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa; 654 if (hpte_is_writable(r) && !write_ok) 655 r = hpte_make_readonly(r); 656 ret = RESUME_GUEST; 657 preempt_disable(); 658 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 659 cpu_relax(); 660 hnow_v = be64_to_cpu(hptep[0]); 661 hnow_r = be64_to_cpu(hptep[1]); 662 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 663 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); 664 hnow_r = hpte_new_to_old_r(hnow_r); 665 } 666 667 /* 668 * If the HPT is being resized, don't update the HPTE, 669 * instead let the guest retry after the resize operation is complete. 670 * The synchronization for mmu_ready test vs. set is provided 671 * by the HPTE lock. 672 */ 673 if (!kvm->arch.mmu_ready) 674 goto out_unlock; 675 676 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 677 rev->guest_rpte != hpte[2]) 678 /* HPTE has been changed under us; let the guest retry */ 679 goto out_unlock; 680 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 681 682 /* Always put the HPTE in the rmap chain for the page base address */ 683 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 684 lock_rmap(rmap); 685 686 /* Check if we might have been invalidated; let the guest retry if so */ 687 ret = RESUME_GUEST; 688 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 689 unlock_rmap(rmap); 690 goto out_unlock; 691 } 692 693 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 694 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 695 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 696 697 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { 698 /* HPTE was previously valid, so we need to invalidate it */ 699 unlock_rmap(rmap); 700 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 701 kvmppc_invalidate_hpte(kvm, hptep, index); 702 /* don't lose previous R and C bits */ 703 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 704 } else { 705 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 706 } 707 708 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 709 r = hpte_old_to_new_r(hpte[0], r); 710 hpte[0] = hpte_old_to_new_v(hpte[0]); 711 } 712 hptep[1] = cpu_to_be64(r); 713 eieio(); 714 __unlock_hpte(hptep, hpte[0]); 715 asm volatile("ptesync" : : : "memory"); 716 preempt_enable(); 717 if (page && hpte_is_writable(r)) 718 set_page_dirty_lock(page); 719 720 out_put: 721 trace_kvm_page_fault_exit(vcpu, hpte, ret); 722 723 if (page) 724 put_page(page); 725 return ret; 726 727 out_unlock: 728 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 729 preempt_enable(); 730 goto out_put; 731 } 732 733 void kvmppc_rmap_reset(struct kvm *kvm) 734 { 735 struct kvm_memslots *slots; 736 struct kvm_memory_slot *memslot; 737 int srcu_idx; 738 739 srcu_idx = srcu_read_lock(&kvm->srcu); 740 slots = kvm_memslots(kvm); 741 kvm_for_each_memslot(memslot, slots) { 742 /* Mutual exclusion with kvm_unmap_hva_range etc. */ 743 spin_lock(&kvm->mmu_lock); 744 /* 745 * This assumes it is acceptable to lose reference and 746 * change bits across a reset. 747 */ 748 memset(memslot->arch.rmap, 0, 749 memslot->npages * sizeof(*memslot->arch.rmap)); 750 spin_unlock(&kvm->mmu_lock); 751 } 752 srcu_read_unlock(&kvm->srcu, srcu_idx); 753 } 754 755 /* Must be called with both HPTE and rmap locked */ 756 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 757 struct kvm_memory_slot *memslot, 758 unsigned long *rmapp, unsigned long gfn) 759 { 760 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 761 struct revmap_entry *rev = kvm->arch.hpt.rev; 762 unsigned long j, h; 763 unsigned long ptel, psize, rcbits; 764 765 j = rev[i].forw; 766 if (j == i) { 767 /* chain is now empty */ 768 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 769 } else { 770 /* remove i from chain */ 771 h = rev[i].back; 772 rev[h].forw = j; 773 rev[j].back = h; 774 rev[i].forw = rev[i].back = i; 775 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 776 } 777 778 /* Now check and modify the HPTE */ 779 ptel = rev[i].guest_rpte; 780 psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); 781 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 782 hpte_rpn(ptel, psize) == gfn) { 783 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 784 kvmppc_invalidate_hpte(kvm, hptep, i); 785 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); 786 /* Harvest R and C */ 787 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 788 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 789 if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) 790 kvmppc_update_dirty_map(memslot, gfn, psize); 791 if (rcbits & ~rev[i].guest_rpte) { 792 rev[i].guest_rpte = ptel | rcbits; 793 note_hpte_modification(kvm, &rev[i]); 794 } 795 } 796 } 797 798 static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 799 unsigned long gfn) 800 { 801 unsigned long i; 802 __be64 *hptep; 803 unsigned long *rmapp; 804 805 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 806 for (;;) { 807 lock_rmap(rmapp); 808 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 809 unlock_rmap(rmapp); 810 break; 811 } 812 813 /* 814 * To avoid an ABBA deadlock with the HPTE lock bit, 815 * we can't spin on the HPTE lock while holding the 816 * rmap chain lock. 817 */ 818 i = *rmapp & KVMPPC_RMAP_INDEX; 819 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 820 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 821 /* unlock rmap before spinning on the HPTE lock */ 822 unlock_rmap(rmapp); 823 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 824 cpu_relax(); 825 continue; 826 } 827 828 kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); 829 unlock_rmap(rmapp); 830 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 831 } 832 } 833 834 bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) 835 { 836 gfn_t gfn; 837 838 if (kvm_is_radix(kvm)) { 839 for (gfn = range->start; gfn < range->end; gfn++) 840 kvm_unmap_radix(kvm, range->slot, gfn); 841 } else { 842 for (gfn = range->start; gfn < range->end; gfn++) 843 kvm_unmap_rmapp(kvm, range->slot, gfn); 844 } 845 846 return false; 847 } 848 849 void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 850 struct kvm_memory_slot *memslot) 851 { 852 unsigned long gfn; 853 unsigned long n; 854 unsigned long *rmapp; 855 856 gfn = memslot->base_gfn; 857 rmapp = memslot->arch.rmap; 858 if (kvm_is_radix(kvm)) { 859 kvmppc_radix_flush_memslot(kvm, memslot); 860 return; 861 } 862 863 for (n = memslot->npages; n; --n, ++gfn) { 864 /* 865 * Testing the present bit without locking is OK because 866 * the memslot has been marked invalid already, and hence 867 * no new HPTEs referencing this page can be created, 868 * thus the present bit can't go from 0 to 1. 869 */ 870 if (*rmapp & KVMPPC_RMAP_PRESENT) 871 kvm_unmap_rmapp(kvm, memslot, gfn); 872 ++rmapp; 873 } 874 } 875 876 static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 877 unsigned long gfn) 878 { 879 struct revmap_entry *rev = kvm->arch.hpt.rev; 880 unsigned long head, i, j; 881 __be64 *hptep; 882 int ret = 0; 883 unsigned long *rmapp; 884 885 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 886 retry: 887 lock_rmap(rmapp); 888 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 889 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 890 ret = 1; 891 } 892 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 893 unlock_rmap(rmapp); 894 return ret; 895 } 896 897 i = head = *rmapp & KVMPPC_RMAP_INDEX; 898 do { 899 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 900 j = rev[i].forw; 901 902 /* If this HPTE isn't referenced, ignore it */ 903 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) 904 continue; 905 906 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 907 /* unlock rmap before spinning on the HPTE lock */ 908 unlock_rmap(rmapp); 909 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 910 cpu_relax(); 911 goto retry; 912 } 913 914 /* Now check and modify the HPTE */ 915 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 916 (be64_to_cpu(hptep[1]) & HPTE_R_R)) { 917 kvmppc_clear_ref_hpte(kvm, hptep, i); 918 if (!(rev[i].guest_rpte & HPTE_R_R)) { 919 rev[i].guest_rpte |= HPTE_R_R; 920 note_hpte_modification(kvm, &rev[i]); 921 } 922 ret = 1; 923 } 924 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 925 } while ((i = j) != head); 926 927 unlock_rmap(rmapp); 928 return ret; 929 } 930 931 bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) 932 { 933 gfn_t gfn; 934 bool ret = false; 935 936 if (kvm_is_radix(kvm)) { 937 for (gfn = range->start; gfn < range->end; gfn++) 938 ret |= kvm_age_radix(kvm, range->slot, gfn); 939 } else { 940 for (gfn = range->start; gfn < range->end; gfn++) 941 ret |= kvm_age_rmapp(kvm, range->slot, gfn); 942 } 943 944 return ret; 945 } 946 947 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 948 unsigned long gfn) 949 { 950 struct revmap_entry *rev = kvm->arch.hpt.rev; 951 unsigned long head, i, j; 952 unsigned long *hp; 953 bool ret = true; 954 unsigned long *rmapp; 955 956 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 957 if (*rmapp & KVMPPC_RMAP_REFERENCED) 958 return true; 959 960 lock_rmap(rmapp); 961 if (*rmapp & KVMPPC_RMAP_REFERENCED) 962 goto out; 963 964 if (*rmapp & KVMPPC_RMAP_PRESENT) { 965 i = head = *rmapp & KVMPPC_RMAP_INDEX; 966 do { 967 hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); 968 j = rev[i].forw; 969 if (be64_to_cpu(hp[1]) & HPTE_R_R) 970 goto out; 971 } while ((i = j) != head); 972 } 973 ret = false; 974 975 out: 976 unlock_rmap(rmapp); 977 return ret; 978 } 979 980 bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) 981 { 982 WARN_ON(range->start + 1 != range->end); 983 984 if (kvm_is_radix(kvm)) 985 return kvm_test_age_radix(kvm, range->slot, range->start); 986 else 987 return kvm_test_age_rmapp(kvm, range->slot, range->start); 988 } 989 990 bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) 991 { 992 WARN_ON(range->start + 1 != range->end); 993 994 if (kvm_is_radix(kvm)) 995 kvm_unmap_radix(kvm, range->slot, range->start); 996 else 997 kvm_unmap_rmapp(kvm, range->slot, range->start); 998 999 return false; 1000 } 1001 1002 static int vcpus_running(struct kvm *kvm) 1003 { 1004 return atomic_read(&kvm->arch.vcpus_running) != 0; 1005 } 1006 1007 /* 1008 * Returns the number of system pages that are dirty. 1009 * This can be more than 1 if we find a huge-page HPTE. 1010 */ 1011 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1012 { 1013 struct revmap_entry *rev = kvm->arch.hpt.rev; 1014 unsigned long head, i, j; 1015 unsigned long n; 1016 unsigned long v, r; 1017 __be64 *hptep; 1018 int npages_dirty = 0; 1019 1020 retry: 1021 lock_rmap(rmapp); 1022 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1023 unlock_rmap(rmapp); 1024 return npages_dirty; 1025 } 1026 1027 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1028 do { 1029 unsigned long hptep1; 1030 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 1031 j = rev[i].forw; 1032 1033 /* 1034 * Checking the C (changed) bit here is racy since there 1035 * is no guarantee about when the hardware writes it back. 1036 * If the HPTE is not writable then it is stable since the 1037 * page can't be written to, and we would have done a tlbie 1038 * (which forces the hardware to complete any writeback) 1039 * when making the HPTE read-only. 1040 * If vcpus are running then this call is racy anyway 1041 * since the page could get dirtied subsequently, so we 1042 * expect there to be a further call which would pick up 1043 * any delayed C bit writeback. 1044 * Otherwise we need to do the tlbie even if C==0 in 1045 * order to pick up any delayed writeback of C. 1046 */ 1047 hptep1 = be64_to_cpu(hptep[1]); 1048 if (!(hptep1 & HPTE_R_C) && 1049 (!hpte_is_writable(hptep1) || vcpus_running(kvm))) 1050 continue; 1051 1052 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1053 /* unlock rmap before spinning on the HPTE lock */ 1054 unlock_rmap(rmapp); 1055 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) 1056 cpu_relax(); 1057 goto retry; 1058 } 1059 1060 /* Now check and modify the HPTE */ 1061 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 1062 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 1063 continue; 1064 } 1065 1066 /* need to make it temporarily absent so C is stable */ 1067 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 1068 kvmppc_invalidate_hpte(kvm, hptep, i); 1069 v = be64_to_cpu(hptep[0]); 1070 r = be64_to_cpu(hptep[1]); 1071 if (r & HPTE_R_C) { 1072 hptep[1] = cpu_to_be64(r & ~HPTE_R_C); 1073 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1074 rev[i].guest_rpte |= HPTE_R_C; 1075 note_hpte_modification(kvm, &rev[i]); 1076 } 1077 n = kvmppc_actual_pgsz(v, r); 1078 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1079 if (n > npages_dirty) 1080 npages_dirty = n; 1081 eieio(); 1082 } 1083 v &= ~HPTE_V_ABSENT; 1084 v |= HPTE_V_VALID; 1085 __unlock_hpte(hptep, v); 1086 } while ((i = j) != head); 1087 1088 unlock_rmap(rmapp); 1089 return npages_dirty; 1090 } 1091 1092 void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1093 struct kvm_memory_slot *memslot, 1094 unsigned long *map) 1095 { 1096 unsigned long gfn; 1097 1098 if (!vpa->dirty || !vpa->pinned_addr) 1099 return; 1100 gfn = vpa->gpa >> PAGE_SHIFT; 1101 if (gfn < memslot->base_gfn || 1102 gfn >= memslot->base_gfn + memslot->npages) 1103 return; 1104 1105 vpa->dirty = false; 1106 if (map) 1107 __set_bit_le(gfn - memslot->base_gfn, map); 1108 } 1109 1110 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1111 struct kvm_memory_slot *memslot, unsigned long *map) 1112 { 1113 unsigned long i; 1114 unsigned long *rmapp; 1115 1116 preempt_disable(); 1117 rmapp = memslot->arch.rmap; 1118 for (i = 0; i < memslot->npages; ++i) { 1119 int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1120 /* 1121 * Note that if npages > 0 then i must be a multiple of npages, 1122 * since we always put huge-page HPTEs in the rmap chain 1123 * corresponding to their page base address. 1124 */ 1125 if (npages) 1126 set_dirty_bits(map, i, npages); 1127 ++rmapp; 1128 } 1129 preempt_enable(); 1130 return 0; 1131 } 1132 1133 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1134 unsigned long *nb_ret) 1135 { 1136 struct kvm_memory_slot *memslot; 1137 unsigned long gfn = gpa >> PAGE_SHIFT; 1138 struct page *page, *pages[1]; 1139 int npages; 1140 unsigned long hva, offset; 1141 int srcu_idx; 1142 1143 srcu_idx = srcu_read_lock(&kvm->srcu); 1144 memslot = gfn_to_memslot(kvm, gfn); 1145 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1146 goto err; 1147 hva = gfn_to_hva_memslot(memslot, gfn); 1148 npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); 1149 if (npages < 1) 1150 goto err; 1151 page = pages[0]; 1152 srcu_read_unlock(&kvm->srcu, srcu_idx); 1153 1154 offset = gpa & (PAGE_SIZE - 1); 1155 if (nb_ret) 1156 *nb_ret = PAGE_SIZE - offset; 1157 return page_address(page) + offset; 1158 1159 err: 1160 srcu_read_unlock(&kvm->srcu, srcu_idx); 1161 return NULL; 1162 } 1163 1164 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1165 bool dirty) 1166 { 1167 struct page *page = virt_to_page(va); 1168 struct kvm_memory_slot *memslot; 1169 unsigned long gfn; 1170 int srcu_idx; 1171 1172 put_page(page); 1173 1174 if (!dirty) 1175 return; 1176 1177 /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ 1178 gfn = gpa >> PAGE_SHIFT; 1179 srcu_idx = srcu_read_lock(&kvm->srcu); 1180 memslot = gfn_to_memslot(kvm, gfn); 1181 if (memslot && memslot->dirty_bitmap) 1182 set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); 1183 srcu_read_unlock(&kvm->srcu, srcu_idx); 1184 } 1185 1186 /* 1187 * HPT resizing 1188 */ 1189 static int resize_hpt_allocate(struct kvm_resize_hpt *resize) 1190 { 1191 int rc; 1192 1193 rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); 1194 if (rc < 0) 1195 return rc; 1196 1197 resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", 1198 resize->hpt.virt); 1199 1200 return 0; 1201 } 1202 1203 static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, 1204 unsigned long idx) 1205 { 1206 struct kvm *kvm = resize->kvm; 1207 struct kvm_hpt_info *old = &kvm->arch.hpt; 1208 struct kvm_hpt_info *new = &resize->hpt; 1209 unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; 1210 unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; 1211 __be64 *hptep, *new_hptep; 1212 unsigned long vpte, rpte, guest_rpte; 1213 int ret; 1214 struct revmap_entry *rev; 1215 unsigned long apsize, avpn, pteg, hash; 1216 unsigned long new_idx, new_pteg, replace_vpte; 1217 int pshift; 1218 1219 hptep = (__be64 *)(old->virt + (idx << 4)); 1220 1221 /* Guest is stopped, so new HPTEs can't be added or faulted 1222 * in, only unmapped or altered by host actions. So, it's 1223 * safe to check this before we take the HPTE lock */ 1224 vpte = be64_to_cpu(hptep[0]); 1225 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1226 return 0; /* nothing to do */ 1227 1228 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 1229 cpu_relax(); 1230 1231 vpte = be64_to_cpu(hptep[0]); 1232 1233 ret = 0; 1234 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1235 /* Nothing to do */ 1236 goto out; 1237 1238 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1239 rpte = be64_to_cpu(hptep[1]); 1240 vpte = hpte_new_to_old_v(vpte, rpte); 1241 } 1242 1243 /* Unmap */ 1244 rev = &old->rev[idx]; 1245 guest_rpte = rev->guest_rpte; 1246 1247 ret = -EIO; 1248 apsize = kvmppc_actual_pgsz(vpte, guest_rpte); 1249 if (!apsize) 1250 goto out; 1251 1252 if (vpte & HPTE_V_VALID) { 1253 unsigned long gfn = hpte_rpn(guest_rpte, apsize); 1254 int srcu_idx = srcu_read_lock(&kvm->srcu); 1255 struct kvm_memory_slot *memslot = 1256 __gfn_to_memslot(kvm_memslots(kvm), gfn); 1257 1258 if (memslot) { 1259 unsigned long *rmapp; 1260 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1261 1262 lock_rmap(rmapp); 1263 kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); 1264 unlock_rmap(rmapp); 1265 } 1266 1267 srcu_read_unlock(&kvm->srcu, srcu_idx); 1268 } 1269 1270 /* Reload PTE after unmap */ 1271 vpte = be64_to_cpu(hptep[0]); 1272 BUG_ON(vpte & HPTE_V_VALID); 1273 BUG_ON(!(vpte & HPTE_V_ABSENT)); 1274 1275 ret = 0; 1276 if (!(vpte & HPTE_V_BOLTED)) 1277 goto out; 1278 1279 rpte = be64_to_cpu(hptep[1]); 1280 1281 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1282 vpte = hpte_new_to_old_v(vpte, rpte); 1283 rpte = hpte_new_to_old_r(rpte); 1284 } 1285 1286 pshift = kvmppc_hpte_base_page_shift(vpte, rpte); 1287 avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23); 1288 pteg = idx / HPTES_PER_GROUP; 1289 if (vpte & HPTE_V_SECONDARY) 1290 pteg = ~pteg; 1291 1292 if (!(vpte & HPTE_V_1TB_SEG)) { 1293 unsigned long offset, vsid; 1294 1295 /* We only have 28 - 23 bits of offset in avpn */ 1296 offset = (avpn & 0x1f) << 23; 1297 vsid = avpn >> 5; 1298 /* We can find more bits from the pteg value */ 1299 if (pshift < 23) 1300 offset |= ((vsid ^ pteg) & old_hash_mask) << pshift; 1301 1302 hash = vsid ^ (offset >> pshift); 1303 } else { 1304 unsigned long offset, vsid; 1305 1306 /* We only have 40 - 23 bits of seg_off in avpn */ 1307 offset = (avpn & 0x1ffff) << 23; 1308 vsid = avpn >> 17; 1309 if (pshift < 23) 1310 offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift; 1311 1312 hash = vsid ^ (vsid << 25) ^ (offset >> pshift); 1313 } 1314 1315 new_pteg = hash & new_hash_mask; 1316 if (vpte & HPTE_V_SECONDARY) 1317 new_pteg = ~hash & new_hash_mask; 1318 1319 new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); 1320 new_hptep = (__be64 *)(new->virt + (new_idx << 4)); 1321 1322 replace_vpte = be64_to_cpu(new_hptep[0]); 1323 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1324 unsigned long replace_rpte = be64_to_cpu(new_hptep[1]); 1325 replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte); 1326 } 1327 1328 if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1329 BUG_ON(new->order >= old->order); 1330 1331 if (replace_vpte & HPTE_V_BOLTED) { 1332 if (vpte & HPTE_V_BOLTED) 1333 /* Bolted collision, nothing we can do */ 1334 ret = -ENOSPC; 1335 /* Discard the new HPTE */ 1336 goto out; 1337 } 1338 1339 /* Discard the previous HPTE */ 1340 } 1341 1342 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1343 rpte = hpte_old_to_new_r(vpte, rpte); 1344 vpte = hpte_old_to_new_v(vpte); 1345 } 1346 1347 new_hptep[1] = cpu_to_be64(rpte); 1348 new->rev[new_idx].guest_rpte = guest_rpte; 1349 /* No need for a barrier, since new HPT isn't active */ 1350 new_hptep[0] = cpu_to_be64(vpte); 1351 unlock_hpte(new_hptep, vpte); 1352 1353 out: 1354 unlock_hpte(hptep, vpte); 1355 return ret; 1356 } 1357 1358 static int resize_hpt_rehash(struct kvm_resize_hpt *resize) 1359 { 1360 struct kvm *kvm = resize->kvm; 1361 unsigned long i; 1362 int rc; 1363 1364 for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { 1365 rc = resize_hpt_rehash_hpte(resize, i); 1366 if (rc != 0) 1367 return rc; 1368 } 1369 1370 return 0; 1371 } 1372 1373 static void resize_hpt_pivot(struct kvm_resize_hpt *resize) 1374 { 1375 struct kvm *kvm = resize->kvm; 1376 struct kvm_hpt_info hpt_tmp; 1377 1378 /* Exchange the pending tables in the resize structure with 1379 * the active tables */ 1380 1381 resize_hpt_debug(resize, "resize_hpt_pivot()\n"); 1382 1383 spin_lock(&kvm->mmu_lock); 1384 asm volatile("ptesync" : : : "memory"); 1385 1386 hpt_tmp = kvm->arch.hpt; 1387 kvmppc_set_hpt(kvm, &resize->hpt); 1388 resize->hpt = hpt_tmp; 1389 1390 spin_unlock(&kvm->mmu_lock); 1391 1392 synchronize_srcu_expedited(&kvm->srcu); 1393 1394 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1395 kvmppc_setup_partition_table(kvm); 1396 1397 resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); 1398 } 1399 1400 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) 1401 { 1402 if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock))) 1403 return; 1404 1405 if (!resize) 1406 return; 1407 1408 if (resize->error != -EBUSY) { 1409 if (resize->hpt.virt) 1410 kvmppc_free_hpt(&resize->hpt); 1411 kfree(resize); 1412 } 1413 1414 if (kvm->arch.resize_hpt == resize) 1415 kvm->arch.resize_hpt = NULL; 1416 } 1417 1418 static void resize_hpt_prepare_work(struct work_struct *work) 1419 { 1420 struct kvm_resize_hpt *resize = container_of(work, 1421 struct kvm_resize_hpt, 1422 work); 1423 struct kvm *kvm = resize->kvm; 1424 int err = 0; 1425 1426 if (WARN_ON(resize->error != -EBUSY)) 1427 return; 1428 1429 mutex_lock(&kvm->arch.mmu_setup_lock); 1430 1431 /* Request is still current? */ 1432 if (kvm->arch.resize_hpt == resize) { 1433 /* We may request large allocations here: 1434 * do not sleep with kvm->arch.mmu_setup_lock held for a while. 1435 */ 1436 mutex_unlock(&kvm->arch.mmu_setup_lock); 1437 1438 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", 1439 resize->order); 1440 1441 err = resize_hpt_allocate(resize); 1442 1443 /* We have strict assumption about -EBUSY 1444 * when preparing for HPT resize. 1445 */ 1446 if (WARN_ON(err == -EBUSY)) 1447 err = -EINPROGRESS; 1448 1449 mutex_lock(&kvm->arch.mmu_setup_lock); 1450 /* It is possible that kvm->arch.resize_hpt != resize 1451 * after we grab kvm->arch.mmu_setup_lock again. 1452 */ 1453 } 1454 1455 resize->error = err; 1456 1457 if (kvm->arch.resize_hpt != resize) 1458 resize_hpt_release(kvm, resize); 1459 1460 mutex_unlock(&kvm->arch.mmu_setup_lock); 1461 } 1462 1463 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, 1464 struct kvm_ppc_resize_hpt *rhpt) 1465 { 1466 unsigned long flags = rhpt->flags; 1467 unsigned long shift = rhpt->shift; 1468 struct kvm_resize_hpt *resize; 1469 int ret; 1470 1471 if (flags != 0 || kvm_is_radix(kvm)) 1472 return -EINVAL; 1473 1474 if (shift && ((shift < 18) || (shift > 46))) 1475 return -EINVAL; 1476 1477 mutex_lock(&kvm->arch.mmu_setup_lock); 1478 1479 resize = kvm->arch.resize_hpt; 1480 1481 if (resize) { 1482 if (resize->order == shift) { 1483 /* Suitable resize in progress? */ 1484 ret = resize->error; 1485 if (ret == -EBUSY) 1486 ret = 100; /* estimated time in ms */ 1487 else if (ret) 1488 resize_hpt_release(kvm, resize); 1489 1490 goto out; 1491 } 1492 1493 /* not suitable, cancel it */ 1494 resize_hpt_release(kvm, resize); 1495 } 1496 1497 ret = 0; 1498 if (!shift) 1499 goto out; /* nothing to do */ 1500 1501 /* start new resize */ 1502 1503 resize = kzalloc(sizeof(*resize), GFP_KERNEL); 1504 if (!resize) { 1505 ret = -ENOMEM; 1506 goto out; 1507 } 1508 1509 resize->error = -EBUSY; 1510 resize->order = shift; 1511 resize->kvm = kvm; 1512 INIT_WORK(&resize->work, resize_hpt_prepare_work); 1513 kvm->arch.resize_hpt = resize; 1514 1515 schedule_work(&resize->work); 1516 1517 ret = 100; /* estimated time in ms */ 1518 1519 out: 1520 mutex_unlock(&kvm->arch.mmu_setup_lock); 1521 return ret; 1522 } 1523 1524 static void resize_hpt_boot_vcpu(void *opaque) 1525 { 1526 /* Nothing to do, just force a KVM exit */ 1527 } 1528 1529 long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, 1530 struct kvm_ppc_resize_hpt *rhpt) 1531 { 1532 unsigned long flags = rhpt->flags; 1533 unsigned long shift = rhpt->shift; 1534 struct kvm_resize_hpt *resize; 1535 long ret; 1536 1537 if (flags != 0 || kvm_is_radix(kvm)) 1538 return -EINVAL; 1539 1540 if (shift && ((shift < 18) || (shift > 46))) 1541 return -EINVAL; 1542 1543 mutex_lock(&kvm->arch.mmu_setup_lock); 1544 1545 resize = kvm->arch.resize_hpt; 1546 1547 /* This shouldn't be possible */ 1548 ret = -EIO; 1549 if (WARN_ON(!kvm->arch.mmu_ready)) 1550 goto out_no_hpt; 1551 1552 /* Stop VCPUs from running while we mess with the HPT */ 1553 kvm->arch.mmu_ready = 0; 1554 smp_mb(); 1555 1556 /* Boot all CPUs out of the guest so they re-read 1557 * mmu_ready */ 1558 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1559 1560 ret = -ENXIO; 1561 if (!resize || (resize->order != shift)) 1562 goto out; 1563 1564 ret = resize->error; 1565 if (ret) 1566 goto out; 1567 1568 ret = resize_hpt_rehash(resize); 1569 if (ret) 1570 goto out; 1571 1572 resize_hpt_pivot(resize); 1573 1574 out: 1575 /* Let VCPUs run again */ 1576 kvm->arch.mmu_ready = 1; 1577 smp_mb(); 1578 out_no_hpt: 1579 resize_hpt_release(kvm, resize); 1580 mutex_unlock(&kvm->arch.mmu_setup_lock); 1581 return ret; 1582 } 1583 1584 /* 1585 * Functions for reading and writing the hash table via reads and 1586 * writes on a file descriptor. 1587 * 1588 * Reads return the guest view of the hash table, which has to be 1589 * pieced together from the real hash table and the guest_rpte 1590 * values in the revmap array. 1591 * 1592 * On writes, each HPTE written is considered in turn, and if it 1593 * is valid, it is written to the HPT as if an H_ENTER with the 1594 * exact flag set was done. When the invalid count is non-zero 1595 * in the header written to the stream, the kernel will make 1596 * sure that that many HPTEs are invalid, and invalidate them 1597 * if not. 1598 */ 1599 1600 struct kvm_htab_ctx { 1601 unsigned long index; 1602 unsigned long flags; 1603 struct kvm *kvm; 1604 int first_pass; 1605 }; 1606 1607 #define HPTE_SIZE (2 * sizeof(unsigned long)) 1608 1609 /* 1610 * Returns 1 if this HPT entry has been modified or has pending 1611 * R/C bit changes. 1612 */ 1613 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) 1614 { 1615 unsigned long rcbits_unset; 1616 1617 if (revp->guest_rpte & HPTE_GR_MODIFIED) 1618 return 1; 1619 1620 /* Also need to consider changes in reference and changed bits */ 1621 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1622 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && 1623 (be64_to_cpu(hptp[1]) & rcbits_unset)) 1624 return 1; 1625 1626 return 0; 1627 } 1628 1629 static long record_hpte(unsigned long flags, __be64 *hptp, 1630 unsigned long *hpte, struct revmap_entry *revp, 1631 int want_valid, int first_pass) 1632 { 1633 unsigned long v, r, hr; 1634 unsigned long rcbits_unset; 1635 int ok = 1; 1636 int valid, dirty; 1637 1638 /* Unmodified entries are uninteresting except on the first pass */ 1639 dirty = hpte_dirty(revp, hptp); 1640 if (!first_pass && !dirty) 1641 return 0; 1642 1643 valid = 0; 1644 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1645 valid = 1; 1646 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1647 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) 1648 valid = 0; 1649 } 1650 if (valid != want_valid) 1651 return 0; 1652 1653 v = r = 0; 1654 if (valid || dirty) { 1655 /* lock the HPTE so it's stable and read it */ 1656 preempt_disable(); 1657 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1658 cpu_relax(); 1659 v = be64_to_cpu(hptp[0]); 1660 hr = be64_to_cpu(hptp[1]); 1661 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1662 v = hpte_new_to_old_v(v, hr); 1663 hr = hpte_new_to_old_r(hr); 1664 } 1665 1666 /* re-evaluate valid and dirty from synchronized HPTE value */ 1667 valid = !!(v & HPTE_V_VALID); 1668 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1669 1670 /* Harvest R and C into guest view if necessary */ 1671 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1672 if (valid && (rcbits_unset & hr)) { 1673 revp->guest_rpte |= (hr & 1674 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; 1675 dirty = 1; 1676 } 1677 1678 if (v & HPTE_V_ABSENT) { 1679 v &= ~HPTE_V_ABSENT; 1680 v |= HPTE_V_VALID; 1681 valid = 1; 1682 } 1683 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1684 valid = 0; 1685 1686 r = revp->guest_rpte; 1687 /* only clear modified if this is the right sort of entry */ 1688 if (valid == want_valid && dirty) { 1689 r &= ~HPTE_GR_MODIFIED; 1690 revp->guest_rpte = r; 1691 } 1692 unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1693 preempt_enable(); 1694 if (!(valid == want_valid && (first_pass || dirty))) 1695 ok = 0; 1696 } 1697 hpte[0] = cpu_to_be64(v); 1698 hpte[1] = cpu_to_be64(r); 1699 return ok; 1700 } 1701 1702 static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1703 size_t count, loff_t *ppos) 1704 { 1705 struct kvm_htab_ctx *ctx = file->private_data; 1706 struct kvm *kvm = ctx->kvm; 1707 struct kvm_get_htab_header hdr; 1708 __be64 *hptp; 1709 struct revmap_entry *revp; 1710 unsigned long i, nb, nw; 1711 unsigned long __user *lbuf; 1712 struct kvm_get_htab_header __user *hptr; 1713 unsigned long flags; 1714 int first_pass; 1715 unsigned long hpte[2]; 1716 1717 if (!access_ok(buf, count)) 1718 return -EFAULT; 1719 if (kvm_is_radix(kvm)) 1720 return 0; 1721 1722 first_pass = ctx->first_pass; 1723 flags = ctx->flags; 1724 1725 i = ctx->index; 1726 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1727 revp = kvm->arch.hpt.rev + i; 1728 lbuf = (unsigned long __user *)buf; 1729 1730 nb = 0; 1731 while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1732 /* Initialize header */ 1733 hptr = (struct kvm_get_htab_header __user *)buf; 1734 hdr.n_valid = 0; 1735 hdr.n_invalid = 0; 1736 nw = nb; 1737 nb += sizeof(hdr); 1738 lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1739 1740 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1741 if (!first_pass) { 1742 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1743 !hpte_dirty(revp, hptp)) { 1744 ++i; 1745 hptp += 2; 1746 ++revp; 1747 } 1748 } 1749 hdr.index = i; 1750 1751 /* Grab a series of valid entries */ 1752 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1753 hdr.n_valid < 0xffff && 1754 nb + HPTE_SIZE < count && 1755 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1756 /* valid entry, write it out */ 1757 ++hdr.n_valid; 1758 if (__put_user(hpte[0], lbuf) || 1759 __put_user(hpte[1], lbuf + 1)) 1760 return -EFAULT; 1761 nb += HPTE_SIZE; 1762 lbuf += 2; 1763 ++i; 1764 hptp += 2; 1765 ++revp; 1766 } 1767 /* Now skip invalid entries while we can */ 1768 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1769 hdr.n_invalid < 0xffff && 1770 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1771 /* found an invalid entry */ 1772 ++hdr.n_invalid; 1773 ++i; 1774 hptp += 2; 1775 ++revp; 1776 } 1777 1778 if (hdr.n_valid || hdr.n_invalid) { 1779 /* write back the header */ 1780 if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1781 return -EFAULT; 1782 nw = nb; 1783 buf = (char __user *)lbuf; 1784 } else { 1785 nb = nw; 1786 } 1787 1788 /* Check if we've wrapped around the hash table */ 1789 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { 1790 i = 0; 1791 ctx->first_pass = 0; 1792 break; 1793 } 1794 } 1795 1796 ctx->index = i; 1797 1798 return nb; 1799 } 1800 1801 static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1802 size_t count, loff_t *ppos) 1803 { 1804 struct kvm_htab_ctx *ctx = file->private_data; 1805 struct kvm *kvm = ctx->kvm; 1806 struct kvm_get_htab_header hdr; 1807 unsigned long i, j; 1808 unsigned long v, r; 1809 unsigned long __user *lbuf; 1810 __be64 *hptp; 1811 unsigned long tmp[2]; 1812 ssize_t nb; 1813 long int err, ret; 1814 int mmu_ready; 1815 int pshift; 1816 1817 if (!access_ok(buf, count)) 1818 return -EFAULT; 1819 if (kvm_is_radix(kvm)) 1820 return -EINVAL; 1821 1822 /* lock out vcpus from running while we're doing this */ 1823 mutex_lock(&kvm->arch.mmu_setup_lock); 1824 mmu_ready = kvm->arch.mmu_ready; 1825 if (mmu_ready) { 1826 kvm->arch.mmu_ready = 0; /* temporarily */ 1827 /* order mmu_ready vs. vcpus_running */ 1828 smp_mb(); 1829 if (atomic_read(&kvm->arch.vcpus_running)) { 1830 kvm->arch.mmu_ready = 1; 1831 mutex_unlock(&kvm->arch.mmu_setup_lock); 1832 return -EBUSY; 1833 } 1834 } 1835 1836 err = 0; 1837 for (nb = 0; nb + sizeof(hdr) <= count; ) { 1838 err = -EFAULT; 1839 if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1840 break; 1841 1842 err = 0; 1843 if (nb + hdr.n_valid * HPTE_SIZE > count) 1844 break; 1845 1846 nb += sizeof(hdr); 1847 buf += sizeof(hdr); 1848 1849 err = -EINVAL; 1850 i = hdr.index; 1851 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || 1852 i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) 1853 break; 1854 1855 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1856 lbuf = (unsigned long __user *)buf; 1857 for (j = 0; j < hdr.n_valid; ++j) { 1858 __be64 hpte_v; 1859 __be64 hpte_r; 1860 1861 err = -EFAULT; 1862 if (__get_user(hpte_v, lbuf) || 1863 __get_user(hpte_r, lbuf + 1)) 1864 goto out; 1865 v = be64_to_cpu(hpte_v); 1866 r = be64_to_cpu(hpte_r); 1867 err = -EINVAL; 1868 if (!(v & HPTE_V_VALID)) 1869 goto out; 1870 pshift = kvmppc_hpte_base_page_shift(v, r); 1871 if (pshift <= 0) 1872 goto out; 1873 lbuf += 2; 1874 nb += HPTE_SIZE; 1875 1876 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1877 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1878 err = -EIO; 1879 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1880 tmp); 1881 if (ret != H_SUCCESS) { 1882 pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1883 "r=%lx\n", ret, i, v, r); 1884 goto out; 1885 } 1886 if (!mmu_ready && is_vrma_hpte(v)) { 1887 unsigned long senc, lpcr; 1888 1889 senc = slb_pgsize_encoding(1ul << pshift); 1890 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1891 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1892 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 1893 lpcr = senc << (LPCR_VRMASD_SH - 4); 1894 kvmppc_update_lpcr(kvm, lpcr, 1895 LPCR_VRMASD); 1896 } else { 1897 kvmppc_setup_partition_table(kvm); 1898 } 1899 mmu_ready = 1; 1900 } 1901 ++i; 1902 hptp += 2; 1903 } 1904 1905 for (j = 0; j < hdr.n_invalid; ++j) { 1906 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1907 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1908 ++i; 1909 hptp += 2; 1910 } 1911 err = 0; 1912 } 1913 1914 out: 1915 /* Order HPTE updates vs. mmu_ready */ 1916 smp_wmb(); 1917 kvm->arch.mmu_ready = mmu_ready; 1918 mutex_unlock(&kvm->arch.mmu_setup_lock); 1919 1920 if (err) 1921 return err; 1922 return nb; 1923 } 1924 1925 static int kvm_htab_release(struct inode *inode, struct file *filp) 1926 { 1927 struct kvm_htab_ctx *ctx = filp->private_data; 1928 1929 filp->private_data = NULL; 1930 if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1931 atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1932 kvm_put_kvm(ctx->kvm); 1933 kfree(ctx); 1934 return 0; 1935 } 1936 1937 static const struct file_operations kvm_htab_fops = { 1938 .read = kvm_htab_read, 1939 .write = kvm_htab_write, 1940 .llseek = default_llseek, 1941 .release = kvm_htab_release, 1942 }; 1943 1944 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1945 { 1946 int ret; 1947 struct kvm_htab_ctx *ctx; 1948 int rwflag; 1949 1950 /* reject flags we don't recognize */ 1951 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1952 return -EINVAL; 1953 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1954 if (!ctx) 1955 return -ENOMEM; 1956 kvm_get_kvm(kvm); 1957 ctx->kvm = kvm; 1958 ctx->index = ghf->start_index; 1959 ctx->flags = ghf->flags; 1960 ctx->first_pass = 1; 1961 1962 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1963 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1964 if (ret < 0) { 1965 kfree(ctx); 1966 kvm_put_kvm_no_destroy(kvm); 1967 return ret; 1968 } 1969 1970 if (rwflag == O_RDONLY) { 1971 mutex_lock(&kvm->slots_lock); 1972 atomic_inc(&kvm->arch.hpte_mod_interest); 1973 /* make sure kvmppc_do_h_enter etc. see the increment */ 1974 synchronize_srcu_expedited(&kvm->srcu); 1975 mutex_unlock(&kvm->slots_lock); 1976 } 1977 1978 return ret; 1979 } 1980 1981 struct debugfs_htab_state { 1982 struct kvm *kvm; 1983 struct mutex mutex; 1984 unsigned long hpt_index; 1985 int chars_left; 1986 int buf_index; 1987 char buf[64]; 1988 }; 1989 1990 static int debugfs_htab_open(struct inode *inode, struct file *file) 1991 { 1992 struct kvm *kvm = inode->i_private; 1993 struct debugfs_htab_state *p; 1994 1995 p = kzalloc(sizeof(*p), GFP_KERNEL); 1996 if (!p) 1997 return -ENOMEM; 1998 1999 kvm_get_kvm(kvm); 2000 p->kvm = kvm; 2001 mutex_init(&p->mutex); 2002 file->private_data = p; 2003 2004 return nonseekable_open(inode, file); 2005 } 2006 2007 static int debugfs_htab_release(struct inode *inode, struct file *file) 2008 { 2009 struct debugfs_htab_state *p = file->private_data; 2010 2011 kvm_put_kvm(p->kvm); 2012 kfree(p); 2013 return 0; 2014 } 2015 2016 static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 2017 size_t len, loff_t *ppos) 2018 { 2019 struct debugfs_htab_state *p = file->private_data; 2020 ssize_t ret, r; 2021 unsigned long i, n; 2022 unsigned long v, hr, gr; 2023 struct kvm *kvm; 2024 __be64 *hptp; 2025 2026 kvm = p->kvm; 2027 if (kvm_is_radix(kvm)) 2028 return 0; 2029 2030 ret = mutex_lock_interruptible(&p->mutex); 2031 if (ret) 2032 return ret; 2033 2034 if (p->chars_left) { 2035 n = p->chars_left; 2036 if (n > len) 2037 n = len; 2038 r = copy_to_user(buf, p->buf + p->buf_index, n); 2039 n -= r; 2040 p->chars_left -= n; 2041 p->buf_index += n; 2042 buf += n; 2043 len -= n; 2044 ret = n; 2045 if (r) { 2046 if (!n) 2047 ret = -EFAULT; 2048 goto out; 2049 } 2050 } 2051 2052 i = p->hpt_index; 2053 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2054 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); 2055 ++i, hptp += 2) { 2056 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 2057 continue; 2058 2059 /* lock the HPTE so it's stable and read it */ 2060 preempt_disable(); 2061 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 2062 cpu_relax(); 2063 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 2064 hr = be64_to_cpu(hptp[1]); 2065 gr = kvm->arch.hpt.rev[i].guest_rpte; 2066 unlock_hpte(hptp, v); 2067 preempt_enable(); 2068 2069 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 2070 continue; 2071 2072 n = scnprintf(p->buf, sizeof(p->buf), 2073 "%6lx %.16lx %.16lx %.16lx\n", 2074 i, v, hr, gr); 2075 p->chars_left = n; 2076 if (n > len) 2077 n = len; 2078 r = copy_to_user(buf, p->buf, n); 2079 n -= r; 2080 p->chars_left -= n; 2081 p->buf_index = n; 2082 buf += n; 2083 len -= n; 2084 ret += n; 2085 if (r) { 2086 if (!ret) 2087 ret = -EFAULT; 2088 goto out; 2089 } 2090 } 2091 p->hpt_index = i; 2092 2093 out: 2094 mutex_unlock(&p->mutex); 2095 return ret; 2096 } 2097 2098 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 2099 size_t len, loff_t *ppos) 2100 { 2101 return -EACCES; 2102 } 2103 2104 static const struct file_operations debugfs_htab_fops = { 2105 .owner = THIS_MODULE, 2106 .open = debugfs_htab_open, 2107 .release = debugfs_htab_release, 2108 .read = debugfs_htab_read, 2109 .write = debugfs_htab_write, 2110 .llseek = generic_file_llseek, 2111 }; 2112 2113 void kvmppc_mmu_debugfs_init(struct kvm *kvm) 2114 { 2115 debugfs_create_file("htab", 0400, kvm->arch.debugfs_dir, kvm, 2116 &debugfs_htab_fops); 2117 } 2118 2119 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 2120 { 2121 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 2122 2123 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2124 2125 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2126 2127 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 2128 } 2129