1 /* 2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. 4 * 5 * Authors: 6 * Paul Mackerras <paulus@au1.ibm.com> 7 * Alexander Graf <agraf@suse.de> 8 * Kevin Wolf <mail@kevin-wolf.de> 9 * 10 * Description: KVM functions specific to running on Book 3S 11 * processors in hypervisor mode (specifically POWER7 and later). 12 * 13 * This file is derived from arch/powerpc/kvm/book3s.c, 14 * by Alexander Graf <agraf@suse.de>. 15 * 16 * This program is free software; you can redistribute it and/or modify 17 * it under the terms of the GNU General Public License, version 2, as 18 * published by the Free Software Foundation. 19 */ 20 21 #include <linux/kvm_host.h> 22 #include <linux/err.h> 23 #include <linux/slab.h> 24 #include <linux/preempt.h> 25 #include <linux/sched/signal.h> 26 #include <linux/sched/stat.h> 27 #include <linux/delay.h> 28 #include <linux/export.h> 29 #include <linux/fs.h> 30 #include <linux/anon_inodes.h> 31 #include <linux/cpu.h> 32 #include <linux/cpumask.h> 33 #include <linux/spinlock.h> 34 #include <linux/page-flags.h> 35 #include <linux/srcu.h> 36 #include <linux/miscdevice.h> 37 #include <linux/debugfs.h> 38 39 #include <asm/reg.h> 40 #include <asm/cputable.h> 41 #include <asm/cacheflush.h> 42 #include <asm/tlbflush.h> 43 #include <linux/uaccess.h> 44 #include <asm/io.h> 45 #include <asm/kvm_ppc.h> 46 #include <asm/kvm_book3s.h> 47 #include <asm/mmu_context.h> 48 #include <asm/lppaca.h> 49 #include <asm/processor.h> 50 #include <asm/cputhreads.h> 51 #include <asm/page.h> 52 #include <asm/hvcall.h> 53 #include <asm/switch_to.h> 54 #include <asm/smp.h> 55 #include <asm/dbell.h> 56 #include <asm/hmi.h> 57 #include <asm/pnv-pci.h> 58 #include <asm/mmu.h> 59 #include <asm/opal.h> 60 #include <asm/xics.h> 61 #include <linux/gfp.h> 62 #include <linux/vmalloc.h> 63 #include <linux/highmem.h> 64 #include <linux/hugetlb.h> 65 #include <linux/kvm_irqfd.h> 66 #include <linux/irqbypass.h> 67 #include <linux/module.h> 68 #include <linux/compiler.h> 69 #include <linux/of.h> 70 71 #include "book3s.h" 72 73 #define CREATE_TRACE_POINTS 74 #include "trace_hv.h" 75 76 /* #define EXIT_DEBUG */ 77 /* #define EXIT_DEBUG_SIMPLE */ 78 /* #define EXIT_DEBUG_INT */ 79 80 /* Used to indicate that a guest page fault needs to be handled */ 81 #define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) 82 /* Used to indicate that a guest passthrough interrupt needs to be handled */ 83 #define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2) 84 85 /* Used as a "null" value for timebase values */ 86 #define TB_NIL (~(u64)0) 87 88 static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); 89 90 static int dynamic_mt_modes = 6; 91 module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR); 92 MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)"); 93 static int target_smt_mode; 94 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); 95 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); 96 97 #ifdef CONFIG_KVM_XICS 98 static struct kernel_param_ops module_param_ops = { 99 .set = param_set_int, 100 .get = param_get_int, 101 }; 102 103 module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, 104 S_IRUGO | S_IWUSR); 105 MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization"); 106 107 module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 108 S_IRUGO | S_IWUSR); 109 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); 110 #endif 111 112 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 113 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 114 115 static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, 116 int *ip) 117 { 118 int i = *ip; 119 struct kvm_vcpu *vcpu; 120 121 while (++i < MAX_SMT_THREADS) { 122 vcpu = READ_ONCE(vc->runnable_threads[i]); 123 if (vcpu) { 124 *ip = i; 125 return vcpu; 126 } 127 } 128 return NULL; 129 } 130 131 /* Used to traverse the list of runnable threads for a given vcore */ 132 #define for_each_runnable_thread(i, vcpu, vc) \ 133 for (i = -1; (vcpu = next_runnable_thread(vc, &i)); ) 134 135 static bool kvmppc_ipi_thread(int cpu) 136 { 137 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 138 139 /* On POWER9 we can use msgsnd to IPI any cpu */ 140 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 141 msg |= get_hard_smp_processor_id(cpu); 142 smp_mb(); 143 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 144 return true; 145 } 146 147 /* On POWER8 for IPIs to threads in the same core, use msgsnd */ 148 if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 149 preempt_disable(); 150 if (cpu_first_thread_sibling(cpu) == 151 cpu_first_thread_sibling(smp_processor_id())) { 152 msg |= cpu_thread_in_core(cpu); 153 smp_mb(); 154 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 155 preempt_enable(); 156 return true; 157 } 158 preempt_enable(); 159 } 160 161 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 162 if (cpu >= 0 && cpu < nr_cpu_ids) { 163 if (paca[cpu].kvm_hstate.xics_phys) { 164 xics_wake_cpu(cpu); 165 return true; 166 } 167 opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); 168 return true; 169 } 170 #endif 171 172 return false; 173 } 174 175 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) 176 { 177 int cpu; 178 struct swait_queue_head *wqp; 179 180 wqp = kvm_arch_vcpu_wq(vcpu); 181 if (swait_active(wqp)) { 182 swake_up(wqp); 183 ++vcpu->stat.halt_wakeup; 184 } 185 186 cpu = READ_ONCE(vcpu->arch.thread_cpu); 187 if (cpu >= 0 && kvmppc_ipi_thread(cpu)) 188 return; 189 190 /* CPU points to the first thread of the core */ 191 cpu = vcpu->cpu; 192 if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu)) 193 smp_send_reschedule(cpu); 194 } 195 196 /* 197 * We use the vcpu_load/put functions to measure stolen time. 198 * Stolen time is counted as time when either the vcpu is able to 199 * run as part of a virtual core, but the task running the vcore 200 * is preempted or sleeping, or when the vcpu needs something done 201 * in the kernel by the task running the vcpu, but that task is 202 * preempted or sleeping. Those two things have to be counted 203 * separately, since one of the vcpu tasks will take on the job 204 * of running the core, and the other vcpu tasks in the vcore will 205 * sleep waiting for it to do that, but that sleep shouldn't count 206 * as stolen time. 207 * 208 * Hence we accumulate stolen time when the vcpu can run as part of 209 * a vcore using vc->stolen_tb, and the stolen time when the vcpu 210 * needs its task to do other things in the kernel (for example, 211 * service a page fault) in busy_stolen. We don't accumulate 212 * stolen time for a vcore when it is inactive, or for a vcpu 213 * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of 214 * a misnomer; it means that the vcpu task is not executing in 215 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in 216 * the kernel. We don't have any way of dividing up that time 217 * between time that the vcpu is genuinely stopped, time that 218 * the task is actively working on behalf of the vcpu, and time 219 * that the task is preempted, so we don't count any of it as 220 * stolen. 221 * 222 * Updates to busy_stolen are protected by arch.tbacct_lock; 223 * updates to vc->stolen_tb are protected by the vcore->stoltb_lock 224 * lock. The stolen times are measured in units of timebase ticks. 225 * (Note that the != TB_NIL checks below are purely defensive; 226 * they should never fail.) 227 */ 228 229 static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc) 230 { 231 unsigned long flags; 232 233 spin_lock_irqsave(&vc->stoltb_lock, flags); 234 vc->preempt_tb = mftb(); 235 spin_unlock_irqrestore(&vc->stoltb_lock, flags); 236 } 237 238 static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc) 239 { 240 unsigned long flags; 241 242 spin_lock_irqsave(&vc->stoltb_lock, flags); 243 if (vc->preempt_tb != TB_NIL) { 244 vc->stolen_tb += mftb() - vc->preempt_tb; 245 vc->preempt_tb = TB_NIL; 246 } 247 spin_unlock_irqrestore(&vc->stoltb_lock, flags); 248 } 249 250 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) 251 { 252 struct kvmppc_vcore *vc = vcpu->arch.vcore; 253 unsigned long flags; 254 255 /* 256 * We can test vc->runner without taking the vcore lock, 257 * because only this task ever sets vc->runner to this 258 * vcpu, and once it is set to this vcpu, only this task 259 * ever sets it to NULL. 260 */ 261 if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) 262 kvmppc_core_end_stolen(vc); 263 264 spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); 265 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && 266 vcpu->arch.busy_preempt != TB_NIL) { 267 vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; 268 vcpu->arch.busy_preempt = TB_NIL; 269 } 270 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); 271 } 272 273 static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) 274 { 275 struct kvmppc_vcore *vc = vcpu->arch.vcore; 276 unsigned long flags; 277 278 if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) 279 kvmppc_core_start_stolen(vc); 280 281 spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); 282 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) 283 vcpu->arch.busy_preempt = mftb(); 284 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); 285 } 286 287 static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) 288 { 289 /* 290 * Check for illegal transactional state bit combination 291 * and if we find it, force the TS field to a safe state. 292 */ 293 if ((msr & MSR_TS_MASK) == MSR_TS_MASK) 294 msr &= ~MSR_TS_MASK; 295 vcpu->arch.shregs.msr = msr; 296 kvmppc_end_cede(vcpu); 297 } 298 299 static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) 300 { 301 vcpu->arch.pvr = pvr; 302 } 303 304 /* Dummy value used in computing PCR value below */ 305 #define PCR_ARCH_300 (PCR_ARCH_207 << 1) 306 307 static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) 308 { 309 unsigned long host_pcr_bit = 0, guest_pcr_bit = 0; 310 struct kvmppc_vcore *vc = vcpu->arch.vcore; 311 312 /* We can (emulate) our own architecture version and anything older */ 313 if (cpu_has_feature(CPU_FTR_ARCH_300)) 314 host_pcr_bit = PCR_ARCH_300; 315 else if (cpu_has_feature(CPU_FTR_ARCH_207S)) 316 host_pcr_bit = PCR_ARCH_207; 317 else if (cpu_has_feature(CPU_FTR_ARCH_206)) 318 host_pcr_bit = PCR_ARCH_206; 319 else 320 host_pcr_bit = PCR_ARCH_205; 321 322 /* Determine lowest PCR bit needed to run guest in given PVR level */ 323 guest_pcr_bit = host_pcr_bit; 324 if (arch_compat) { 325 switch (arch_compat) { 326 case PVR_ARCH_205: 327 guest_pcr_bit = PCR_ARCH_205; 328 break; 329 case PVR_ARCH_206: 330 case PVR_ARCH_206p: 331 guest_pcr_bit = PCR_ARCH_206; 332 break; 333 case PVR_ARCH_207: 334 guest_pcr_bit = PCR_ARCH_207; 335 break; 336 case PVR_ARCH_300: 337 guest_pcr_bit = PCR_ARCH_300; 338 break; 339 default: 340 return -EINVAL; 341 } 342 } 343 344 /* Check requested PCR bits don't exceed our capabilities */ 345 if (guest_pcr_bit > host_pcr_bit) 346 return -EINVAL; 347 348 spin_lock(&vc->lock); 349 vc->arch_compat = arch_compat; 350 /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */ 351 vc->pcr = host_pcr_bit - guest_pcr_bit; 352 spin_unlock(&vc->lock); 353 354 return 0; 355 } 356 357 static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) 358 { 359 int r; 360 361 pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id); 362 pr_err("pc = %.16lx msr = %.16llx trap = %x\n", 363 vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap); 364 for (r = 0; r < 16; ++r) 365 pr_err("r%2d = %.16lx r%d = %.16lx\n", 366 r, kvmppc_get_gpr(vcpu, r), 367 r+16, kvmppc_get_gpr(vcpu, r+16)); 368 pr_err("ctr = %.16lx lr = %.16lx\n", 369 vcpu->arch.ctr, vcpu->arch.lr); 370 pr_err("srr0 = %.16llx srr1 = %.16llx\n", 371 vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1); 372 pr_err("sprg0 = %.16llx sprg1 = %.16llx\n", 373 vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); 374 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", 375 vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); 376 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", 377 vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr); 378 pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); 379 pr_err("fault dar = %.16lx dsisr = %.8x\n", 380 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 381 pr_err("SLB (%d entries):\n", vcpu->arch.slb_max); 382 for (r = 0; r < vcpu->arch.slb_max; ++r) 383 pr_err(" ESID = %.16llx VSID = %.16llx\n", 384 vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); 385 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", 386 vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1, 387 vcpu->arch.last_inst); 388 } 389 390 static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) 391 { 392 struct kvm_vcpu *ret; 393 394 mutex_lock(&kvm->lock); 395 ret = kvm_get_vcpu_by_id(kvm, id); 396 mutex_unlock(&kvm->lock); 397 return ret; 398 } 399 400 static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) 401 { 402 vpa->__old_status |= LPPACA_OLD_SHARED_PROC; 403 vpa->yield_count = cpu_to_be32(1); 404 } 405 406 static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, 407 unsigned long addr, unsigned long len) 408 { 409 /* check address is cacheline aligned */ 410 if (addr & (L1_CACHE_BYTES - 1)) 411 return -EINVAL; 412 spin_lock(&vcpu->arch.vpa_update_lock); 413 if (v->next_gpa != addr || v->len != len) { 414 v->next_gpa = addr; 415 v->len = addr ? len : 0; 416 v->update_pending = 1; 417 } 418 spin_unlock(&vcpu->arch.vpa_update_lock); 419 return 0; 420 } 421 422 /* Length for a per-processor buffer is passed in at offset 4 in the buffer */ 423 struct reg_vpa { 424 u32 dummy; 425 union { 426 __be16 hword; 427 __be32 word; 428 } length; 429 }; 430 431 static int vpa_is_registered(struct kvmppc_vpa *vpap) 432 { 433 if (vpap->update_pending) 434 return vpap->next_gpa != 0; 435 return vpap->pinned_addr != NULL; 436 } 437 438 static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, 439 unsigned long flags, 440 unsigned long vcpuid, unsigned long vpa) 441 { 442 struct kvm *kvm = vcpu->kvm; 443 unsigned long len, nb; 444 void *va; 445 struct kvm_vcpu *tvcpu; 446 int err; 447 int subfunc; 448 struct kvmppc_vpa *vpap; 449 450 tvcpu = kvmppc_find_vcpu(kvm, vcpuid); 451 if (!tvcpu) 452 return H_PARAMETER; 453 454 subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK; 455 if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL || 456 subfunc == H_VPA_REG_SLB) { 457 /* Registering new area - address must be cache-line aligned */ 458 if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa) 459 return H_PARAMETER; 460 461 /* convert logical addr to kernel addr and read length */ 462 va = kvmppc_pin_guest_page(kvm, vpa, &nb); 463 if (va == NULL) 464 return H_PARAMETER; 465 if (subfunc == H_VPA_REG_VPA) 466 len = be16_to_cpu(((struct reg_vpa *)va)->length.hword); 467 else 468 len = be32_to_cpu(((struct reg_vpa *)va)->length.word); 469 kvmppc_unpin_guest_page(kvm, va, vpa, false); 470 471 /* Check length */ 472 if (len > nb || len < sizeof(struct reg_vpa)) 473 return H_PARAMETER; 474 } else { 475 vpa = 0; 476 len = 0; 477 } 478 479 err = H_PARAMETER; 480 vpap = NULL; 481 spin_lock(&tvcpu->arch.vpa_update_lock); 482 483 switch (subfunc) { 484 case H_VPA_REG_VPA: /* register VPA */ 485 if (len < sizeof(struct lppaca)) 486 break; 487 vpap = &tvcpu->arch.vpa; 488 err = 0; 489 break; 490 491 case H_VPA_REG_DTL: /* register DTL */ 492 if (len < sizeof(struct dtl_entry)) 493 break; 494 len -= len % sizeof(struct dtl_entry); 495 496 /* Check that they have previously registered a VPA */ 497 err = H_RESOURCE; 498 if (!vpa_is_registered(&tvcpu->arch.vpa)) 499 break; 500 501 vpap = &tvcpu->arch.dtl; 502 err = 0; 503 break; 504 505 case H_VPA_REG_SLB: /* register SLB shadow buffer */ 506 /* Check that they have previously registered a VPA */ 507 err = H_RESOURCE; 508 if (!vpa_is_registered(&tvcpu->arch.vpa)) 509 break; 510 511 vpap = &tvcpu->arch.slb_shadow; 512 err = 0; 513 break; 514 515 case H_VPA_DEREG_VPA: /* deregister VPA */ 516 /* Check they don't still have a DTL or SLB buf registered */ 517 err = H_RESOURCE; 518 if (vpa_is_registered(&tvcpu->arch.dtl) || 519 vpa_is_registered(&tvcpu->arch.slb_shadow)) 520 break; 521 522 vpap = &tvcpu->arch.vpa; 523 err = 0; 524 break; 525 526 case H_VPA_DEREG_DTL: /* deregister DTL */ 527 vpap = &tvcpu->arch.dtl; 528 err = 0; 529 break; 530 531 case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */ 532 vpap = &tvcpu->arch.slb_shadow; 533 err = 0; 534 break; 535 } 536 537 if (vpap) { 538 vpap->next_gpa = vpa; 539 vpap->len = len; 540 vpap->update_pending = 1; 541 } 542 543 spin_unlock(&tvcpu->arch.vpa_update_lock); 544 545 return err; 546 } 547 548 static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) 549 { 550 struct kvm *kvm = vcpu->kvm; 551 void *va; 552 unsigned long nb; 553 unsigned long gpa; 554 555 /* 556 * We need to pin the page pointed to by vpap->next_gpa, 557 * but we can't call kvmppc_pin_guest_page under the lock 558 * as it does get_user_pages() and down_read(). So we 559 * have to drop the lock, pin the page, then get the lock 560 * again and check that a new area didn't get registered 561 * in the meantime. 562 */ 563 for (;;) { 564 gpa = vpap->next_gpa; 565 spin_unlock(&vcpu->arch.vpa_update_lock); 566 va = NULL; 567 nb = 0; 568 if (gpa) 569 va = kvmppc_pin_guest_page(kvm, gpa, &nb); 570 spin_lock(&vcpu->arch.vpa_update_lock); 571 if (gpa == vpap->next_gpa) 572 break; 573 /* sigh... unpin that one and try again */ 574 if (va) 575 kvmppc_unpin_guest_page(kvm, va, gpa, false); 576 } 577 578 vpap->update_pending = 0; 579 if (va && nb < vpap->len) { 580 /* 581 * If it's now too short, it must be that userspace 582 * has changed the mappings underlying guest memory, 583 * so unregister the region. 584 */ 585 kvmppc_unpin_guest_page(kvm, va, gpa, false); 586 va = NULL; 587 } 588 if (vpap->pinned_addr) 589 kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, 590 vpap->dirty); 591 vpap->gpa = gpa; 592 vpap->pinned_addr = va; 593 vpap->dirty = false; 594 if (va) 595 vpap->pinned_end = va + vpap->len; 596 } 597 598 static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) 599 { 600 if (!(vcpu->arch.vpa.update_pending || 601 vcpu->arch.slb_shadow.update_pending || 602 vcpu->arch.dtl.update_pending)) 603 return; 604 605 spin_lock(&vcpu->arch.vpa_update_lock); 606 if (vcpu->arch.vpa.update_pending) { 607 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); 608 if (vcpu->arch.vpa.pinned_addr) 609 init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); 610 } 611 if (vcpu->arch.dtl.update_pending) { 612 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); 613 vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr; 614 vcpu->arch.dtl_index = 0; 615 } 616 if (vcpu->arch.slb_shadow.update_pending) 617 kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow); 618 spin_unlock(&vcpu->arch.vpa_update_lock); 619 } 620 621 /* 622 * Return the accumulated stolen time for the vcore up until `now'. 623 * The caller should hold the vcore lock. 624 */ 625 static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) 626 { 627 u64 p; 628 unsigned long flags; 629 630 spin_lock_irqsave(&vc->stoltb_lock, flags); 631 p = vc->stolen_tb; 632 if (vc->vcore_state != VCORE_INACTIVE && 633 vc->preempt_tb != TB_NIL) 634 p += now - vc->preempt_tb; 635 spin_unlock_irqrestore(&vc->stoltb_lock, flags); 636 return p; 637 } 638 639 static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, 640 struct kvmppc_vcore *vc) 641 { 642 struct dtl_entry *dt; 643 struct lppaca *vpa; 644 unsigned long stolen; 645 unsigned long core_stolen; 646 u64 now; 647 648 dt = vcpu->arch.dtl_ptr; 649 vpa = vcpu->arch.vpa.pinned_addr; 650 now = mftb(); 651 core_stolen = vcore_stolen_time(vc, now); 652 stolen = core_stolen - vcpu->arch.stolen_logged; 653 vcpu->arch.stolen_logged = core_stolen; 654 spin_lock_irq(&vcpu->arch.tbacct_lock); 655 stolen += vcpu->arch.busy_stolen; 656 vcpu->arch.busy_stolen = 0; 657 spin_unlock_irq(&vcpu->arch.tbacct_lock); 658 if (!dt || !vpa) 659 return; 660 memset(dt, 0, sizeof(struct dtl_entry)); 661 dt->dispatch_reason = 7; 662 dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid); 663 dt->timebase = cpu_to_be64(now + vc->tb_offset); 664 dt->enqueue_to_dispatch_time = cpu_to_be32(stolen); 665 dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu)); 666 dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr); 667 ++dt; 668 if (dt == vcpu->arch.dtl.pinned_end) 669 dt = vcpu->arch.dtl.pinned_addr; 670 vcpu->arch.dtl_ptr = dt; 671 /* order writing *dt vs. writing vpa->dtl_idx */ 672 smp_wmb(); 673 vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index); 674 vcpu->arch.dtl.dirty = true; 675 } 676 677 static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu) 678 { 679 if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207) 680 return true; 681 if ((!vcpu->arch.vcore->arch_compat) && 682 cpu_has_feature(CPU_FTR_ARCH_207S)) 683 return true; 684 return false; 685 } 686 687 static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, 688 unsigned long resource, unsigned long value1, 689 unsigned long value2) 690 { 691 switch (resource) { 692 case H_SET_MODE_RESOURCE_SET_CIABR: 693 if (!kvmppc_power8_compatible(vcpu)) 694 return H_P2; 695 if (value2) 696 return H_P4; 697 if (mflags) 698 return H_UNSUPPORTED_FLAG_START; 699 /* Guests can't breakpoint the hypervisor */ 700 if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER) 701 return H_P3; 702 vcpu->arch.ciabr = value1; 703 return H_SUCCESS; 704 case H_SET_MODE_RESOURCE_SET_DAWR: 705 if (!kvmppc_power8_compatible(vcpu)) 706 return H_P2; 707 if (mflags) 708 return H_UNSUPPORTED_FLAG_START; 709 if (value2 & DABRX_HYP) 710 return H_P4; 711 vcpu->arch.dawr = value1; 712 vcpu->arch.dawrx = value2; 713 return H_SUCCESS; 714 default: 715 return H_TOO_HARD; 716 } 717 } 718 719 static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) 720 { 721 struct kvmppc_vcore *vcore = target->arch.vcore; 722 723 /* 724 * We expect to have been called by the real mode handler 725 * (kvmppc_rm_h_confer()) which would have directly returned 726 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may 727 * have useful work to do and should not confer) so we don't 728 * recheck that here. 729 */ 730 731 spin_lock(&vcore->lock); 732 if (target->arch.state == KVMPPC_VCPU_RUNNABLE && 733 vcore->vcore_state != VCORE_INACTIVE && 734 vcore->runner) 735 target = vcore->runner; 736 spin_unlock(&vcore->lock); 737 738 return kvm_vcpu_yield_to(target); 739 } 740 741 static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu) 742 { 743 int yield_count = 0; 744 struct lppaca *lppaca; 745 746 spin_lock(&vcpu->arch.vpa_update_lock); 747 lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr; 748 if (lppaca) 749 yield_count = be32_to_cpu(lppaca->yield_count); 750 spin_unlock(&vcpu->arch.vpa_update_lock); 751 return yield_count; 752 } 753 754 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) 755 { 756 unsigned long req = kvmppc_get_gpr(vcpu, 3); 757 unsigned long target, ret = H_SUCCESS; 758 int yield_count; 759 struct kvm_vcpu *tvcpu; 760 int idx, rc; 761 762 if (req <= MAX_HCALL_OPCODE && 763 !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls)) 764 return RESUME_HOST; 765 766 switch (req) { 767 case H_CEDE: 768 break; 769 case H_PROD: 770 target = kvmppc_get_gpr(vcpu, 4); 771 tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); 772 if (!tvcpu) { 773 ret = H_PARAMETER; 774 break; 775 } 776 tvcpu->arch.prodded = 1; 777 smp_mb(); 778 if (tvcpu->arch.ceded) 779 kvmppc_fast_vcpu_kick_hv(tvcpu); 780 break; 781 case H_CONFER: 782 target = kvmppc_get_gpr(vcpu, 4); 783 if (target == -1) 784 break; 785 tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); 786 if (!tvcpu) { 787 ret = H_PARAMETER; 788 break; 789 } 790 yield_count = kvmppc_get_gpr(vcpu, 5); 791 if (kvmppc_get_yield_count(tvcpu) != yield_count) 792 break; 793 kvm_arch_vcpu_yield_to(tvcpu); 794 break; 795 case H_REGISTER_VPA: 796 ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), 797 kvmppc_get_gpr(vcpu, 5), 798 kvmppc_get_gpr(vcpu, 6)); 799 break; 800 case H_RTAS: 801 if (list_empty(&vcpu->kvm->arch.rtas_tokens)) 802 return RESUME_HOST; 803 804 idx = srcu_read_lock(&vcpu->kvm->srcu); 805 rc = kvmppc_rtas_hcall(vcpu); 806 srcu_read_unlock(&vcpu->kvm->srcu, idx); 807 808 if (rc == -ENOENT) 809 return RESUME_HOST; 810 else if (rc == 0) 811 break; 812 813 /* Send the error out to userspace via KVM_RUN */ 814 return rc; 815 case H_LOGICAL_CI_LOAD: 816 ret = kvmppc_h_logical_ci_load(vcpu); 817 if (ret == H_TOO_HARD) 818 return RESUME_HOST; 819 break; 820 case H_LOGICAL_CI_STORE: 821 ret = kvmppc_h_logical_ci_store(vcpu); 822 if (ret == H_TOO_HARD) 823 return RESUME_HOST; 824 break; 825 case H_SET_MODE: 826 ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4), 827 kvmppc_get_gpr(vcpu, 5), 828 kvmppc_get_gpr(vcpu, 6), 829 kvmppc_get_gpr(vcpu, 7)); 830 if (ret == H_TOO_HARD) 831 return RESUME_HOST; 832 break; 833 case H_XIRR: 834 case H_CPPR: 835 case H_EOI: 836 case H_IPI: 837 case H_IPOLL: 838 case H_XIRR_X: 839 if (kvmppc_xics_enabled(vcpu)) { 840 ret = kvmppc_xics_hcall(vcpu, req); 841 break; 842 } 843 return RESUME_HOST; 844 case H_PUT_TCE: 845 ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), 846 kvmppc_get_gpr(vcpu, 5), 847 kvmppc_get_gpr(vcpu, 6)); 848 if (ret == H_TOO_HARD) 849 return RESUME_HOST; 850 break; 851 case H_PUT_TCE_INDIRECT: 852 ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4), 853 kvmppc_get_gpr(vcpu, 5), 854 kvmppc_get_gpr(vcpu, 6), 855 kvmppc_get_gpr(vcpu, 7)); 856 if (ret == H_TOO_HARD) 857 return RESUME_HOST; 858 break; 859 case H_STUFF_TCE: 860 ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4), 861 kvmppc_get_gpr(vcpu, 5), 862 kvmppc_get_gpr(vcpu, 6), 863 kvmppc_get_gpr(vcpu, 7)); 864 if (ret == H_TOO_HARD) 865 return RESUME_HOST; 866 break; 867 default: 868 return RESUME_HOST; 869 } 870 kvmppc_set_gpr(vcpu, 3, ret); 871 vcpu->arch.hcall_needed = 0; 872 return RESUME_GUEST; 873 } 874 875 static int kvmppc_hcall_impl_hv(unsigned long cmd) 876 { 877 switch (cmd) { 878 case H_CEDE: 879 case H_PROD: 880 case H_CONFER: 881 case H_REGISTER_VPA: 882 case H_SET_MODE: 883 case H_LOGICAL_CI_LOAD: 884 case H_LOGICAL_CI_STORE: 885 #ifdef CONFIG_KVM_XICS 886 case H_XIRR: 887 case H_CPPR: 888 case H_EOI: 889 case H_IPI: 890 case H_IPOLL: 891 case H_XIRR_X: 892 #endif 893 return 1; 894 } 895 896 /* See if it's in the real-mode table */ 897 return kvmppc_hcall_impl_hv_realmode(cmd); 898 } 899 900 static int kvmppc_emulate_debug_inst(struct kvm_run *run, 901 struct kvm_vcpu *vcpu) 902 { 903 u32 last_inst; 904 905 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 906 EMULATE_DONE) { 907 /* 908 * Fetch failed, so return to guest and 909 * try executing it again. 910 */ 911 return RESUME_GUEST; 912 } 913 914 if (last_inst == KVMPPC_INST_SW_BREAKPOINT) { 915 run->exit_reason = KVM_EXIT_DEBUG; 916 run->debug.arch.address = kvmppc_get_pc(vcpu); 917 return RESUME_HOST; 918 } else { 919 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 920 return RESUME_GUEST; 921 } 922 } 923 924 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, 925 struct task_struct *tsk) 926 { 927 int r = RESUME_HOST; 928 929 vcpu->stat.sum_exits++; 930 931 /* 932 * This can happen if an interrupt occurs in the last stages 933 * of guest entry or the first stages of guest exit (i.e. after 934 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV 935 * and before setting it to KVM_GUEST_MODE_HOST_HV). 936 * That can happen due to a bug, or due to a machine check 937 * occurring at just the wrong time. 938 */ 939 if (vcpu->arch.shregs.msr & MSR_HV) { 940 printk(KERN_EMERG "KVM trap in HV mode!\n"); 941 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", 942 vcpu->arch.trap, kvmppc_get_pc(vcpu), 943 vcpu->arch.shregs.msr); 944 kvmppc_dump_regs(vcpu); 945 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 946 run->hw.hardware_exit_reason = vcpu->arch.trap; 947 return RESUME_HOST; 948 } 949 run->exit_reason = KVM_EXIT_UNKNOWN; 950 run->ready_for_interrupt_injection = 1; 951 switch (vcpu->arch.trap) { 952 /* We're good on these - the host merely wanted to get our attention */ 953 case BOOK3S_INTERRUPT_HV_DECREMENTER: 954 vcpu->stat.dec_exits++; 955 r = RESUME_GUEST; 956 break; 957 case BOOK3S_INTERRUPT_EXTERNAL: 958 case BOOK3S_INTERRUPT_H_DOORBELL: 959 case BOOK3S_INTERRUPT_H_VIRT: 960 vcpu->stat.ext_intr_exits++; 961 r = RESUME_GUEST; 962 break; 963 /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/ 964 case BOOK3S_INTERRUPT_HMI: 965 case BOOK3S_INTERRUPT_PERFMON: 966 r = RESUME_GUEST; 967 break; 968 case BOOK3S_INTERRUPT_MACHINE_CHECK: 969 /* 970 * Deliver a machine check interrupt to the guest. 971 * We have to do this, even if the host has handled the 972 * machine check, because machine checks use SRR0/1 and 973 * the interrupt might have trashed guest state in them. 974 */ 975 kvmppc_book3s_queue_irqprio(vcpu, 976 BOOK3S_INTERRUPT_MACHINE_CHECK); 977 r = RESUME_GUEST; 978 break; 979 case BOOK3S_INTERRUPT_PROGRAM: 980 { 981 ulong flags; 982 /* 983 * Normally program interrupts are delivered directly 984 * to the guest by the hardware, but we can get here 985 * as a result of a hypervisor emulation interrupt 986 * (e40) getting turned into a 700 by BML RTAS. 987 */ 988 flags = vcpu->arch.shregs.msr & 0x1f0000ull; 989 kvmppc_core_queue_program(vcpu, flags); 990 r = RESUME_GUEST; 991 break; 992 } 993 case BOOK3S_INTERRUPT_SYSCALL: 994 { 995 /* hcall - punt to userspace */ 996 int i; 997 998 /* hypercall with MSR_PR has already been handled in rmode, 999 * and never reaches here. 1000 */ 1001 1002 run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); 1003 for (i = 0; i < 9; ++i) 1004 run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); 1005 run->exit_reason = KVM_EXIT_PAPR_HCALL; 1006 vcpu->arch.hcall_needed = 1; 1007 r = RESUME_HOST; 1008 break; 1009 } 1010 /* 1011 * We get these next two if the guest accesses a page which it thinks 1012 * it has mapped but which is not actually present, either because 1013 * it is for an emulated I/O device or because the corresonding 1014 * host page has been paged out. Any other HDSI/HISI interrupts 1015 * have been handled already. 1016 */ 1017 case BOOK3S_INTERRUPT_H_DATA_STORAGE: 1018 r = RESUME_PAGE_FAULT; 1019 break; 1020 case BOOK3S_INTERRUPT_H_INST_STORAGE: 1021 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); 1022 vcpu->arch.fault_dsisr = 0; 1023 r = RESUME_PAGE_FAULT; 1024 break; 1025 /* 1026 * This occurs if the guest executes an illegal instruction. 1027 * If the guest debug is disabled, generate a program interrupt 1028 * to the guest. If guest debug is enabled, we need to check 1029 * whether the instruction is a software breakpoint instruction. 1030 * Accordingly return to Guest or Host. 1031 */ 1032 case BOOK3S_INTERRUPT_H_EMUL_ASSIST: 1033 if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED) 1034 vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ? 1035 swab32(vcpu->arch.emul_inst) : 1036 vcpu->arch.emul_inst; 1037 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { 1038 r = kvmppc_emulate_debug_inst(run, vcpu); 1039 } else { 1040 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1041 r = RESUME_GUEST; 1042 } 1043 break; 1044 /* 1045 * This occurs if the guest (kernel or userspace), does something that 1046 * is prohibited by HFSCR. We just generate a program interrupt to 1047 * the guest. 1048 */ 1049 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: 1050 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1051 r = RESUME_GUEST; 1052 break; 1053 case BOOK3S_INTERRUPT_HV_RM_HARD: 1054 r = RESUME_PASSTHROUGH; 1055 break; 1056 default: 1057 kvmppc_dump_regs(vcpu); 1058 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", 1059 vcpu->arch.trap, kvmppc_get_pc(vcpu), 1060 vcpu->arch.shregs.msr); 1061 run->hw.hardware_exit_reason = vcpu->arch.trap; 1062 r = RESUME_HOST; 1063 break; 1064 } 1065 1066 return r; 1067 } 1068 1069 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, 1070 struct kvm_sregs *sregs) 1071 { 1072 int i; 1073 1074 memset(sregs, 0, sizeof(struct kvm_sregs)); 1075 sregs->pvr = vcpu->arch.pvr; 1076 for (i = 0; i < vcpu->arch.slb_max; i++) { 1077 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; 1078 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; 1079 } 1080 1081 return 0; 1082 } 1083 1084 static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, 1085 struct kvm_sregs *sregs) 1086 { 1087 int i, j; 1088 1089 /* Only accept the same PVR as the host's, since we can't spoof it */ 1090 if (sregs->pvr != vcpu->arch.pvr) 1091 return -EINVAL; 1092 1093 j = 0; 1094 for (i = 0; i < vcpu->arch.slb_nr; i++) { 1095 if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) { 1096 vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe; 1097 vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv; 1098 ++j; 1099 } 1100 } 1101 vcpu->arch.slb_max = j; 1102 1103 return 0; 1104 } 1105 1106 static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, 1107 bool preserve_top32) 1108 { 1109 struct kvm *kvm = vcpu->kvm; 1110 struct kvmppc_vcore *vc = vcpu->arch.vcore; 1111 u64 mask; 1112 1113 mutex_lock(&kvm->lock); 1114 spin_lock(&vc->lock); 1115 /* 1116 * If ILE (interrupt little-endian) has changed, update the 1117 * MSR_LE bit in the intr_msr for each vcpu in this vcore. 1118 */ 1119 if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) { 1120 struct kvm_vcpu *vcpu; 1121 int i; 1122 1123 kvm_for_each_vcpu(i, vcpu, kvm) { 1124 if (vcpu->arch.vcore != vc) 1125 continue; 1126 if (new_lpcr & LPCR_ILE) 1127 vcpu->arch.intr_msr |= MSR_LE; 1128 else 1129 vcpu->arch.intr_msr &= ~MSR_LE; 1130 } 1131 } 1132 1133 /* 1134 * Userspace can only modify DPFD (default prefetch depth), 1135 * ILE (interrupt little-endian) and TC (translation control). 1136 * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.). 1137 */ 1138 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; 1139 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 1140 mask |= LPCR_AIL; 1141 1142 /* Broken 32-bit version of LPCR must not clear top bits */ 1143 if (preserve_top32) 1144 mask &= 0xFFFFFFFF; 1145 vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); 1146 spin_unlock(&vc->lock); 1147 mutex_unlock(&kvm->lock); 1148 } 1149 1150 static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, 1151 union kvmppc_one_reg *val) 1152 { 1153 int r = 0; 1154 long int i; 1155 1156 switch (id) { 1157 case KVM_REG_PPC_DEBUG_INST: 1158 *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); 1159 break; 1160 case KVM_REG_PPC_HIOR: 1161 *val = get_reg_val(id, 0); 1162 break; 1163 case KVM_REG_PPC_DABR: 1164 *val = get_reg_val(id, vcpu->arch.dabr); 1165 break; 1166 case KVM_REG_PPC_DABRX: 1167 *val = get_reg_val(id, vcpu->arch.dabrx); 1168 break; 1169 case KVM_REG_PPC_DSCR: 1170 *val = get_reg_val(id, vcpu->arch.dscr); 1171 break; 1172 case KVM_REG_PPC_PURR: 1173 *val = get_reg_val(id, vcpu->arch.purr); 1174 break; 1175 case KVM_REG_PPC_SPURR: 1176 *val = get_reg_val(id, vcpu->arch.spurr); 1177 break; 1178 case KVM_REG_PPC_AMR: 1179 *val = get_reg_val(id, vcpu->arch.amr); 1180 break; 1181 case KVM_REG_PPC_UAMOR: 1182 *val = get_reg_val(id, vcpu->arch.uamor); 1183 break; 1184 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: 1185 i = id - KVM_REG_PPC_MMCR0; 1186 *val = get_reg_val(id, vcpu->arch.mmcr[i]); 1187 break; 1188 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: 1189 i = id - KVM_REG_PPC_PMC1; 1190 *val = get_reg_val(id, vcpu->arch.pmc[i]); 1191 break; 1192 case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: 1193 i = id - KVM_REG_PPC_SPMC1; 1194 *val = get_reg_val(id, vcpu->arch.spmc[i]); 1195 break; 1196 case KVM_REG_PPC_SIAR: 1197 *val = get_reg_val(id, vcpu->arch.siar); 1198 break; 1199 case KVM_REG_PPC_SDAR: 1200 *val = get_reg_val(id, vcpu->arch.sdar); 1201 break; 1202 case KVM_REG_PPC_SIER: 1203 *val = get_reg_val(id, vcpu->arch.sier); 1204 break; 1205 case KVM_REG_PPC_IAMR: 1206 *val = get_reg_val(id, vcpu->arch.iamr); 1207 break; 1208 case KVM_REG_PPC_PSPB: 1209 *val = get_reg_val(id, vcpu->arch.pspb); 1210 break; 1211 case KVM_REG_PPC_DPDES: 1212 *val = get_reg_val(id, vcpu->arch.vcore->dpdes); 1213 break; 1214 case KVM_REG_PPC_VTB: 1215 *val = get_reg_val(id, vcpu->arch.vcore->vtb); 1216 break; 1217 case KVM_REG_PPC_DAWR: 1218 *val = get_reg_val(id, vcpu->arch.dawr); 1219 break; 1220 case KVM_REG_PPC_DAWRX: 1221 *val = get_reg_val(id, vcpu->arch.dawrx); 1222 break; 1223 case KVM_REG_PPC_CIABR: 1224 *val = get_reg_val(id, vcpu->arch.ciabr); 1225 break; 1226 case KVM_REG_PPC_CSIGR: 1227 *val = get_reg_val(id, vcpu->arch.csigr); 1228 break; 1229 case KVM_REG_PPC_TACR: 1230 *val = get_reg_val(id, vcpu->arch.tacr); 1231 break; 1232 case KVM_REG_PPC_TCSCR: 1233 *val = get_reg_val(id, vcpu->arch.tcscr); 1234 break; 1235 case KVM_REG_PPC_PID: 1236 *val = get_reg_val(id, vcpu->arch.pid); 1237 break; 1238 case KVM_REG_PPC_ACOP: 1239 *val = get_reg_val(id, vcpu->arch.acop); 1240 break; 1241 case KVM_REG_PPC_WORT: 1242 *val = get_reg_val(id, vcpu->arch.wort); 1243 break; 1244 case KVM_REG_PPC_TIDR: 1245 *val = get_reg_val(id, vcpu->arch.tid); 1246 break; 1247 case KVM_REG_PPC_PSSCR: 1248 *val = get_reg_val(id, vcpu->arch.psscr); 1249 break; 1250 case KVM_REG_PPC_VPA_ADDR: 1251 spin_lock(&vcpu->arch.vpa_update_lock); 1252 *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); 1253 spin_unlock(&vcpu->arch.vpa_update_lock); 1254 break; 1255 case KVM_REG_PPC_VPA_SLB: 1256 spin_lock(&vcpu->arch.vpa_update_lock); 1257 val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa; 1258 val->vpaval.length = vcpu->arch.slb_shadow.len; 1259 spin_unlock(&vcpu->arch.vpa_update_lock); 1260 break; 1261 case KVM_REG_PPC_VPA_DTL: 1262 spin_lock(&vcpu->arch.vpa_update_lock); 1263 val->vpaval.addr = vcpu->arch.dtl.next_gpa; 1264 val->vpaval.length = vcpu->arch.dtl.len; 1265 spin_unlock(&vcpu->arch.vpa_update_lock); 1266 break; 1267 case KVM_REG_PPC_TB_OFFSET: 1268 *val = get_reg_val(id, vcpu->arch.vcore->tb_offset); 1269 break; 1270 case KVM_REG_PPC_LPCR: 1271 case KVM_REG_PPC_LPCR_64: 1272 *val = get_reg_val(id, vcpu->arch.vcore->lpcr); 1273 break; 1274 case KVM_REG_PPC_PPR: 1275 *val = get_reg_val(id, vcpu->arch.ppr); 1276 break; 1277 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1278 case KVM_REG_PPC_TFHAR: 1279 *val = get_reg_val(id, vcpu->arch.tfhar); 1280 break; 1281 case KVM_REG_PPC_TFIAR: 1282 *val = get_reg_val(id, vcpu->arch.tfiar); 1283 break; 1284 case KVM_REG_PPC_TEXASR: 1285 *val = get_reg_val(id, vcpu->arch.texasr); 1286 break; 1287 case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: 1288 i = id - KVM_REG_PPC_TM_GPR0; 1289 *val = get_reg_val(id, vcpu->arch.gpr_tm[i]); 1290 break; 1291 case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: 1292 { 1293 int j; 1294 i = id - KVM_REG_PPC_TM_VSR0; 1295 if (i < 32) 1296 for (j = 0; j < TS_FPRWIDTH; j++) 1297 val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j]; 1298 else { 1299 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 1300 val->vval = vcpu->arch.vr_tm.vr[i-32]; 1301 else 1302 r = -ENXIO; 1303 } 1304 break; 1305 } 1306 case KVM_REG_PPC_TM_CR: 1307 *val = get_reg_val(id, vcpu->arch.cr_tm); 1308 break; 1309 case KVM_REG_PPC_TM_XER: 1310 *val = get_reg_val(id, vcpu->arch.xer_tm); 1311 break; 1312 case KVM_REG_PPC_TM_LR: 1313 *val = get_reg_val(id, vcpu->arch.lr_tm); 1314 break; 1315 case KVM_REG_PPC_TM_CTR: 1316 *val = get_reg_val(id, vcpu->arch.ctr_tm); 1317 break; 1318 case KVM_REG_PPC_TM_FPSCR: 1319 *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr); 1320 break; 1321 case KVM_REG_PPC_TM_AMR: 1322 *val = get_reg_val(id, vcpu->arch.amr_tm); 1323 break; 1324 case KVM_REG_PPC_TM_PPR: 1325 *val = get_reg_val(id, vcpu->arch.ppr_tm); 1326 break; 1327 case KVM_REG_PPC_TM_VRSAVE: 1328 *val = get_reg_val(id, vcpu->arch.vrsave_tm); 1329 break; 1330 case KVM_REG_PPC_TM_VSCR: 1331 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 1332 *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]); 1333 else 1334 r = -ENXIO; 1335 break; 1336 case KVM_REG_PPC_TM_DSCR: 1337 *val = get_reg_val(id, vcpu->arch.dscr_tm); 1338 break; 1339 case KVM_REG_PPC_TM_TAR: 1340 *val = get_reg_val(id, vcpu->arch.tar_tm); 1341 break; 1342 #endif 1343 case KVM_REG_PPC_ARCH_COMPAT: 1344 *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); 1345 break; 1346 default: 1347 r = -EINVAL; 1348 break; 1349 } 1350 1351 return r; 1352 } 1353 1354 static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, 1355 union kvmppc_one_reg *val) 1356 { 1357 int r = 0; 1358 long int i; 1359 unsigned long addr, len; 1360 1361 switch (id) { 1362 case KVM_REG_PPC_HIOR: 1363 /* Only allow this to be set to zero */ 1364 if (set_reg_val(id, *val)) 1365 r = -EINVAL; 1366 break; 1367 case KVM_REG_PPC_DABR: 1368 vcpu->arch.dabr = set_reg_val(id, *val); 1369 break; 1370 case KVM_REG_PPC_DABRX: 1371 vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP; 1372 break; 1373 case KVM_REG_PPC_DSCR: 1374 vcpu->arch.dscr = set_reg_val(id, *val); 1375 break; 1376 case KVM_REG_PPC_PURR: 1377 vcpu->arch.purr = set_reg_val(id, *val); 1378 break; 1379 case KVM_REG_PPC_SPURR: 1380 vcpu->arch.spurr = set_reg_val(id, *val); 1381 break; 1382 case KVM_REG_PPC_AMR: 1383 vcpu->arch.amr = set_reg_val(id, *val); 1384 break; 1385 case KVM_REG_PPC_UAMOR: 1386 vcpu->arch.uamor = set_reg_val(id, *val); 1387 break; 1388 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: 1389 i = id - KVM_REG_PPC_MMCR0; 1390 vcpu->arch.mmcr[i] = set_reg_val(id, *val); 1391 break; 1392 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: 1393 i = id - KVM_REG_PPC_PMC1; 1394 vcpu->arch.pmc[i] = set_reg_val(id, *val); 1395 break; 1396 case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: 1397 i = id - KVM_REG_PPC_SPMC1; 1398 vcpu->arch.spmc[i] = set_reg_val(id, *val); 1399 break; 1400 case KVM_REG_PPC_SIAR: 1401 vcpu->arch.siar = set_reg_val(id, *val); 1402 break; 1403 case KVM_REG_PPC_SDAR: 1404 vcpu->arch.sdar = set_reg_val(id, *val); 1405 break; 1406 case KVM_REG_PPC_SIER: 1407 vcpu->arch.sier = set_reg_val(id, *val); 1408 break; 1409 case KVM_REG_PPC_IAMR: 1410 vcpu->arch.iamr = set_reg_val(id, *val); 1411 break; 1412 case KVM_REG_PPC_PSPB: 1413 vcpu->arch.pspb = set_reg_val(id, *val); 1414 break; 1415 case KVM_REG_PPC_DPDES: 1416 vcpu->arch.vcore->dpdes = set_reg_val(id, *val); 1417 break; 1418 case KVM_REG_PPC_VTB: 1419 vcpu->arch.vcore->vtb = set_reg_val(id, *val); 1420 break; 1421 case KVM_REG_PPC_DAWR: 1422 vcpu->arch.dawr = set_reg_val(id, *val); 1423 break; 1424 case KVM_REG_PPC_DAWRX: 1425 vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP; 1426 break; 1427 case KVM_REG_PPC_CIABR: 1428 vcpu->arch.ciabr = set_reg_val(id, *val); 1429 /* Don't allow setting breakpoints in hypervisor code */ 1430 if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) 1431 vcpu->arch.ciabr &= ~CIABR_PRIV; /* disable */ 1432 break; 1433 case KVM_REG_PPC_CSIGR: 1434 vcpu->arch.csigr = set_reg_val(id, *val); 1435 break; 1436 case KVM_REG_PPC_TACR: 1437 vcpu->arch.tacr = set_reg_val(id, *val); 1438 break; 1439 case KVM_REG_PPC_TCSCR: 1440 vcpu->arch.tcscr = set_reg_val(id, *val); 1441 break; 1442 case KVM_REG_PPC_PID: 1443 vcpu->arch.pid = set_reg_val(id, *val); 1444 break; 1445 case KVM_REG_PPC_ACOP: 1446 vcpu->arch.acop = set_reg_val(id, *val); 1447 break; 1448 case KVM_REG_PPC_WORT: 1449 vcpu->arch.wort = set_reg_val(id, *val); 1450 break; 1451 case KVM_REG_PPC_TIDR: 1452 vcpu->arch.tid = set_reg_val(id, *val); 1453 break; 1454 case KVM_REG_PPC_PSSCR: 1455 vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS; 1456 break; 1457 case KVM_REG_PPC_VPA_ADDR: 1458 addr = set_reg_val(id, *val); 1459 r = -EINVAL; 1460 if (!addr && (vcpu->arch.slb_shadow.next_gpa || 1461 vcpu->arch.dtl.next_gpa)) 1462 break; 1463 r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); 1464 break; 1465 case KVM_REG_PPC_VPA_SLB: 1466 addr = val->vpaval.addr; 1467 len = val->vpaval.length; 1468 r = -EINVAL; 1469 if (addr && !vcpu->arch.vpa.next_gpa) 1470 break; 1471 r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); 1472 break; 1473 case KVM_REG_PPC_VPA_DTL: 1474 addr = val->vpaval.addr; 1475 len = val->vpaval.length; 1476 r = -EINVAL; 1477 if (addr && (len < sizeof(struct dtl_entry) || 1478 !vcpu->arch.vpa.next_gpa)) 1479 break; 1480 len -= len % sizeof(struct dtl_entry); 1481 r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); 1482 break; 1483 case KVM_REG_PPC_TB_OFFSET: 1484 /* round up to multiple of 2^24 */ 1485 vcpu->arch.vcore->tb_offset = 1486 ALIGN(set_reg_val(id, *val), 1UL << 24); 1487 break; 1488 case KVM_REG_PPC_LPCR: 1489 kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true); 1490 break; 1491 case KVM_REG_PPC_LPCR_64: 1492 kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false); 1493 break; 1494 case KVM_REG_PPC_PPR: 1495 vcpu->arch.ppr = set_reg_val(id, *val); 1496 break; 1497 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1498 case KVM_REG_PPC_TFHAR: 1499 vcpu->arch.tfhar = set_reg_val(id, *val); 1500 break; 1501 case KVM_REG_PPC_TFIAR: 1502 vcpu->arch.tfiar = set_reg_val(id, *val); 1503 break; 1504 case KVM_REG_PPC_TEXASR: 1505 vcpu->arch.texasr = set_reg_val(id, *val); 1506 break; 1507 case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: 1508 i = id - KVM_REG_PPC_TM_GPR0; 1509 vcpu->arch.gpr_tm[i] = set_reg_val(id, *val); 1510 break; 1511 case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: 1512 { 1513 int j; 1514 i = id - KVM_REG_PPC_TM_VSR0; 1515 if (i < 32) 1516 for (j = 0; j < TS_FPRWIDTH; j++) 1517 vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j]; 1518 else 1519 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 1520 vcpu->arch.vr_tm.vr[i-32] = val->vval; 1521 else 1522 r = -ENXIO; 1523 break; 1524 } 1525 case KVM_REG_PPC_TM_CR: 1526 vcpu->arch.cr_tm = set_reg_val(id, *val); 1527 break; 1528 case KVM_REG_PPC_TM_XER: 1529 vcpu->arch.xer_tm = set_reg_val(id, *val); 1530 break; 1531 case KVM_REG_PPC_TM_LR: 1532 vcpu->arch.lr_tm = set_reg_val(id, *val); 1533 break; 1534 case KVM_REG_PPC_TM_CTR: 1535 vcpu->arch.ctr_tm = set_reg_val(id, *val); 1536 break; 1537 case KVM_REG_PPC_TM_FPSCR: 1538 vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val); 1539 break; 1540 case KVM_REG_PPC_TM_AMR: 1541 vcpu->arch.amr_tm = set_reg_val(id, *val); 1542 break; 1543 case KVM_REG_PPC_TM_PPR: 1544 vcpu->arch.ppr_tm = set_reg_val(id, *val); 1545 break; 1546 case KVM_REG_PPC_TM_VRSAVE: 1547 vcpu->arch.vrsave_tm = set_reg_val(id, *val); 1548 break; 1549 case KVM_REG_PPC_TM_VSCR: 1550 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 1551 vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val); 1552 else 1553 r = - ENXIO; 1554 break; 1555 case KVM_REG_PPC_TM_DSCR: 1556 vcpu->arch.dscr_tm = set_reg_val(id, *val); 1557 break; 1558 case KVM_REG_PPC_TM_TAR: 1559 vcpu->arch.tar_tm = set_reg_val(id, *val); 1560 break; 1561 #endif 1562 case KVM_REG_PPC_ARCH_COMPAT: 1563 r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); 1564 break; 1565 default: 1566 r = -EINVAL; 1567 break; 1568 } 1569 1570 return r; 1571 } 1572 1573 /* 1574 * On POWER9, threads are independent and can be in different partitions. 1575 * Therefore we consider each thread to be a subcore. 1576 * There is a restriction that all threads have to be in the same 1577 * MMU mode (radix or HPT), unfortunately, but since we only support 1578 * HPT guests on a HPT host so far, that isn't an impediment yet. 1579 */ 1580 static int threads_per_vcore(void) 1581 { 1582 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1583 return 1; 1584 return threads_per_subcore; 1585 } 1586 1587 static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) 1588 { 1589 struct kvmppc_vcore *vcore; 1590 1591 vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL); 1592 1593 if (vcore == NULL) 1594 return NULL; 1595 1596 spin_lock_init(&vcore->lock); 1597 spin_lock_init(&vcore->stoltb_lock); 1598 init_swait_queue_head(&vcore->wq); 1599 vcore->preempt_tb = TB_NIL; 1600 vcore->lpcr = kvm->arch.lpcr; 1601 vcore->first_vcpuid = core * threads_per_vcore(); 1602 vcore->kvm = kvm; 1603 INIT_LIST_HEAD(&vcore->preempt_list); 1604 1605 return vcore; 1606 } 1607 1608 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1609 static struct debugfs_timings_element { 1610 const char *name; 1611 size_t offset; 1612 } timings[] = { 1613 {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, 1614 {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, 1615 {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, 1616 {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, 1617 {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, 1618 }; 1619 1620 #define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) 1621 1622 struct debugfs_timings_state { 1623 struct kvm_vcpu *vcpu; 1624 unsigned int buflen; 1625 char buf[N_TIMINGS * 100]; 1626 }; 1627 1628 static int debugfs_timings_open(struct inode *inode, struct file *file) 1629 { 1630 struct kvm_vcpu *vcpu = inode->i_private; 1631 struct debugfs_timings_state *p; 1632 1633 p = kzalloc(sizeof(*p), GFP_KERNEL); 1634 if (!p) 1635 return -ENOMEM; 1636 1637 kvm_get_kvm(vcpu->kvm); 1638 p->vcpu = vcpu; 1639 file->private_data = p; 1640 1641 return nonseekable_open(inode, file); 1642 } 1643 1644 static int debugfs_timings_release(struct inode *inode, struct file *file) 1645 { 1646 struct debugfs_timings_state *p = file->private_data; 1647 1648 kvm_put_kvm(p->vcpu->kvm); 1649 kfree(p); 1650 return 0; 1651 } 1652 1653 static ssize_t debugfs_timings_read(struct file *file, char __user *buf, 1654 size_t len, loff_t *ppos) 1655 { 1656 struct debugfs_timings_state *p = file->private_data; 1657 struct kvm_vcpu *vcpu = p->vcpu; 1658 char *s, *buf_end; 1659 struct kvmhv_tb_accumulator tb; 1660 u64 count; 1661 loff_t pos; 1662 ssize_t n; 1663 int i, loops; 1664 bool ok; 1665 1666 if (!p->buflen) { 1667 s = p->buf; 1668 buf_end = s + sizeof(p->buf); 1669 for (i = 0; i < N_TIMINGS; ++i) { 1670 struct kvmhv_tb_accumulator *acc; 1671 1672 acc = (struct kvmhv_tb_accumulator *) 1673 ((unsigned long)vcpu + timings[i].offset); 1674 ok = false; 1675 for (loops = 0; loops < 1000; ++loops) { 1676 count = acc->seqcount; 1677 if (!(count & 1)) { 1678 smp_rmb(); 1679 tb = *acc; 1680 smp_rmb(); 1681 if (count == acc->seqcount) { 1682 ok = true; 1683 break; 1684 } 1685 } 1686 udelay(1); 1687 } 1688 if (!ok) 1689 snprintf(s, buf_end - s, "%s: stuck\n", 1690 timings[i].name); 1691 else 1692 snprintf(s, buf_end - s, 1693 "%s: %llu %llu %llu %llu\n", 1694 timings[i].name, count / 2, 1695 tb_to_ns(tb.tb_total), 1696 tb_to_ns(tb.tb_min), 1697 tb_to_ns(tb.tb_max)); 1698 s += strlen(s); 1699 } 1700 p->buflen = s - p->buf; 1701 } 1702 1703 pos = *ppos; 1704 if (pos >= p->buflen) 1705 return 0; 1706 if (len > p->buflen - pos) 1707 len = p->buflen - pos; 1708 n = copy_to_user(buf, p->buf + pos, len); 1709 if (n) { 1710 if (n == len) 1711 return -EFAULT; 1712 len -= n; 1713 } 1714 *ppos = pos + len; 1715 return len; 1716 } 1717 1718 static ssize_t debugfs_timings_write(struct file *file, const char __user *buf, 1719 size_t len, loff_t *ppos) 1720 { 1721 return -EACCES; 1722 } 1723 1724 static const struct file_operations debugfs_timings_ops = { 1725 .owner = THIS_MODULE, 1726 .open = debugfs_timings_open, 1727 .release = debugfs_timings_release, 1728 .read = debugfs_timings_read, 1729 .write = debugfs_timings_write, 1730 .llseek = generic_file_llseek, 1731 }; 1732 1733 /* Create a debugfs directory for the vcpu */ 1734 static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) 1735 { 1736 char buf[16]; 1737 struct kvm *kvm = vcpu->kvm; 1738 1739 snprintf(buf, sizeof(buf), "vcpu%u", id); 1740 if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) 1741 return; 1742 vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); 1743 if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) 1744 return; 1745 vcpu->arch.debugfs_timings = 1746 debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, 1747 vcpu, &debugfs_timings_ops); 1748 } 1749 1750 #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 1751 static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) 1752 { 1753 } 1754 #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 1755 1756 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, 1757 unsigned int id) 1758 { 1759 struct kvm_vcpu *vcpu; 1760 int err = -EINVAL; 1761 int core; 1762 struct kvmppc_vcore *vcore; 1763 1764 core = id / threads_per_vcore(); 1765 if (core >= KVM_MAX_VCORES) 1766 goto out; 1767 1768 err = -ENOMEM; 1769 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 1770 if (!vcpu) 1771 goto out; 1772 1773 err = kvm_vcpu_init(vcpu, kvm, id); 1774 if (err) 1775 goto free_vcpu; 1776 1777 vcpu->arch.shared = &vcpu->arch.shregs; 1778 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE 1779 /* 1780 * The shared struct is never shared on HV, 1781 * so we can always use host endianness 1782 */ 1783 #ifdef __BIG_ENDIAN__ 1784 vcpu->arch.shared_big_endian = true; 1785 #else 1786 vcpu->arch.shared_big_endian = false; 1787 #endif 1788 #endif 1789 vcpu->arch.mmcr[0] = MMCR0_FC; 1790 vcpu->arch.ctrl = CTRL_RUNLATCH; 1791 /* default to host PVR, since we can't spoof it */ 1792 kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); 1793 spin_lock_init(&vcpu->arch.vpa_update_lock); 1794 spin_lock_init(&vcpu->arch.tbacct_lock); 1795 vcpu->arch.busy_preempt = TB_NIL; 1796 vcpu->arch.intr_msr = MSR_SF | MSR_ME; 1797 1798 kvmppc_mmu_book3s_hv_init(vcpu); 1799 1800 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 1801 1802 init_waitqueue_head(&vcpu->arch.cpu_run); 1803 1804 mutex_lock(&kvm->lock); 1805 vcore = kvm->arch.vcores[core]; 1806 if (!vcore) { 1807 vcore = kvmppc_vcore_create(kvm, core); 1808 kvm->arch.vcores[core] = vcore; 1809 kvm->arch.online_vcores++; 1810 } 1811 mutex_unlock(&kvm->lock); 1812 1813 if (!vcore) 1814 goto free_vcpu; 1815 1816 spin_lock(&vcore->lock); 1817 ++vcore->num_threads; 1818 spin_unlock(&vcore->lock); 1819 vcpu->arch.vcore = vcore; 1820 vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; 1821 vcpu->arch.thread_cpu = -1; 1822 vcpu->arch.prev_cpu = -1; 1823 1824 vcpu->arch.cpu_type = KVM_CPU_3S_64; 1825 kvmppc_sanity_check(vcpu); 1826 1827 debugfs_vcpu_init(vcpu, id); 1828 1829 return vcpu; 1830 1831 free_vcpu: 1832 kmem_cache_free(kvm_vcpu_cache, vcpu); 1833 out: 1834 return ERR_PTR(err); 1835 } 1836 1837 static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) 1838 { 1839 if (vpa->pinned_addr) 1840 kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa, 1841 vpa->dirty); 1842 } 1843 1844 static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) 1845 { 1846 spin_lock(&vcpu->arch.vpa_update_lock); 1847 unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); 1848 unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); 1849 unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); 1850 spin_unlock(&vcpu->arch.vpa_update_lock); 1851 kvm_vcpu_uninit(vcpu); 1852 kmem_cache_free(kvm_vcpu_cache, vcpu); 1853 } 1854 1855 static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) 1856 { 1857 /* Indicate we want to get back into the guest */ 1858 return 1; 1859 } 1860 1861 static void kvmppc_set_timer(struct kvm_vcpu *vcpu) 1862 { 1863 unsigned long dec_nsec, now; 1864 1865 now = get_tb(); 1866 if (now > vcpu->arch.dec_expires) { 1867 /* decrementer has already gone negative */ 1868 kvmppc_core_queue_dec(vcpu); 1869 kvmppc_core_prepare_to_enter(vcpu); 1870 return; 1871 } 1872 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC 1873 / tb_ticks_per_sec; 1874 hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL); 1875 vcpu->arch.timer_running = 1; 1876 } 1877 1878 static void kvmppc_end_cede(struct kvm_vcpu *vcpu) 1879 { 1880 vcpu->arch.ceded = 0; 1881 if (vcpu->arch.timer_running) { 1882 hrtimer_try_to_cancel(&vcpu->arch.dec_timer); 1883 vcpu->arch.timer_running = 0; 1884 } 1885 } 1886 1887 extern void __kvmppc_vcore_entry(void); 1888 1889 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 1890 struct kvm_vcpu *vcpu) 1891 { 1892 u64 now; 1893 1894 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 1895 return; 1896 spin_lock_irq(&vcpu->arch.tbacct_lock); 1897 now = mftb(); 1898 vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - 1899 vcpu->arch.stolen_logged; 1900 vcpu->arch.busy_preempt = now; 1901 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 1902 spin_unlock_irq(&vcpu->arch.tbacct_lock); 1903 --vc->n_runnable; 1904 WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL); 1905 } 1906 1907 static int kvmppc_grab_hwthread(int cpu) 1908 { 1909 struct paca_struct *tpaca; 1910 long timeout = 10000; 1911 1912 tpaca = &paca[cpu]; 1913 1914 /* Ensure the thread won't go into the kernel if it wakes */ 1915 tpaca->kvm_hstate.kvm_vcpu = NULL; 1916 tpaca->kvm_hstate.kvm_vcore = NULL; 1917 tpaca->kvm_hstate.napping = 0; 1918 smp_wmb(); 1919 tpaca->kvm_hstate.hwthread_req = 1; 1920 1921 /* 1922 * If the thread is already executing in the kernel (e.g. handling 1923 * a stray interrupt), wait for it to get back to nap mode. 1924 * The smp_mb() is to ensure that our setting of hwthread_req 1925 * is visible before we look at hwthread_state, so if this 1926 * races with the code at system_reset_pSeries and the thread 1927 * misses our setting of hwthread_req, we are sure to see its 1928 * setting of hwthread_state, and vice versa. 1929 */ 1930 smp_mb(); 1931 while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) { 1932 if (--timeout <= 0) { 1933 pr_err("KVM: couldn't grab cpu %d\n", cpu); 1934 return -EBUSY; 1935 } 1936 udelay(1); 1937 } 1938 return 0; 1939 } 1940 1941 static void kvmppc_release_hwthread(int cpu) 1942 { 1943 struct paca_struct *tpaca; 1944 1945 tpaca = &paca[cpu]; 1946 tpaca->kvm_hstate.hwthread_req = 0; 1947 tpaca->kvm_hstate.kvm_vcpu = NULL; 1948 tpaca->kvm_hstate.kvm_vcore = NULL; 1949 tpaca->kvm_hstate.kvm_split_mode = NULL; 1950 } 1951 1952 static void do_nothing(void *x) 1953 { 1954 } 1955 1956 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) 1957 { 1958 int i; 1959 1960 cpu = cpu_first_thread_sibling(cpu); 1961 cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); 1962 /* 1963 * Make sure setting of bit in need_tlb_flush precedes 1964 * testing of cpu_in_guest bits. The matching barrier on 1965 * the other side is the first smp_mb() in kvmppc_run_core(). 1966 */ 1967 smp_mb(); 1968 for (i = 0; i < threads_per_core; ++i) 1969 if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) 1970 smp_call_function_single(cpu + i, do_nothing, NULL, 1); 1971 } 1972 1973 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) 1974 { 1975 int cpu; 1976 struct paca_struct *tpaca; 1977 struct kvmppc_vcore *mvc = vc->master_vcore; 1978 struct kvm *kvm = vc->kvm; 1979 1980 cpu = vc->pcpu; 1981 if (vcpu) { 1982 if (vcpu->arch.timer_running) { 1983 hrtimer_try_to_cancel(&vcpu->arch.dec_timer); 1984 vcpu->arch.timer_running = 0; 1985 } 1986 cpu += vcpu->arch.ptid; 1987 vcpu->cpu = mvc->pcpu; 1988 vcpu->arch.thread_cpu = cpu; 1989 1990 /* 1991 * With radix, the guest can do TLB invalidations itself, 1992 * and it could choose to use the local form (tlbiel) if 1993 * it is invalidating a translation that has only ever been 1994 * used on one vcpu. However, that doesn't mean it has 1995 * only ever been used on one physical cpu, since vcpus 1996 * can move around between pcpus. To cope with this, when 1997 * a vcpu moves from one pcpu to another, we need to tell 1998 * any vcpus running on the same core as this vcpu previously 1999 * ran to flush the TLB. The TLB is shared between threads, 2000 * so we use a single bit in .need_tlb_flush for all 4 threads. 2001 */ 2002 if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) { 2003 if (vcpu->arch.prev_cpu >= 0 && 2004 cpu_first_thread_sibling(vcpu->arch.prev_cpu) != 2005 cpu_first_thread_sibling(cpu)) 2006 radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); 2007 vcpu->arch.prev_cpu = cpu; 2008 } 2009 cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); 2010 } 2011 tpaca = &paca[cpu]; 2012 tpaca->kvm_hstate.kvm_vcpu = vcpu; 2013 tpaca->kvm_hstate.ptid = cpu - mvc->pcpu; 2014 /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */ 2015 smp_wmb(); 2016 tpaca->kvm_hstate.kvm_vcore = mvc; 2017 if (cpu != smp_processor_id()) 2018 kvmppc_ipi_thread(cpu); 2019 } 2020 2021 static void kvmppc_wait_for_nap(void) 2022 { 2023 int cpu = smp_processor_id(); 2024 int i, loops; 2025 int n_threads = threads_per_vcore(); 2026 2027 if (n_threads <= 1) 2028 return; 2029 for (loops = 0; loops < 1000000; ++loops) { 2030 /* 2031 * Check if all threads are finished. 2032 * We set the vcore pointer when starting a thread 2033 * and the thread clears it when finished, so we look 2034 * for any threads that still have a non-NULL vcore ptr. 2035 */ 2036 for (i = 1; i < n_threads; ++i) 2037 if (paca[cpu + i].kvm_hstate.kvm_vcore) 2038 break; 2039 if (i == n_threads) { 2040 HMT_medium(); 2041 return; 2042 } 2043 HMT_low(); 2044 } 2045 HMT_medium(); 2046 for (i = 1; i < n_threads; ++i) 2047 if (paca[cpu + i].kvm_hstate.kvm_vcore) 2048 pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); 2049 } 2050 2051 /* 2052 * Check that we are on thread 0 and that any other threads in 2053 * this core are off-line. Then grab the threads so they can't 2054 * enter the kernel. 2055 */ 2056 static int on_primary_thread(void) 2057 { 2058 int cpu = smp_processor_id(); 2059 int thr; 2060 2061 /* Are we on a primary subcore? */ 2062 if (cpu_thread_in_subcore(cpu)) 2063 return 0; 2064 2065 thr = 0; 2066 while (++thr < threads_per_subcore) 2067 if (cpu_online(cpu + thr)) 2068 return 0; 2069 2070 /* Grab all hw threads so they can't go into the kernel */ 2071 for (thr = 1; thr < threads_per_subcore; ++thr) { 2072 if (kvmppc_grab_hwthread(cpu + thr)) { 2073 /* Couldn't grab one; let the others go */ 2074 do { 2075 kvmppc_release_hwthread(cpu + thr); 2076 } while (--thr > 0); 2077 return 0; 2078 } 2079 } 2080 return 1; 2081 } 2082 2083 /* 2084 * A list of virtual cores for each physical CPU. 2085 * These are vcores that could run but their runner VCPU tasks are 2086 * (or may be) preempted. 2087 */ 2088 struct preempted_vcore_list { 2089 struct list_head list; 2090 spinlock_t lock; 2091 }; 2092 2093 static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores); 2094 2095 static void init_vcore_lists(void) 2096 { 2097 int cpu; 2098 2099 for_each_possible_cpu(cpu) { 2100 struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu); 2101 spin_lock_init(&lp->lock); 2102 INIT_LIST_HEAD(&lp->list); 2103 } 2104 } 2105 2106 static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) 2107 { 2108 struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores); 2109 2110 vc->vcore_state = VCORE_PREEMPT; 2111 vc->pcpu = smp_processor_id(); 2112 if (vc->num_threads < threads_per_vcore()) { 2113 spin_lock(&lp->lock); 2114 list_add_tail(&vc->preempt_list, &lp->list); 2115 spin_unlock(&lp->lock); 2116 } 2117 2118 /* Start accumulating stolen time */ 2119 kvmppc_core_start_stolen(vc); 2120 } 2121 2122 static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc) 2123 { 2124 struct preempted_vcore_list *lp; 2125 2126 kvmppc_core_end_stolen(vc); 2127 if (!list_empty(&vc->preempt_list)) { 2128 lp = &per_cpu(preempted_vcores, vc->pcpu); 2129 spin_lock(&lp->lock); 2130 list_del_init(&vc->preempt_list); 2131 spin_unlock(&lp->lock); 2132 } 2133 vc->vcore_state = VCORE_INACTIVE; 2134 } 2135 2136 /* 2137 * This stores information about the virtual cores currently 2138 * assigned to a physical core. 2139 */ 2140 struct core_info { 2141 int n_subcores; 2142 int max_subcore_threads; 2143 int total_threads; 2144 int subcore_threads[MAX_SUBCORES]; 2145 struct kvm *subcore_vm[MAX_SUBCORES]; 2146 struct list_head vcs[MAX_SUBCORES]; 2147 }; 2148 2149 /* 2150 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 2151 * respectively in 2-way micro-threading (split-core) mode. 2152 */ 2153 static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; 2154 2155 static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) 2156 { 2157 int sub; 2158 2159 memset(cip, 0, sizeof(*cip)); 2160 cip->n_subcores = 1; 2161 cip->max_subcore_threads = vc->num_threads; 2162 cip->total_threads = vc->num_threads; 2163 cip->subcore_threads[0] = vc->num_threads; 2164 cip->subcore_vm[0] = vc->kvm; 2165 for (sub = 0; sub < MAX_SUBCORES; ++sub) 2166 INIT_LIST_HEAD(&cip->vcs[sub]); 2167 list_add_tail(&vc->preempt_list, &cip->vcs[0]); 2168 } 2169 2170 static bool subcore_config_ok(int n_subcores, int n_threads) 2171 { 2172 /* Can only dynamically split if unsplit to begin with */ 2173 if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) 2174 return false; 2175 if (n_subcores > MAX_SUBCORES) 2176 return false; 2177 if (n_subcores > 1) { 2178 if (!(dynamic_mt_modes & 2)) 2179 n_subcores = 4; 2180 if (n_subcores > 2 && !(dynamic_mt_modes & 4)) 2181 return false; 2182 } 2183 2184 return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS; 2185 } 2186 2187 static void init_master_vcore(struct kvmppc_vcore *vc) 2188 { 2189 vc->master_vcore = vc; 2190 vc->entry_exit_map = 0; 2191 vc->in_guest = 0; 2192 vc->napping_threads = 0; 2193 vc->conferring_threads = 0; 2194 } 2195 2196 static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) 2197 { 2198 int n_threads = vc->num_threads; 2199 int sub; 2200 2201 if (!cpu_has_feature(CPU_FTR_ARCH_207S)) 2202 return false; 2203 2204 if (n_threads < cip->max_subcore_threads) 2205 n_threads = cip->max_subcore_threads; 2206 if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) 2207 return false; 2208 cip->max_subcore_threads = n_threads; 2209 2210 sub = cip->n_subcores; 2211 ++cip->n_subcores; 2212 cip->total_threads += vc->num_threads; 2213 cip->subcore_threads[sub] = vc->num_threads; 2214 cip->subcore_vm[sub] = vc->kvm; 2215 init_master_vcore(vc); 2216 list_move_tail(&vc->preempt_list, &cip->vcs[sub]); 2217 2218 return true; 2219 } 2220 2221 /* 2222 * Work out whether it is possible to piggyback the execution of 2223 * vcore *pvc onto the execution of the other vcores described in *cip. 2224 */ 2225 static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip, 2226 int target_threads) 2227 { 2228 if (cip->total_threads + pvc->num_threads > target_threads) 2229 return false; 2230 2231 return can_dynamic_split(pvc, cip); 2232 } 2233 2234 static void prepare_threads(struct kvmppc_vcore *vc) 2235 { 2236 int i; 2237 struct kvm_vcpu *vcpu; 2238 2239 for_each_runnable_thread(i, vcpu, vc) { 2240 if (signal_pending(vcpu->arch.run_task)) 2241 vcpu->arch.ret = -EINTR; 2242 else if (vcpu->arch.vpa.update_pending || 2243 vcpu->arch.slb_shadow.update_pending || 2244 vcpu->arch.dtl.update_pending) 2245 vcpu->arch.ret = RESUME_GUEST; 2246 else 2247 continue; 2248 kvmppc_remove_runnable(vc, vcpu); 2249 wake_up(&vcpu->arch.cpu_run); 2250 } 2251 } 2252 2253 static void collect_piggybacks(struct core_info *cip, int target_threads) 2254 { 2255 struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores); 2256 struct kvmppc_vcore *pvc, *vcnext; 2257 2258 spin_lock(&lp->lock); 2259 list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) { 2260 if (!spin_trylock(&pvc->lock)) 2261 continue; 2262 prepare_threads(pvc); 2263 if (!pvc->n_runnable) { 2264 list_del_init(&pvc->preempt_list); 2265 if (pvc->runner == NULL) { 2266 pvc->vcore_state = VCORE_INACTIVE; 2267 kvmppc_core_end_stolen(pvc); 2268 } 2269 spin_unlock(&pvc->lock); 2270 continue; 2271 } 2272 if (!can_piggyback(pvc, cip, target_threads)) { 2273 spin_unlock(&pvc->lock); 2274 continue; 2275 } 2276 kvmppc_core_end_stolen(pvc); 2277 pvc->vcore_state = VCORE_PIGGYBACK; 2278 if (cip->total_threads >= target_threads) 2279 break; 2280 } 2281 spin_unlock(&lp->lock); 2282 } 2283 2284 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) 2285 { 2286 int still_running = 0, i; 2287 u64 now; 2288 long ret; 2289 struct kvm_vcpu *vcpu; 2290 2291 spin_lock(&vc->lock); 2292 now = get_tb(); 2293 for_each_runnable_thread(i, vcpu, vc) { 2294 /* cancel pending dec exception if dec is positive */ 2295 if (now < vcpu->arch.dec_expires && 2296 kvmppc_core_pending_dec(vcpu)) 2297 kvmppc_core_dequeue_dec(vcpu); 2298 2299 trace_kvm_guest_exit(vcpu); 2300 2301 ret = RESUME_GUEST; 2302 if (vcpu->arch.trap) 2303 ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, 2304 vcpu->arch.run_task); 2305 2306 vcpu->arch.ret = ret; 2307 vcpu->arch.trap = 0; 2308 2309 if (is_kvmppc_resume_guest(vcpu->arch.ret)) { 2310 if (vcpu->arch.pending_exceptions) 2311 kvmppc_core_prepare_to_enter(vcpu); 2312 if (vcpu->arch.ceded) 2313 kvmppc_set_timer(vcpu); 2314 else 2315 ++still_running; 2316 } else { 2317 kvmppc_remove_runnable(vc, vcpu); 2318 wake_up(&vcpu->arch.cpu_run); 2319 } 2320 } 2321 list_del_init(&vc->preempt_list); 2322 if (!is_master) { 2323 if (still_running > 0) { 2324 kvmppc_vcore_preempt(vc); 2325 } else if (vc->runner) { 2326 vc->vcore_state = VCORE_PREEMPT; 2327 kvmppc_core_start_stolen(vc); 2328 } else { 2329 vc->vcore_state = VCORE_INACTIVE; 2330 } 2331 if (vc->n_runnable > 0 && vc->runner == NULL) { 2332 /* make sure there's a candidate runner awake */ 2333 i = -1; 2334 vcpu = next_runnable_thread(vc, &i); 2335 wake_up(&vcpu->arch.cpu_run); 2336 } 2337 } 2338 spin_unlock(&vc->lock); 2339 } 2340 2341 /* 2342 * Clear core from the list of active host cores as we are about to 2343 * enter the guest. Only do this if it is the primary thread of the 2344 * core (not if a subcore) that is entering the guest. 2345 */ 2346 static inline int kvmppc_clear_host_core(unsigned int cpu) 2347 { 2348 int core; 2349 2350 if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) 2351 return 0; 2352 /* 2353 * Memory barrier can be omitted here as we will do a smp_wmb() 2354 * later in kvmppc_start_thread and we need ensure that state is 2355 * visible to other CPUs only after we enter guest. 2356 */ 2357 core = cpu >> threads_shift; 2358 kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0; 2359 return 0; 2360 } 2361 2362 /* 2363 * Advertise this core as an active host core since we exited the guest 2364 * Only need to do this if it is the primary thread of the core that is 2365 * exiting. 2366 */ 2367 static inline int kvmppc_set_host_core(unsigned int cpu) 2368 { 2369 int core; 2370 2371 if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) 2372 return 0; 2373 2374 /* 2375 * Memory barrier can be omitted here because we do a spin_unlock 2376 * immediately after this which provides the memory barrier. 2377 */ 2378 core = cpu >> threads_shift; 2379 kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1; 2380 return 0; 2381 } 2382 2383 /* 2384 * Run a set of guest threads on a physical core. 2385 * Called with vc->lock held. 2386 */ 2387 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) 2388 { 2389 struct kvm_vcpu *vcpu; 2390 int i; 2391 int srcu_idx; 2392 struct core_info core_info; 2393 struct kvmppc_vcore *pvc, *vcnext; 2394 struct kvm_split_mode split_info, *sip; 2395 int split, subcore_size, active; 2396 int sub; 2397 bool thr0_done; 2398 unsigned long cmd_bit, stat_bit; 2399 int pcpu, thr; 2400 int target_threads; 2401 int controlled_threads; 2402 2403 /* 2404 * Remove from the list any threads that have a signal pending 2405 * or need a VPA update done 2406 */ 2407 prepare_threads(vc); 2408 2409 /* if the runner is no longer runnable, let the caller pick a new one */ 2410 if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE) 2411 return; 2412 2413 /* 2414 * Initialize *vc. 2415 */ 2416 init_master_vcore(vc); 2417 vc->preempt_tb = TB_NIL; 2418 2419 /* 2420 * Number of threads that we will be controlling: the same as 2421 * the number of threads per subcore, except on POWER9, 2422 * where it's 1 because the threads are (mostly) independent. 2423 */ 2424 controlled_threads = threads_per_vcore(); 2425 2426 /* 2427 * Make sure we are running on primary threads, and that secondary 2428 * threads are offline. Also check if the number of threads in this 2429 * guest are greater than the current system threads per guest. 2430 */ 2431 if ((controlled_threads > 1) && 2432 ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { 2433 for_each_runnable_thread(i, vcpu, vc) { 2434 vcpu->arch.ret = -EBUSY; 2435 kvmppc_remove_runnable(vc, vcpu); 2436 wake_up(&vcpu->arch.cpu_run); 2437 } 2438 goto out; 2439 } 2440 2441 /* 2442 * See if we could run any other vcores on the physical core 2443 * along with this one. 2444 */ 2445 init_core_info(&core_info, vc); 2446 pcpu = smp_processor_id(); 2447 target_threads = controlled_threads; 2448 if (target_smt_mode && target_smt_mode < target_threads) 2449 target_threads = target_smt_mode; 2450 if (vc->num_threads < target_threads) 2451 collect_piggybacks(&core_info, target_threads); 2452 2453 /* Decide on micro-threading (split-core) mode */ 2454 subcore_size = threads_per_subcore; 2455 cmd_bit = stat_bit = 0; 2456 split = core_info.n_subcores; 2457 sip = NULL; 2458 if (split > 1) { 2459 /* threads_per_subcore must be MAX_SMT_THREADS (8) here */ 2460 if (split == 2 && (dynamic_mt_modes & 2)) { 2461 cmd_bit = HID0_POWER8_1TO2LPAR; 2462 stat_bit = HID0_POWER8_2LPARMODE; 2463 } else { 2464 split = 4; 2465 cmd_bit = HID0_POWER8_1TO4LPAR; 2466 stat_bit = HID0_POWER8_4LPARMODE; 2467 } 2468 subcore_size = MAX_SMT_THREADS / split; 2469 sip = &split_info; 2470 memset(&split_info, 0, sizeof(split_info)); 2471 split_info.rpr = mfspr(SPRN_RPR); 2472 split_info.pmmar = mfspr(SPRN_PMMAR); 2473 split_info.ldbar = mfspr(SPRN_LDBAR); 2474 split_info.subcore_size = subcore_size; 2475 for (sub = 0; sub < core_info.n_subcores; ++sub) 2476 split_info.master_vcs[sub] = 2477 list_first_entry(&core_info.vcs[sub], 2478 struct kvmppc_vcore, preempt_list); 2479 /* order writes to split_info before kvm_split_mode pointer */ 2480 smp_wmb(); 2481 } 2482 pcpu = smp_processor_id(); 2483 for (thr = 0; thr < controlled_threads; ++thr) 2484 paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; 2485 2486 /* Initiate micro-threading (split-core) if required */ 2487 if (cmd_bit) { 2488 unsigned long hid0 = mfspr(SPRN_HID0); 2489 2490 hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS; 2491 mb(); 2492 mtspr(SPRN_HID0, hid0); 2493 isync(); 2494 for (;;) { 2495 hid0 = mfspr(SPRN_HID0); 2496 if (hid0 & stat_bit) 2497 break; 2498 cpu_relax(); 2499 } 2500 } 2501 2502 kvmppc_clear_host_core(pcpu); 2503 2504 /* Start all the threads */ 2505 active = 0; 2506 for (sub = 0; sub < core_info.n_subcores; ++sub) { 2507 thr = subcore_thread_map[sub]; 2508 thr0_done = false; 2509 active |= 1 << thr; 2510 list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) { 2511 pvc->pcpu = pcpu + thr; 2512 for_each_runnable_thread(i, vcpu, pvc) { 2513 kvmppc_start_thread(vcpu, pvc); 2514 kvmppc_create_dtl_entry(vcpu, pvc); 2515 trace_kvm_guest_enter(vcpu); 2516 if (!vcpu->arch.ptid) 2517 thr0_done = true; 2518 active |= 1 << (thr + vcpu->arch.ptid); 2519 } 2520 /* 2521 * We need to start the first thread of each subcore 2522 * even if it doesn't have a vcpu. 2523 */ 2524 if (pvc->master_vcore == pvc && !thr0_done) 2525 kvmppc_start_thread(NULL, pvc); 2526 thr += pvc->num_threads; 2527 } 2528 } 2529 2530 /* 2531 * Ensure that split_info.do_nap is set after setting 2532 * the vcore pointer in the PACA of the secondaries. 2533 */ 2534 smp_mb(); 2535 if (cmd_bit) 2536 split_info.do_nap = 1; /* ask secondaries to nap when done */ 2537 2538 /* 2539 * When doing micro-threading, poke the inactive threads as well. 2540 * This gets them to the nap instruction after kvm_do_nap, 2541 * which reduces the time taken to unsplit later. 2542 */ 2543 if (split > 1) 2544 for (thr = 1; thr < threads_per_subcore; ++thr) 2545 if (!(active & (1 << thr))) 2546 kvmppc_ipi_thread(pcpu + thr); 2547 2548 vc->vcore_state = VCORE_RUNNING; 2549 preempt_disable(); 2550 2551 trace_kvmppc_run_core(vc, 0); 2552 2553 for (sub = 0; sub < core_info.n_subcores; ++sub) 2554 list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) 2555 spin_unlock(&pvc->lock); 2556 2557 guest_enter(); 2558 2559 srcu_idx = srcu_read_lock(&vc->kvm->srcu); 2560 2561 __kvmppc_vcore_entry(); 2562 2563 srcu_read_unlock(&vc->kvm->srcu, srcu_idx); 2564 2565 spin_lock(&vc->lock); 2566 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ 2567 vc->vcore_state = VCORE_EXITING; 2568 2569 /* wait for secondary threads to finish writing their state to memory */ 2570 kvmppc_wait_for_nap(); 2571 2572 /* Return to whole-core mode if we split the core earlier */ 2573 if (split > 1) { 2574 unsigned long hid0 = mfspr(SPRN_HID0); 2575 unsigned long loops = 0; 2576 2577 hid0 &= ~HID0_POWER8_DYNLPARDIS; 2578 stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE; 2579 mb(); 2580 mtspr(SPRN_HID0, hid0); 2581 isync(); 2582 for (;;) { 2583 hid0 = mfspr(SPRN_HID0); 2584 if (!(hid0 & stat_bit)) 2585 break; 2586 cpu_relax(); 2587 ++loops; 2588 } 2589 split_info.do_nap = 0; 2590 } 2591 2592 /* Let secondaries go back to the offline loop */ 2593 for (i = 0; i < controlled_threads; ++i) { 2594 kvmppc_release_hwthread(pcpu + i); 2595 if (sip && sip->napped[i]) 2596 kvmppc_ipi_thread(pcpu + i); 2597 cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); 2598 } 2599 2600 kvmppc_set_host_core(pcpu); 2601 2602 spin_unlock(&vc->lock); 2603 2604 /* make sure updates to secondary vcpu structs are visible now */ 2605 smp_mb(); 2606 guest_exit(); 2607 2608 for (sub = 0; sub < core_info.n_subcores; ++sub) 2609 list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], 2610 preempt_list) 2611 post_guest_process(pvc, pvc == vc); 2612 2613 spin_lock(&vc->lock); 2614 preempt_enable(); 2615 2616 out: 2617 vc->vcore_state = VCORE_INACTIVE; 2618 trace_kvmppc_run_core(vc, 1); 2619 } 2620 2621 /* 2622 * Wait for some other vcpu thread to execute us, and 2623 * wake us up when we need to handle something in the host. 2624 */ 2625 static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, 2626 struct kvm_vcpu *vcpu, int wait_state) 2627 { 2628 DEFINE_WAIT(wait); 2629 2630 prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); 2631 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 2632 spin_unlock(&vc->lock); 2633 schedule(); 2634 spin_lock(&vc->lock); 2635 } 2636 finish_wait(&vcpu->arch.cpu_run, &wait); 2637 } 2638 2639 static void grow_halt_poll_ns(struct kvmppc_vcore *vc) 2640 { 2641 /* 10us base */ 2642 if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) 2643 vc->halt_poll_ns = 10000; 2644 else 2645 vc->halt_poll_ns *= halt_poll_ns_grow; 2646 } 2647 2648 static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) 2649 { 2650 if (halt_poll_ns_shrink == 0) 2651 vc->halt_poll_ns = 0; 2652 else 2653 vc->halt_poll_ns /= halt_poll_ns_shrink; 2654 } 2655 2656 /* 2657 * Check to see if any of the runnable vcpus on the vcore have pending 2658 * exceptions or are no longer ceded 2659 */ 2660 static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) 2661 { 2662 struct kvm_vcpu *vcpu; 2663 int i; 2664 2665 for_each_runnable_thread(i, vcpu, vc) { 2666 if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || 2667 vcpu->arch.prodded) 2668 return 1; 2669 } 2670 2671 return 0; 2672 } 2673 2674 /* 2675 * All the vcpus in this vcore are idle, so wait for a decrementer 2676 * or external interrupt to one of the vcpus. vc->lock is held. 2677 */ 2678 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) 2679 { 2680 ktime_t cur, start_poll, start_wait; 2681 int do_sleep = 1; 2682 u64 block_ns; 2683 DECLARE_SWAITQUEUE(wait); 2684 2685 /* Poll for pending exceptions and ceded state */ 2686 cur = start_poll = ktime_get(); 2687 if (vc->halt_poll_ns) { 2688 ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns); 2689 ++vc->runner->stat.halt_attempted_poll; 2690 2691 vc->vcore_state = VCORE_POLLING; 2692 spin_unlock(&vc->lock); 2693 2694 do { 2695 if (kvmppc_vcore_check_block(vc)) { 2696 do_sleep = 0; 2697 break; 2698 } 2699 cur = ktime_get(); 2700 } while (single_task_running() && ktime_before(cur, stop)); 2701 2702 spin_lock(&vc->lock); 2703 vc->vcore_state = VCORE_INACTIVE; 2704 2705 if (!do_sleep) { 2706 ++vc->runner->stat.halt_successful_poll; 2707 goto out; 2708 } 2709 } 2710 2711 prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); 2712 2713 if (kvmppc_vcore_check_block(vc)) { 2714 finish_swait(&vc->wq, &wait); 2715 do_sleep = 0; 2716 /* If we polled, count this as a successful poll */ 2717 if (vc->halt_poll_ns) 2718 ++vc->runner->stat.halt_successful_poll; 2719 goto out; 2720 } 2721 2722 start_wait = ktime_get(); 2723 2724 vc->vcore_state = VCORE_SLEEPING; 2725 trace_kvmppc_vcore_blocked(vc, 0); 2726 spin_unlock(&vc->lock); 2727 schedule(); 2728 finish_swait(&vc->wq, &wait); 2729 spin_lock(&vc->lock); 2730 vc->vcore_state = VCORE_INACTIVE; 2731 trace_kvmppc_vcore_blocked(vc, 1); 2732 ++vc->runner->stat.halt_successful_wait; 2733 2734 cur = ktime_get(); 2735 2736 out: 2737 block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll); 2738 2739 /* Attribute wait time */ 2740 if (do_sleep) { 2741 vc->runner->stat.halt_wait_ns += 2742 ktime_to_ns(cur) - ktime_to_ns(start_wait); 2743 /* Attribute failed poll time */ 2744 if (vc->halt_poll_ns) 2745 vc->runner->stat.halt_poll_fail_ns += 2746 ktime_to_ns(start_wait) - 2747 ktime_to_ns(start_poll); 2748 } else { 2749 /* Attribute successful poll time */ 2750 if (vc->halt_poll_ns) 2751 vc->runner->stat.halt_poll_success_ns += 2752 ktime_to_ns(cur) - 2753 ktime_to_ns(start_poll); 2754 } 2755 2756 /* Adjust poll time */ 2757 if (halt_poll_ns) { 2758 if (block_ns <= vc->halt_poll_ns) 2759 ; 2760 /* We slept and blocked for longer than the max halt time */ 2761 else if (vc->halt_poll_ns && block_ns > halt_poll_ns) 2762 shrink_halt_poll_ns(vc); 2763 /* We slept and our poll time is too small */ 2764 else if (vc->halt_poll_ns < halt_poll_ns && 2765 block_ns < halt_poll_ns) 2766 grow_halt_poll_ns(vc); 2767 if (vc->halt_poll_ns > halt_poll_ns) 2768 vc->halt_poll_ns = halt_poll_ns; 2769 } else 2770 vc->halt_poll_ns = 0; 2771 2772 trace_kvmppc_vcore_wakeup(do_sleep, block_ns); 2773 } 2774 2775 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2776 { 2777 int n_ceded, i; 2778 struct kvmppc_vcore *vc; 2779 struct kvm_vcpu *v; 2780 2781 trace_kvmppc_run_vcpu_enter(vcpu); 2782 2783 kvm_run->exit_reason = 0; 2784 vcpu->arch.ret = RESUME_GUEST; 2785 vcpu->arch.trap = 0; 2786 kvmppc_update_vpas(vcpu); 2787 2788 /* 2789 * Synchronize with other threads in this virtual core 2790 */ 2791 vc = vcpu->arch.vcore; 2792 spin_lock(&vc->lock); 2793 vcpu->arch.ceded = 0; 2794 vcpu->arch.run_task = current; 2795 vcpu->arch.kvm_run = kvm_run; 2796 vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); 2797 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; 2798 vcpu->arch.busy_preempt = TB_NIL; 2799 WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu); 2800 ++vc->n_runnable; 2801 2802 /* 2803 * This happens the first time this is called for a vcpu. 2804 * If the vcore is already running, we may be able to start 2805 * this thread straight away and have it join in. 2806 */ 2807 if (!signal_pending(current)) { 2808 if (vc->vcore_state == VCORE_PIGGYBACK) { 2809 struct kvmppc_vcore *mvc = vc->master_vcore; 2810 if (spin_trylock(&mvc->lock)) { 2811 if (mvc->vcore_state == VCORE_RUNNING && 2812 !VCORE_IS_EXITING(mvc)) { 2813 kvmppc_create_dtl_entry(vcpu, vc); 2814 kvmppc_start_thread(vcpu, vc); 2815 trace_kvm_guest_enter(vcpu); 2816 } 2817 spin_unlock(&mvc->lock); 2818 } 2819 } else if (vc->vcore_state == VCORE_RUNNING && 2820 !VCORE_IS_EXITING(vc)) { 2821 kvmppc_create_dtl_entry(vcpu, vc); 2822 kvmppc_start_thread(vcpu, vc); 2823 trace_kvm_guest_enter(vcpu); 2824 } else if (vc->vcore_state == VCORE_SLEEPING) { 2825 swake_up(&vc->wq); 2826 } 2827 2828 } 2829 2830 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 2831 !signal_pending(current)) { 2832 if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL) 2833 kvmppc_vcore_end_preempt(vc); 2834 2835 if (vc->vcore_state != VCORE_INACTIVE) { 2836 kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE); 2837 continue; 2838 } 2839 for_each_runnable_thread(i, v, vc) { 2840 kvmppc_core_prepare_to_enter(v); 2841 if (signal_pending(v->arch.run_task)) { 2842 kvmppc_remove_runnable(vc, v); 2843 v->stat.signal_exits++; 2844 v->arch.kvm_run->exit_reason = KVM_EXIT_INTR; 2845 v->arch.ret = -EINTR; 2846 wake_up(&v->arch.cpu_run); 2847 } 2848 } 2849 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 2850 break; 2851 n_ceded = 0; 2852 for_each_runnable_thread(i, v, vc) { 2853 if (!v->arch.pending_exceptions && !v->arch.prodded) 2854 n_ceded += v->arch.ceded; 2855 else 2856 v->arch.ceded = 0; 2857 } 2858 vc->runner = vcpu; 2859 if (n_ceded == vc->n_runnable) { 2860 kvmppc_vcore_blocked(vc); 2861 } else if (need_resched()) { 2862 kvmppc_vcore_preempt(vc); 2863 /* Let something else run */ 2864 cond_resched_lock(&vc->lock); 2865 if (vc->vcore_state == VCORE_PREEMPT) 2866 kvmppc_vcore_end_preempt(vc); 2867 } else { 2868 kvmppc_run_core(vc); 2869 } 2870 vc->runner = NULL; 2871 } 2872 2873 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 2874 (vc->vcore_state == VCORE_RUNNING || 2875 vc->vcore_state == VCORE_EXITING || 2876 vc->vcore_state == VCORE_PIGGYBACK)) 2877 kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE); 2878 2879 if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL) 2880 kvmppc_vcore_end_preempt(vc); 2881 2882 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 2883 kvmppc_remove_runnable(vc, vcpu); 2884 vcpu->stat.signal_exits++; 2885 kvm_run->exit_reason = KVM_EXIT_INTR; 2886 vcpu->arch.ret = -EINTR; 2887 } 2888 2889 if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { 2890 /* Wake up some vcpu to run the core */ 2891 i = -1; 2892 v = next_runnable_thread(vc, &i); 2893 wake_up(&v->arch.cpu_run); 2894 } 2895 2896 trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); 2897 spin_unlock(&vc->lock); 2898 return vcpu->arch.ret; 2899 } 2900 2901 static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) 2902 { 2903 int r; 2904 int srcu_idx; 2905 2906 if (!vcpu->arch.sane) { 2907 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2908 return -EINVAL; 2909 } 2910 2911 kvmppc_core_prepare_to_enter(vcpu); 2912 2913 /* No need to go into the guest when all we'll do is come back out */ 2914 if (signal_pending(current)) { 2915 run->exit_reason = KVM_EXIT_INTR; 2916 return -EINTR; 2917 } 2918 2919 atomic_inc(&vcpu->kvm->arch.vcpus_running); 2920 /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ 2921 smp_mb(); 2922 2923 /* On the first time here, set up HTAB and VRMA */ 2924 if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) { 2925 r = kvmppc_hv_setup_htab_rma(vcpu); 2926 if (r) 2927 goto out; 2928 } 2929 2930 flush_all_to_thread(current); 2931 2932 vcpu->arch.wqp = &vcpu->arch.vcore->wq; 2933 vcpu->arch.pgdir = current->mm->pgd; 2934 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 2935 2936 do { 2937 r = kvmppc_run_vcpu(run, vcpu); 2938 2939 if (run->exit_reason == KVM_EXIT_PAPR_HCALL && 2940 !(vcpu->arch.shregs.msr & MSR_PR)) { 2941 trace_kvm_hcall_enter(vcpu); 2942 r = kvmppc_pseries_do_hcall(vcpu); 2943 trace_kvm_hcall_exit(vcpu, r); 2944 kvmppc_core_prepare_to_enter(vcpu); 2945 } else if (r == RESUME_PAGE_FAULT) { 2946 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 2947 r = kvmppc_book3s_hv_page_fault(run, vcpu, 2948 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 2949 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); 2950 } else if (r == RESUME_PASSTHROUGH) 2951 r = kvmppc_xics_rm_complete(vcpu, 0); 2952 } while (is_kvmppc_resume_guest(r)); 2953 2954 out: 2955 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 2956 atomic_dec(&vcpu->kvm->arch.vcpus_running); 2957 return r; 2958 } 2959 2960 static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, 2961 int linux_psize) 2962 { 2963 struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; 2964 2965 if (!def->shift) 2966 return; 2967 (*sps)->page_shift = def->shift; 2968 (*sps)->slb_enc = def->sllp; 2969 (*sps)->enc[0].page_shift = def->shift; 2970 (*sps)->enc[0].pte_enc = def->penc[linux_psize]; 2971 /* 2972 * Add 16MB MPSS support if host supports it 2973 */ 2974 if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { 2975 (*sps)->enc[1].page_shift = 24; 2976 (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; 2977 } 2978 (*sps)++; 2979 } 2980 2981 static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, 2982 struct kvm_ppc_smmu_info *info) 2983 { 2984 struct kvm_ppc_one_seg_page_size *sps; 2985 2986 /* 2987 * Since we don't yet support HPT guests on a radix host, 2988 * return an error if the host uses radix. 2989 */ 2990 if (radix_enabled()) 2991 return -EINVAL; 2992 2993 info->flags = KVM_PPC_PAGE_SIZES_REAL; 2994 if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) 2995 info->flags |= KVM_PPC_1T_SEGMENTS; 2996 info->slb_size = mmu_slb_size; 2997 2998 /* We only support these sizes for now, and no muti-size segments */ 2999 sps = &info->sps[0]; 3000 kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); 3001 kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); 3002 kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); 3003 3004 return 0; 3005 } 3006 3007 /* 3008 * Get (and clear) the dirty memory log for a memory slot. 3009 */ 3010 static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, 3011 struct kvm_dirty_log *log) 3012 { 3013 struct kvm_memslots *slots; 3014 struct kvm_memory_slot *memslot; 3015 int i, r; 3016 unsigned long n; 3017 unsigned long *buf; 3018 struct kvm_vcpu *vcpu; 3019 3020 mutex_lock(&kvm->slots_lock); 3021 3022 r = -EINVAL; 3023 if (log->slot >= KVM_USER_MEM_SLOTS) 3024 goto out; 3025 3026 slots = kvm_memslots(kvm); 3027 memslot = id_to_memslot(slots, log->slot); 3028 r = -ENOENT; 3029 if (!memslot->dirty_bitmap) 3030 goto out; 3031 3032 /* 3033 * Use second half of bitmap area because radix accumulates 3034 * bits in the first half. 3035 */ 3036 n = kvm_dirty_bitmap_bytes(memslot); 3037 buf = memslot->dirty_bitmap + n / sizeof(long); 3038 memset(buf, 0, n); 3039 3040 if (kvm_is_radix(kvm)) 3041 r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf); 3042 else 3043 r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf); 3044 if (r) 3045 goto out; 3046 3047 /* Harvest dirty bits from VPA and DTL updates */ 3048 /* Note: we never modify the SLB shadow buffer areas */ 3049 kvm_for_each_vcpu(i, vcpu, kvm) { 3050 spin_lock(&vcpu->arch.vpa_update_lock); 3051 kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf); 3052 kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf); 3053 spin_unlock(&vcpu->arch.vpa_update_lock); 3054 } 3055 3056 r = -EFAULT; 3057 if (copy_to_user(log->dirty_bitmap, buf, n)) 3058 goto out; 3059 3060 r = 0; 3061 out: 3062 mutex_unlock(&kvm->slots_lock); 3063 return r; 3064 } 3065 3066 static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, 3067 struct kvm_memory_slot *dont) 3068 { 3069 if (!dont || free->arch.rmap != dont->arch.rmap) { 3070 vfree(free->arch.rmap); 3071 free->arch.rmap = NULL; 3072 } 3073 } 3074 3075 static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, 3076 unsigned long npages) 3077 { 3078 /* 3079 * For now, if radix_enabled() then we only support radix guests, 3080 * and in that case we don't need the rmap array. 3081 */ 3082 if (radix_enabled()) { 3083 slot->arch.rmap = NULL; 3084 return 0; 3085 } 3086 3087 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 3088 if (!slot->arch.rmap) 3089 return -ENOMEM; 3090 3091 return 0; 3092 } 3093 3094 static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, 3095 struct kvm_memory_slot *memslot, 3096 const struct kvm_userspace_memory_region *mem) 3097 { 3098 return 0; 3099 } 3100 3101 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, 3102 const struct kvm_userspace_memory_region *mem, 3103 const struct kvm_memory_slot *old, 3104 const struct kvm_memory_slot *new) 3105 { 3106 unsigned long npages = mem->memory_size >> PAGE_SHIFT; 3107 struct kvm_memslots *slots; 3108 struct kvm_memory_slot *memslot; 3109 3110 /* 3111 * If we are making a new memslot, it might make 3112 * some address that was previously cached as emulated 3113 * MMIO be no longer emulated MMIO, so invalidate 3114 * all the caches of emulated MMIO translations. 3115 */ 3116 if (npages) 3117 atomic64_inc(&kvm->arch.mmio_update); 3118 3119 if (npages && old->npages && !kvm_is_radix(kvm)) { 3120 /* 3121 * If modifying a memslot, reset all the rmap dirty bits. 3122 * If this is a new memslot, we don't need to do anything 3123 * since the rmap array starts out as all zeroes, 3124 * i.e. no pages are dirty. 3125 */ 3126 slots = kvm_memslots(kvm); 3127 memslot = id_to_memslot(slots, mem->slot); 3128 kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL); 3129 } 3130 } 3131 3132 /* 3133 * Update LPCR values in kvm->arch and in vcores. 3134 * Caller must hold kvm->lock. 3135 */ 3136 void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) 3137 { 3138 long int i; 3139 u32 cores_done = 0; 3140 3141 if ((kvm->arch.lpcr & mask) == lpcr) 3142 return; 3143 3144 kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr; 3145 3146 for (i = 0; i < KVM_MAX_VCORES; ++i) { 3147 struct kvmppc_vcore *vc = kvm->arch.vcores[i]; 3148 if (!vc) 3149 continue; 3150 spin_lock(&vc->lock); 3151 vc->lpcr = (vc->lpcr & ~mask) | lpcr; 3152 spin_unlock(&vc->lock); 3153 if (++cores_done >= kvm->arch.online_vcores) 3154 break; 3155 } 3156 } 3157 3158 static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) 3159 { 3160 return; 3161 } 3162 3163 static void kvmppc_setup_partition_table(struct kvm *kvm) 3164 { 3165 unsigned long dw0, dw1; 3166 3167 if (!kvm_is_radix(kvm)) { 3168 /* PS field - page size for VRMA */ 3169 dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | 3170 ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); 3171 /* HTABSIZE and HTABORG fields */ 3172 dw0 |= kvm->arch.sdr1; 3173 3174 /* Second dword as set by userspace */ 3175 dw1 = kvm->arch.process_table; 3176 } else { 3177 dw0 = PATB_HR | radix__get_tree_size() | 3178 __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; 3179 dw1 = PATB_GR | kvm->arch.process_table; 3180 } 3181 3182 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); 3183 } 3184 3185 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) 3186 { 3187 int err = 0; 3188 struct kvm *kvm = vcpu->kvm; 3189 unsigned long hva; 3190 struct kvm_memory_slot *memslot; 3191 struct vm_area_struct *vma; 3192 unsigned long lpcr = 0, senc; 3193 unsigned long psize, porder; 3194 int srcu_idx; 3195 3196 mutex_lock(&kvm->lock); 3197 if (kvm->arch.hpte_setup_done) 3198 goto out; /* another vcpu beat us to it */ 3199 3200 /* Allocate hashed page table (if not done already) and reset it */ 3201 if (!kvm->arch.hpt.virt) { 3202 int order = KVM_DEFAULT_HPT_ORDER; 3203 struct kvm_hpt_info info; 3204 3205 err = kvmppc_allocate_hpt(&info, order); 3206 /* If we get here, it means userspace didn't specify a 3207 * size explicitly. So, try successively smaller 3208 * sizes if the default failed. */ 3209 while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER) 3210 err = kvmppc_allocate_hpt(&info, order); 3211 3212 if (err < 0) { 3213 pr_err("KVM: Couldn't alloc HPT\n"); 3214 goto out; 3215 } 3216 3217 kvmppc_set_hpt(kvm, &info); 3218 } 3219 3220 /* Look up the memslot for guest physical address 0 */ 3221 srcu_idx = srcu_read_lock(&kvm->srcu); 3222 memslot = gfn_to_memslot(kvm, 0); 3223 3224 /* We must have some memory at 0 by now */ 3225 err = -EINVAL; 3226 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 3227 goto out_srcu; 3228 3229 /* Look up the VMA for the start of this memory slot */ 3230 hva = memslot->userspace_addr; 3231 down_read(¤t->mm->mmap_sem); 3232 vma = find_vma(current->mm, hva); 3233 if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) 3234 goto up_out; 3235 3236 psize = vma_kernel_pagesize(vma); 3237 porder = __ilog2(psize); 3238 3239 up_read(¤t->mm->mmap_sem); 3240 3241 /* We can handle 4k, 64k or 16M pages in the VRMA */ 3242 err = -EINVAL; 3243 if (!(psize == 0x1000 || psize == 0x10000 || 3244 psize == 0x1000000)) 3245 goto out_srcu; 3246 3247 senc = slb_pgsize_encoding(psize); 3248 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 3249 (VRMA_VSID << SLB_VSID_SHIFT_1T); 3250 /* Create HPTEs in the hash page table for the VRMA */ 3251 kvmppc_map_vrma(vcpu, memslot, porder); 3252 3253 /* Update VRMASD field in the LPCR */ 3254 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 3255 /* the -4 is to account for senc values starting at 0x10 */ 3256 lpcr = senc << (LPCR_VRMASD_SH - 4); 3257 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 3258 } else { 3259 kvmppc_setup_partition_table(kvm); 3260 } 3261 3262 /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ 3263 smp_wmb(); 3264 kvm->arch.hpte_setup_done = 1; 3265 err = 0; 3266 out_srcu: 3267 srcu_read_unlock(&kvm->srcu, srcu_idx); 3268 out: 3269 mutex_unlock(&kvm->lock); 3270 return err; 3271 3272 up_out: 3273 up_read(¤t->mm->mmap_sem); 3274 goto out_srcu; 3275 } 3276 3277 #ifdef CONFIG_KVM_XICS 3278 /* 3279 * Allocate a per-core structure for managing state about which cores are 3280 * running in the host versus the guest and for exchanging data between 3281 * real mode KVM and CPU running in the host. 3282 * This is only done for the first VM. 3283 * The allocated structure stays even if all VMs have stopped. 3284 * It is only freed when the kvm-hv module is unloaded. 3285 * It's OK for this routine to fail, we just don't support host 3286 * core operations like redirecting H_IPI wakeups. 3287 */ 3288 void kvmppc_alloc_host_rm_ops(void) 3289 { 3290 struct kvmppc_host_rm_ops *ops; 3291 unsigned long l_ops; 3292 int cpu, core; 3293 int size; 3294 3295 /* Not the first time here ? */ 3296 if (kvmppc_host_rm_ops_hv != NULL) 3297 return; 3298 3299 ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL); 3300 if (!ops) 3301 return; 3302 3303 size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core); 3304 ops->rm_core = kzalloc(size, GFP_KERNEL); 3305 3306 if (!ops->rm_core) { 3307 kfree(ops); 3308 return; 3309 } 3310 3311 get_online_cpus(); 3312 3313 for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { 3314 if (!cpu_online(cpu)) 3315 continue; 3316 3317 core = cpu >> threads_shift; 3318 ops->rm_core[core].rm_state.in_host = 1; 3319 } 3320 3321 ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv; 3322 3323 /* 3324 * Make the contents of the kvmppc_host_rm_ops structure visible 3325 * to other CPUs before we assign it to the global variable. 3326 * Do an atomic assignment (no locks used here), but if someone 3327 * beats us to it, just free our copy and return. 3328 */ 3329 smp_wmb(); 3330 l_ops = (unsigned long) ops; 3331 3332 if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { 3333 put_online_cpus(); 3334 kfree(ops->rm_core); 3335 kfree(ops); 3336 return; 3337 } 3338 3339 cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE, 3340 "ppc/kvm_book3s:prepare", 3341 kvmppc_set_host_core, 3342 kvmppc_clear_host_core); 3343 put_online_cpus(); 3344 } 3345 3346 void kvmppc_free_host_rm_ops(void) 3347 { 3348 if (kvmppc_host_rm_ops_hv) { 3349 cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE); 3350 kfree(kvmppc_host_rm_ops_hv->rm_core); 3351 kfree(kvmppc_host_rm_ops_hv); 3352 kvmppc_host_rm_ops_hv = NULL; 3353 } 3354 } 3355 #endif 3356 3357 static int kvmppc_core_init_vm_hv(struct kvm *kvm) 3358 { 3359 unsigned long lpcr, lpid; 3360 char buf[32]; 3361 int ret; 3362 3363 /* Allocate the guest's logical partition ID */ 3364 3365 lpid = kvmppc_alloc_lpid(); 3366 if ((long)lpid < 0) 3367 return -ENOMEM; 3368 kvm->arch.lpid = lpid; 3369 3370 kvmppc_alloc_host_rm_ops(); 3371 3372 /* 3373 * Since we don't flush the TLB when tearing down a VM, 3374 * and this lpid might have previously been used, 3375 * make sure we flush on each core before running the new VM. 3376 * On POWER9, the tlbie in mmu_partition_table_set_entry() 3377 * does this flush for us. 3378 */ 3379 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3380 cpumask_setall(&kvm->arch.need_tlb_flush); 3381 3382 /* Start out with the default set of hcalls enabled */ 3383 memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls, 3384 sizeof(kvm->arch.enabled_hcalls)); 3385 3386 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3387 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); 3388 3389 /* Init LPCR for virtual RMA mode */ 3390 kvm->arch.host_lpid = mfspr(SPRN_LPID); 3391 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); 3392 lpcr &= LPCR_PECE | LPCR_LPES; 3393 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | 3394 LPCR_VPM0 | LPCR_VPM1; 3395 kvm->arch.vrma_slb_v = SLB_VSID_B_1T | 3396 (VRMA_VSID << SLB_VSID_SHIFT_1T); 3397 /* On POWER8 turn on online bit to enable PURR/SPURR */ 3398 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 3399 lpcr |= LPCR_ONL; 3400 /* 3401 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed) 3402 * Set HVICE bit to enable hypervisor virtualization interrupts. 3403 */ 3404 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 3405 lpcr &= ~LPCR_VPM0; 3406 lpcr |= LPCR_HVICE; 3407 } 3408 3409 /* 3410 * For now, if the host uses radix, the guest must be radix. 3411 */ 3412 if (radix_enabled()) { 3413 kvm->arch.radix = 1; 3414 lpcr &= ~LPCR_VPM1; 3415 lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; 3416 ret = kvmppc_init_vm_radix(kvm); 3417 if (ret) { 3418 kvmppc_free_lpid(kvm->arch.lpid); 3419 return ret; 3420 } 3421 kvmppc_setup_partition_table(kvm); 3422 } 3423 3424 kvm->arch.lpcr = lpcr; 3425 3426 /* Initialization for future HPT resizes */ 3427 kvm->arch.resize_hpt = NULL; 3428 3429 /* 3430 * Work out how many sets the TLB has, for the use of 3431 * the TLB invalidation loop in book3s_hv_rmhandlers.S. 3432 */ 3433 if (kvm_is_radix(kvm)) 3434 kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ 3435 else if (cpu_has_feature(CPU_FTR_ARCH_300)) 3436 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ 3437 else if (cpu_has_feature(CPU_FTR_ARCH_207S)) 3438 kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ 3439 else 3440 kvm->arch.tlb_sets = POWER7_TLB_SETS; /* 128 */ 3441 3442 /* 3443 * Track that we now have a HV mode VM active. This blocks secondary 3444 * CPU threads from coming online. 3445 * On POWER9, we only need to do this for HPT guests on a radix 3446 * host, which is not yet supported. 3447 */ 3448 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3449 kvm_hv_vm_activated(); 3450 3451 /* 3452 * Create a debugfs directory for the VM 3453 */ 3454 snprintf(buf, sizeof(buf), "vm%d", current->pid); 3455 kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); 3456 if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) 3457 kvmppc_mmu_debugfs_init(kvm); 3458 3459 return 0; 3460 } 3461 3462 static void kvmppc_free_vcores(struct kvm *kvm) 3463 { 3464 long int i; 3465 3466 for (i = 0; i < KVM_MAX_VCORES; ++i) 3467 kfree(kvm->arch.vcores[i]); 3468 kvm->arch.online_vcores = 0; 3469 } 3470 3471 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) 3472 { 3473 debugfs_remove_recursive(kvm->arch.debugfs_dir); 3474 3475 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3476 kvm_hv_vm_deactivated(); 3477 3478 kvmppc_free_vcores(kvm); 3479 3480 kvmppc_free_lpid(kvm->arch.lpid); 3481 3482 if (kvm_is_radix(kvm)) 3483 kvmppc_free_radix(kvm); 3484 else 3485 kvmppc_free_hpt(&kvm->arch.hpt); 3486 3487 kvmppc_free_pimap(kvm); 3488 } 3489 3490 /* We don't need to emulate any privileged instructions or dcbz */ 3491 static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, 3492 unsigned int inst, int *advance) 3493 { 3494 return EMULATE_FAIL; 3495 } 3496 3497 static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn, 3498 ulong spr_val) 3499 { 3500 return EMULATE_FAIL; 3501 } 3502 3503 static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, 3504 ulong *spr_val) 3505 { 3506 return EMULATE_FAIL; 3507 } 3508 3509 static int kvmppc_core_check_processor_compat_hv(void) 3510 { 3511 if (!cpu_has_feature(CPU_FTR_HVMODE) || 3512 !cpu_has_feature(CPU_FTR_ARCH_206)) 3513 return -EIO; 3514 3515 return 0; 3516 } 3517 3518 #ifdef CONFIG_KVM_XICS 3519 3520 void kvmppc_free_pimap(struct kvm *kvm) 3521 { 3522 kfree(kvm->arch.pimap); 3523 } 3524 3525 static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) 3526 { 3527 return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL); 3528 } 3529 3530 static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) 3531 { 3532 struct irq_desc *desc; 3533 struct kvmppc_irq_map *irq_map; 3534 struct kvmppc_passthru_irqmap *pimap; 3535 struct irq_chip *chip; 3536 int i; 3537 3538 if (!kvm_irq_bypass) 3539 return 1; 3540 3541 desc = irq_to_desc(host_irq); 3542 if (!desc) 3543 return -EIO; 3544 3545 mutex_lock(&kvm->lock); 3546 3547 pimap = kvm->arch.pimap; 3548 if (pimap == NULL) { 3549 /* First call, allocate structure to hold IRQ map */ 3550 pimap = kvmppc_alloc_pimap(); 3551 if (pimap == NULL) { 3552 mutex_unlock(&kvm->lock); 3553 return -ENOMEM; 3554 } 3555 kvm->arch.pimap = pimap; 3556 } 3557 3558 /* 3559 * For now, we only support interrupts for which the EOI operation 3560 * is an OPAL call followed by a write to XIRR, since that's 3561 * what our real-mode EOI code does. 3562 */ 3563 chip = irq_data_get_irq_chip(&desc->irq_data); 3564 if (!chip || !is_pnv_opal_msi(chip)) { 3565 pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n", 3566 host_irq, guest_gsi); 3567 mutex_unlock(&kvm->lock); 3568 return -ENOENT; 3569 } 3570 3571 /* 3572 * See if we already have an entry for this guest IRQ number. 3573 * If it's mapped to a hardware IRQ number, that's an error, 3574 * otherwise re-use this entry. 3575 */ 3576 for (i = 0; i < pimap->n_mapped; i++) { 3577 if (guest_gsi == pimap->mapped[i].v_hwirq) { 3578 if (pimap->mapped[i].r_hwirq) { 3579 mutex_unlock(&kvm->lock); 3580 return -EINVAL; 3581 } 3582 break; 3583 } 3584 } 3585 3586 if (i == KVMPPC_PIRQ_MAPPED) { 3587 mutex_unlock(&kvm->lock); 3588 return -EAGAIN; /* table is full */ 3589 } 3590 3591 irq_map = &pimap->mapped[i]; 3592 3593 irq_map->v_hwirq = guest_gsi; 3594 irq_map->desc = desc; 3595 3596 /* 3597 * Order the above two stores before the next to serialize with 3598 * the KVM real mode handler. 3599 */ 3600 smp_wmb(); 3601 irq_map->r_hwirq = desc->irq_data.hwirq; 3602 3603 if (i == pimap->n_mapped) 3604 pimap->n_mapped++; 3605 3606 kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); 3607 3608 mutex_unlock(&kvm->lock); 3609 3610 return 0; 3611 } 3612 3613 static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) 3614 { 3615 struct irq_desc *desc; 3616 struct kvmppc_passthru_irqmap *pimap; 3617 int i; 3618 3619 if (!kvm_irq_bypass) 3620 return 0; 3621 3622 desc = irq_to_desc(host_irq); 3623 if (!desc) 3624 return -EIO; 3625 3626 mutex_lock(&kvm->lock); 3627 3628 if (kvm->arch.pimap == NULL) { 3629 mutex_unlock(&kvm->lock); 3630 return 0; 3631 } 3632 pimap = kvm->arch.pimap; 3633 3634 for (i = 0; i < pimap->n_mapped; i++) { 3635 if (guest_gsi == pimap->mapped[i].v_hwirq) 3636 break; 3637 } 3638 3639 if (i == pimap->n_mapped) { 3640 mutex_unlock(&kvm->lock); 3641 return -ENODEV; 3642 } 3643 3644 kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); 3645 3646 /* invalidate the entry */ 3647 pimap->mapped[i].r_hwirq = 0; 3648 3649 /* 3650 * We don't free this structure even when the count goes to 3651 * zero. The structure is freed when we destroy the VM. 3652 */ 3653 3654 mutex_unlock(&kvm->lock); 3655 return 0; 3656 } 3657 3658 static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons, 3659 struct irq_bypass_producer *prod) 3660 { 3661 int ret = 0; 3662 struct kvm_kernel_irqfd *irqfd = 3663 container_of(cons, struct kvm_kernel_irqfd, consumer); 3664 3665 irqfd->producer = prod; 3666 3667 ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); 3668 if (ret) 3669 pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n", 3670 prod->irq, irqfd->gsi, ret); 3671 3672 return ret; 3673 } 3674 3675 static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons, 3676 struct irq_bypass_producer *prod) 3677 { 3678 int ret; 3679 struct kvm_kernel_irqfd *irqfd = 3680 container_of(cons, struct kvm_kernel_irqfd, consumer); 3681 3682 irqfd->producer = NULL; 3683 3684 /* 3685 * When producer of consumer is unregistered, we change back to 3686 * default external interrupt handling mode - KVM real mode 3687 * will switch back to host. 3688 */ 3689 ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); 3690 if (ret) 3691 pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n", 3692 prod->irq, irqfd->gsi, ret); 3693 } 3694 #endif 3695 3696 static long kvm_arch_vm_ioctl_hv(struct file *filp, 3697 unsigned int ioctl, unsigned long arg) 3698 { 3699 struct kvm *kvm __maybe_unused = filp->private_data; 3700 void __user *argp = (void __user *)arg; 3701 long r; 3702 3703 switch (ioctl) { 3704 3705 case KVM_PPC_ALLOCATE_HTAB: { 3706 u32 htab_order; 3707 3708 r = -EFAULT; 3709 if (get_user(htab_order, (u32 __user *)argp)) 3710 break; 3711 r = kvmppc_alloc_reset_hpt(kvm, htab_order); 3712 if (r) 3713 break; 3714 r = 0; 3715 break; 3716 } 3717 3718 case KVM_PPC_GET_HTAB_FD: { 3719 struct kvm_get_htab_fd ghf; 3720 3721 r = -EFAULT; 3722 if (copy_from_user(&ghf, argp, sizeof(ghf))) 3723 break; 3724 r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); 3725 break; 3726 } 3727 3728 case KVM_PPC_RESIZE_HPT_PREPARE: { 3729 struct kvm_ppc_resize_hpt rhpt; 3730 3731 r = -EFAULT; 3732 if (copy_from_user(&rhpt, argp, sizeof(rhpt))) 3733 break; 3734 3735 r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt); 3736 break; 3737 } 3738 3739 case KVM_PPC_RESIZE_HPT_COMMIT: { 3740 struct kvm_ppc_resize_hpt rhpt; 3741 3742 r = -EFAULT; 3743 if (copy_from_user(&rhpt, argp, sizeof(rhpt))) 3744 break; 3745 3746 r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt); 3747 break; 3748 } 3749 3750 default: 3751 r = -ENOTTY; 3752 } 3753 3754 return r; 3755 } 3756 3757 /* 3758 * List of hcall numbers to enable by default. 3759 * For compatibility with old userspace, we enable by default 3760 * all hcalls that were implemented before the hcall-enabling 3761 * facility was added. Note this list should not include H_RTAS. 3762 */ 3763 static unsigned int default_hcall_list[] = { 3764 H_REMOVE, 3765 H_ENTER, 3766 H_READ, 3767 H_PROTECT, 3768 H_BULK_REMOVE, 3769 H_GET_TCE, 3770 H_PUT_TCE, 3771 H_SET_DABR, 3772 H_SET_XDABR, 3773 H_CEDE, 3774 H_PROD, 3775 H_CONFER, 3776 H_REGISTER_VPA, 3777 #ifdef CONFIG_KVM_XICS 3778 H_EOI, 3779 H_CPPR, 3780 H_IPI, 3781 H_IPOLL, 3782 H_XIRR, 3783 H_XIRR_X, 3784 #endif 3785 0 3786 }; 3787 3788 static void init_default_hcalls(void) 3789 { 3790 int i; 3791 unsigned int hcall; 3792 3793 for (i = 0; default_hcall_list[i]; ++i) { 3794 hcall = default_hcall_list[i]; 3795 WARN_ON(!kvmppc_hcall_impl_hv(hcall)); 3796 __set_bit(hcall / 4, default_enabled_hcalls); 3797 } 3798 } 3799 3800 static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) 3801 { 3802 unsigned long lpcr; 3803 int radix; 3804 3805 /* If not on a POWER9, reject it */ 3806 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3807 return -ENODEV; 3808 3809 /* If any unknown flags set, reject it */ 3810 if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) 3811 return -EINVAL; 3812 3813 /* We can't change a guest to/from radix yet */ 3814 radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); 3815 if (radix != kvm_is_radix(kvm)) 3816 return -EINVAL; 3817 3818 /* GR (guest radix) bit in process_table field must match */ 3819 if (!!(cfg->process_table & PATB_GR) != radix) 3820 return -EINVAL; 3821 3822 /* Process table size field must be reasonable, i.e. <= 24 */ 3823 if ((cfg->process_table & PRTS_MASK) > 24) 3824 return -EINVAL; 3825 3826 kvm->arch.process_table = cfg->process_table; 3827 kvmppc_setup_partition_table(kvm); 3828 3829 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; 3830 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); 3831 3832 return 0; 3833 } 3834 3835 static struct kvmppc_ops kvm_ops_hv = { 3836 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, 3837 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, 3838 .get_one_reg = kvmppc_get_one_reg_hv, 3839 .set_one_reg = kvmppc_set_one_reg_hv, 3840 .vcpu_load = kvmppc_core_vcpu_load_hv, 3841 .vcpu_put = kvmppc_core_vcpu_put_hv, 3842 .set_msr = kvmppc_set_msr_hv, 3843 .vcpu_run = kvmppc_vcpu_run_hv, 3844 .vcpu_create = kvmppc_core_vcpu_create_hv, 3845 .vcpu_free = kvmppc_core_vcpu_free_hv, 3846 .check_requests = kvmppc_core_check_requests_hv, 3847 .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv, 3848 .flush_memslot = kvmppc_core_flush_memslot_hv, 3849 .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, 3850 .commit_memory_region = kvmppc_core_commit_memory_region_hv, 3851 .unmap_hva = kvm_unmap_hva_hv, 3852 .unmap_hva_range = kvm_unmap_hva_range_hv, 3853 .age_hva = kvm_age_hva_hv, 3854 .test_age_hva = kvm_test_age_hva_hv, 3855 .set_spte_hva = kvm_set_spte_hva_hv, 3856 .mmu_destroy = kvmppc_mmu_destroy_hv, 3857 .free_memslot = kvmppc_core_free_memslot_hv, 3858 .create_memslot = kvmppc_core_create_memslot_hv, 3859 .init_vm = kvmppc_core_init_vm_hv, 3860 .destroy_vm = kvmppc_core_destroy_vm_hv, 3861 .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, 3862 .emulate_op = kvmppc_core_emulate_op_hv, 3863 .emulate_mtspr = kvmppc_core_emulate_mtspr_hv, 3864 .emulate_mfspr = kvmppc_core_emulate_mfspr_hv, 3865 .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, 3866 .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, 3867 .hcall_implemented = kvmppc_hcall_impl_hv, 3868 #ifdef CONFIG_KVM_XICS 3869 .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv, 3870 .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv, 3871 #endif 3872 .configure_mmu = kvmhv_configure_mmu, 3873 .get_rmmu_info = kvmhv_get_rmmu_info, 3874 }; 3875 3876 static int kvm_init_subcore_bitmap(void) 3877 { 3878 int i, j; 3879 int nr_cores = cpu_nr_cores(); 3880 struct sibling_subcore_state *sibling_subcore_state; 3881 3882 for (i = 0; i < nr_cores; i++) { 3883 int first_cpu = i * threads_per_core; 3884 int node = cpu_to_node(first_cpu); 3885 3886 /* Ignore if it is already allocated. */ 3887 if (paca[first_cpu].sibling_subcore_state) 3888 continue; 3889 3890 sibling_subcore_state = 3891 kmalloc_node(sizeof(struct sibling_subcore_state), 3892 GFP_KERNEL, node); 3893 if (!sibling_subcore_state) 3894 return -ENOMEM; 3895 3896 memset(sibling_subcore_state, 0, 3897 sizeof(struct sibling_subcore_state)); 3898 3899 for (j = 0; j < threads_per_core; j++) { 3900 int cpu = first_cpu + j; 3901 3902 paca[cpu].sibling_subcore_state = sibling_subcore_state; 3903 } 3904 } 3905 return 0; 3906 } 3907 3908 static int kvmppc_radix_possible(void) 3909 { 3910 return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); 3911 } 3912 3913 static int kvmppc_book3s_init_hv(void) 3914 { 3915 int r; 3916 /* 3917 * FIXME!! Do we need to check on all cpus ? 3918 */ 3919 r = kvmppc_core_check_processor_compat_hv(); 3920 if (r < 0) 3921 return -ENODEV; 3922 3923 r = kvm_init_subcore_bitmap(); 3924 if (r) 3925 return r; 3926 3927 /* 3928 * We need a way of accessing the XICS interrupt controller, 3929 * either directly, via paca[cpu].kvm_hstate.xics_phys, or 3930 * indirectly, via OPAL. 3931 */ 3932 #ifdef CONFIG_SMP 3933 if (!get_paca()->kvm_hstate.xics_phys) { 3934 struct device_node *np; 3935 3936 np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); 3937 if (!np) { 3938 pr_err("KVM-HV: Cannot determine method for accessing XICS\n"); 3939 return -ENODEV; 3940 } 3941 } 3942 #endif 3943 3944 kvm_ops_hv.owner = THIS_MODULE; 3945 kvmppc_hv_ops = &kvm_ops_hv; 3946 3947 init_default_hcalls(); 3948 3949 init_vcore_lists(); 3950 3951 r = kvmppc_mmu_hv_init(); 3952 if (r) 3953 return r; 3954 3955 if (kvmppc_radix_possible()) 3956 r = kvmppc_radix_init(); 3957 return r; 3958 } 3959 3960 static void kvmppc_book3s_exit_hv(void) 3961 { 3962 kvmppc_free_host_rm_ops(); 3963 if (kvmppc_radix_possible()) 3964 kvmppc_radix_exit(); 3965 kvmppc_hv_ops = NULL; 3966 } 3967 3968 module_init(kvmppc_book3s_init_hv); 3969 module_exit(kvmppc_book3s_exit_hv); 3970 MODULE_LICENSE("GPL"); 3971 MODULE_ALIAS_MISCDEV(KVM_MINOR); 3972 MODULE_ALIAS("devname:kvm"); 3973 3974