1 /* 2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. 4 * 5 * Authors: 6 * Paul Mackerras <paulus@au1.ibm.com> 7 * Alexander Graf <agraf@suse.de> 8 * Kevin Wolf <mail@kevin-wolf.de> 9 * 10 * Description: KVM functions specific to running on Book 3S 11 * processors in hypervisor mode (specifically POWER7 and later). 12 * 13 * This file is derived from arch/powerpc/kvm/book3s.c, 14 * by Alexander Graf <agraf@suse.de>. 15 * 16 * This program is free software; you can redistribute it and/or modify 17 * it under the terms of the GNU General Public License, version 2, as 18 * published by the Free Software Foundation. 19 */ 20 21 #include <linux/kvm_host.h> 22 #include <linux/kernel.h> 23 #include <linux/err.h> 24 #include <linux/slab.h> 25 #include <linux/preempt.h> 26 #include <linux/sched/signal.h> 27 #include <linux/sched/stat.h> 28 #include <linux/delay.h> 29 #include <linux/export.h> 30 #include <linux/fs.h> 31 #include <linux/anon_inodes.h> 32 #include <linux/cpu.h> 33 #include <linux/cpumask.h> 34 #include <linux/spinlock.h> 35 #include <linux/page-flags.h> 36 #include <linux/srcu.h> 37 #include <linux/miscdevice.h> 38 #include <linux/debugfs.h> 39 #include <linux/gfp.h> 40 #include <linux/vmalloc.h> 41 #include <linux/highmem.h> 42 #include <linux/hugetlb.h> 43 #include <linux/kvm_irqfd.h> 44 #include <linux/irqbypass.h> 45 #include <linux/module.h> 46 #include <linux/compiler.h> 47 #include <linux/of.h> 48 49 #include <asm/reg.h> 50 #include <asm/ppc-opcode.h> 51 #include <asm/asm-prototypes.h> 52 #include <asm/disassemble.h> 53 #include <asm/cputable.h> 54 #include <asm/cacheflush.h> 55 #include <asm/tlbflush.h> 56 #include <linux/uaccess.h> 57 #include <asm/io.h> 58 #include <asm/kvm_ppc.h> 59 #include <asm/kvm_book3s.h> 60 #include <asm/mmu_context.h> 61 #include <asm/lppaca.h> 62 #include <asm/processor.h> 63 #include <asm/cputhreads.h> 64 #include <asm/page.h> 65 #include <asm/hvcall.h> 66 #include <asm/switch_to.h> 67 #include <asm/smp.h> 68 #include <asm/dbell.h> 69 #include <asm/hmi.h> 70 #include <asm/pnv-pci.h> 71 #include <asm/mmu.h> 72 #include <asm/opal.h> 73 #include <asm/xics.h> 74 #include <asm/xive.h> 75 76 #include "book3s.h" 77 78 #define CREATE_TRACE_POINTS 79 #include "trace_hv.h" 80 81 /* #define EXIT_DEBUG */ 82 /* #define EXIT_DEBUG_SIMPLE */ 83 /* #define EXIT_DEBUG_INT */ 84 85 /* Used to indicate that a guest page fault needs to be handled */ 86 #define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) 87 /* Used to indicate that a guest passthrough interrupt needs to be handled */ 88 #define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2) 89 90 /* Used as a "null" value for timebase values */ 91 #define TB_NIL (~(u64)0) 92 93 static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); 94 95 static int dynamic_mt_modes = 6; 96 module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR); 97 MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)"); 98 static int target_smt_mode; 99 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); 100 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); 101 102 static bool indep_threads_mode = true; 103 module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); 104 MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); 105 106 #ifdef CONFIG_KVM_XICS 107 static struct kernel_param_ops module_param_ops = { 108 .set = param_set_int, 109 .get = param_get_int, 110 }; 111 112 module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, 113 S_IRUGO | S_IWUSR); 114 MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization"); 115 116 module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 117 S_IRUGO | S_IWUSR); 118 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); 119 #endif 120 121 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 122 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 123 124 static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, 125 int *ip) 126 { 127 int i = *ip; 128 struct kvm_vcpu *vcpu; 129 130 while (++i < MAX_SMT_THREADS) { 131 vcpu = READ_ONCE(vc->runnable_threads[i]); 132 if (vcpu) { 133 *ip = i; 134 return vcpu; 135 } 136 } 137 return NULL; 138 } 139 140 /* Used to traverse the list of runnable threads for a given vcore */ 141 #define for_each_runnable_thread(i, vcpu, vc) \ 142 for (i = -1; (vcpu = next_runnable_thread(vc, &i)); ) 143 144 static bool kvmppc_ipi_thread(int cpu) 145 { 146 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 147 148 /* On POWER9 we can use msgsnd to IPI any cpu */ 149 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 150 msg |= get_hard_smp_processor_id(cpu); 151 smp_mb(); 152 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 153 return true; 154 } 155 156 /* On POWER8 for IPIs to threads in the same core, use msgsnd */ 157 if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 158 preempt_disable(); 159 if (cpu_first_thread_sibling(cpu) == 160 cpu_first_thread_sibling(smp_processor_id())) { 161 msg |= cpu_thread_in_core(cpu); 162 smp_mb(); 163 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 164 preempt_enable(); 165 return true; 166 } 167 preempt_enable(); 168 } 169 170 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 171 if (cpu >= 0 && cpu < nr_cpu_ids) { 172 if (paca[cpu].kvm_hstate.xics_phys) { 173 xics_wake_cpu(cpu); 174 return true; 175 } 176 opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); 177 return true; 178 } 179 #endif 180 181 return false; 182 } 183 184 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) 185 { 186 int cpu; 187 struct swait_queue_head *wqp; 188 189 wqp = kvm_arch_vcpu_wq(vcpu); 190 if (swq_has_sleeper(wqp)) { 191 swake_up(wqp); 192 ++vcpu->stat.halt_wakeup; 193 } 194 195 cpu = READ_ONCE(vcpu->arch.thread_cpu); 196 if (cpu >= 0 && kvmppc_ipi_thread(cpu)) 197 return; 198 199 /* CPU points to the first thread of the core */ 200 cpu = vcpu->cpu; 201 if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu)) 202 smp_send_reschedule(cpu); 203 } 204 205 /* 206 * We use the vcpu_load/put functions to measure stolen time. 207 * Stolen time is counted as time when either the vcpu is able to 208 * run as part of a virtual core, but the task running the vcore 209 * is preempted or sleeping, or when the vcpu needs something done 210 * in the kernel by the task running the vcpu, but that task is 211 * preempted or sleeping. Those two things have to be counted 212 * separately, since one of the vcpu tasks will take on the job 213 * of running the core, and the other vcpu tasks in the vcore will 214 * sleep waiting for it to do that, but that sleep shouldn't count 215 * as stolen time. 216 * 217 * Hence we accumulate stolen time when the vcpu can run as part of 218 * a vcore using vc->stolen_tb, and the stolen time when the vcpu 219 * needs its task to do other things in the kernel (for example, 220 * service a page fault) in busy_stolen. We don't accumulate 221 * stolen time for a vcore when it is inactive, or for a vcpu 222 * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of 223 * a misnomer; it means that the vcpu task is not executing in 224 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in 225 * the kernel. We don't have any way of dividing up that time 226 * between time that the vcpu is genuinely stopped, time that 227 * the task is actively working on behalf of the vcpu, and time 228 * that the task is preempted, so we don't count any of it as 229 * stolen. 230 * 231 * Updates to busy_stolen are protected by arch.tbacct_lock; 232 * updates to vc->stolen_tb are protected by the vcore->stoltb_lock 233 * lock. The stolen times are measured in units of timebase ticks. 234 * (Note that the != TB_NIL checks below are purely defensive; 235 * they should never fail.) 236 */ 237 238 static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc) 239 { 240 unsigned long flags; 241 242 spin_lock_irqsave(&vc->stoltb_lock, flags); 243 vc->preempt_tb = mftb(); 244 spin_unlock_irqrestore(&vc->stoltb_lock, flags); 245 } 246 247 static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc) 248 { 249 unsigned long flags; 250 251 spin_lock_irqsave(&vc->stoltb_lock, flags); 252 if (vc->preempt_tb != TB_NIL) { 253 vc->stolen_tb += mftb() - vc->preempt_tb; 254 vc->preempt_tb = TB_NIL; 255 } 256 spin_unlock_irqrestore(&vc->stoltb_lock, flags); 257 } 258 259 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) 260 { 261 struct kvmppc_vcore *vc = vcpu->arch.vcore; 262 unsigned long flags; 263 264 /* 265 * We can test vc->runner without taking the vcore lock, 266 * because only this task ever sets vc->runner to this 267 * vcpu, and once it is set to this vcpu, only this task 268 * ever sets it to NULL. 269 */ 270 if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) 271 kvmppc_core_end_stolen(vc); 272 273 spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); 274 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && 275 vcpu->arch.busy_preempt != TB_NIL) { 276 vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; 277 vcpu->arch.busy_preempt = TB_NIL; 278 } 279 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); 280 } 281 282 static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) 283 { 284 struct kvmppc_vcore *vc = vcpu->arch.vcore; 285 unsigned long flags; 286 287 if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) 288 kvmppc_core_start_stolen(vc); 289 290 spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); 291 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) 292 vcpu->arch.busy_preempt = mftb(); 293 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); 294 } 295 296 static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) 297 { 298 /* 299 * Check for illegal transactional state bit combination 300 * and if we find it, force the TS field to a safe state. 301 */ 302 if ((msr & MSR_TS_MASK) == MSR_TS_MASK) 303 msr &= ~MSR_TS_MASK; 304 vcpu->arch.shregs.msr = msr; 305 kvmppc_end_cede(vcpu); 306 } 307 308 static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) 309 { 310 vcpu->arch.pvr = pvr; 311 } 312 313 /* Dummy value used in computing PCR value below */ 314 #define PCR_ARCH_300 (PCR_ARCH_207 << 1) 315 316 static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) 317 { 318 unsigned long host_pcr_bit = 0, guest_pcr_bit = 0; 319 struct kvmppc_vcore *vc = vcpu->arch.vcore; 320 321 /* We can (emulate) our own architecture version and anything older */ 322 if (cpu_has_feature(CPU_FTR_ARCH_300)) 323 host_pcr_bit = PCR_ARCH_300; 324 else if (cpu_has_feature(CPU_FTR_ARCH_207S)) 325 host_pcr_bit = PCR_ARCH_207; 326 else if (cpu_has_feature(CPU_FTR_ARCH_206)) 327 host_pcr_bit = PCR_ARCH_206; 328 else 329 host_pcr_bit = PCR_ARCH_205; 330 331 /* Determine lowest PCR bit needed to run guest in given PVR level */ 332 guest_pcr_bit = host_pcr_bit; 333 if (arch_compat) { 334 switch (arch_compat) { 335 case PVR_ARCH_205: 336 guest_pcr_bit = PCR_ARCH_205; 337 break; 338 case PVR_ARCH_206: 339 case PVR_ARCH_206p: 340 guest_pcr_bit = PCR_ARCH_206; 341 break; 342 case PVR_ARCH_207: 343 guest_pcr_bit = PCR_ARCH_207; 344 break; 345 case PVR_ARCH_300: 346 guest_pcr_bit = PCR_ARCH_300; 347 break; 348 default: 349 return -EINVAL; 350 } 351 } 352 353 /* Check requested PCR bits don't exceed our capabilities */ 354 if (guest_pcr_bit > host_pcr_bit) 355 return -EINVAL; 356 357 spin_lock(&vc->lock); 358 vc->arch_compat = arch_compat; 359 /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */ 360 vc->pcr = host_pcr_bit - guest_pcr_bit; 361 spin_unlock(&vc->lock); 362 363 return 0; 364 } 365 366 static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) 367 { 368 int r; 369 370 pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id); 371 pr_err("pc = %.16lx msr = %.16llx trap = %x\n", 372 vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap); 373 for (r = 0; r < 16; ++r) 374 pr_err("r%2d = %.16lx r%d = %.16lx\n", 375 r, kvmppc_get_gpr(vcpu, r), 376 r+16, kvmppc_get_gpr(vcpu, r+16)); 377 pr_err("ctr = %.16lx lr = %.16lx\n", 378 vcpu->arch.ctr, vcpu->arch.lr); 379 pr_err("srr0 = %.16llx srr1 = %.16llx\n", 380 vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1); 381 pr_err("sprg0 = %.16llx sprg1 = %.16llx\n", 382 vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); 383 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", 384 vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); 385 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", 386 vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr); 387 pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); 388 pr_err("fault dar = %.16lx dsisr = %.8x\n", 389 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 390 pr_err("SLB (%d entries):\n", vcpu->arch.slb_max); 391 for (r = 0; r < vcpu->arch.slb_max; ++r) 392 pr_err(" ESID = %.16llx VSID = %.16llx\n", 393 vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); 394 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", 395 vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1, 396 vcpu->arch.last_inst); 397 } 398 399 static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) 400 { 401 struct kvm_vcpu *ret; 402 403 mutex_lock(&kvm->lock); 404 ret = kvm_get_vcpu_by_id(kvm, id); 405 mutex_unlock(&kvm->lock); 406 return ret; 407 } 408 409 static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) 410 { 411 vpa->__old_status |= LPPACA_OLD_SHARED_PROC; 412 vpa->yield_count = cpu_to_be32(1); 413 } 414 415 static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, 416 unsigned long addr, unsigned long len) 417 { 418 /* check address is cacheline aligned */ 419 if (addr & (L1_CACHE_BYTES - 1)) 420 return -EINVAL; 421 spin_lock(&vcpu->arch.vpa_update_lock); 422 if (v->next_gpa != addr || v->len != len) { 423 v->next_gpa = addr; 424 v->len = addr ? len : 0; 425 v->update_pending = 1; 426 } 427 spin_unlock(&vcpu->arch.vpa_update_lock); 428 return 0; 429 } 430 431 /* Length for a per-processor buffer is passed in at offset 4 in the buffer */ 432 struct reg_vpa { 433 u32 dummy; 434 union { 435 __be16 hword; 436 __be32 word; 437 } length; 438 }; 439 440 static int vpa_is_registered(struct kvmppc_vpa *vpap) 441 { 442 if (vpap->update_pending) 443 return vpap->next_gpa != 0; 444 return vpap->pinned_addr != NULL; 445 } 446 447 static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, 448 unsigned long flags, 449 unsigned long vcpuid, unsigned long vpa) 450 { 451 struct kvm *kvm = vcpu->kvm; 452 unsigned long len, nb; 453 void *va; 454 struct kvm_vcpu *tvcpu; 455 int err; 456 int subfunc; 457 struct kvmppc_vpa *vpap; 458 459 tvcpu = kvmppc_find_vcpu(kvm, vcpuid); 460 if (!tvcpu) 461 return H_PARAMETER; 462 463 subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK; 464 if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL || 465 subfunc == H_VPA_REG_SLB) { 466 /* Registering new area - address must be cache-line aligned */ 467 if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa) 468 return H_PARAMETER; 469 470 /* convert logical addr to kernel addr and read length */ 471 va = kvmppc_pin_guest_page(kvm, vpa, &nb); 472 if (va == NULL) 473 return H_PARAMETER; 474 if (subfunc == H_VPA_REG_VPA) 475 len = be16_to_cpu(((struct reg_vpa *)va)->length.hword); 476 else 477 len = be32_to_cpu(((struct reg_vpa *)va)->length.word); 478 kvmppc_unpin_guest_page(kvm, va, vpa, false); 479 480 /* Check length */ 481 if (len > nb || len < sizeof(struct reg_vpa)) 482 return H_PARAMETER; 483 } else { 484 vpa = 0; 485 len = 0; 486 } 487 488 err = H_PARAMETER; 489 vpap = NULL; 490 spin_lock(&tvcpu->arch.vpa_update_lock); 491 492 switch (subfunc) { 493 case H_VPA_REG_VPA: /* register VPA */ 494 /* 495 * The size of our lppaca is 1kB because of the way we align 496 * it for the guest to avoid crossing a 4kB boundary. We only 497 * use 640 bytes of the structure though, so we should accept 498 * clients that set a size of 640. 499 */ 500 if (len < 640) 501 break; 502 vpap = &tvcpu->arch.vpa; 503 err = 0; 504 break; 505 506 case H_VPA_REG_DTL: /* register DTL */ 507 if (len < sizeof(struct dtl_entry)) 508 break; 509 len -= len % sizeof(struct dtl_entry); 510 511 /* Check that they have previously registered a VPA */ 512 err = H_RESOURCE; 513 if (!vpa_is_registered(&tvcpu->arch.vpa)) 514 break; 515 516 vpap = &tvcpu->arch.dtl; 517 err = 0; 518 break; 519 520 case H_VPA_REG_SLB: /* register SLB shadow buffer */ 521 /* Check that they have previously registered a VPA */ 522 err = H_RESOURCE; 523 if (!vpa_is_registered(&tvcpu->arch.vpa)) 524 break; 525 526 vpap = &tvcpu->arch.slb_shadow; 527 err = 0; 528 break; 529 530 case H_VPA_DEREG_VPA: /* deregister VPA */ 531 /* Check they don't still have a DTL or SLB buf registered */ 532 err = H_RESOURCE; 533 if (vpa_is_registered(&tvcpu->arch.dtl) || 534 vpa_is_registered(&tvcpu->arch.slb_shadow)) 535 break; 536 537 vpap = &tvcpu->arch.vpa; 538 err = 0; 539 break; 540 541 case H_VPA_DEREG_DTL: /* deregister DTL */ 542 vpap = &tvcpu->arch.dtl; 543 err = 0; 544 break; 545 546 case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */ 547 vpap = &tvcpu->arch.slb_shadow; 548 err = 0; 549 break; 550 } 551 552 if (vpap) { 553 vpap->next_gpa = vpa; 554 vpap->len = len; 555 vpap->update_pending = 1; 556 } 557 558 spin_unlock(&tvcpu->arch.vpa_update_lock); 559 560 return err; 561 } 562 563 static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) 564 { 565 struct kvm *kvm = vcpu->kvm; 566 void *va; 567 unsigned long nb; 568 unsigned long gpa; 569 570 /* 571 * We need to pin the page pointed to by vpap->next_gpa, 572 * but we can't call kvmppc_pin_guest_page under the lock 573 * as it does get_user_pages() and down_read(). So we 574 * have to drop the lock, pin the page, then get the lock 575 * again and check that a new area didn't get registered 576 * in the meantime. 577 */ 578 for (;;) { 579 gpa = vpap->next_gpa; 580 spin_unlock(&vcpu->arch.vpa_update_lock); 581 va = NULL; 582 nb = 0; 583 if (gpa) 584 va = kvmppc_pin_guest_page(kvm, gpa, &nb); 585 spin_lock(&vcpu->arch.vpa_update_lock); 586 if (gpa == vpap->next_gpa) 587 break; 588 /* sigh... unpin that one and try again */ 589 if (va) 590 kvmppc_unpin_guest_page(kvm, va, gpa, false); 591 } 592 593 vpap->update_pending = 0; 594 if (va && nb < vpap->len) { 595 /* 596 * If it's now too short, it must be that userspace 597 * has changed the mappings underlying guest memory, 598 * so unregister the region. 599 */ 600 kvmppc_unpin_guest_page(kvm, va, gpa, false); 601 va = NULL; 602 } 603 if (vpap->pinned_addr) 604 kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, 605 vpap->dirty); 606 vpap->gpa = gpa; 607 vpap->pinned_addr = va; 608 vpap->dirty = false; 609 if (va) 610 vpap->pinned_end = va + vpap->len; 611 } 612 613 static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) 614 { 615 if (!(vcpu->arch.vpa.update_pending || 616 vcpu->arch.slb_shadow.update_pending || 617 vcpu->arch.dtl.update_pending)) 618 return; 619 620 spin_lock(&vcpu->arch.vpa_update_lock); 621 if (vcpu->arch.vpa.update_pending) { 622 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); 623 if (vcpu->arch.vpa.pinned_addr) 624 init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); 625 } 626 if (vcpu->arch.dtl.update_pending) { 627 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); 628 vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr; 629 vcpu->arch.dtl_index = 0; 630 } 631 if (vcpu->arch.slb_shadow.update_pending) 632 kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow); 633 spin_unlock(&vcpu->arch.vpa_update_lock); 634 } 635 636 /* 637 * Return the accumulated stolen time for the vcore up until `now'. 638 * The caller should hold the vcore lock. 639 */ 640 static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) 641 { 642 u64 p; 643 unsigned long flags; 644 645 spin_lock_irqsave(&vc->stoltb_lock, flags); 646 p = vc->stolen_tb; 647 if (vc->vcore_state != VCORE_INACTIVE && 648 vc->preempt_tb != TB_NIL) 649 p += now - vc->preempt_tb; 650 spin_unlock_irqrestore(&vc->stoltb_lock, flags); 651 return p; 652 } 653 654 static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, 655 struct kvmppc_vcore *vc) 656 { 657 struct dtl_entry *dt; 658 struct lppaca *vpa; 659 unsigned long stolen; 660 unsigned long core_stolen; 661 u64 now; 662 unsigned long flags; 663 664 dt = vcpu->arch.dtl_ptr; 665 vpa = vcpu->arch.vpa.pinned_addr; 666 now = mftb(); 667 core_stolen = vcore_stolen_time(vc, now); 668 stolen = core_stolen - vcpu->arch.stolen_logged; 669 vcpu->arch.stolen_logged = core_stolen; 670 spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); 671 stolen += vcpu->arch.busy_stolen; 672 vcpu->arch.busy_stolen = 0; 673 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); 674 if (!dt || !vpa) 675 return; 676 memset(dt, 0, sizeof(struct dtl_entry)); 677 dt->dispatch_reason = 7; 678 dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid); 679 dt->timebase = cpu_to_be64(now + vc->tb_offset); 680 dt->enqueue_to_dispatch_time = cpu_to_be32(stolen); 681 dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu)); 682 dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr); 683 ++dt; 684 if (dt == vcpu->arch.dtl.pinned_end) 685 dt = vcpu->arch.dtl.pinned_addr; 686 vcpu->arch.dtl_ptr = dt; 687 /* order writing *dt vs. writing vpa->dtl_idx */ 688 smp_wmb(); 689 vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index); 690 vcpu->arch.dtl.dirty = true; 691 } 692 693 /* See if there is a doorbell interrupt pending for a vcpu */ 694 static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) 695 { 696 int thr; 697 struct kvmppc_vcore *vc; 698 699 if (vcpu->arch.doorbell_request) 700 return true; 701 /* 702 * Ensure that the read of vcore->dpdes comes after the read 703 * of vcpu->doorbell_request. This barrier matches the 704 * lwsync in book3s_hv_rmhandlers.S just before the 705 * fast_guest_return label. 706 */ 707 smp_rmb(); 708 vc = vcpu->arch.vcore; 709 thr = vcpu->vcpu_id - vc->first_vcpuid; 710 return !!(vc->dpdes & (1 << thr)); 711 } 712 713 static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu) 714 { 715 if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207) 716 return true; 717 if ((!vcpu->arch.vcore->arch_compat) && 718 cpu_has_feature(CPU_FTR_ARCH_207S)) 719 return true; 720 return false; 721 } 722 723 static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, 724 unsigned long resource, unsigned long value1, 725 unsigned long value2) 726 { 727 switch (resource) { 728 case H_SET_MODE_RESOURCE_SET_CIABR: 729 if (!kvmppc_power8_compatible(vcpu)) 730 return H_P2; 731 if (value2) 732 return H_P4; 733 if (mflags) 734 return H_UNSUPPORTED_FLAG_START; 735 /* Guests can't breakpoint the hypervisor */ 736 if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER) 737 return H_P3; 738 vcpu->arch.ciabr = value1; 739 return H_SUCCESS; 740 case H_SET_MODE_RESOURCE_SET_DAWR: 741 if (!kvmppc_power8_compatible(vcpu)) 742 return H_P2; 743 if (mflags) 744 return H_UNSUPPORTED_FLAG_START; 745 if (value2 & DABRX_HYP) 746 return H_P4; 747 vcpu->arch.dawr = value1; 748 vcpu->arch.dawrx = value2; 749 return H_SUCCESS; 750 default: 751 return H_TOO_HARD; 752 } 753 } 754 755 static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) 756 { 757 struct kvmppc_vcore *vcore = target->arch.vcore; 758 759 /* 760 * We expect to have been called by the real mode handler 761 * (kvmppc_rm_h_confer()) which would have directly returned 762 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may 763 * have useful work to do and should not confer) so we don't 764 * recheck that here. 765 */ 766 767 spin_lock(&vcore->lock); 768 if (target->arch.state == KVMPPC_VCPU_RUNNABLE && 769 vcore->vcore_state != VCORE_INACTIVE && 770 vcore->runner) 771 target = vcore->runner; 772 spin_unlock(&vcore->lock); 773 774 return kvm_vcpu_yield_to(target); 775 } 776 777 static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu) 778 { 779 int yield_count = 0; 780 struct lppaca *lppaca; 781 782 spin_lock(&vcpu->arch.vpa_update_lock); 783 lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr; 784 if (lppaca) 785 yield_count = be32_to_cpu(lppaca->yield_count); 786 spin_unlock(&vcpu->arch.vpa_update_lock); 787 return yield_count; 788 } 789 790 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) 791 { 792 unsigned long req = kvmppc_get_gpr(vcpu, 3); 793 unsigned long target, ret = H_SUCCESS; 794 int yield_count; 795 struct kvm_vcpu *tvcpu; 796 int idx, rc; 797 798 if (req <= MAX_HCALL_OPCODE && 799 !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls)) 800 return RESUME_HOST; 801 802 switch (req) { 803 case H_CEDE: 804 break; 805 case H_PROD: 806 target = kvmppc_get_gpr(vcpu, 4); 807 tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); 808 if (!tvcpu) { 809 ret = H_PARAMETER; 810 break; 811 } 812 tvcpu->arch.prodded = 1; 813 smp_mb(); 814 if (tvcpu->arch.ceded) 815 kvmppc_fast_vcpu_kick_hv(tvcpu); 816 break; 817 case H_CONFER: 818 target = kvmppc_get_gpr(vcpu, 4); 819 if (target == -1) 820 break; 821 tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); 822 if (!tvcpu) { 823 ret = H_PARAMETER; 824 break; 825 } 826 yield_count = kvmppc_get_gpr(vcpu, 5); 827 if (kvmppc_get_yield_count(tvcpu) != yield_count) 828 break; 829 kvm_arch_vcpu_yield_to(tvcpu); 830 break; 831 case H_REGISTER_VPA: 832 ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), 833 kvmppc_get_gpr(vcpu, 5), 834 kvmppc_get_gpr(vcpu, 6)); 835 break; 836 case H_RTAS: 837 if (list_empty(&vcpu->kvm->arch.rtas_tokens)) 838 return RESUME_HOST; 839 840 idx = srcu_read_lock(&vcpu->kvm->srcu); 841 rc = kvmppc_rtas_hcall(vcpu); 842 srcu_read_unlock(&vcpu->kvm->srcu, idx); 843 844 if (rc == -ENOENT) 845 return RESUME_HOST; 846 else if (rc == 0) 847 break; 848 849 /* Send the error out to userspace via KVM_RUN */ 850 return rc; 851 case H_LOGICAL_CI_LOAD: 852 ret = kvmppc_h_logical_ci_load(vcpu); 853 if (ret == H_TOO_HARD) 854 return RESUME_HOST; 855 break; 856 case H_LOGICAL_CI_STORE: 857 ret = kvmppc_h_logical_ci_store(vcpu); 858 if (ret == H_TOO_HARD) 859 return RESUME_HOST; 860 break; 861 case H_SET_MODE: 862 ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4), 863 kvmppc_get_gpr(vcpu, 5), 864 kvmppc_get_gpr(vcpu, 6), 865 kvmppc_get_gpr(vcpu, 7)); 866 if (ret == H_TOO_HARD) 867 return RESUME_HOST; 868 break; 869 case H_XIRR: 870 case H_CPPR: 871 case H_EOI: 872 case H_IPI: 873 case H_IPOLL: 874 case H_XIRR_X: 875 if (kvmppc_xics_enabled(vcpu)) { 876 if (xive_enabled()) { 877 ret = H_NOT_AVAILABLE; 878 return RESUME_GUEST; 879 } 880 ret = kvmppc_xics_hcall(vcpu, req); 881 break; 882 } 883 return RESUME_HOST; 884 case H_PUT_TCE: 885 ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), 886 kvmppc_get_gpr(vcpu, 5), 887 kvmppc_get_gpr(vcpu, 6)); 888 if (ret == H_TOO_HARD) 889 return RESUME_HOST; 890 break; 891 case H_PUT_TCE_INDIRECT: 892 ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4), 893 kvmppc_get_gpr(vcpu, 5), 894 kvmppc_get_gpr(vcpu, 6), 895 kvmppc_get_gpr(vcpu, 7)); 896 if (ret == H_TOO_HARD) 897 return RESUME_HOST; 898 break; 899 case H_STUFF_TCE: 900 ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4), 901 kvmppc_get_gpr(vcpu, 5), 902 kvmppc_get_gpr(vcpu, 6), 903 kvmppc_get_gpr(vcpu, 7)); 904 if (ret == H_TOO_HARD) 905 return RESUME_HOST; 906 break; 907 default: 908 return RESUME_HOST; 909 } 910 kvmppc_set_gpr(vcpu, 3, ret); 911 vcpu->arch.hcall_needed = 0; 912 return RESUME_GUEST; 913 } 914 915 static int kvmppc_hcall_impl_hv(unsigned long cmd) 916 { 917 switch (cmd) { 918 case H_CEDE: 919 case H_PROD: 920 case H_CONFER: 921 case H_REGISTER_VPA: 922 case H_SET_MODE: 923 case H_LOGICAL_CI_LOAD: 924 case H_LOGICAL_CI_STORE: 925 #ifdef CONFIG_KVM_XICS 926 case H_XIRR: 927 case H_CPPR: 928 case H_EOI: 929 case H_IPI: 930 case H_IPOLL: 931 case H_XIRR_X: 932 #endif 933 return 1; 934 } 935 936 /* See if it's in the real-mode table */ 937 return kvmppc_hcall_impl_hv_realmode(cmd); 938 } 939 940 static int kvmppc_emulate_debug_inst(struct kvm_run *run, 941 struct kvm_vcpu *vcpu) 942 { 943 u32 last_inst; 944 945 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 946 EMULATE_DONE) { 947 /* 948 * Fetch failed, so return to guest and 949 * try executing it again. 950 */ 951 return RESUME_GUEST; 952 } 953 954 if (last_inst == KVMPPC_INST_SW_BREAKPOINT) { 955 run->exit_reason = KVM_EXIT_DEBUG; 956 run->debug.arch.address = kvmppc_get_pc(vcpu); 957 return RESUME_HOST; 958 } else { 959 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 960 return RESUME_GUEST; 961 } 962 } 963 964 static void do_nothing(void *x) 965 { 966 } 967 968 static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu) 969 { 970 int thr, cpu, pcpu, nthreads; 971 struct kvm_vcpu *v; 972 unsigned long dpdes; 973 974 nthreads = vcpu->kvm->arch.emul_smt_mode; 975 dpdes = 0; 976 cpu = vcpu->vcpu_id & ~(nthreads - 1); 977 for (thr = 0; thr < nthreads; ++thr, ++cpu) { 978 v = kvmppc_find_vcpu(vcpu->kvm, cpu); 979 if (!v) 980 continue; 981 /* 982 * If the vcpu is currently running on a physical cpu thread, 983 * interrupt it in order to pull it out of the guest briefly, 984 * which will update its vcore->dpdes value. 985 */ 986 pcpu = READ_ONCE(v->cpu); 987 if (pcpu >= 0) 988 smp_call_function_single(pcpu, do_nothing, NULL, 1); 989 if (kvmppc_doorbell_pending(v)) 990 dpdes |= 1 << thr; 991 } 992 return dpdes; 993 } 994 995 /* 996 * On POWER9, emulate doorbell-related instructions in order to 997 * give the guest the illusion of running on a multi-threaded core. 998 * The instructions emulated are msgsndp, msgclrp, mfspr TIR, 999 * and mfspr DPDES. 1000 */ 1001 static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) 1002 { 1003 u32 inst, rb, thr; 1004 unsigned long arg; 1005 struct kvm *kvm = vcpu->kvm; 1006 struct kvm_vcpu *tvcpu; 1007 1008 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 1009 return EMULATE_FAIL; 1010 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE) 1011 return RESUME_GUEST; 1012 if (get_op(inst) != 31) 1013 return EMULATE_FAIL; 1014 rb = get_rb(inst); 1015 thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1); 1016 switch (get_xop(inst)) { 1017 case OP_31_XOP_MSGSNDP: 1018 arg = kvmppc_get_gpr(vcpu, rb); 1019 if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER) 1020 break; 1021 arg &= 0x3f; 1022 if (arg >= kvm->arch.emul_smt_mode) 1023 break; 1024 tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg); 1025 if (!tvcpu) 1026 break; 1027 if (!tvcpu->arch.doorbell_request) { 1028 tvcpu->arch.doorbell_request = 1; 1029 kvmppc_fast_vcpu_kick_hv(tvcpu); 1030 } 1031 break; 1032 case OP_31_XOP_MSGCLRP: 1033 arg = kvmppc_get_gpr(vcpu, rb); 1034 if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER) 1035 break; 1036 vcpu->arch.vcore->dpdes = 0; 1037 vcpu->arch.doorbell_request = 0; 1038 break; 1039 case OP_31_XOP_MFSPR: 1040 switch (get_sprn(inst)) { 1041 case SPRN_TIR: 1042 arg = thr; 1043 break; 1044 case SPRN_DPDES: 1045 arg = kvmppc_read_dpdes(vcpu); 1046 break; 1047 default: 1048 return EMULATE_FAIL; 1049 } 1050 kvmppc_set_gpr(vcpu, get_rt(inst), arg); 1051 break; 1052 default: 1053 return EMULATE_FAIL; 1054 } 1055 kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); 1056 return RESUME_GUEST; 1057 } 1058 1059 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, 1060 struct task_struct *tsk) 1061 { 1062 int r = RESUME_HOST; 1063 1064 vcpu->stat.sum_exits++; 1065 1066 /* 1067 * This can happen if an interrupt occurs in the last stages 1068 * of guest entry or the first stages of guest exit (i.e. after 1069 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV 1070 * and before setting it to KVM_GUEST_MODE_HOST_HV). 1071 * That can happen due to a bug, or due to a machine check 1072 * occurring at just the wrong time. 1073 */ 1074 if (vcpu->arch.shregs.msr & MSR_HV) { 1075 printk(KERN_EMERG "KVM trap in HV mode!\n"); 1076 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", 1077 vcpu->arch.trap, kvmppc_get_pc(vcpu), 1078 vcpu->arch.shregs.msr); 1079 kvmppc_dump_regs(vcpu); 1080 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1081 run->hw.hardware_exit_reason = vcpu->arch.trap; 1082 return RESUME_HOST; 1083 } 1084 run->exit_reason = KVM_EXIT_UNKNOWN; 1085 run->ready_for_interrupt_injection = 1; 1086 switch (vcpu->arch.trap) { 1087 /* We're good on these - the host merely wanted to get our attention */ 1088 case BOOK3S_INTERRUPT_HV_DECREMENTER: 1089 vcpu->stat.dec_exits++; 1090 r = RESUME_GUEST; 1091 break; 1092 case BOOK3S_INTERRUPT_EXTERNAL: 1093 case BOOK3S_INTERRUPT_H_DOORBELL: 1094 case BOOK3S_INTERRUPT_H_VIRT: 1095 vcpu->stat.ext_intr_exits++; 1096 r = RESUME_GUEST; 1097 break; 1098 /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ 1099 case BOOK3S_INTERRUPT_HMI: 1100 case BOOK3S_INTERRUPT_PERFMON: 1101 case BOOK3S_INTERRUPT_SYSTEM_RESET: 1102 r = RESUME_GUEST; 1103 break; 1104 case BOOK3S_INTERRUPT_MACHINE_CHECK: 1105 /* Exit to guest with KVM_EXIT_NMI as exit reason */ 1106 run->exit_reason = KVM_EXIT_NMI; 1107 run->hw.hardware_exit_reason = vcpu->arch.trap; 1108 /* Clear out the old NMI status from run->flags */ 1109 run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK; 1110 /* Now set the NMI status */ 1111 if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED) 1112 run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; 1113 else 1114 run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; 1115 1116 r = RESUME_HOST; 1117 /* Print the MCE event to host console. */ 1118 machine_check_print_event_info(&vcpu->arch.mce_evt, false); 1119 break; 1120 case BOOK3S_INTERRUPT_PROGRAM: 1121 { 1122 ulong flags; 1123 /* 1124 * Normally program interrupts are delivered directly 1125 * to the guest by the hardware, but we can get here 1126 * as a result of a hypervisor emulation interrupt 1127 * (e40) getting turned into a 700 by BML RTAS. 1128 */ 1129 flags = vcpu->arch.shregs.msr & 0x1f0000ull; 1130 kvmppc_core_queue_program(vcpu, flags); 1131 r = RESUME_GUEST; 1132 break; 1133 } 1134 case BOOK3S_INTERRUPT_SYSCALL: 1135 { 1136 /* hcall - punt to userspace */ 1137 int i; 1138 1139 /* hypercall with MSR_PR has already been handled in rmode, 1140 * and never reaches here. 1141 */ 1142 1143 run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); 1144 for (i = 0; i < 9; ++i) 1145 run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); 1146 run->exit_reason = KVM_EXIT_PAPR_HCALL; 1147 vcpu->arch.hcall_needed = 1; 1148 r = RESUME_HOST; 1149 break; 1150 } 1151 /* 1152 * We get these next two if the guest accesses a page which it thinks 1153 * it has mapped but which is not actually present, either because 1154 * it is for an emulated I/O device or because the corresonding 1155 * host page has been paged out. Any other HDSI/HISI interrupts 1156 * have been handled already. 1157 */ 1158 case BOOK3S_INTERRUPT_H_DATA_STORAGE: 1159 r = RESUME_PAGE_FAULT; 1160 break; 1161 case BOOK3S_INTERRUPT_H_INST_STORAGE: 1162 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); 1163 vcpu->arch.fault_dsisr = 0; 1164 r = RESUME_PAGE_FAULT; 1165 break; 1166 /* 1167 * This occurs if the guest executes an illegal instruction. 1168 * If the guest debug is disabled, generate a program interrupt 1169 * to the guest. If guest debug is enabled, we need to check 1170 * whether the instruction is a software breakpoint instruction. 1171 * Accordingly return to Guest or Host. 1172 */ 1173 case BOOK3S_INTERRUPT_H_EMUL_ASSIST: 1174 if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED) 1175 vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ? 1176 swab32(vcpu->arch.emul_inst) : 1177 vcpu->arch.emul_inst; 1178 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { 1179 r = kvmppc_emulate_debug_inst(run, vcpu); 1180 } else { 1181 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1182 r = RESUME_GUEST; 1183 } 1184 break; 1185 /* 1186 * This occurs if the guest (kernel or userspace), does something that 1187 * is prohibited by HFSCR. 1188 * On POWER9, this could be a doorbell instruction that we need 1189 * to emulate. 1190 * Otherwise, we just generate a program interrupt to the guest. 1191 */ 1192 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: 1193 r = EMULATE_FAIL; 1194 if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) 1195 r = kvmppc_emulate_doorbell_instr(vcpu); 1196 if (r == EMULATE_FAIL) { 1197 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1198 r = RESUME_GUEST; 1199 } 1200 break; 1201 case BOOK3S_INTERRUPT_HV_RM_HARD: 1202 r = RESUME_PASSTHROUGH; 1203 break; 1204 default: 1205 kvmppc_dump_regs(vcpu); 1206 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", 1207 vcpu->arch.trap, kvmppc_get_pc(vcpu), 1208 vcpu->arch.shregs.msr); 1209 run->hw.hardware_exit_reason = vcpu->arch.trap; 1210 r = RESUME_HOST; 1211 break; 1212 } 1213 1214 return r; 1215 } 1216 1217 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, 1218 struct kvm_sregs *sregs) 1219 { 1220 int i; 1221 1222 memset(sregs, 0, sizeof(struct kvm_sregs)); 1223 sregs->pvr = vcpu->arch.pvr; 1224 for (i = 0; i < vcpu->arch.slb_max; i++) { 1225 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; 1226 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; 1227 } 1228 1229 return 0; 1230 } 1231 1232 static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, 1233 struct kvm_sregs *sregs) 1234 { 1235 int i, j; 1236 1237 /* Only accept the same PVR as the host's, since we can't spoof it */ 1238 if (sregs->pvr != vcpu->arch.pvr) 1239 return -EINVAL; 1240 1241 j = 0; 1242 for (i = 0; i < vcpu->arch.slb_nr; i++) { 1243 if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) { 1244 vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe; 1245 vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv; 1246 ++j; 1247 } 1248 } 1249 vcpu->arch.slb_max = j; 1250 1251 return 0; 1252 } 1253 1254 static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, 1255 bool preserve_top32) 1256 { 1257 struct kvm *kvm = vcpu->kvm; 1258 struct kvmppc_vcore *vc = vcpu->arch.vcore; 1259 u64 mask; 1260 1261 mutex_lock(&kvm->lock); 1262 spin_lock(&vc->lock); 1263 /* 1264 * If ILE (interrupt little-endian) has changed, update the 1265 * MSR_LE bit in the intr_msr for each vcpu in this vcore. 1266 */ 1267 if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) { 1268 struct kvm_vcpu *vcpu; 1269 int i; 1270 1271 kvm_for_each_vcpu(i, vcpu, kvm) { 1272 if (vcpu->arch.vcore != vc) 1273 continue; 1274 if (new_lpcr & LPCR_ILE) 1275 vcpu->arch.intr_msr |= MSR_LE; 1276 else 1277 vcpu->arch.intr_msr &= ~MSR_LE; 1278 } 1279 } 1280 1281 /* 1282 * Userspace can only modify DPFD (default prefetch depth), 1283 * ILE (interrupt little-endian) and TC (translation control). 1284 * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.). 1285 */ 1286 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; 1287 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 1288 mask |= LPCR_AIL; 1289 /* 1290 * On POWER9, allow userspace to enable large decrementer for the 1291 * guest, whether or not the host has it enabled. 1292 */ 1293 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1294 mask |= LPCR_LD; 1295 1296 /* Broken 32-bit version of LPCR must not clear top bits */ 1297 if (preserve_top32) 1298 mask &= 0xFFFFFFFF; 1299 vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); 1300 spin_unlock(&vc->lock); 1301 mutex_unlock(&kvm->lock); 1302 } 1303 1304 static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, 1305 union kvmppc_one_reg *val) 1306 { 1307 int r = 0; 1308 long int i; 1309 1310 switch (id) { 1311 case KVM_REG_PPC_DEBUG_INST: 1312 *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); 1313 break; 1314 case KVM_REG_PPC_HIOR: 1315 *val = get_reg_val(id, 0); 1316 break; 1317 case KVM_REG_PPC_DABR: 1318 *val = get_reg_val(id, vcpu->arch.dabr); 1319 break; 1320 case KVM_REG_PPC_DABRX: 1321 *val = get_reg_val(id, vcpu->arch.dabrx); 1322 break; 1323 case KVM_REG_PPC_DSCR: 1324 *val = get_reg_val(id, vcpu->arch.dscr); 1325 break; 1326 case KVM_REG_PPC_PURR: 1327 *val = get_reg_val(id, vcpu->arch.purr); 1328 break; 1329 case KVM_REG_PPC_SPURR: 1330 *val = get_reg_val(id, vcpu->arch.spurr); 1331 break; 1332 case KVM_REG_PPC_AMR: 1333 *val = get_reg_val(id, vcpu->arch.amr); 1334 break; 1335 case KVM_REG_PPC_UAMOR: 1336 *val = get_reg_val(id, vcpu->arch.uamor); 1337 break; 1338 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: 1339 i = id - KVM_REG_PPC_MMCR0; 1340 *val = get_reg_val(id, vcpu->arch.mmcr[i]); 1341 break; 1342 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: 1343 i = id - KVM_REG_PPC_PMC1; 1344 *val = get_reg_val(id, vcpu->arch.pmc[i]); 1345 break; 1346 case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: 1347 i = id - KVM_REG_PPC_SPMC1; 1348 *val = get_reg_val(id, vcpu->arch.spmc[i]); 1349 break; 1350 case KVM_REG_PPC_SIAR: 1351 *val = get_reg_val(id, vcpu->arch.siar); 1352 break; 1353 case KVM_REG_PPC_SDAR: 1354 *val = get_reg_val(id, vcpu->arch.sdar); 1355 break; 1356 case KVM_REG_PPC_SIER: 1357 *val = get_reg_val(id, vcpu->arch.sier); 1358 break; 1359 case KVM_REG_PPC_IAMR: 1360 *val = get_reg_val(id, vcpu->arch.iamr); 1361 break; 1362 case KVM_REG_PPC_PSPB: 1363 *val = get_reg_val(id, vcpu->arch.pspb); 1364 break; 1365 case KVM_REG_PPC_DPDES: 1366 *val = get_reg_val(id, vcpu->arch.vcore->dpdes); 1367 break; 1368 case KVM_REG_PPC_VTB: 1369 *val = get_reg_val(id, vcpu->arch.vcore->vtb); 1370 break; 1371 case KVM_REG_PPC_DAWR: 1372 *val = get_reg_val(id, vcpu->arch.dawr); 1373 break; 1374 case KVM_REG_PPC_DAWRX: 1375 *val = get_reg_val(id, vcpu->arch.dawrx); 1376 break; 1377 case KVM_REG_PPC_CIABR: 1378 *val = get_reg_val(id, vcpu->arch.ciabr); 1379 break; 1380 case KVM_REG_PPC_CSIGR: 1381 *val = get_reg_val(id, vcpu->arch.csigr); 1382 break; 1383 case KVM_REG_PPC_TACR: 1384 *val = get_reg_val(id, vcpu->arch.tacr); 1385 break; 1386 case KVM_REG_PPC_TCSCR: 1387 *val = get_reg_val(id, vcpu->arch.tcscr); 1388 break; 1389 case KVM_REG_PPC_PID: 1390 *val = get_reg_val(id, vcpu->arch.pid); 1391 break; 1392 case KVM_REG_PPC_ACOP: 1393 *val = get_reg_val(id, vcpu->arch.acop); 1394 break; 1395 case KVM_REG_PPC_WORT: 1396 *val = get_reg_val(id, vcpu->arch.wort); 1397 break; 1398 case KVM_REG_PPC_TIDR: 1399 *val = get_reg_val(id, vcpu->arch.tid); 1400 break; 1401 case KVM_REG_PPC_PSSCR: 1402 *val = get_reg_val(id, vcpu->arch.psscr); 1403 break; 1404 case KVM_REG_PPC_VPA_ADDR: 1405 spin_lock(&vcpu->arch.vpa_update_lock); 1406 *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); 1407 spin_unlock(&vcpu->arch.vpa_update_lock); 1408 break; 1409 case KVM_REG_PPC_VPA_SLB: 1410 spin_lock(&vcpu->arch.vpa_update_lock); 1411 val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa; 1412 val->vpaval.length = vcpu->arch.slb_shadow.len; 1413 spin_unlock(&vcpu->arch.vpa_update_lock); 1414 break; 1415 case KVM_REG_PPC_VPA_DTL: 1416 spin_lock(&vcpu->arch.vpa_update_lock); 1417 val->vpaval.addr = vcpu->arch.dtl.next_gpa; 1418 val->vpaval.length = vcpu->arch.dtl.len; 1419 spin_unlock(&vcpu->arch.vpa_update_lock); 1420 break; 1421 case KVM_REG_PPC_TB_OFFSET: 1422 *val = get_reg_val(id, vcpu->arch.vcore->tb_offset); 1423 break; 1424 case KVM_REG_PPC_LPCR: 1425 case KVM_REG_PPC_LPCR_64: 1426 *val = get_reg_val(id, vcpu->arch.vcore->lpcr); 1427 break; 1428 case KVM_REG_PPC_PPR: 1429 *val = get_reg_val(id, vcpu->arch.ppr); 1430 break; 1431 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1432 case KVM_REG_PPC_TFHAR: 1433 *val = get_reg_val(id, vcpu->arch.tfhar); 1434 break; 1435 case KVM_REG_PPC_TFIAR: 1436 *val = get_reg_val(id, vcpu->arch.tfiar); 1437 break; 1438 case KVM_REG_PPC_TEXASR: 1439 *val = get_reg_val(id, vcpu->arch.texasr); 1440 break; 1441 case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: 1442 i = id - KVM_REG_PPC_TM_GPR0; 1443 *val = get_reg_val(id, vcpu->arch.gpr_tm[i]); 1444 break; 1445 case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: 1446 { 1447 int j; 1448 i = id - KVM_REG_PPC_TM_VSR0; 1449 if (i < 32) 1450 for (j = 0; j < TS_FPRWIDTH; j++) 1451 val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j]; 1452 else { 1453 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 1454 val->vval = vcpu->arch.vr_tm.vr[i-32]; 1455 else 1456 r = -ENXIO; 1457 } 1458 break; 1459 } 1460 case KVM_REG_PPC_TM_CR: 1461 *val = get_reg_val(id, vcpu->arch.cr_tm); 1462 break; 1463 case KVM_REG_PPC_TM_XER: 1464 *val = get_reg_val(id, vcpu->arch.xer_tm); 1465 break; 1466 case KVM_REG_PPC_TM_LR: 1467 *val = get_reg_val(id, vcpu->arch.lr_tm); 1468 break; 1469 case KVM_REG_PPC_TM_CTR: 1470 *val = get_reg_val(id, vcpu->arch.ctr_tm); 1471 break; 1472 case KVM_REG_PPC_TM_FPSCR: 1473 *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr); 1474 break; 1475 case KVM_REG_PPC_TM_AMR: 1476 *val = get_reg_val(id, vcpu->arch.amr_tm); 1477 break; 1478 case KVM_REG_PPC_TM_PPR: 1479 *val = get_reg_val(id, vcpu->arch.ppr_tm); 1480 break; 1481 case KVM_REG_PPC_TM_VRSAVE: 1482 *val = get_reg_val(id, vcpu->arch.vrsave_tm); 1483 break; 1484 case KVM_REG_PPC_TM_VSCR: 1485 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 1486 *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]); 1487 else 1488 r = -ENXIO; 1489 break; 1490 case KVM_REG_PPC_TM_DSCR: 1491 *val = get_reg_val(id, vcpu->arch.dscr_tm); 1492 break; 1493 case KVM_REG_PPC_TM_TAR: 1494 *val = get_reg_val(id, vcpu->arch.tar_tm); 1495 break; 1496 #endif 1497 case KVM_REG_PPC_ARCH_COMPAT: 1498 *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); 1499 break; 1500 default: 1501 r = -EINVAL; 1502 break; 1503 } 1504 1505 return r; 1506 } 1507 1508 static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, 1509 union kvmppc_one_reg *val) 1510 { 1511 int r = 0; 1512 long int i; 1513 unsigned long addr, len; 1514 1515 switch (id) { 1516 case KVM_REG_PPC_HIOR: 1517 /* Only allow this to be set to zero */ 1518 if (set_reg_val(id, *val)) 1519 r = -EINVAL; 1520 break; 1521 case KVM_REG_PPC_DABR: 1522 vcpu->arch.dabr = set_reg_val(id, *val); 1523 break; 1524 case KVM_REG_PPC_DABRX: 1525 vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP; 1526 break; 1527 case KVM_REG_PPC_DSCR: 1528 vcpu->arch.dscr = set_reg_val(id, *val); 1529 break; 1530 case KVM_REG_PPC_PURR: 1531 vcpu->arch.purr = set_reg_val(id, *val); 1532 break; 1533 case KVM_REG_PPC_SPURR: 1534 vcpu->arch.spurr = set_reg_val(id, *val); 1535 break; 1536 case KVM_REG_PPC_AMR: 1537 vcpu->arch.amr = set_reg_val(id, *val); 1538 break; 1539 case KVM_REG_PPC_UAMOR: 1540 vcpu->arch.uamor = set_reg_val(id, *val); 1541 break; 1542 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: 1543 i = id - KVM_REG_PPC_MMCR0; 1544 vcpu->arch.mmcr[i] = set_reg_val(id, *val); 1545 break; 1546 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: 1547 i = id - KVM_REG_PPC_PMC1; 1548 vcpu->arch.pmc[i] = set_reg_val(id, *val); 1549 break; 1550 case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: 1551 i = id - KVM_REG_PPC_SPMC1; 1552 vcpu->arch.spmc[i] = set_reg_val(id, *val); 1553 break; 1554 case KVM_REG_PPC_SIAR: 1555 vcpu->arch.siar = set_reg_val(id, *val); 1556 break; 1557 case KVM_REG_PPC_SDAR: 1558 vcpu->arch.sdar = set_reg_val(id, *val); 1559 break; 1560 case KVM_REG_PPC_SIER: 1561 vcpu->arch.sier = set_reg_val(id, *val); 1562 break; 1563 case KVM_REG_PPC_IAMR: 1564 vcpu->arch.iamr = set_reg_val(id, *val); 1565 break; 1566 case KVM_REG_PPC_PSPB: 1567 vcpu->arch.pspb = set_reg_val(id, *val); 1568 break; 1569 case KVM_REG_PPC_DPDES: 1570 vcpu->arch.vcore->dpdes = set_reg_val(id, *val); 1571 break; 1572 case KVM_REG_PPC_VTB: 1573 vcpu->arch.vcore->vtb = set_reg_val(id, *val); 1574 break; 1575 case KVM_REG_PPC_DAWR: 1576 vcpu->arch.dawr = set_reg_val(id, *val); 1577 break; 1578 case KVM_REG_PPC_DAWRX: 1579 vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP; 1580 break; 1581 case KVM_REG_PPC_CIABR: 1582 vcpu->arch.ciabr = set_reg_val(id, *val); 1583 /* Don't allow setting breakpoints in hypervisor code */ 1584 if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) 1585 vcpu->arch.ciabr &= ~CIABR_PRIV; /* disable */ 1586 break; 1587 case KVM_REG_PPC_CSIGR: 1588 vcpu->arch.csigr = set_reg_val(id, *val); 1589 break; 1590 case KVM_REG_PPC_TACR: 1591 vcpu->arch.tacr = set_reg_val(id, *val); 1592 break; 1593 case KVM_REG_PPC_TCSCR: 1594 vcpu->arch.tcscr = set_reg_val(id, *val); 1595 break; 1596 case KVM_REG_PPC_PID: 1597 vcpu->arch.pid = set_reg_val(id, *val); 1598 break; 1599 case KVM_REG_PPC_ACOP: 1600 vcpu->arch.acop = set_reg_val(id, *val); 1601 break; 1602 case KVM_REG_PPC_WORT: 1603 vcpu->arch.wort = set_reg_val(id, *val); 1604 break; 1605 case KVM_REG_PPC_TIDR: 1606 vcpu->arch.tid = set_reg_val(id, *val); 1607 break; 1608 case KVM_REG_PPC_PSSCR: 1609 vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS; 1610 break; 1611 case KVM_REG_PPC_VPA_ADDR: 1612 addr = set_reg_val(id, *val); 1613 r = -EINVAL; 1614 if (!addr && (vcpu->arch.slb_shadow.next_gpa || 1615 vcpu->arch.dtl.next_gpa)) 1616 break; 1617 r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); 1618 break; 1619 case KVM_REG_PPC_VPA_SLB: 1620 addr = val->vpaval.addr; 1621 len = val->vpaval.length; 1622 r = -EINVAL; 1623 if (addr && !vcpu->arch.vpa.next_gpa) 1624 break; 1625 r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); 1626 break; 1627 case KVM_REG_PPC_VPA_DTL: 1628 addr = val->vpaval.addr; 1629 len = val->vpaval.length; 1630 r = -EINVAL; 1631 if (addr && (len < sizeof(struct dtl_entry) || 1632 !vcpu->arch.vpa.next_gpa)) 1633 break; 1634 len -= len % sizeof(struct dtl_entry); 1635 r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); 1636 break; 1637 case KVM_REG_PPC_TB_OFFSET: 1638 /* 1639 * POWER9 DD1 has an erratum where writing TBU40 causes 1640 * the timebase to lose ticks. So we don't let the 1641 * timebase offset be changed on P9 DD1. (It is 1642 * initialized to zero.) 1643 */ 1644 if (cpu_has_feature(CPU_FTR_POWER9_DD1)) 1645 break; 1646 /* round up to multiple of 2^24 */ 1647 vcpu->arch.vcore->tb_offset = 1648 ALIGN(set_reg_val(id, *val), 1UL << 24); 1649 break; 1650 case KVM_REG_PPC_LPCR: 1651 kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true); 1652 break; 1653 case KVM_REG_PPC_LPCR_64: 1654 kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false); 1655 break; 1656 case KVM_REG_PPC_PPR: 1657 vcpu->arch.ppr = set_reg_val(id, *val); 1658 break; 1659 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1660 case KVM_REG_PPC_TFHAR: 1661 vcpu->arch.tfhar = set_reg_val(id, *val); 1662 break; 1663 case KVM_REG_PPC_TFIAR: 1664 vcpu->arch.tfiar = set_reg_val(id, *val); 1665 break; 1666 case KVM_REG_PPC_TEXASR: 1667 vcpu->arch.texasr = set_reg_val(id, *val); 1668 break; 1669 case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: 1670 i = id - KVM_REG_PPC_TM_GPR0; 1671 vcpu->arch.gpr_tm[i] = set_reg_val(id, *val); 1672 break; 1673 case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: 1674 { 1675 int j; 1676 i = id - KVM_REG_PPC_TM_VSR0; 1677 if (i < 32) 1678 for (j = 0; j < TS_FPRWIDTH; j++) 1679 vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j]; 1680 else 1681 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 1682 vcpu->arch.vr_tm.vr[i-32] = val->vval; 1683 else 1684 r = -ENXIO; 1685 break; 1686 } 1687 case KVM_REG_PPC_TM_CR: 1688 vcpu->arch.cr_tm = set_reg_val(id, *val); 1689 break; 1690 case KVM_REG_PPC_TM_XER: 1691 vcpu->arch.xer_tm = set_reg_val(id, *val); 1692 break; 1693 case KVM_REG_PPC_TM_LR: 1694 vcpu->arch.lr_tm = set_reg_val(id, *val); 1695 break; 1696 case KVM_REG_PPC_TM_CTR: 1697 vcpu->arch.ctr_tm = set_reg_val(id, *val); 1698 break; 1699 case KVM_REG_PPC_TM_FPSCR: 1700 vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val); 1701 break; 1702 case KVM_REG_PPC_TM_AMR: 1703 vcpu->arch.amr_tm = set_reg_val(id, *val); 1704 break; 1705 case KVM_REG_PPC_TM_PPR: 1706 vcpu->arch.ppr_tm = set_reg_val(id, *val); 1707 break; 1708 case KVM_REG_PPC_TM_VRSAVE: 1709 vcpu->arch.vrsave_tm = set_reg_val(id, *val); 1710 break; 1711 case KVM_REG_PPC_TM_VSCR: 1712 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 1713 vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val); 1714 else 1715 r = - ENXIO; 1716 break; 1717 case KVM_REG_PPC_TM_DSCR: 1718 vcpu->arch.dscr_tm = set_reg_val(id, *val); 1719 break; 1720 case KVM_REG_PPC_TM_TAR: 1721 vcpu->arch.tar_tm = set_reg_val(id, *val); 1722 break; 1723 #endif 1724 case KVM_REG_PPC_ARCH_COMPAT: 1725 r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); 1726 break; 1727 default: 1728 r = -EINVAL; 1729 break; 1730 } 1731 1732 return r; 1733 } 1734 1735 /* 1736 * On POWER9, threads are independent and can be in different partitions. 1737 * Therefore we consider each thread to be a subcore. 1738 * There is a restriction that all threads have to be in the same 1739 * MMU mode (radix or HPT), unfortunately, but since we only support 1740 * HPT guests on a HPT host so far, that isn't an impediment yet. 1741 */ 1742 static int threads_per_vcore(struct kvm *kvm) 1743 { 1744 if (kvm->arch.threads_indep) 1745 return 1; 1746 return threads_per_subcore; 1747 } 1748 1749 static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) 1750 { 1751 struct kvmppc_vcore *vcore; 1752 1753 vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL); 1754 1755 if (vcore == NULL) 1756 return NULL; 1757 1758 spin_lock_init(&vcore->lock); 1759 spin_lock_init(&vcore->stoltb_lock); 1760 init_swait_queue_head(&vcore->wq); 1761 vcore->preempt_tb = TB_NIL; 1762 vcore->lpcr = kvm->arch.lpcr; 1763 vcore->first_vcpuid = core * kvm->arch.smt_mode; 1764 vcore->kvm = kvm; 1765 INIT_LIST_HEAD(&vcore->preempt_list); 1766 1767 return vcore; 1768 } 1769 1770 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1771 static struct debugfs_timings_element { 1772 const char *name; 1773 size_t offset; 1774 } timings[] = { 1775 {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, 1776 {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, 1777 {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, 1778 {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, 1779 {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, 1780 }; 1781 1782 #define N_TIMINGS (ARRAY_SIZE(timings)) 1783 1784 struct debugfs_timings_state { 1785 struct kvm_vcpu *vcpu; 1786 unsigned int buflen; 1787 char buf[N_TIMINGS * 100]; 1788 }; 1789 1790 static int debugfs_timings_open(struct inode *inode, struct file *file) 1791 { 1792 struct kvm_vcpu *vcpu = inode->i_private; 1793 struct debugfs_timings_state *p; 1794 1795 p = kzalloc(sizeof(*p), GFP_KERNEL); 1796 if (!p) 1797 return -ENOMEM; 1798 1799 kvm_get_kvm(vcpu->kvm); 1800 p->vcpu = vcpu; 1801 file->private_data = p; 1802 1803 return nonseekable_open(inode, file); 1804 } 1805 1806 static int debugfs_timings_release(struct inode *inode, struct file *file) 1807 { 1808 struct debugfs_timings_state *p = file->private_data; 1809 1810 kvm_put_kvm(p->vcpu->kvm); 1811 kfree(p); 1812 return 0; 1813 } 1814 1815 static ssize_t debugfs_timings_read(struct file *file, char __user *buf, 1816 size_t len, loff_t *ppos) 1817 { 1818 struct debugfs_timings_state *p = file->private_data; 1819 struct kvm_vcpu *vcpu = p->vcpu; 1820 char *s, *buf_end; 1821 struct kvmhv_tb_accumulator tb; 1822 u64 count; 1823 loff_t pos; 1824 ssize_t n; 1825 int i, loops; 1826 bool ok; 1827 1828 if (!p->buflen) { 1829 s = p->buf; 1830 buf_end = s + sizeof(p->buf); 1831 for (i = 0; i < N_TIMINGS; ++i) { 1832 struct kvmhv_tb_accumulator *acc; 1833 1834 acc = (struct kvmhv_tb_accumulator *) 1835 ((unsigned long)vcpu + timings[i].offset); 1836 ok = false; 1837 for (loops = 0; loops < 1000; ++loops) { 1838 count = acc->seqcount; 1839 if (!(count & 1)) { 1840 smp_rmb(); 1841 tb = *acc; 1842 smp_rmb(); 1843 if (count == acc->seqcount) { 1844 ok = true; 1845 break; 1846 } 1847 } 1848 udelay(1); 1849 } 1850 if (!ok) 1851 snprintf(s, buf_end - s, "%s: stuck\n", 1852 timings[i].name); 1853 else 1854 snprintf(s, buf_end - s, 1855 "%s: %llu %llu %llu %llu\n", 1856 timings[i].name, count / 2, 1857 tb_to_ns(tb.tb_total), 1858 tb_to_ns(tb.tb_min), 1859 tb_to_ns(tb.tb_max)); 1860 s += strlen(s); 1861 } 1862 p->buflen = s - p->buf; 1863 } 1864 1865 pos = *ppos; 1866 if (pos >= p->buflen) 1867 return 0; 1868 if (len > p->buflen - pos) 1869 len = p->buflen - pos; 1870 n = copy_to_user(buf, p->buf + pos, len); 1871 if (n) { 1872 if (n == len) 1873 return -EFAULT; 1874 len -= n; 1875 } 1876 *ppos = pos + len; 1877 return len; 1878 } 1879 1880 static ssize_t debugfs_timings_write(struct file *file, const char __user *buf, 1881 size_t len, loff_t *ppos) 1882 { 1883 return -EACCES; 1884 } 1885 1886 static const struct file_operations debugfs_timings_ops = { 1887 .owner = THIS_MODULE, 1888 .open = debugfs_timings_open, 1889 .release = debugfs_timings_release, 1890 .read = debugfs_timings_read, 1891 .write = debugfs_timings_write, 1892 .llseek = generic_file_llseek, 1893 }; 1894 1895 /* Create a debugfs directory for the vcpu */ 1896 static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) 1897 { 1898 char buf[16]; 1899 struct kvm *kvm = vcpu->kvm; 1900 1901 snprintf(buf, sizeof(buf), "vcpu%u", id); 1902 if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) 1903 return; 1904 vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); 1905 if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) 1906 return; 1907 vcpu->arch.debugfs_timings = 1908 debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, 1909 vcpu, &debugfs_timings_ops); 1910 } 1911 1912 #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 1913 static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) 1914 { 1915 } 1916 #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 1917 1918 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, 1919 unsigned int id) 1920 { 1921 struct kvm_vcpu *vcpu; 1922 int err; 1923 int core; 1924 struct kvmppc_vcore *vcore; 1925 1926 err = -ENOMEM; 1927 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 1928 if (!vcpu) 1929 goto out; 1930 1931 err = kvm_vcpu_init(vcpu, kvm, id); 1932 if (err) 1933 goto free_vcpu; 1934 1935 vcpu->arch.shared = &vcpu->arch.shregs; 1936 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE 1937 /* 1938 * The shared struct is never shared on HV, 1939 * so we can always use host endianness 1940 */ 1941 #ifdef __BIG_ENDIAN__ 1942 vcpu->arch.shared_big_endian = true; 1943 #else 1944 vcpu->arch.shared_big_endian = false; 1945 #endif 1946 #endif 1947 vcpu->arch.mmcr[0] = MMCR0_FC; 1948 vcpu->arch.ctrl = CTRL_RUNLATCH; 1949 /* default to host PVR, since we can't spoof it */ 1950 kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); 1951 spin_lock_init(&vcpu->arch.vpa_update_lock); 1952 spin_lock_init(&vcpu->arch.tbacct_lock); 1953 vcpu->arch.busy_preempt = TB_NIL; 1954 vcpu->arch.intr_msr = MSR_SF | MSR_ME; 1955 1956 /* 1957 * Set the default HFSCR for the guest from the host value. 1958 * This value is only used on POWER9. 1959 * On POWER9 DD1, TM doesn't work, so we make sure to 1960 * prevent the guest from using it. 1961 * On POWER9, we want to virtualize the doorbell facility, so we 1962 * turn off the HFSCR bit, which causes those instructions to trap. 1963 */ 1964 vcpu->arch.hfscr = mfspr(SPRN_HFSCR); 1965 if (!cpu_has_feature(CPU_FTR_TM)) 1966 vcpu->arch.hfscr &= ~HFSCR_TM; 1967 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1968 vcpu->arch.hfscr &= ~HFSCR_MSGP; 1969 1970 kvmppc_mmu_book3s_hv_init(vcpu); 1971 1972 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 1973 1974 init_waitqueue_head(&vcpu->arch.cpu_run); 1975 1976 mutex_lock(&kvm->lock); 1977 vcore = NULL; 1978 err = -EINVAL; 1979 core = id / kvm->arch.smt_mode; 1980 if (core < KVM_MAX_VCORES) { 1981 vcore = kvm->arch.vcores[core]; 1982 if (!vcore) { 1983 err = -ENOMEM; 1984 vcore = kvmppc_vcore_create(kvm, core); 1985 kvm->arch.vcores[core] = vcore; 1986 kvm->arch.online_vcores++; 1987 } 1988 } 1989 mutex_unlock(&kvm->lock); 1990 1991 if (!vcore) 1992 goto free_vcpu; 1993 1994 spin_lock(&vcore->lock); 1995 ++vcore->num_threads; 1996 spin_unlock(&vcore->lock); 1997 vcpu->arch.vcore = vcore; 1998 vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; 1999 vcpu->arch.thread_cpu = -1; 2000 vcpu->arch.prev_cpu = -1; 2001 2002 vcpu->arch.cpu_type = KVM_CPU_3S_64; 2003 kvmppc_sanity_check(vcpu); 2004 2005 debugfs_vcpu_init(vcpu, id); 2006 2007 return vcpu; 2008 2009 free_vcpu: 2010 kmem_cache_free(kvm_vcpu_cache, vcpu); 2011 out: 2012 return ERR_PTR(err); 2013 } 2014 2015 static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, 2016 unsigned long flags) 2017 { 2018 int err; 2019 int esmt = 0; 2020 2021 if (flags) 2022 return -EINVAL; 2023 if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode)) 2024 return -EINVAL; 2025 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 2026 /* 2027 * On POWER8 (or POWER7), the threading mode is "strict", 2028 * so we pack smt_mode vcpus per vcore. 2029 */ 2030 if (smt_mode > threads_per_subcore) 2031 return -EINVAL; 2032 } else { 2033 /* 2034 * On POWER9, the threading mode is "loose", 2035 * so each vcpu gets its own vcore. 2036 */ 2037 esmt = smt_mode; 2038 smt_mode = 1; 2039 } 2040 mutex_lock(&kvm->lock); 2041 err = -EBUSY; 2042 if (!kvm->arch.online_vcores) { 2043 kvm->arch.smt_mode = smt_mode; 2044 kvm->arch.emul_smt_mode = esmt; 2045 err = 0; 2046 } 2047 mutex_unlock(&kvm->lock); 2048 2049 return err; 2050 } 2051 2052 static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) 2053 { 2054 if (vpa->pinned_addr) 2055 kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa, 2056 vpa->dirty); 2057 } 2058 2059 static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) 2060 { 2061 spin_lock(&vcpu->arch.vpa_update_lock); 2062 unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); 2063 unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); 2064 unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); 2065 spin_unlock(&vcpu->arch.vpa_update_lock); 2066 kvm_vcpu_uninit(vcpu); 2067 kmem_cache_free(kvm_vcpu_cache, vcpu); 2068 } 2069 2070 static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) 2071 { 2072 /* Indicate we want to get back into the guest */ 2073 return 1; 2074 } 2075 2076 static void kvmppc_set_timer(struct kvm_vcpu *vcpu) 2077 { 2078 unsigned long dec_nsec, now; 2079 2080 now = get_tb(); 2081 if (now > vcpu->arch.dec_expires) { 2082 /* decrementer has already gone negative */ 2083 kvmppc_core_queue_dec(vcpu); 2084 kvmppc_core_prepare_to_enter(vcpu); 2085 return; 2086 } 2087 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC 2088 / tb_ticks_per_sec; 2089 hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL); 2090 vcpu->arch.timer_running = 1; 2091 } 2092 2093 static void kvmppc_end_cede(struct kvm_vcpu *vcpu) 2094 { 2095 vcpu->arch.ceded = 0; 2096 if (vcpu->arch.timer_running) { 2097 hrtimer_try_to_cancel(&vcpu->arch.dec_timer); 2098 vcpu->arch.timer_running = 0; 2099 } 2100 } 2101 2102 extern int __kvmppc_vcore_entry(void); 2103 2104 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 2105 struct kvm_vcpu *vcpu) 2106 { 2107 u64 now; 2108 2109 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 2110 return; 2111 spin_lock_irq(&vcpu->arch.tbacct_lock); 2112 now = mftb(); 2113 vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - 2114 vcpu->arch.stolen_logged; 2115 vcpu->arch.busy_preempt = now; 2116 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 2117 spin_unlock_irq(&vcpu->arch.tbacct_lock); 2118 --vc->n_runnable; 2119 WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL); 2120 } 2121 2122 static int kvmppc_grab_hwthread(int cpu) 2123 { 2124 struct paca_struct *tpaca; 2125 long timeout = 10000; 2126 2127 tpaca = &paca[cpu]; 2128 2129 /* Ensure the thread won't go into the kernel if it wakes */ 2130 tpaca->kvm_hstate.kvm_vcpu = NULL; 2131 tpaca->kvm_hstate.kvm_vcore = NULL; 2132 tpaca->kvm_hstate.napping = 0; 2133 smp_wmb(); 2134 tpaca->kvm_hstate.hwthread_req = 1; 2135 2136 /* 2137 * If the thread is already executing in the kernel (e.g. handling 2138 * a stray interrupt), wait for it to get back to nap mode. 2139 * The smp_mb() is to ensure that our setting of hwthread_req 2140 * is visible before we look at hwthread_state, so if this 2141 * races with the code at system_reset_pSeries and the thread 2142 * misses our setting of hwthread_req, we are sure to see its 2143 * setting of hwthread_state, and vice versa. 2144 */ 2145 smp_mb(); 2146 while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) { 2147 if (--timeout <= 0) { 2148 pr_err("KVM: couldn't grab cpu %d\n", cpu); 2149 return -EBUSY; 2150 } 2151 udelay(1); 2152 } 2153 return 0; 2154 } 2155 2156 static void kvmppc_release_hwthread(int cpu) 2157 { 2158 struct paca_struct *tpaca; 2159 2160 tpaca = &paca[cpu]; 2161 tpaca->kvm_hstate.hwthread_req = 0; 2162 tpaca->kvm_hstate.kvm_vcpu = NULL; 2163 tpaca->kvm_hstate.kvm_vcore = NULL; 2164 tpaca->kvm_hstate.kvm_split_mode = NULL; 2165 } 2166 2167 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) 2168 { 2169 int i; 2170 2171 cpu = cpu_first_thread_sibling(cpu); 2172 cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); 2173 /* 2174 * Make sure setting of bit in need_tlb_flush precedes 2175 * testing of cpu_in_guest bits. The matching barrier on 2176 * the other side is the first smp_mb() in kvmppc_run_core(). 2177 */ 2178 smp_mb(); 2179 for (i = 0; i < threads_per_core; ++i) 2180 if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) 2181 smp_call_function_single(cpu + i, do_nothing, NULL, 1); 2182 } 2183 2184 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) 2185 { 2186 struct kvm *kvm = vcpu->kvm; 2187 2188 /* 2189 * With radix, the guest can do TLB invalidations itself, 2190 * and it could choose to use the local form (tlbiel) if 2191 * it is invalidating a translation that has only ever been 2192 * used on one vcpu. However, that doesn't mean it has 2193 * only ever been used on one physical cpu, since vcpus 2194 * can move around between pcpus. To cope with this, when 2195 * a vcpu moves from one pcpu to another, we need to tell 2196 * any vcpus running on the same core as this vcpu previously 2197 * ran to flush the TLB. The TLB is shared between threads, 2198 * so we use a single bit in .need_tlb_flush for all 4 threads. 2199 */ 2200 if (vcpu->arch.prev_cpu != pcpu) { 2201 if (vcpu->arch.prev_cpu >= 0 && 2202 cpu_first_thread_sibling(vcpu->arch.prev_cpu) != 2203 cpu_first_thread_sibling(pcpu)) 2204 radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); 2205 vcpu->arch.prev_cpu = pcpu; 2206 } 2207 } 2208 2209 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) 2210 { 2211 int cpu; 2212 struct paca_struct *tpaca; 2213 struct kvm *kvm = vc->kvm; 2214 2215 cpu = vc->pcpu; 2216 if (vcpu) { 2217 if (vcpu->arch.timer_running) { 2218 hrtimer_try_to_cancel(&vcpu->arch.dec_timer); 2219 vcpu->arch.timer_running = 0; 2220 } 2221 cpu += vcpu->arch.ptid; 2222 vcpu->cpu = vc->pcpu; 2223 vcpu->arch.thread_cpu = cpu; 2224 cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); 2225 } 2226 tpaca = &paca[cpu]; 2227 tpaca->kvm_hstate.kvm_vcpu = vcpu; 2228 tpaca->kvm_hstate.ptid = cpu - vc->pcpu; 2229 /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */ 2230 smp_wmb(); 2231 tpaca->kvm_hstate.kvm_vcore = vc; 2232 if (cpu != smp_processor_id()) 2233 kvmppc_ipi_thread(cpu); 2234 } 2235 2236 static void kvmppc_wait_for_nap(int n_threads) 2237 { 2238 int cpu = smp_processor_id(); 2239 int i, loops; 2240 2241 if (n_threads <= 1) 2242 return; 2243 for (loops = 0; loops < 1000000; ++loops) { 2244 /* 2245 * Check if all threads are finished. 2246 * We set the vcore pointer when starting a thread 2247 * and the thread clears it when finished, so we look 2248 * for any threads that still have a non-NULL vcore ptr. 2249 */ 2250 for (i = 1; i < n_threads; ++i) 2251 if (paca[cpu + i].kvm_hstate.kvm_vcore) 2252 break; 2253 if (i == n_threads) { 2254 HMT_medium(); 2255 return; 2256 } 2257 HMT_low(); 2258 } 2259 HMT_medium(); 2260 for (i = 1; i < n_threads; ++i) 2261 if (paca[cpu + i].kvm_hstate.kvm_vcore) 2262 pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); 2263 } 2264 2265 /* 2266 * Check that we are on thread 0 and that any other threads in 2267 * this core are off-line. Then grab the threads so they can't 2268 * enter the kernel. 2269 */ 2270 static int on_primary_thread(void) 2271 { 2272 int cpu = smp_processor_id(); 2273 int thr; 2274 2275 /* Are we on a primary subcore? */ 2276 if (cpu_thread_in_subcore(cpu)) 2277 return 0; 2278 2279 thr = 0; 2280 while (++thr < threads_per_subcore) 2281 if (cpu_online(cpu + thr)) 2282 return 0; 2283 2284 /* Grab all hw threads so they can't go into the kernel */ 2285 for (thr = 1; thr < threads_per_subcore; ++thr) { 2286 if (kvmppc_grab_hwthread(cpu + thr)) { 2287 /* Couldn't grab one; let the others go */ 2288 do { 2289 kvmppc_release_hwthread(cpu + thr); 2290 } while (--thr > 0); 2291 return 0; 2292 } 2293 } 2294 return 1; 2295 } 2296 2297 /* 2298 * A list of virtual cores for each physical CPU. 2299 * These are vcores that could run but their runner VCPU tasks are 2300 * (or may be) preempted. 2301 */ 2302 struct preempted_vcore_list { 2303 struct list_head list; 2304 spinlock_t lock; 2305 }; 2306 2307 static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores); 2308 2309 static void init_vcore_lists(void) 2310 { 2311 int cpu; 2312 2313 for_each_possible_cpu(cpu) { 2314 struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu); 2315 spin_lock_init(&lp->lock); 2316 INIT_LIST_HEAD(&lp->list); 2317 } 2318 } 2319 2320 static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) 2321 { 2322 struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores); 2323 2324 vc->vcore_state = VCORE_PREEMPT; 2325 vc->pcpu = smp_processor_id(); 2326 if (vc->num_threads < threads_per_vcore(vc->kvm)) { 2327 spin_lock(&lp->lock); 2328 list_add_tail(&vc->preempt_list, &lp->list); 2329 spin_unlock(&lp->lock); 2330 } 2331 2332 /* Start accumulating stolen time */ 2333 kvmppc_core_start_stolen(vc); 2334 } 2335 2336 static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc) 2337 { 2338 struct preempted_vcore_list *lp; 2339 2340 kvmppc_core_end_stolen(vc); 2341 if (!list_empty(&vc->preempt_list)) { 2342 lp = &per_cpu(preempted_vcores, vc->pcpu); 2343 spin_lock(&lp->lock); 2344 list_del_init(&vc->preempt_list); 2345 spin_unlock(&lp->lock); 2346 } 2347 vc->vcore_state = VCORE_INACTIVE; 2348 } 2349 2350 /* 2351 * This stores information about the virtual cores currently 2352 * assigned to a physical core. 2353 */ 2354 struct core_info { 2355 int n_subcores; 2356 int max_subcore_threads; 2357 int total_threads; 2358 int subcore_threads[MAX_SUBCORES]; 2359 struct kvmppc_vcore *vc[MAX_SUBCORES]; 2360 }; 2361 2362 /* 2363 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 2364 * respectively in 2-way micro-threading (split-core) mode on POWER8. 2365 */ 2366 static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; 2367 2368 static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) 2369 { 2370 memset(cip, 0, sizeof(*cip)); 2371 cip->n_subcores = 1; 2372 cip->max_subcore_threads = vc->num_threads; 2373 cip->total_threads = vc->num_threads; 2374 cip->subcore_threads[0] = vc->num_threads; 2375 cip->vc[0] = vc; 2376 } 2377 2378 static bool subcore_config_ok(int n_subcores, int n_threads) 2379 { 2380 /* 2381 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core 2382 * mode, with one thread per subcore. 2383 */ 2384 if (cpu_has_feature(CPU_FTR_ARCH_300)) 2385 return n_subcores <= 4 && n_threads == 1; 2386 2387 /* On POWER8, can only dynamically split if unsplit to begin with */ 2388 if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) 2389 return false; 2390 if (n_subcores > MAX_SUBCORES) 2391 return false; 2392 if (n_subcores > 1) { 2393 if (!(dynamic_mt_modes & 2)) 2394 n_subcores = 4; 2395 if (n_subcores > 2 && !(dynamic_mt_modes & 4)) 2396 return false; 2397 } 2398 2399 return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS; 2400 } 2401 2402 static void init_vcore_to_run(struct kvmppc_vcore *vc) 2403 { 2404 vc->entry_exit_map = 0; 2405 vc->in_guest = 0; 2406 vc->napping_threads = 0; 2407 vc->conferring_threads = 0; 2408 } 2409 2410 static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) 2411 { 2412 int n_threads = vc->num_threads; 2413 int sub; 2414 2415 if (!cpu_has_feature(CPU_FTR_ARCH_207S)) 2416 return false; 2417 2418 /* POWER9 currently requires all threads to be in the same MMU mode */ 2419 if (cpu_has_feature(CPU_FTR_ARCH_300) && 2420 kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) 2421 return false; 2422 2423 if (n_threads < cip->max_subcore_threads) 2424 n_threads = cip->max_subcore_threads; 2425 if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) 2426 return false; 2427 cip->max_subcore_threads = n_threads; 2428 2429 sub = cip->n_subcores; 2430 ++cip->n_subcores; 2431 cip->total_threads += vc->num_threads; 2432 cip->subcore_threads[sub] = vc->num_threads; 2433 cip->vc[sub] = vc; 2434 init_vcore_to_run(vc); 2435 list_del_init(&vc->preempt_list); 2436 2437 return true; 2438 } 2439 2440 /* 2441 * Work out whether it is possible to piggyback the execution of 2442 * vcore *pvc onto the execution of the other vcores described in *cip. 2443 */ 2444 static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip, 2445 int target_threads) 2446 { 2447 if (cip->total_threads + pvc->num_threads > target_threads) 2448 return false; 2449 2450 return can_dynamic_split(pvc, cip); 2451 } 2452 2453 static void prepare_threads(struct kvmppc_vcore *vc) 2454 { 2455 int i; 2456 struct kvm_vcpu *vcpu; 2457 2458 for_each_runnable_thread(i, vcpu, vc) { 2459 if (signal_pending(vcpu->arch.run_task)) 2460 vcpu->arch.ret = -EINTR; 2461 else if (vcpu->arch.vpa.update_pending || 2462 vcpu->arch.slb_shadow.update_pending || 2463 vcpu->arch.dtl.update_pending) 2464 vcpu->arch.ret = RESUME_GUEST; 2465 else 2466 continue; 2467 kvmppc_remove_runnable(vc, vcpu); 2468 wake_up(&vcpu->arch.cpu_run); 2469 } 2470 } 2471 2472 static void collect_piggybacks(struct core_info *cip, int target_threads) 2473 { 2474 struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores); 2475 struct kvmppc_vcore *pvc, *vcnext; 2476 2477 spin_lock(&lp->lock); 2478 list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) { 2479 if (!spin_trylock(&pvc->lock)) 2480 continue; 2481 prepare_threads(pvc); 2482 if (!pvc->n_runnable) { 2483 list_del_init(&pvc->preempt_list); 2484 if (pvc->runner == NULL) { 2485 pvc->vcore_state = VCORE_INACTIVE; 2486 kvmppc_core_end_stolen(pvc); 2487 } 2488 spin_unlock(&pvc->lock); 2489 continue; 2490 } 2491 if (!can_piggyback(pvc, cip, target_threads)) { 2492 spin_unlock(&pvc->lock); 2493 continue; 2494 } 2495 kvmppc_core_end_stolen(pvc); 2496 pvc->vcore_state = VCORE_PIGGYBACK; 2497 if (cip->total_threads >= target_threads) 2498 break; 2499 } 2500 spin_unlock(&lp->lock); 2501 } 2502 2503 static bool recheck_signals(struct core_info *cip) 2504 { 2505 int sub, i; 2506 struct kvm_vcpu *vcpu; 2507 2508 for (sub = 0; sub < cip->n_subcores; ++sub) 2509 for_each_runnable_thread(i, vcpu, cip->vc[sub]) 2510 if (signal_pending(vcpu->arch.run_task)) 2511 return true; 2512 return false; 2513 } 2514 2515 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) 2516 { 2517 int still_running = 0, i; 2518 u64 now; 2519 long ret; 2520 struct kvm_vcpu *vcpu; 2521 2522 spin_lock(&vc->lock); 2523 now = get_tb(); 2524 for_each_runnable_thread(i, vcpu, vc) { 2525 /* cancel pending dec exception if dec is positive */ 2526 if (now < vcpu->arch.dec_expires && 2527 kvmppc_core_pending_dec(vcpu)) 2528 kvmppc_core_dequeue_dec(vcpu); 2529 2530 trace_kvm_guest_exit(vcpu); 2531 2532 ret = RESUME_GUEST; 2533 if (vcpu->arch.trap) 2534 ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, 2535 vcpu->arch.run_task); 2536 2537 vcpu->arch.ret = ret; 2538 vcpu->arch.trap = 0; 2539 2540 if (is_kvmppc_resume_guest(vcpu->arch.ret)) { 2541 if (vcpu->arch.pending_exceptions) 2542 kvmppc_core_prepare_to_enter(vcpu); 2543 if (vcpu->arch.ceded) 2544 kvmppc_set_timer(vcpu); 2545 else 2546 ++still_running; 2547 } else { 2548 kvmppc_remove_runnable(vc, vcpu); 2549 wake_up(&vcpu->arch.cpu_run); 2550 } 2551 } 2552 if (!is_master) { 2553 if (still_running > 0) { 2554 kvmppc_vcore_preempt(vc); 2555 } else if (vc->runner) { 2556 vc->vcore_state = VCORE_PREEMPT; 2557 kvmppc_core_start_stolen(vc); 2558 } else { 2559 vc->vcore_state = VCORE_INACTIVE; 2560 } 2561 if (vc->n_runnable > 0 && vc->runner == NULL) { 2562 /* make sure there's a candidate runner awake */ 2563 i = -1; 2564 vcpu = next_runnable_thread(vc, &i); 2565 wake_up(&vcpu->arch.cpu_run); 2566 } 2567 } 2568 spin_unlock(&vc->lock); 2569 } 2570 2571 /* 2572 * Clear core from the list of active host cores as we are about to 2573 * enter the guest. Only do this if it is the primary thread of the 2574 * core (not if a subcore) that is entering the guest. 2575 */ 2576 static inline int kvmppc_clear_host_core(unsigned int cpu) 2577 { 2578 int core; 2579 2580 if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) 2581 return 0; 2582 /* 2583 * Memory barrier can be omitted here as we will do a smp_wmb() 2584 * later in kvmppc_start_thread and we need ensure that state is 2585 * visible to other CPUs only after we enter guest. 2586 */ 2587 core = cpu >> threads_shift; 2588 kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0; 2589 return 0; 2590 } 2591 2592 /* 2593 * Advertise this core as an active host core since we exited the guest 2594 * Only need to do this if it is the primary thread of the core that is 2595 * exiting. 2596 */ 2597 static inline int kvmppc_set_host_core(unsigned int cpu) 2598 { 2599 int core; 2600 2601 if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) 2602 return 0; 2603 2604 /* 2605 * Memory barrier can be omitted here because we do a spin_unlock 2606 * immediately after this which provides the memory barrier. 2607 */ 2608 core = cpu >> threads_shift; 2609 kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1; 2610 return 0; 2611 } 2612 2613 static void set_irq_happened(int trap) 2614 { 2615 switch (trap) { 2616 case BOOK3S_INTERRUPT_EXTERNAL: 2617 local_paca->irq_happened |= PACA_IRQ_EE; 2618 break; 2619 case BOOK3S_INTERRUPT_H_DOORBELL: 2620 local_paca->irq_happened |= PACA_IRQ_DBELL; 2621 break; 2622 case BOOK3S_INTERRUPT_HMI: 2623 local_paca->irq_happened |= PACA_IRQ_HMI; 2624 break; 2625 case BOOK3S_INTERRUPT_SYSTEM_RESET: 2626 replay_system_reset(); 2627 break; 2628 } 2629 } 2630 2631 /* 2632 * Run a set of guest threads on a physical core. 2633 * Called with vc->lock held. 2634 */ 2635 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) 2636 { 2637 struct kvm_vcpu *vcpu; 2638 int i; 2639 int srcu_idx; 2640 struct core_info core_info; 2641 struct kvmppc_vcore *pvc; 2642 struct kvm_split_mode split_info, *sip; 2643 int split, subcore_size, active; 2644 int sub; 2645 bool thr0_done; 2646 unsigned long cmd_bit, stat_bit; 2647 int pcpu, thr; 2648 int target_threads; 2649 int controlled_threads; 2650 int trap; 2651 bool is_power8; 2652 bool hpt_on_radix; 2653 2654 /* 2655 * Remove from the list any threads that have a signal pending 2656 * or need a VPA update done 2657 */ 2658 prepare_threads(vc); 2659 2660 /* if the runner is no longer runnable, let the caller pick a new one */ 2661 if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE) 2662 return; 2663 2664 /* 2665 * Initialize *vc. 2666 */ 2667 init_vcore_to_run(vc); 2668 vc->preempt_tb = TB_NIL; 2669 2670 /* 2671 * Number of threads that we will be controlling: the same as 2672 * the number of threads per subcore, except on POWER9, 2673 * where it's 1 because the threads are (mostly) independent. 2674 */ 2675 controlled_threads = threads_per_vcore(vc->kvm); 2676 2677 /* 2678 * Make sure we are running on primary threads, and that secondary 2679 * threads are offline. Also check if the number of threads in this 2680 * guest are greater than the current system threads per guest. 2681 * On POWER9, we need to be not in independent-threads mode if 2682 * this is a HPT guest on a radix host. 2683 */ 2684 hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm); 2685 if (((controlled_threads > 1) && 2686 ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || 2687 (hpt_on_radix && vc->kvm->arch.threads_indep)) { 2688 for_each_runnable_thread(i, vcpu, vc) { 2689 vcpu->arch.ret = -EBUSY; 2690 kvmppc_remove_runnable(vc, vcpu); 2691 wake_up(&vcpu->arch.cpu_run); 2692 } 2693 goto out; 2694 } 2695 2696 /* 2697 * See if we could run any other vcores on the physical core 2698 * along with this one. 2699 */ 2700 init_core_info(&core_info, vc); 2701 pcpu = smp_processor_id(); 2702 target_threads = controlled_threads; 2703 if (target_smt_mode && target_smt_mode < target_threads) 2704 target_threads = target_smt_mode; 2705 if (vc->num_threads < target_threads) 2706 collect_piggybacks(&core_info, target_threads); 2707 2708 /* 2709 * On radix, arrange for TLB flushing if necessary. 2710 * This has to be done before disabling interrupts since 2711 * it uses smp_call_function(). 2712 */ 2713 pcpu = smp_processor_id(); 2714 if (kvm_is_radix(vc->kvm)) { 2715 for (sub = 0; sub < core_info.n_subcores; ++sub) 2716 for_each_runnable_thread(i, vcpu, core_info.vc[sub]) 2717 kvmppc_prepare_radix_vcpu(vcpu, pcpu); 2718 } 2719 2720 /* 2721 * Hard-disable interrupts, and check resched flag and signals. 2722 * If we need to reschedule or deliver a signal, clean up 2723 * and return without going into the guest(s). 2724 * If the mmu_ready flag has been cleared, don't go into the 2725 * guest because that means a HPT resize operation is in progress. 2726 */ 2727 local_irq_disable(); 2728 hard_irq_disable(); 2729 if (lazy_irq_pending() || need_resched() || 2730 recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { 2731 local_irq_enable(); 2732 vc->vcore_state = VCORE_INACTIVE; 2733 /* Unlock all except the primary vcore */ 2734 for (sub = 1; sub < core_info.n_subcores; ++sub) { 2735 pvc = core_info.vc[sub]; 2736 /* Put back on to the preempted vcores list */ 2737 kvmppc_vcore_preempt(pvc); 2738 spin_unlock(&pvc->lock); 2739 } 2740 for (i = 0; i < controlled_threads; ++i) 2741 kvmppc_release_hwthread(pcpu + i); 2742 return; 2743 } 2744 2745 kvmppc_clear_host_core(pcpu); 2746 2747 /* Decide on micro-threading (split-core) mode */ 2748 subcore_size = threads_per_subcore; 2749 cmd_bit = stat_bit = 0; 2750 split = core_info.n_subcores; 2751 sip = NULL; 2752 is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S) 2753 && !cpu_has_feature(CPU_FTR_ARCH_300); 2754 2755 if (split > 1 || hpt_on_radix) { 2756 sip = &split_info; 2757 memset(&split_info, 0, sizeof(split_info)); 2758 for (sub = 0; sub < core_info.n_subcores; ++sub) 2759 split_info.vc[sub] = core_info.vc[sub]; 2760 2761 if (is_power8) { 2762 if (split == 2 && (dynamic_mt_modes & 2)) { 2763 cmd_bit = HID0_POWER8_1TO2LPAR; 2764 stat_bit = HID0_POWER8_2LPARMODE; 2765 } else { 2766 split = 4; 2767 cmd_bit = HID0_POWER8_1TO4LPAR; 2768 stat_bit = HID0_POWER8_4LPARMODE; 2769 } 2770 subcore_size = MAX_SMT_THREADS / split; 2771 split_info.rpr = mfspr(SPRN_RPR); 2772 split_info.pmmar = mfspr(SPRN_PMMAR); 2773 split_info.ldbar = mfspr(SPRN_LDBAR); 2774 split_info.subcore_size = subcore_size; 2775 } else { 2776 split_info.subcore_size = 1; 2777 if (hpt_on_radix) { 2778 /* Use the split_info for LPCR/LPIDR changes */ 2779 split_info.lpcr_req = vc->lpcr; 2780 split_info.lpidr_req = vc->kvm->arch.lpid; 2781 split_info.host_lpcr = vc->kvm->arch.host_lpcr; 2782 split_info.do_set = 1; 2783 } 2784 } 2785 2786 /* order writes to split_info before kvm_split_mode pointer */ 2787 smp_wmb(); 2788 } 2789 2790 for (thr = 0; thr < controlled_threads; ++thr) { 2791 paca[pcpu + thr].kvm_hstate.tid = thr; 2792 paca[pcpu + thr].kvm_hstate.napping = 0; 2793 paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; 2794 } 2795 2796 /* Initiate micro-threading (split-core) on POWER8 if required */ 2797 if (cmd_bit) { 2798 unsigned long hid0 = mfspr(SPRN_HID0); 2799 2800 hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS; 2801 mb(); 2802 mtspr(SPRN_HID0, hid0); 2803 isync(); 2804 for (;;) { 2805 hid0 = mfspr(SPRN_HID0); 2806 if (hid0 & stat_bit) 2807 break; 2808 cpu_relax(); 2809 } 2810 } 2811 2812 /* Start all the threads */ 2813 active = 0; 2814 for (sub = 0; sub < core_info.n_subcores; ++sub) { 2815 thr = is_power8 ? subcore_thread_map[sub] : sub; 2816 thr0_done = false; 2817 active |= 1 << thr; 2818 pvc = core_info.vc[sub]; 2819 pvc->pcpu = pcpu + thr; 2820 for_each_runnable_thread(i, vcpu, pvc) { 2821 kvmppc_start_thread(vcpu, pvc); 2822 kvmppc_create_dtl_entry(vcpu, pvc); 2823 trace_kvm_guest_enter(vcpu); 2824 if (!vcpu->arch.ptid) 2825 thr0_done = true; 2826 active |= 1 << (thr + vcpu->arch.ptid); 2827 } 2828 /* 2829 * We need to start the first thread of each subcore 2830 * even if it doesn't have a vcpu. 2831 */ 2832 if (!thr0_done) 2833 kvmppc_start_thread(NULL, pvc); 2834 thr += pvc->num_threads; 2835 } 2836 2837 /* 2838 * Ensure that split_info.do_nap is set after setting 2839 * the vcore pointer in the PACA of the secondaries. 2840 */ 2841 smp_mb(); 2842 2843 /* 2844 * When doing micro-threading, poke the inactive threads as well. 2845 * This gets them to the nap instruction after kvm_do_nap, 2846 * which reduces the time taken to unsplit later. 2847 * For POWER9 HPT guest on radix host, we need all the secondary 2848 * threads woken up so they can do the LPCR/LPIDR change. 2849 */ 2850 if (cmd_bit || hpt_on_radix) { 2851 split_info.do_nap = 1; /* ask secondaries to nap when done */ 2852 for (thr = 1; thr < threads_per_subcore; ++thr) 2853 if (!(active & (1 << thr))) 2854 kvmppc_ipi_thread(pcpu + thr); 2855 } 2856 2857 vc->vcore_state = VCORE_RUNNING; 2858 preempt_disable(); 2859 2860 trace_kvmppc_run_core(vc, 0); 2861 2862 for (sub = 0; sub < core_info.n_subcores; ++sub) 2863 spin_unlock(&core_info.vc[sub]->lock); 2864 2865 /* 2866 * Interrupts will be enabled once we get into the guest, 2867 * so tell lockdep that we're about to enable interrupts. 2868 */ 2869 trace_hardirqs_on(); 2870 2871 guest_enter(); 2872 2873 srcu_idx = srcu_read_lock(&vc->kvm->srcu); 2874 2875 trap = __kvmppc_vcore_entry(); 2876 2877 srcu_read_unlock(&vc->kvm->srcu, srcu_idx); 2878 2879 guest_exit(); 2880 2881 trace_hardirqs_off(); 2882 set_irq_happened(trap); 2883 2884 spin_lock(&vc->lock); 2885 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ 2886 vc->vcore_state = VCORE_EXITING; 2887 2888 /* wait for secondary threads to finish writing their state to memory */ 2889 kvmppc_wait_for_nap(controlled_threads); 2890 2891 /* Return to whole-core mode if we split the core earlier */ 2892 if (cmd_bit) { 2893 unsigned long hid0 = mfspr(SPRN_HID0); 2894 unsigned long loops = 0; 2895 2896 hid0 &= ~HID0_POWER8_DYNLPARDIS; 2897 stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE; 2898 mb(); 2899 mtspr(SPRN_HID0, hid0); 2900 isync(); 2901 for (;;) { 2902 hid0 = mfspr(SPRN_HID0); 2903 if (!(hid0 & stat_bit)) 2904 break; 2905 cpu_relax(); 2906 ++loops; 2907 } 2908 } else if (hpt_on_radix) { 2909 /* Wait for all threads to have seen final sync */ 2910 for (thr = 1; thr < controlled_threads; ++thr) { 2911 while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) { 2912 HMT_low(); 2913 barrier(); 2914 } 2915 HMT_medium(); 2916 } 2917 } 2918 split_info.do_nap = 0; 2919 2920 kvmppc_set_host_core(pcpu); 2921 2922 local_irq_enable(); 2923 2924 /* Let secondaries go back to the offline loop */ 2925 for (i = 0; i < controlled_threads; ++i) { 2926 kvmppc_release_hwthread(pcpu + i); 2927 if (sip && sip->napped[i]) 2928 kvmppc_ipi_thread(pcpu + i); 2929 cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); 2930 } 2931 2932 spin_unlock(&vc->lock); 2933 2934 /* make sure updates to secondary vcpu structs are visible now */ 2935 smp_mb(); 2936 2937 for (sub = 0; sub < core_info.n_subcores; ++sub) { 2938 pvc = core_info.vc[sub]; 2939 post_guest_process(pvc, pvc == vc); 2940 } 2941 2942 spin_lock(&vc->lock); 2943 preempt_enable(); 2944 2945 out: 2946 vc->vcore_state = VCORE_INACTIVE; 2947 trace_kvmppc_run_core(vc, 1); 2948 } 2949 2950 /* 2951 * Wait for some other vcpu thread to execute us, and 2952 * wake us up when we need to handle something in the host. 2953 */ 2954 static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, 2955 struct kvm_vcpu *vcpu, int wait_state) 2956 { 2957 DEFINE_WAIT(wait); 2958 2959 prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); 2960 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 2961 spin_unlock(&vc->lock); 2962 schedule(); 2963 spin_lock(&vc->lock); 2964 } 2965 finish_wait(&vcpu->arch.cpu_run, &wait); 2966 } 2967 2968 static void grow_halt_poll_ns(struct kvmppc_vcore *vc) 2969 { 2970 /* 10us base */ 2971 if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) 2972 vc->halt_poll_ns = 10000; 2973 else 2974 vc->halt_poll_ns *= halt_poll_ns_grow; 2975 } 2976 2977 static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) 2978 { 2979 if (halt_poll_ns_shrink == 0) 2980 vc->halt_poll_ns = 0; 2981 else 2982 vc->halt_poll_ns /= halt_poll_ns_shrink; 2983 } 2984 2985 #ifdef CONFIG_KVM_XICS 2986 static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) 2987 { 2988 if (!xive_enabled()) 2989 return false; 2990 return vcpu->arch.xive_saved_state.pipr < 2991 vcpu->arch.xive_saved_state.cppr; 2992 } 2993 #else 2994 static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) 2995 { 2996 return false; 2997 } 2998 #endif /* CONFIG_KVM_XICS */ 2999 3000 static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu) 3001 { 3002 if (vcpu->arch.pending_exceptions || vcpu->arch.prodded || 3003 kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu)) 3004 return true; 3005 3006 return false; 3007 } 3008 3009 /* 3010 * Check to see if any of the runnable vcpus on the vcore have pending 3011 * exceptions or are no longer ceded 3012 */ 3013 static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) 3014 { 3015 struct kvm_vcpu *vcpu; 3016 int i; 3017 3018 for_each_runnable_thread(i, vcpu, vc) { 3019 if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu)) 3020 return 1; 3021 } 3022 3023 return 0; 3024 } 3025 3026 /* 3027 * All the vcpus in this vcore are idle, so wait for a decrementer 3028 * or external interrupt to one of the vcpus. vc->lock is held. 3029 */ 3030 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) 3031 { 3032 ktime_t cur, start_poll, start_wait; 3033 int do_sleep = 1; 3034 u64 block_ns; 3035 DECLARE_SWAITQUEUE(wait); 3036 3037 /* Poll for pending exceptions and ceded state */ 3038 cur = start_poll = ktime_get(); 3039 if (vc->halt_poll_ns) { 3040 ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns); 3041 ++vc->runner->stat.halt_attempted_poll; 3042 3043 vc->vcore_state = VCORE_POLLING; 3044 spin_unlock(&vc->lock); 3045 3046 do { 3047 if (kvmppc_vcore_check_block(vc)) { 3048 do_sleep = 0; 3049 break; 3050 } 3051 cur = ktime_get(); 3052 } while (single_task_running() && ktime_before(cur, stop)); 3053 3054 spin_lock(&vc->lock); 3055 vc->vcore_state = VCORE_INACTIVE; 3056 3057 if (!do_sleep) { 3058 ++vc->runner->stat.halt_successful_poll; 3059 goto out; 3060 } 3061 } 3062 3063 prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); 3064 3065 if (kvmppc_vcore_check_block(vc)) { 3066 finish_swait(&vc->wq, &wait); 3067 do_sleep = 0; 3068 /* If we polled, count this as a successful poll */ 3069 if (vc->halt_poll_ns) 3070 ++vc->runner->stat.halt_successful_poll; 3071 goto out; 3072 } 3073 3074 start_wait = ktime_get(); 3075 3076 vc->vcore_state = VCORE_SLEEPING; 3077 trace_kvmppc_vcore_blocked(vc, 0); 3078 spin_unlock(&vc->lock); 3079 schedule(); 3080 finish_swait(&vc->wq, &wait); 3081 spin_lock(&vc->lock); 3082 vc->vcore_state = VCORE_INACTIVE; 3083 trace_kvmppc_vcore_blocked(vc, 1); 3084 ++vc->runner->stat.halt_successful_wait; 3085 3086 cur = ktime_get(); 3087 3088 out: 3089 block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll); 3090 3091 /* Attribute wait time */ 3092 if (do_sleep) { 3093 vc->runner->stat.halt_wait_ns += 3094 ktime_to_ns(cur) - ktime_to_ns(start_wait); 3095 /* Attribute failed poll time */ 3096 if (vc->halt_poll_ns) 3097 vc->runner->stat.halt_poll_fail_ns += 3098 ktime_to_ns(start_wait) - 3099 ktime_to_ns(start_poll); 3100 } else { 3101 /* Attribute successful poll time */ 3102 if (vc->halt_poll_ns) 3103 vc->runner->stat.halt_poll_success_ns += 3104 ktime_to_ns(cur) - 3105 ktime_to_ns(start_poll); 3106 } 3107 3108 /* Adjust poll time */ 3109 if (halt_poll_ns) { 3110 if (block_ns <= vc->halt_poll_ns) 3111 ; 3112 /* We slept and blocked for longer than the max halt time */ 3113 else if (vc->halt_poll_ns && block_ns > halt_poll_ns) 3114 shrink_halt_poll_ns(vc); 3115 /* We slept and our poll time is too small */ 3116 else if (vc->halt_poll_ns < halt_poll_ns && 3117 block_ns < halt_poll_ns) 3118 grow_halt_poll_ns(vc); 3119 if (vc->halt_poll_ns > halt_poll_ns) 3120 vc->halt_poll_ns = halt_poll_ns; 3121 } else 3122 vc->halt_poll_ns = 0; 3123 3124 trace_kvmppc_vcore_wakeup(do_sleep, block_ns); 3125 } 3126 3127 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) 3128 { 3129 int r = 0; 3130 struct kvm *kvm = vcpu->kvm; 3131 3132 mutex_lock(&kvm->lock); 3133 if (!kvm->arch.mmu_ready) { 3134 if (!kvm_is_radix(kvm)) 3135 r = kvmppc_hv_setup_htab_rma(vcpu); 3136 if (!r) { 3137 if (cpu_has_feature(CPU_FTR_ARCH_300)) 3138 kvmppc_setup_partition_table(kvm); 3139 kvm->arch.mmu_ready = 1; 3140 } 3141 } 3142 mutex_unlock(&kvm->lock); 3143 return r; 3144 } 3145 3146 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3147 { 3148 int n_ceded, i, r; 3149 struct kvmppc_vcore *vc; 3150 struct kvm_vcpu *v; 3151 3152 trace_kvmppc_run_vcpu_enter(vcpu); 3153 3154 kvm_run->exit_reason = 0; 3155 vcpu->arch.ret = RESUME_GUEST; 3156 vcpu->arch.trap = 0; 3157 kvmppc_update_vpas(vcpu); 3158 3159 /* 3160 * Synchronize with other threads in this virtual core 3161 */ 3162 vc = vcpu->arch.vcore; 3163 spin_lock(&vc->lock); 3164 vcpu->arch.ceded = 0; 3165 vcpu->arch.run_task = current; 3166 vcpu->arch.kvm_run = kvm_run; 3167 vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); 3168 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; 3169 vcpu->arch.busy_preempt = TB_NIL; 3170 WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu); 3171 ++vc->n_runnable; 3172 3173 /* 3174 * This happens the first time this is called for a vcpu. 3175 * If the vcore is already running, we may be able to start 3176 * this thread straight away and have it join in. 3177 */ 3178 if (!signal_pending(current)) { 3179 if (vc->vcore_state == VCORE_PIGGYBACK) { 3180 if (spin_trylock(&vc->lock)) { 3181 if (vc->vcore_state == VCORE_RUNNING && 3182 !VCORE_IS_EXITING(vc)) { 3183 kvmppc_create_dtl_entry(vcpu, vc); 3184 kvmppc_start_thread(vcpu, vc); 3185 trace_kvm_guest_enter(vcpu); 3186 } 3187 spin_unlock(&vc->lock); 3188 } 3189 } else if (vc->vcore_state == VCORE_RUNNING && 3190 !VCORE_IS_EXITING(vc)) { 3191 kvmppc_create_dtl_entry(vcpu, vc); 3192 kvmppc_start_thread(vcpu, vc); 3193 trace_kvm_guest_enter(vcpu); 3194 } else if (vc->vcore_state == VCORE_SLEEPING) { 3195 swake_up(&vc->wq); 3196 } 3197 3198 } 3199 3200 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 3201 !signal_pending(current)) { 3202 /* See if the MMU is ready to go */ 3203 if (!vcpu->kvm->arch.mmu_ready) { 3204 spin_unlock(&vc->lock); 3205 r = kvmhv_setup_mmu(vcpu); 3206 spin_lock(&vc->lock); 3207 if (r) { 3208 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3209 kvm_run->fail_entry. 3210 hardware_entry_failure_reason = 0; 3211 vcpu->arch.ret = r; 3212 break; 3213 } 3214 } 3215 3216 if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL) 3217 kvmppc_vcore_end_preempt(vc); 3218 3219 if (vc->vcore_state != VCORE_INACTIVE) { 3220 kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE); 3221 continue; 3222 } 3223 for_each_runnable_thread(i, v, vc) { 3224 kvmppc_core_prepare_to_enter(v); 3225 if (signal_pending(v->arch.run_task)) { 3226 kvmppc_remove_runnable(vc, v); 3227 v->stat.signal_exits++; 3228 v->arch.kvm_run->exit_reason = KVM_EXIT_INTR; 3229 v->arch.ret = -EINTR; 3230 wake_up(&v->arch.cpu_run); 3231 } 3232 } 3233 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 3234 break; 3235 n_ceded = 0; 3236 for_each_runnable_thread(i, v, vc) { 3237 if (!kvmppc_vcpu_woken(v)) 3238 n_ceded += v->arch.ceded; 3239 else 3240 v->arch.ceded = 0; 3241 } 3242 vc->runner = vcpu; 3243 if (n_ceded == vc->n_runnable) { 3244 kvmppc_vcore_blocked(vc); 3245 } else if (need_resched()) { 3246 kvmppc_vcore_preempt(vc); 3247 /* Let something else run */ 3248 cond_resched_lock(&vc->lock); 3249 if (vc->vcore_state == VCORE_PREEMPT) 3250 kvmppc_vcore_end_preempt(vc); 3251 } else { 3252 kvmppc_run_core(vc); 3253 } 3254 vc->runner = NULL; 3255 } 3256 3257 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 3258 (vc->vcore_state == VCORE_RUNNING || 3259 vc->vcore_state == VCORE_EXITING || 3260 vc->vcore_state == VCORE_PIGGYBACK)) 3261 kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE); 3262 3263 if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL) 3264 kvmppc_vcore_end_preempt(vc); 3265 3266 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 3267 kvmppc_remove_runnable(vc, vcpu); 3268 vcpu->stat.signal_exits++; 3269 kvm_run->exit_reason = KVM_EXIT_INTR; 3270 vcpu->arch.ret = -EINTR; 3271 } 3272 3273 if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { 3274 /* Wake up some vcpu to run the core */ 3275 i = -1; 3276 v = next_runnable_thread(vc, &i); 3277 wake_up(&v->arch.cpu_run); 3278 } 3279 3280 trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); 3281 spin_unlock(&vc->lock); 3282 return vcpu->arch.ret; 3283 } 3284 3285 static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) 3286 { 3287 int r; 3288 int srcu_idx; 3289 unsigned long ebb_regs[3] = {}; /* shut up GCC */ 3290 unsigned long user_tar = 0; 3291 unsigned int user_vrsave; 3292 struct kvm *kvm; 3293 3294 if (!vcpu->arch.sane) { 3295 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3296 return -EINVAL; 3297 } 3298 3299 /* 3300 * Don't allow entry with a suspended transaction, because 3301 * the guest entry/exit code will lose it. 3302 * If the guest has TM enabled, save away their TM-related SPRs 3303 * (they will get restored by the TM unavailable interrupt). 3304 */ 3305 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 3306 if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs && 3307 (current->thread.regs->msr & MSR_TM)) { 3308 if (MSR_TM_ACTIVE(current->thread.regs->msr)) { 3309 run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3310 run->fail_entry.hardware_entry_failure_reason = 0; 3311 return -EINVAL; 3312 } 3313 /* Enable TM so we can read the TM SPRs */ 3314 mtmsr(mfmsr() | MSR_TM); 3315 current->thread.tm_tfhar = mfspr(SPRN_TFHAR); 3316 current->thread.tm_tfiar = mfspr(SPRN_TFIAR); 3317 current->thread.tm_texasr = mfspr(SPRN_TEXASR); 3318 current->thread.regs->msr &= ~MSR_TM; 3319 } 3320 #endif 3321 3322 kvmppc_core_prepare_to_enter(vcpu); 3323 3324 /* No need to go into the guest when all we'll do is come back out */ 3325 if (signal_pending(current)) { 3326 run->exit_reason = KVM_EXIT_INTR; 3327 return -EINTR; 3328 } 3329 3330 kvm = vcpu->kvm; 3331 atomic_inc(&kvm->arch.vcpus_running); 3332 /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */ 3333 smp_mb(); 3334 3335 flush_all_to_thread(current); 3336 3337 /* Save userspace EBB and other register values */ 3338 if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 3339 ebb_regs[0] = mfspr(SPRN_EBBHR); 3340 ebb_regs[1] = mfspr(SPRN_EBBRR); 3341 ebb_regs[2] = mfspr(SPRN_BESCR); 3342 user_tar = mfspr(SPRN_TAR); 3343 } 3344 user_vrsave = mfspr(SPRN_VRSAVE); 3345 3346 vcpu->arch.wqp = &vcpu->arch.vcore->wq; 3347 vcpu->arch.pgdir = current->mm->pgd; 3348 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 3349 3350 do { 3351 r = kvmppc_run_vcpu(run, vcpu); 3352 3353 if (run->exit_reason == KVM_EXIT_PAPR_HCALL && 3354 !(vcpu->arch.shregs.msr & MSR_PR)) { 3355 trace_kvm_hcall_enter(vcpu); 3356 r = kvmppc_pseries_do_hcall(vcpu); 3357 trace_kvm_hcall_exit(vcpu, r); 3358 kvmppc_core_prepare_to_enter(vcpu); 3359 } else if (r == RESUME_PAGE_FAULT) { 3360 srcu_idx = srcu_read_lock(&kvm->srcu); 3361 r = kvmppc_book3s_hv_page_fault(run, vcpu, 3362 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 3363 srcu_read_unlock(&kvm->srcu, srcu_idx); 3364 } else if (r == RESUME_PASSTHROUGH) { 3365 if (WARN_ON(xive_enabled())) 3366 r = H_SUCCESS; 3367 else 3368 r = kvmppc_xics_rm_complete(vcpu, 0); 3369 } 3370 } while (is_kvmppc_resume_guest(r)); 3371 3372 /* Restore userspace EBB and other register values */ 3373 if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 3374 mtspr(SPRN_EBBHR, ebb_regs[0]); 3375 mtspr(SPRN_EBBRR, ebb_regs[1]); 3376 mtspr(SPRN_BESCR, ebb_regs[2]); 3377 mtspr(SPRN_TAR, user_tar); 3378 mtspr(SPRN_FSCR, current->thread.fscr); 3379 } 3380 mtspr(SPRN_VRSAVE, user_vrsave); 3381 3382 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 3383 atomic_dec(&kvm->arch.vcpus_running); 3384 return r; 3385 } 3386 3387 static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, 3388 int shift, int sllp) 3389 { 3390 (*sps)->page_shift = shift; 3391 (*sps)->slb_enc = sllp; 3392 (*sps)->enc[0].page_shift = shift; 3393 (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift); 3394 /* 3395 * Add 16MB MPSS support (may get filtered out by userspace) 3396 */ 3397 if (shift != 24) { 3398 int penc = kvmppc_pgsize_lp_encoding(shift, 24); 3399 if (penc != -1) { 3400 (*sps)->enc[1].page_shift = 24; 3401 (*sps)->enc[1].pte_enc = penc; 3402 } 3403 } 3404 (*sps)++; 3405 } 3406 3407 static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, 3408 struct kvm_ppc_smmu_info *info) 3409 { 3410 struct kvm_ppc_one_seg_page_size *sps; 3411 3412 /* 3413 * POWER7, POWER8 and POWER9 all support 32 storage keys for data. 3414 * POWER7 doesn't support keys for instruction accesses, 3415 * POWER8 and POWER9 do. 3416 */ 3417 info->data_keys = 32; 3418 info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0; 3419 3420 /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */ 3421 info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS; 3422 info->slb_size = 32; 3423 3424 /* We only support these sizes for now, and no muti-size segments */ 3425 sps = &info->sps[0]; 3426 kvmppc_add_seg_page_size(&sps, 12, 0); 3427 kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); 3428 kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); 3429 3430 return 0; 3431 } 3432 3433 /* 3434 * Get (and clear) the dirty memory log for a memory slot. 3435 */ 3436 static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, 3437 struct kvm_dirty_log *log) 3438 { 3439 struct kvm_memslots *slots; 3440 struct kvm_memory_slot *memslot; 3441 int i, r; 3442 unsigned long n; 3443 unsigned long *buf, *p; 3444 struct kvm_vcpu *vcpu; 3445 3446 mutex_lock(&kvm->slots_lock); 3447 3448 r = -EINVAL; 3449 if (log->slot >= KVM_USER_MEM_SLOTS) 3450 goto out; 3451 3452 slots = kvm_memslots(kvm); 3453 memslot = id_to_memslot(slots, log->slot); 3454 r = -ENOENT; 3455 if (!memslot->dirty_bitmap) 3456 goto out; 3457 3458 /* 3459 * Use second half of bitmap area because both HPT and radix 3460 * accumulate bits in the first half. 3461 */ 3462 n = kvm_dirty_bitmap_bytes(memslot); 3463 buf = memslot->dirty_bitmap + n / sizeof(long); 3464 memset(buf, 0, n); 3465 3466 if (kvm_is_radix(kvm)) 3467 r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf); 3468 else 3469 r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf); 3470 if (r) 3471 goto out; 3472 3473 /* 3474 * We accumulate dirty bits in the first half of the 3475 * memslot's dirty_bitmap area, for when pages are paged 3476 * out or modified by the host directly. Pick up these 3477 * bits and add them to the map. 3478 */ 3479 p = memslot->dirty_bitmap; 3480 for (i = 0; i < n / sizeof(long); ++i) 3481 buf[i] |= xchg(&p[i], 0); 3482 3483 /* Harvest dirty bits from VPA and DTL updates */ 3484 /* Note: we never modify the SLB shadow buffer areas */ 3485 kvm_for_each_vcpu(i, vcpu, kvm) { 3486 spin_lock(&vcpu->arch.vpa_update_lock); 3487 kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf); 3488 kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf); 3489 spin_unlock(&vcpu->arch.vpa_update_lock); 3490 } 3491 3492 r = -EFAULT; 3493 if (copy_to_user(log->dirty_bitmap, buf, n)) 3494 goto out; 3495 3496 r = 0; 3497 out: 3498 mutex_unlock(&kvm->slots_lock); 3499 return r; 3500 } 3501 3502 static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, 3503 struct kvm_memory_slot *dont) 3504 { 3505 if (!dont || free->arch.rmap != dont->arch.rmap) { 3506 vfree(free->arch.rmap); 3507 free->arch.rmap = NULL; 3508 } 3509 } 3510 3511 static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, 3512 unsigned long npages) 3513 { 3514 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 3515 if (!slot->arch.rmap) 3516 return -ENOMEM; 3517 3518 return 0; 3519 } 3520 3521 static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, 3522 struct kvm_memory_slot *memslot, 3523 const struct kvm_userspace_memory_region *mem) 3524 { 3525 return 0; 3526 } 3527 3528 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, 3529 const struct kvm_userspace_memory_region *mem, 3530 const struct kvm_memory_slot *old, 3531 const struct kvm_memory_slot *new) 3532 { 3533 unsigned long npages = mem->memory_size >> PAGE_SHIFT; 3534 3535 /* 3536 * If we are making a new memslot, it might make 3537 * some address that was previously cached as emulated 3538 * MMIO be no longer emulated MMIO, so invalidate 3539 * all the caches of emulated MMIO translations. 3540 */ 3541 if (npages) 3542 atomic64_inc(&kvm->arch.mmio_update); 3543 } 3544 3545 /* 3546 * Update LPCR values in kvm->arch and in vcores. 3547 * Caller must hold kvm->lock. 3548 */ 3549 void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) 3550 { 3551 long int i; 3552 u32 cores_done = 0; 3553 3554 if ((kvm->arch.lpcr & mask) == lpcr) 3555 return; 3556 3557 kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr; 3558 3559 for (i = 0; i < KVM_MAX_VCORES; ++i) { 3560 struct kvmppc_vcore *vc = kvm->arch.vcores[i]; 3561 if (!vc) 3562 continue; 3563 spin_lock(&vc->lock); 3564 vc->lpcr = (vc->lpcr & ~mask) | lpcr; 3565 spin_unlock(&vc->lock); 3566 if (++cores_done >= kvm->arch.online_vcores) 3567 break; 3568 } 3569 } 3570 3571 static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) 3572 { 3573 return; 3574 } 3575 3576 void kvmppc_setup_partition_table(struct kvm *kvm) 3577 { 3578 unsigned long dw0, dw1; 3579 3580 if (!kvm_is_radix(kvm)) { 3581 /* PS field - page size for VRMA */ 3582 dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | 3583 ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); 3584 /* HTABSIZE and HTABORG fields */ 3585 dw0 |= kvm->arch.sdr1; 3586 3587 /* Second dword as set by userspace */ 3588 dw1 = kvm->arch.process_table; 3589 } else { 3590 dw0 = PATB_HR | radix__get_tree_size() | 3591 __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; 3592 dw1 = PATB_GR | kvm->arch.process_table; 3593 } 3594 3595 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); 3596 } 3597 3598 /* 3599 * Set up HPT (hashed page table) and RMA (real-mode area). 3600 * Must be called with kvm->lock held. 3601 */ 3602 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) 3603 { 3604 int err = 0; 3605 struct kvm *kvm = vcpu->kvm; 3606 unsigned long hva; 3607 struct kvm_memory_slot *memslot; 3608 struct vm_area_struct *vma; 3609 unsigned long lpcr = 0, senc; 3610 unsigned long psize, porder; 3611 int srcu_idx; 3612 3613 /* Allocate hashed page table (if not done already) and reset it */ 3614 if (!kvm->arch.hpt.virt) { 3615 int order = KVM_DEFAULT_HPT_ORDER; 3616 struct kvm_hpt_info info; 3617 3618 err = kvmppc_allocate_hpt(&info, order); 3619 /* If we get here, it means userspace didn't specify a 3620 * size explicitly. So, try successively smaller 3621 * sizes if the default failed. */ 3622 while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER) 3623 err = kvmppc_allocate_hpt(&info, order); 3624 3625 if (err < 0) { 3626 pr_err("KVM: Couldn't alloc HPT\n"); 3627 goto out; 3628 } 3629 3630 kvmppc_set_hpt(kvm, &info); 3631 } 3632 3633 /* Look up the memslot for guest physical address 0 */ 3634 srcu_idx = srcu_read_lock(&kvm->srcu); 3635 memslot = gfn_to_memslot(kvm, 0); 3636 3637 /* We must have some memory at 0 by now */ 3638 err = -EINVAL; 3639 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 3640 goto out_srcu; 3641 3642 /* Look up the VMA for the start of this memory slot */ 3643 hva = memslot->userspace_addr; 3644 down_read(¤t->mm->mmap_sem); 3645 vma = find_vma(current->mm, hva); 3646 if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) 3647 goto up_out; 3648 3649 psize = vma_kernel_pagesize(vma); 3650 porder = __ilog2(psize); 3651 3652 up_read(¤t->mm->mmap_sem); 3653 3654 /* We can handle 4k, 64k or 16M pages in the VRMA */ 3655 err = -EINVAL; 3656 if (!(psize == 0x1000 || psize == 0x10000 || 3657 psize == 0x1000000)) 3658 goto out_srcu; 3659 3660 senc = slb_pgsize_encoding(psize); 3661 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 3662 (VRMA_VSID << SLB_VSID_SHIFT_1T); 3663 /* Create HPTEs in the hash page table for the VRMA */ 3664 kvmppc_map_vrma(vcpu, memslot, porder); 3665 3666 /* Update VRMASD field in the LPCR */ 3667 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 3668 /* the -4 is to account for senc values starting at 0x10 */ 3669 lpcr = senc << (LPCR_VRMASD_SH - 4); 3670 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 3671 } 3672 3673 /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */ 3674 smp_wmb(); 3675 err = 0; 3676 out_srcu: 3677 srcu_read_unlock(&kvm->srcu, srcu_idx); 3678 out: 3679 return err; 3680 3681 up_out: 3682 up_read(¤t->mm->mmap_sem); 3683 goto out_srcu; 3684 } 3685 3686 /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ 3687 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) 3688 { 3689 kvmppc_free_radix(kvm); 3690 kvmppc_update_lpcr(kvm, LPCR_VPM1, 3691 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); 3692 kvmppc_rmap_reset(kvm); 3693 kvm->arch.radix = 0; 3694 kvm->arch.process_table = 0; 3695 return 0; 3696 } 3697 3698 /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ 3699 int kvmppc_switch_mmu_to_radix(struct kvm *kvm) 3700 { 3701 int err; 3702 3703 err = kvmppc_init_vm_radix(kvm); 3704 if (err) 3705 return err; 3706 3707 kvmppc_free_hpt(&kvm->arch.hpt); 3708 kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, 3709 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); 3710 kvm->arch.radix = 1; 3711 return 0; 3712 } 3713 3714 #ifdef CONFIG_KVM_XICS 3715 /* 3716 * Allocate a per-core structure for managing state about which cores are 3717 * running in the host versus the guest and for exchanging data between 3718 * real mode KVM and CPU running in the host. 3719 * This is only done for the first VM. 3720 * The allocated structure stays even if all VMs have stopped. 3721 * It is only freed when the kvm-hv module is unloaded. 3722 * It's OK for this routine to fail, we just don't support host 3723 * core operations like redirecting H_IPI wakeups. 3724 */ 3725 void kvmppc_alloc_host_rm_ops(void) 3726 { 3727 struct kvmppc_host_rm_ops *ops; 3728 unsigned long l_ops; 3729 int cpu, core; 3730 int size; 3731 3732 /* Not the first time here ? */ 3733 if (kvmppc_host_rm_ops_hv != NULL) 3734 return; 3735 3736 ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL); 3737 if (!ops) 3738 return; 3739 3740 size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core); 3741 ops->rm_core = kzalloc(size, GFP_KERNEL); 3742 3743 if (!ops->rm_core) { 3744 kfree(ops); 3745 return; 3746 } 3747 3748 cpus_read_lock(); 3749 3750 for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { 3751 if (!cpu_online(cpu)) 3752 continue; 3753 3754 core = cpu >> threads_shift; 3755 ops->rm_core[core].rm_state.in_host = 1; 3756 } 3757 3758 ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv; 3759 3760 /* 3761 * Make the contents of the kvmppc_host_rm_ops structure visible 3762 * to other CPUs before we assign it to the global variable. 3763 * Do an atomic assignment (no locks used here), but if someone 3764 * beats us to it, just free our copy and return. 3765 */ 3766 smp_wmb(); 3767 l_ops = (unsigned long) ops; 3768 3769 if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { 3770 cpus_read_unlock(); 3771 kfree(ops->rm_core); 3772 kfree(ops); 3773 return; 3774 } 3775 3776 cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE, 3777 "ppc/kvm_book3s:prepare", 3778 kvmppc_set_host_core, 3779 kvmppc_clear_host_core); 3780 cpus_read_unlock(); 3781 } 3782 3783 void kvmppc_free_host_rm_ops(void) 3784 { 3785 if (kvmppc_host_rm_ops_hv) { 3786 cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE); 3787 kfree(kvmppc_host_rm_ops_hv->rm_core); 3788 kfree(kvmppc_host_rm_ops_hv); 3789 kvmppc_host_rm_ops_hv = NULL; 3790 } 3791 } 3792 #endif 3793 3794 static int kvmppc_core_init_vm_hv(struct kvm *kvm) 3795 { 3796 unsigned long lpcr, lpid; 3797 char buf[32]; 3798 int ret; 3799 3800 /* Allocate the guest's logical partition ID */ 3801 3802 lpid = kvmppc_alloc_lpid(); 3803 if ((long)lpid < 0) 3804 return -ENOMEM; 3805 kvm->arch.lpid = lpid; 3806 3807 kvmppc_alloc_host_rm_ops(); 3808 3809 /* 3810 * Since we don't flush the TLB when tearing down a VM, 3811 * and this lpid might have previously been used, 3812 * make sure we flush on each core before running the new VM. 3813 * On POWER9, the tlbie in mmu_partition_table_set_entry() 3814 * does this flush for us. 3815 */ 3816 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3817 cpumask_setall(&kvm->arch.need_tlb_flush); 3818 3819 /* Start out with the default set of hcalls enabled */ 3820 memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls, 3821 sizeof(kvm->arch.enabled_hcalls)); 3822 3823 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3824 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); 3825 3826 /* Init LPCR for virtual RMA mode */ 3827 kvm->arch.host_lpid = mfspr(SPRN_LPID); 3828 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); 3829 lpcr &= LPCR_PECE | LPCR_LPES; 3830 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | 3831 LPCR_VPM0 | LPCR_VPM1; 3832 kvm->arch.vrma_slb_v = SLB_VSID_B_1T | 3833 (VRMA_VSID << SLB_VSID_SHIFT_1T); 3834 /* On POWER8 turn on online bit to enable PURR/SPURR */ 3835 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 3836 lpcr |= LPCR_ONL; 3837 /* 3838 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed) 3839 * Set HVICE bit to enable hypervisor virtualization interrupts. 3840 * Set HEIC to prevent OS interrupts to go to hypervisor (should 3841 * be unnecessary but better safe than sorry in case we re-enable 3842 * EE in HV mode with this LPCR still set) 3843 */ 3844 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 3845 lpcr &= ~LPCR_VPM0; 3846 lpcr |= LPCR_HVICE | LPCR_HEIC; 3847 3848 /* 3849 * If xive is enabled, we route 0x500 interrupts directly 3850 * to the guest. 3851 */ 3852 if (xive_enabled()) 3853 lpcr |= LPCR_LPES; 3854 } 3855 3856 /* 3857 * If the host uses radix, the guest starts out as radix. 3858 */ 3859 if (radix_enabled()) { 3860 kvm->arch.radix = 1; 3861 kvm->arch.mmu_ready = 1; 3862 lpcr &= ~LPCR_VPM1; 3863 lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; 3864 ret = kvmppc_init_vm_radix(kvm); 3865 if (ret) { 3866 kvmppc_free_lpid(kvm->arch.lpid); 3867 return ret; 3868 } 3869 kvmppc_setup_partition_table(kvm); 3870 } 3871 3872 kvm->arch.lpcr = lpcr; 3873 3874 /* Initialization for future HPT resizes */ 3875 kvm->arch.resize_hpt = NULL; 3876 3877 /* 3878 * Work out how many sets the TLB has, for the use of 3879 * the TLB invalidation loop in book3s_hv_rmhandlers.S. 3880 */ 3881 if (radix_enabled()) 3882 kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ 3883 else if (cpu_has_feature(CPU_FTR_ARCH_300)) 3884 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ 3885 else if (cpu_has_feature(CPU_FTR_ARCH_207S)) 3886 kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ 3887 else 3888 kvm->arch.tlb_sets = POWER7_TLB_SETS; /* 128 */ 3889 3890 /* 3891 * Track that we now have a HV mode VM active. This blocks secondary 3892 * CPU threads from coming online. 3893 * On POWER9, we only need to do this if the "indep_threads_mode" 3894 * module parameter has been set to N. 3895 */ 3896 if (cpu_has_feature(CPU_FTR_ARCH_300)) 3897 kvm->arch.threads_indep = indep_threads_mode; 3898 if (!kvm->arch.threads_indep) 3899 kvm_hv_vm_activated(); 3900 3901 /* 3902 * Initialize smt_mode depending on processor. 3903 * POWER8 and earlier have to use "strict" threading, where 3904 * all vCPUs in a vcore have to run on the same (sub)core, 3905 * whereas on POWER9 the threads can each run a different 3906 * guest. 3907 */ 3908 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3909 kvm->arch.smt_mode = threads_per_subcore; 3910 else 3911 kvm->arch.smt_mode = 1; 3912 kvm->arch.emul_smt_mode = 1; 3913 3914 /* 3915 * Create a debugfs directory for the VM 3916 */ 3917 snprintf(buf, sizeof(buf), "vm%d", current->pid); 3918 kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); 3919 if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) 3920 kvmppc_mmu_debugfs_init(kvm); 3921 3922 return 0; 3923 } 3924 3925 static void kvmppc_free_vcores(struct kvm *kvm) 3926 { 3927 long int i; 3928 3929 for (i = 0; i < KVM_MAX_VCORES; ++i) 3930 kfree(kvm->arch.vcores[i]); 3931 kvm->arch.online_vcores = 0; 3932 } 3933 3934 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) 3935 { 3936 debugfs_remove_recursive(kvm->arch.debugfs_dir); 3937 3938 if (!kvm->arch.threads_indep) 3939 kvm_hv_vm_deactivated(); 3940 3941 kvmppc_free_vcores(kvm); 3942 3943 kvmppc_free_lpid(kvm->arch.lpid); 3944 3945 if (kvm_is_radix(kvm)) 3946 kvmppc_free_radix(kvm); 3947 else 3948 kvmppc_free_hpt(&kvm->arch.hpt); 3949 3950 kvmppc_free_pimap(kvm); 3951 } 3952 3953 /* We don't need to emulate any privileged instructions or dcbz */ 3954 static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, 3955 unsigned int inst, int *advance) 3956 { 3957 return EMULATE_FAIL; 3958 } 3959 3960 static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn, 3961 ulong spr_val) 3962 { 3963 return EMULATE_FAIL; 3964 } 3965 3966 static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, 3967 ulong *spr_val) 3968 { 3969 return EMULATE_FAIL; 3970 } 3971 3972 static int kvmppc_core_check_processor_compat_hv(void) 3973 { 3974 if (!cpu_has_feature(CPU_FTR_HVMODE) || 3975 !cpu_has_feature(CPU_FTR_ARCH_206)) 3976 return -EIO; 3977 3978 return 0; 3979 } 3980 3981 #ifdef CONFIG_KVM_XICS 3982 3983 void kvmppc_free_pimap(struct kvm *kvm) 3984 { 3985 kfree(kvm->arch.pimap); 3986 } 3987 3988 static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) 3989 { 3990 return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL); 3991 } 3992 3993 static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) 3994 { 3995 struct irq_desc *desc; 3996 struct kvmppc_irq_map *irq_map; 3997 struct kvmppc_passthru_irqmap *pimap; 3998 struct irq_chip *chip; 3999 int i, rc = 0; 4000 4001 if (!kvm_irq_bypass) 4002 return 1; 4003 4004 desc = irq_to_desc(host_irq); 4005 if (!desc) 4006 return -EIO; 4007 4008 mutex_lock(&kvm->lock); 4009 4010 pimap = kvm->arch.pimap; 4011 if (pimap == NULL) { 4012 /* First call, allocate structure to hold IRQ map */ 4013 pimap = kvmppc_alloc_pimap(); 4014 if (pimap == NULL) { 4015 mutex_unlock(&kvm->lock); 4016 return -ENOMEM; 4017 } 4018 kvm->arch.pimap = pimap; 4019 } 4020 4021 /* 4022 * For now, we only support interrupts for which the EOI operation 4023 * is an OPAL call followed by a write to XIRR, since that's 4024 * what our real-mode EOI code does, or a XIVE interrupt 4025 */ 4026 chip = irq_data_get_irq_chip(&desc->irq_data); 4027 if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) { 4028 pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n", 4029 host_irq, guest_gsi); 4030 mutex_unlock(&kvm->lock); 4031 return -ENOENT; 4032 } 4033 4034 /* 4035 * See if we already have an entry for this guest IRQ number. 4036 * If it's mapped to a hardware IRQ number, that's an error, 4037 * otherwise re-use this entry. 4038 */ 4039 for (i = 0; i < pimap->n_mapped; i++) { 4040 if (guest_gsi == pimap->mapped[i].v_hwirq) { 4041 if (pimap->mapped[i].r_hwirq) { 4042 mutex_unlock(&kvm->lock); 4043 return -EINVAL; 4044 } 4045 break; 4046 } 4047 } 4048 4049 if (i == KVMPPC_PIRQ_MAPPED) { 4050 mutex_unlock(&kvm->lock); 4051 return -EAGAIN; /* table is full */ 4052 } 4053 4054 irq_map = &pimap->mapped[i]; 4055 4056 irq_map->v_hwirq = guest_gsi; 4057 irq_map->desc = desc; 4058 4059 /* 4060 * Order the above two stores before the next to serialize with 4061 * the KVM real mode handler. 4062 */ 4063 smp_wmb(); 4064 irq_map->r_hwirq = desc->irq_data.hwirq; 4065 4066 if (i == pimap->n_mapped) 4067 pimap->n_mapped++; 4068 4069 if (xive_enabled()) 4070 rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); 4071 else 4072 kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); 4073 if (rc) 4074 irq_map->r_hwirq = 0; 4075 4076 mutex_unlock(&kvm->lock); 4077 4078 return 0; 4079 } 4080 4081 static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) 4082 { 4083 struct irq_desc *desc; 4084 struct kvmppc_passthru_irqmap *pimap; 4085 int i, rc = 0; 4086 4087 if (!kvm_irq_bypass) 4088 return 0; 4089 4090 desc = irq_to_desc(host_irq); 4091 if (!desc) 4092 return -EIO; 4093 4094 mutex_lock(&kvm->lock); 4095 if (!kvm->arch.pimap) 4096 goto unlock; 4097 4098 pimap = kvm->arch.pimap; 4099 4100 for (i = 0; i < pimap->n_mapped; i++) { 4101 if (guest_gsi == pimap->mapped[i].v_hwirq) 4102 break; 4103 } 4104 4105 if (i == pimap->n_mapped) { 4106 mutex_unlock(&kvm->lock); 4107 return -ENODEV; 4108 } 4109 4110 if (xive_enabled()) 4111 rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); 4112 else 4113 kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); 4114 4115 /* invalidate the entry (what do do on error from the above ?) */ 4116 pimap->mapped[i].r_hwirq = 0; 4117 4118 /* 4119 * We don't free this structure even when the count goes to 4120 * zero. The structure is freed when we destroy the VM. 4121 */ 4122 unlock: 4123 mutex_unlock(&kvm->lock); 4124 return rc; 4125 } 4126 4127 static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons, 4128 struct irq_bypass_producer *prod) 4129 { 4130 int ret = 0; 4131 struct kvm_kernel_irqfd *irqfd = 4132 container_of(cons, struct kvm_kernel_irqfd, consumer); 4133 4134 irqfd->producer = prod; 4135 4136 ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); 4137 if (ret) 4138 pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n", 4139 prod->irq, irqfd->gsi, ret); 4140 4141 return ret; 4142 } 4143 4144 static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons, 4145 struct irq_bypass_producer *prod) 4146 { 4147 int ret; 4148 struct kvm_kernel_irqfd *irqfd = 4149 container_of(cons, struct kvm_kernel_irqfd, consumer); 4150 4151 irqfd->producer = NULL; 4152 4153 /* 4154 * When producer of consumer is unregistered, we change back to 4155 * default external interrupt handling mode - KVM real mode 4156 * will switch back to host. 4157 */ 4158 ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); 4159 if (ret) 4160 pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n", 4161 prod->irq, irqfd->gsi, ret); 4162 } 4163 #endif 4164 4165 static long kvm_arch_vm_ioctl_hv(struct file *filp, 4166 unsigned int ioctl, unsigned long arg) 4167 { 4168 struct kvm *kvm __maybe_unused = filp->private_data; 4169 void __user *argp = (void __user *)arg; 4170 long r; 4171 4172 switch (ioctl) { 4173 4174 case KVM_PPC_ALLOCATE_HTAB: { 4175 u32 htab_order; 4176 4177 r = -EFAULT; 4178 if (get_user(htab_order, (u32 __user *)argp)) 4179 break; 4180 r = kvmppc_alloc_reset_hpt(kvm, htab_order); 4181 if (r) 4182 break; 4183 r = 0; 4184 break; 4185 } 4186 4187 case KVM_PPC_GET_HTAB_FD: { 4188 struct kvm_get_htab_fd ghf; 4189 4190 r = -EFAULT; 4191 if (copy_from_user(&ghf, argp, sizeof(ghf))) 4192 break; 4193 r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); 4194 break; 4195 } 4196 4197 case KVM_PPC_RESIZE_HPT_PREPARE: { 4198 struct kvm_ppc_resize_hpt rhpt; 4199 4200 r = -EFAULT; 4201 if (copy_from_user(&rhpt, argp, sizeof(rhpt))) 4202 break; 4203 4204 r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt); 4205 break; 4206 } 4207 4208 case KVM_PPC_RESIZE_HPT_COMMIT: { 4209 struct kvm_ppc_resize_hpt rhpt; 4210 4211 r = -EFAULT; 4212 if (copy_from_user(&rhpt, argp, sizeof(rhpt))) 4213 break; 4214 4215 r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt); 4216 break; 4217 } 4218 4219 default: 4220 r = -ENOTTY; 4221 } 4222 4223 return r; 4224 } 4225 4226 /* 4227 * List of hcall numbers to enable by default. 4228 * For compatibility with old userspace, we enable by default 4229 * all hcalls that were implemented before the hcall-enabling 4230 * facility was added. Note this list should not include H_RTAS. 4231 */ 4232 static unsigned int default_hcall_list[] = { 4233 H_REMOVE, 4234 H_ENTER, 4235 H_READ, 4236 H_PROTECT, 4237 H_BULK_REMOVE, 4238 H_GET_TCE, 4239 H_PUT_TCE, 4240 H_SET_DABR, 4241 H_SET_XDABR, 4242 H_CEDE, 4243 H_PROD, 4244 H_CONFER, 4245 H_REGISTER_VPA, 4246 #ifdef CONFIG_KVM_XICS 4247 H_EOI, 4248 H_CPPR, 4249 H_IPI, 4250 H_IPOLL, 4251 H_XIRR, 4252 H_XIRR_X, 4253 #endif 4254 0 4255 }; 4256 4257 static void init_default_hcalls(void) 4258 { 4259 int i; 4260 unsigned int hcall; 4261 4262 for (i = 0; default_hcall_list[i]; ++i) { 4263 hcall = default_hcall_list[i]; 4264 WARN_ON(!kvmppc_hcall_impl_hv(hcall)); 4265 __set_bit(hcall / 4, default_enabled_hcalls); 4266 } 4267 } 4268 4269 static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) 4270 { 4271 unsigned long lpcr; 4272 int radix; 4273 int err; 4274 4275 /* If not on a POWER9, reject it */ 4276 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 4277 return -ENODEV; 4278 4279 /* If any unknown flags set, reject it */ 4280 if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) 4281 return -EINVAL; 4282 4283 /* GR (guest radix) bit in process_table field must match */ 4284 radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); 4285 if (!!(cfg->process_table & PATB_GR) != radix) 4286 return -EINVAL; 4287 4288 /* Process table size field must be reasonable, i.e. <= 24 */ 4289 if ((cfg->process_table & PRTS_MASK) > 24) 4290 return -EINVAL; 4291 4292 /* We can change a guest to/from radix now, if the host is radix */ 4293 if (radix && !radix_enabled()) 4294 return -EINVAL; 4295 4296 mutex_lock(&kvm->lock); 4297 if (radix != kvm_is_radix(kvm)) { 4298 if (kvm->arch.mmu_ready) { 4299 kvm->arch.mmu_ready = 0; 4300 /* order mmu_ready vs. vcpus_running */ 4301 smp_mb(); 4302 if (atomic_read(&kvm->arch.vcpus_running)) { 4303 kvm->arch.mmu_ready = 1; 4304 err = -EBUSY; 4305 goto out_unlock; 4306 } 4307 } 4308 if (radix) 4309 err = kvmppc_switch_mmu_to_radix(kvm); 4310 else 4311 err = kvmppc_switch_mmu_to_hpt(kvm); 4312 if (err) 4313 goto out_unlock; 4314 } 4315 4316 kvm->arch.process_table = cfg->process_table; 4317 kvmppc_setup_partition_table(kvm); 4318 4319 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; 4320 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); 4321 err = 0; 4322 4323 out_unlock: 4324 mutex_unlock(&kvm->lock); 4325 return err; 4326 } 4327 4328 static struct kvmppc_ops kvm_ops_hv = { 4329 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, 4330 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, 4331 .get_one_reg = kvmppc_get_one_reg_hv, 4332 .set_one_reg = kvmppc_set_one_reg_hv, 4333 .vcpu_load = kvmppc_core_vcpu_load_hv, 4334 .vcpu_put = kvmppc_core_vcpu_put_hv, 4335 .set_msr = kvmppc_set_msr_hv, 4336 .vcpu_run = kvmppc_vcpu_run_hv, 4337 .vcpu_create = kvmppc_core_vcpu_create_hv, 4338 .vcpu_free = kvmppc_core_vcpu_free_hv, 4339 .check_requests = kvmppc_core_check_requests_hv, 4340 .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv, 4341 .flush_memslot = kvmppc_core_flush_memslot_hv, 4342 .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, 4343 .commit_memory_region = kvmppc_core_commit_memory_region_hv, 4344 .unmap_hva = kvm_unmap_hva_hv, 4345 .unmap_hva_range = kvm_unmap_hva_range_hv, 4346 .age_hva = kvm_age_hva_hv, 4347 .test_age_hva = kvm_test_age_hva_hv, 4348 .set_spte_hva = kvm_set_spte_hva_hv, 4349 .mmu_destroy = kvmppc_mmu_destroy_hv, 4350 .free_memslot = kvmppc_core_free_memslot_hv, 4351 .create_memslot = kvmppc_core_create_memslot_hv, 4352 .init_vm = kvmppc_core_init_vm_hv, 4353 .destroy_vm = kvmppc_core_destroy_vm_hv, 4354 .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, 4355 .emulate_op = kvmppc_core_emulate_op_hv, 4356 .emulate_mtspr = kvmppc_core_emulate_mtspr_hv, 4357 .emulate_mfspr = kvmppc_core_emulate_mfspr_hv, 4358 .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, 4359 .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, 4360 .hcall_implemented = kvmppc_hcall_impl_hv, 4361 #ifdef CONFIG_KVM_XICS 4362 .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv, 4363 .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv, 4364 #endif 4365 .configure_mmu = kvmhv_configure_mmu, 4366 .get_rmmu_info = kvmhv_get_rmmu_info, 4367 .set_smt_mode = kvmhv_set_smt_mode, 4368 }; 4369 4370 static int kvm_init_subcore_bitmap(void) 4371 { 4372 int i, j; 4373 int nr_cores = cpu_nr_cores(); 4374 struct sibling_subcore_state *sibling_subcore_state; 4375 4376 for (i = 0; i < nr_cores; i++) { 4377 int first_cpu = i * threads_per_core; 4378 int node = cpu_to_node(first_cpu); 4379 4380 /* Ignore if it is already allocated. */ 4381 if (paca[first_cpu].sibling_subcore_state) 4382 continue; 4383 4384 sibling_subcore_state = 4385 kmalloc_node(sizeof(struct sibling_subcore_state), 4386 GFP_KERNEL, node); 4387 if (!sibling_subcore_state) 4388 return -ENOMEM; 4389 4390 memset(sibling_subcore_state, 0, 4391 sizeof(struct sibling_subcore_state)); 4392 4393 for (j = 0; j < threads_per_core; j++) { 4394 int cpu = first_cpu + j; 4395 4396 paca[cpu].sibling_subcore_state = sibling_subcore_state; 4397 } 4398 } 4399 return 0; 4400 } 4401 4402 static int kvmppc_radix_possible(void) 4403 { 4404 return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); 4405 } 4406 4407 static int kvmppc_book3s_init_hv(void) 4408 { 4409 int r; 4410 /* 4411 * FIXME!! Do we need to check on all cpus ? 4412 */ 4413 r = kvmppc_core_check_processor_compat_hv(); 4414 if (r < 0) 4415 return -ENODEV; 4416 4417 r = kvm_init_subcore_bitmap(); 4418 if (r) 4419 return r; 4420 4421 /* 4422 * We need a way of accessing the XICS interrupt controller, 4423 * either directly, via paca[cpu].kvm_hstate.xics_phys, or 4424 * indirectly, via OPAL. 4425 */ 4426 #ifdef CONFIG_SMP 4427 if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) { 4428 struct device_node *np; 4429 4430 np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); 4431 if (!np) { 4432 pr_err("KVM-HV: Cannot determine method for accessing XICS\n"); 4433 return -ENODEV; 4434 } 4435 } 4436 #endif 4437 4438 kvm_ops_hv.owner = THIS_MODULE; 4439 kvmppc_hv_ops = &kvm_ops_hv; 4440 4441 init_default_hcalls(); 4442 4443 init_vcore_lists(); 4444 4445 r = kvmppc_mmu_hv_init(); 4446 if (r) 4447 return r; 4448 4449 if (kvmppc_radix_possible()) 4450 r = kvmppc_radix_init(); 4451 return r; 4452 } 4453 4454 static void kvmppc_book3s_exit_hv(void) 4455 { 4456 kvmppc_free_host_rm_ops(); 4457 if (kvmppc_radix_possible()) 4458 kvmppc_radix_exit(); 4459 kvmppc_hv_ops = NULL; 4460 } 4461 4462 module_init(kvmppc_book3s_init_hv); 4463 module_exit(kvmppc_book3s_exit_hv); 4464 MODULE_LICENSE("GPL"); 4465 MODULE_ALIAS_MISCDEV(KVM_MINOR); 4466 MODULE_ALIAS("devname:kvm"); 4467