1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation. 4 */ 5 6 #define pr_fmt(fmt) "xive-kvm: " fmt 7 8 #include <linux/kernel.h> 9 #include <linux/kvm_host.h> 10 #include <linux/err.h> 11 #include <linux/gfp.h> 12 #include <linux/spinlock.h> 13 #include <linux/delay.h> 14 #include <linux/percpu.h> 15 #include <linux/cpumask.h> 16 #include <linux/uaccess.h> 17 #include <asm/kvm_book3s.h> 18 #include <asm/kvm_ppc.h> 19 #include <asm/hvcall.h> 20 #include <asm/xics.h> 21 #include <asm/xive.h> 22 #include <asm/xive-regs.h> 23 #include <asm/debug.h> 24 #include <asm/debugfs.h> 25 #include <asm/time.h> 26 #include <asm/opal.h> 27 28 #include <linux/debugfs.h> 29 #include <linux/seq_file.h> 30 31 #include "book3s_xive.h" 32 33 34 /* 35 * Virtual mode variants of the hcalls for use on radix/radix 36 * with AIL. They require the VCPU's VP to be "pushed" 37 * 38 * We still instantiate them here because we use some of the 39 * generated utility functions as well in this file. 40 */ 41 #define XIVE_RUNTIME_CHECKS 42 #define X_PFX xive_vm_ 43 #define X_STATIC static 44 #define X_STAT_PFX stat_vm_ 45 #define __x_tima xive_tima 46 #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) 47 #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) 48 #define __x_writeb __raw_writeb 49 #define __x_readw __raw_readw 50 #define __x_readq __raw_readq 51 #define __x_writeq __raw_writeq 52 53 #include "book3s_xive_template.c" 54 55 /* 56 * We leave a gap of a couple of interrupts in the queue to 57 * account for the IPI and additional safety guard. 58 */ 59 #define XIVE_Q_GAP 2 60 61 /* 62 * Push a vcpu's context to the XIVE on guest entry. 63 * This assumes we are in virtual mode (MMU on) 64 */ 65 void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) 66 { 67 void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; 68 u64 pq; 69 70 /* 71 * Nothing to do if the platform doesn't have a XIVE 72 * or this vCPU doesn't have its own XIVE context 73 * (e.g. because it's not using an in-kernel interrupt controller). 74 */ 75 if (!tima || !vcpu->arch.xive_cam_word) 76 return; 77 78 eieio(); 79 __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); 80 __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); 81 vcpu->arch.xive_pushed = 1; 82 eieio(); 83 84 /* 85 * We clear the irq_pending flag. There is a small chance of a 86 * race vs. the escalation interrupt happening on another 87 * processor setting it again, but the only consequence is to 88 * cause a spurious wakeup on the next H_CEDE, which is not an 89 * issue. 90 */ 91 vcpu->arch.irq_pending = 0; 92 93 /* 94 * In single escalation mode, if the escalation interrupt is 95 * on, we mask it. 96 */ 97 if (vcpu->arch.xive_esc_on) { 98 pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + 99 XIVE_ESB_SET_PQ_01)); 100 mb(); 101 102 /* 103 * We have a possible subtle race here: The escalation 104 * interrupt might have fired and be on its way to the 105 * host queue while we mask it, and if we unmask it 106 * early enough (re-cede right away), there is a 107 * theorical possibility that it fires again, thus 108 * landing in the target queue more than once which is 109 * a big no-no. 110 * 111 * Fortunately, solving this is rather easy. If the 112 * above load setting PQ to 01 returns a previous 113 * value where P is set, then we know the escalation 114 * interrupt is somewhere on its way to the host. In 115 * that case we simply don't clear the xive_esc_on 116 * flag below. It will be eventually cleared by the 117 * handler for the escalation interrupt. 118 * 119 * Then, when doing a cede, we check that flag again 120 * before re-enabling the escalation interrupt, and if 121 * set, we abort the cede. 122 */ 123 if (!(pq & XIVE_ESB_VAL_P)) 124 /* Now P is 0, we can clear the flag */ 125 vcpu->arch.xive_esc_on = 0; 126 } 127 } 128 EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); 129 130 /* 131 * Pull a vcpu's context from the XIVE on guest exit. 132 * This assumes we are in virtual mode (MMU on) 133 */ 134 void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) 135 { 136 void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; 137 138 if (!vcpu->arch.xive_pushed) 139 return; 140 141 /* 142 * Should not have been pushed if there is no tima 143 */ 144 if (WARN_ON(!tima)) 145 return; 146 147 eieio(); 148 /* First load to pull the context, we ignore the value */ 149 __raw_readl(tima + TM_SPC_PULL_OS_CTX); 150 /* Second load to recover the context state (Words 0 and 1) */ 151 vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); 152 153 /* Fixup some of the state for the next load */ 154 vcpu->arch.xive_saved_state.lsmfb = 0; 155 vcpu->arch.xive_saved_state.ack = 0xff; 156 vcpu->arch.xive_pushed = 0; 157 eieio(); 158 } 159 EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu); 160 161 void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) 162 { 163 void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr; 164 165 if (!esc_vaddr) 166 return; 167 168 /* we are using XIVE with single escalation */ 169 170 if (vcpu->arch.xive_esc_on) { 171 /* 172 * If we still have a pending escalation, abort the cede, 173 * and we must set PQ to 10 rather than 00 so that we don't 174 * potentially end up with two entries for the escalation 175 * interrupt in the XIVE interrupt queue. In that case 176 * we also don't want to set xive_esc_on to 1 here in 177 * case we race with xive_esc_irq(). 178 */ 179 vcpu->arch.ceded = 0; 180 /* 181 * The escalation interrupts are special as we don't EOI them. 182 * There is no need to use the load-after-store ordering offset 183 * to set PQ to 10 as we won't use StoreEOI. 184 */ 185 __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10); 186 } else { 187 vcpu->arch.xive_esc_on = true; 188 mb(); 189 __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00); 190 } 191 mb(); 192 } 193 EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation); 194 195 /* 196 * This is a simple trigger for a generic XIVE IRQ. This must 197 * only be called for interrupts that support a trigger page 198 */ 199 static bool xive_irq_trigger(struct xive_irq_data *xd) 200 { 201 /* This should be only for MSIs */ 202 if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) 203 return false; 204 205 /* Those interrupts should always have a trigger page */ 206 if (WARN_ON(!xd->trig_mmio)) 207 return false; 208 209 out_be64(xd->trig_mmio, 0); 210 211 return true; 212 } 213 214 static irqreturn_t xive_esc_irq(int irq, void *data) 215 { 216 struct kvm_vcpu *vcpu = data; 217 218 vcpu->arch.irq_pending = 1; 219 smp_mb(); 220 if (vcpu->arch.ceded) 221 kvmppc_fast_vcpu_kick(vcpu); 222 223 /* Since we have the no-EOI flag, the interrupt is effectively 224 * disabled now. Clearing xive_esc_on means we won't bother 225 * doing so on the next entry. 226 * 227 * This also allows the entry code to know that if a PQ combination 228 * of 10 is observed while xive_esc_on is true, it means the queue 229 * contains an unprocessed escalation interrupt. We don't make use of 230 * that knowledge today but might (see comment in book3s_hv_rmhandler.S) 231 */ 232 vcpu->arch.xive_esc_on = false; 233 234 /* This orders xive_esc_on = false vs. subsequent stale_p = true */ 235 smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */ 236 237 return IRQ_HANDLED; 238 } 239 240 int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, 241 bool single_escalation) 242 { 243 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 244 struct xive_q *q = &xc->queues[prio]; 245 char *name = NULL; 246 int rc; 247 248 /* Already there ? */ 249 if (xc->esc_virq[prio]) 250 return 0; 251 252 /* Hook up the escalation interrupt */ 253 xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq); 254 if (!xc->esc_virq[prio]) { 255 pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n", 256 prio, xc->server_num); 257 return -EIO; 258 } 259 260 if (single_escalation) 261 name = kasprintf(GFP_KERNEL, "kvm-%d-%d", 262 vcpu->kvm->arch.lpid, xc->server_num); 263 else 264 name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", 265 vcpu->kvm->arch.lpid, xc->server_num, prio); 266 if (!name) { 267 pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n", 268 prio, xc->server_num); 269 rc = -ENOMEM; 270 goto error; 271 } 272 273 pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio); 274 275 rc = request_irq(xc->esc_virq[prio], xive_esc_irq, 276 IRQF_NO_THREAD, name, vcpu); 277 if (rc) { 278 pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n", 279 prio, xc->server_num); 280 goto error; 281 } 282 xc->esc_virq_names[prio] = name; 283 284 /* In single escalation mode, we grab the ESB MMIO of the 285 * interrupt and mask it. Also populate the VCPU v/raddr 286 * of the ESB page for use by asm entry/exit code. Finally 287 * set the XIVE_IRQ_FLAG_NO_EOI flag which will prevent the 288 * core code from performing an EOI on the escalation 289 * interrupt, thus leaving it effectively masked after 290 * it fires once. 291 */ 292 if (single_escalation) { 293 struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]); 294 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 295 296 xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); 297 vcpu->arch.xive_esc_raddr = xd->eoi_page; 298 vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio; 299 xd->flags |= XIVE_IRQ_FLAG_NO_EOI; 300 } 301 302 return 0; 303 error: 304 irq_dispose_mapping(xc->esc_virq[prio]); 305 xc->esc_virq[prio] = 0; 306 kfree(name); 307 return rc; 308 } 309 310 static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio) 311 { 312 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 313 struct kvmppc_xive *xive = xc->xive; 314 struct xive_q *q = &xc->queues[prio]; 315 void *qpage; 316 int rc; 317 318 if (WARN_ON(q->qpage)) 319 return 0; 320 321 /* Allocate the queue and retrieve infos on current node for now */ 322 qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order); 323 if (!qpage) { 324 pr_err("Failed to allocate queue %d for VCPU %d\n", 325 prio, xc->server_num); 326 return -ENOMEM; 327 } 328 memset(qpage, 0, 1 << xive->q_order); 329 330 /* 331 * Reconfigure the queue. This will set q->qpage only once the 332 * queue is fully configured. This is a requirement for prio 0 333 * as we will stop doing EOIs for every IPI as soon as we observe 334 * qpage being non-NULL, and instead will only EOI when we receive 335 * corresponding queue 0 entries 336 */ 337 rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage, 338 xive->q_order, true); 339 if (rc) 340 pr_err("Failed to configure queue %d for VCPU %d\n", 341 prio, xc->server_num); 342 return rc; 343 } 344 345 /* Called with xive->lock held */ 346 static int xive_check_provisioning(struct kvm *kvm, u8 prio) 347 { 348 struct kvmppc_xive *xive = kvm->arch.xive; 349 struct kvm_vcpu *vcpu; 350 int i, rc; 351 352 lockdep_assert_held(&xive->lock); 353 354 /* Already provisioned ? */ 355 if (xive->qmap & (1 << prio)) 356 return 0; 357 358 pr_devel("Provisioning prio... %d\n", prio); 359 360 /* Provision each VCPU and enable escalations if needed */ 361 kvm_for_each_vcpu(i, vcpu, kvm) { 362 if (!vcpu->arch.xive_vcpu) 363 continue; 364 rc = xive_provision_queue(vcpu, prio); 365 if (rc == 0 && !xive->single_escalation) 366 kvmppc_xive_attach_escalation(vcpu, prio, 367 xive->single_escalation); 368 if (rc) 369 return rc; 370 } 371 372 /* Order previous stores and mark it as provisioned */ 373 mb(); 374 xive->qmap |= (1 << prio); 375 return 0; 376 } 377 378 static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio) 379 { 380 struct kvm_vcpu *vcpu; 381 struct kvmppc_xive_vcpu *xc; 382 struct xive_q *q; 383 384 /* Locate target server */ 385 vcpu = kvmppc_xive_find_server(kvm, server); 386 if (!vcpu) { 387 pr_warn("%s: Can't find server %d\n", __func__, server); 388 return; 389 } 390 xc = vcpu->arch.xive_vcpu; 391 if (WARN_ON(!xc)) 392 return; 393 394 q = &xc->queues[prio]; 395 atomic_inc(&q->pending_count); 396 } 397 398 static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio) 399 { 400 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 401 struct xive_q *q; 402 u32 max; 403 404 if (WARN_ON(!xc)) 405 return -ENXIO; 406 if (!xc->valid) 407 return -ENXIO; 408 409 q = &xc->queues[prio]; 410 if (WARN_ON(!q->qpage)) 411 return -ENXIO; 412 413 /* Calculate max number of interrupts in that queue. */ 414 max = (q->msk + 1) - XIVE_Q_GAP; 415 return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY; 416 } 417 418 int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio) 419 { 420 struct kvm_vcpu *vcpu; 421 int i, rc; 422 423 /* Locate target server */ 424 vcpu = kvmppc_xive_find_server(kvm, *server); 425 if (!vcpu) { 426 pr_devel("Can't find server %d\n", *server); 427 return -EINVAL; 428 } 429 430 pr_devel("Finding irq target on 0x%x/%d...\n", *server, prio); 431 432 /* Try pick it */ 433 rc = xive_try_pick_queue(vcpu, prio); 434 if (rc == 0) 435 return rc; 436 437 pr_devel(" .. failed, looking up candidate...\n"); 438 439 /* Failed, pick another VCPU */ 440 kvm_for_each_vcpu(i, vcpu, kvm) { 441 if (!vcpu->arch.xive_vcpu) 442 continue; 443 rc = xive_try_pick_queue(vcpu, prio); 444 if (rc == 0) { 445 *server = vcpu->arch.xive_vcpu->server_num; 446 pr_devel(" found on 0x%x/%d\n", *server, prio); 447 return rc; 448 } 449 } 450 pr_devel(" no available target !\n"); 451 452 /* No available target ! */ 453 return -EBUSY; 454 } 455 456 static u8 xive_lock_and_mask(struct kvmppc_xive *xive, 457 struct kvmppc_xive_src_block *sb, 458 struct kvmppc_xive_irq_state *state) 459 { 460 struct xive_irq_data *xd; 461 u32 hw_num; 462 u8 old_prio; 463 u64 val; 464 465 /* 466 * Take the lock, set masked, try again if racing 467 * with H_EOI 468 */ 469 for (;;) { 470 arch_spin_lock(&sb->lock); 471 old_prio = state->guest_priority; 472 state->guest_priority = MASKED; 473 mb(); 474 if (!state->in_eoi) 475 break; 476 state->guest_priority = old_prio; 477 arch_spin_unlock(&sb->lock); 478 } 479 480 /* No change ? Bail */ 481 if (old_prio == MASKED) 482 return old_prio; 483 484 /* Get the right irq */ 485 kvmppc_xive_select_irq(state, &hw_num, &xd); 486 487 /* Set PQ to 10, return old P and old Q and remember them */ 488 val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10); 489 state->old_p = !!(val & 2); 490 state->old_q = !!(val & 1); 491 492 /* 493 * Synchronize hardware to sensure the queues are updated when 494 * masking 495 */ 496 xive_native_sync_source(hw_num); 497 498 return old_prio; 499 } 500 501 static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb, 502 struct kvmppc_xive_irq_state *state) 503 { 504 /* 505 * Take the lock try again if racing with H_EOI 506 */ 507 for (;;) { 508 arch_spin_lock(&sb->lock); 509 if (!state->in_eoi) 510 break; 511 arch_spin_unlock(&sb->lock); 512 } 513 } 514 515 static void xive_finish_unmask(struct kvmppc_xive *xive, 516 struct kvmppc_xive_src_block *sb, 517 struct kvmppc_xive_irq_state *state, 518 u8 prio) 519 { 520 struct xive_irq_data *xd; 521 u32 hw_num; 522 523 /* If we aren't changing a thing, move on */ 524 if (state->guest_priority != MASKED) 525 goto bail; 526 527 /* Get the right irq */ 528 kvmppc_xive_select_irq(state, &hw_num, &xd); 529 530 /* Old Q set, set PQ to 11 */ 531 if (state->old_q) 532 xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11); 533 534 /* 535 * If not old P, then perform an "effective" EOI, 536 * on the source. This will handle the cases where 537 * FW EOI is needed. 538 */ 539 if (!state->old_p) 540 xive_vm_source_eoi(hw_num, xd); 541 542 /* Synchronize ordering and mark unmasked */ 543 mb(); 544 bail: 545 state->guest_priority = prio; 546 } 547 548 /* 549 * Target an interrupt to a given server/prio, this will fallback 550 * to another server if necessary and perform the HW targetting 551 * updates as needed 552 * 553 * NOTE: Must be called with the state lock held 554 */ 555 static int xive_target_interrupt(struct kvm *kvm, 556 struct kvmppc_xive_irq_state *state, 557 u32 server, u8 prio) 558 { 559 struct kvmppc_xive *xive = kvm->arch.xive; 560 u32 hw_num; 561 int rc; 562 563 /* 564 * This will return a tentative server and actual 565 * priority. The count for that new target will have 566 * already been incremented. 567 */ 568 rc = kvmppc_xive_select_target(kvm, &server, prio); 569 570 /* 571 * We failed to find a target ? Not much we can do 572 * at least until we support the GIQ. 573 */ 574 if (rc) 575 return rc; 576 577 /* 578 * Increment the old queue pending count if there 579 * was one so that the old queue count gets adjusted later 580 * when observed to be empty. 581 */ 582 if (state->act_priority != MASKED) 583 xive_inc_q_pending(kvm, 584 state->act_server, 585 state->act_priority); 586 /* 587 * Update state and HW 588 */ 589 state->act_priority = prio; 590 state->act_server = server; 591 592 /* Get the right irq */ 593 kvmppc_xive_select_irq(state, &hw_num, NULL); 594 595 return xive_native_configure_irq(hw_num, 596 kvmppc_xive_vp(xive, server), 597 prio, state->number); 598 } 599 600 /* 601 * Targetting rules: In order to avoid losing track of 602 * pending interrupts accross mask and unmask, which would 603 * allow queue overflows, we implement the following rules: 604 * 605 * - Unless it was never enabled (or we run out of capacity) 606 * an interrupt is always targetted at a valid server/queue 607 * pair even when "masked" by the guest. This pair tends to 608 * be the last one used but it can be changed under some 609 * circumstances. That allows us to separate targetting 610 * from masking, we only handle accounting during (re)targetting, 611 * this also allows us to let an interrupt drain into its target 612 * queue after masking, avoiding complex schemes to remove 613 * interrupts out of remote processor queues. 614 * 615 * - When masking, we set PQ to 10 and save the previous value 616 * of P and Q. 617 * 618 * - When unmasking, if saved Q was set, we set PQ to 11 619 * otherwise we leave PQ to the HW state which will be either 620 * 10 if nothing happened or 11 if the interrupt fired while 621 * masked. Effectively we are OR'ing the previous Q into the 622 * HW Q. 623 * 624 * Then if saved P is clear, we do an effective EOI (Q->P->Trigger) 625 * which will unmask the interrupt and shoot a new one if Q was 626 * set. 627 * 628 * Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11, 629 * effectively meaning an H_EOI from the guest is still expected 630 * for that interrupt). 631 * 632 * - If H_EOI occurs while masked, we clear the saved P. 633 * 634 * - When changing target, we account on the new target and 635 * increment a separate "pending" counter on the old one. 636 * This pending counter will be used to decrement the old 637 * target's count when its queue has been observed empty. 638 */ 639 640 int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, 641 u32 priority) 642 { 643 struct kvmppc_xive *xive = kvm->arch.xive; 644 struct kvmppc_xive_src_block *sb; 645 struct kvmppc_xive_irq_state *state; 646 u8 new_act_prio; 647 int rc = 0; 648 u16 idx; 649 650 if (!xive) 651 return -ENODEV; 652 653 pr_devel("set_xive ! irq 0x%x server 0x%x prio %d\n", 654 irq, server, priority); 655 656 /* First, check provisioning of queues */ 657 if (priority != MASKED) { 658 mutex_lock(&xive->lock); 659 rc = xive_check_provisioning(xive->kvm, 660 xive_prio_from_guest(priority)); 661 mutex_unlock(&xive->lock); 662 } 663 if (rc) { 664 pr_devel(" provisioning failure %d !\n", rc); 665 return rc; 666 } 667 668 sb = kvmppc_xive_find_source(xive, irq, &idx); 669 if (!sb) 670 return -EINVAL; 671 state = &sb->irq_state[idx]; 672 673 /* 674 * We first handle masking/unmasking since the locking 675 * might need to be retried due to EOIs, we'll handle 676 * targetting changes later. These functions will return 677 * with the SB lock held. 678 * 679 * xive_lock_and_mask() will also set state->guest_priority 680 * but won't otherwise change other fields of the state. 681 * 682 * xive_lock_for_unmask will not actually unmask, this will 683 * be done later by xive_finish_unmask() once the targetting 684 * has been done, so we don't try to unmask an interrupt 685 * that hasn't yet been targetted. 686 */ 687 if (priority == MASKED) 688 xive_lock_and_mask(xive, sb, state); 689 else 690 xive_lock_for_unmask(sb, state); 691 692 693 /* 694 * Then we handle targetting. 695 * 696 * First calculate a new "actual priority" 697 */ 698 new_act_prio = state->act_priority; 699 if (priority != MASKED) 700 new_act_prio = xive_prio_from_guest(priority); 701 702 pr_devel(" new_act_prio=%x act_server=%x act_prio=%x\n", 703 new_act_prio, state->act_server, state->act_priority); 704 705 /* 706 * Then check if we actually need to change anything, 707 * 708 * The condition for re-targetting the interrupt is that 709 * we have a valid new priority (new_act_prio is not 0xff) 710 * and either the server or the priority changed. 711 * 712 * Note: If act_priority was ff and the new priority is 713 * also ff, we don't do anything and leave the interrupt 714 * untargetted. An attempt of doing an int_on on an 715 * untargetted interrupt will fail. If that is a problem 716 * we could initialize interrupts with valid default 717 */ 718 719 if (new_act_prio != MASKED && 720 (state->act_server != server || 721 state->act_priority != new_act_prio)) 722 rc = xive_target_interrupt(kvm, state, server, new_act_prio); 723 724 /* 725 * Perform the final unmasking of the interrupt source 726 * if necessary 727 */ 728 if (priority != MASKED) 729 xive_finish_unmask(xive, sb, state, priority); 730 731 /* 732 * Finally Update saved_priority to match. Only int_on/off 733 * set this field to a different value. 734 */ 735 state->saved_priority = priority; 736 737 arch_spin_unlock(&sb->lock); 738 return rc; 739 } 740 741 int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server, 742 u32 *priority) 743 { 744 struct kvmppc_xive *xive = kvm->arch.xive; 745 struct kvmppc_xive_src_block *sb; 746 struct kvmppc_xive_irq_state *state; 747 u16 idx; 748 749 if (!xive) 750 return -ENODEV; 751 752 sb = kvmppc_xive_find_source(xive, irq, &idx); 753 if (!sb) 754 return -EINVAL; 755 state = &sb->irq_state[idx]; 756 arch_spin_lock(&sb->lock); 757 *server = state->act_server; 758 *priority = state->guest_priority; 759 arch_spin_unlock(&sb->lock); 760 761 return 0; 762 } 763 764 int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) 765 { 766 struct kvmppc_xive *xive = kvm->arch.xive; 767 struct kvmppc_xive_src_block *sb; 768 struct kvmppc_xive_irq_state *state; 769 u16 idx; 770 771 if (!xive) 772 return -ENODEV; 773 774 sb = kvmppc_xive_find_source(xive, irq, &idx); 775 if (!sb) 776 return -EINVAL; 777 state = &sb->irq_state[idx]; 778 779 pr_devel("int_on(irq=0x%x)\n", irq); 780 781 /* 782 * Check if interrupt was not targetted 783 */ 784 if (state->act_priority == MASKED) { 785 pr_devel("int_on on untargetted interrupt\n"); 786 return -EINVAL; 787 } 788 789 /* If saved_priority is 0xff, do nothing */ 790 if (state->saved_priority == MASKED) 791 return 0; 792 793 /* 794 * Lock and unmask it. 795 */ 796 xive_lock_for_unmask(sb, state); 797 xive_finish_unmask(xive, sb, state, state->saved_priority); 798 arch_spin_unlock(&sb->lock); 799 800 return 0; 801 } 802 803 int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) 804 { 805 struct kvmppc_xive *xive = kvm->arch.xive; 806 struct kvmppc_xive_src_block *sb; 807 struct kvmppc_xive_irq_state *state; 808 u16 idx; 809 810 if (!xive) 811 return -ENODEV; 812 813 sb = kvmppc_xive_find_source(xive, irq, &idx); 814 if (!sb) 815 return -EINVAL; 816 state = &sb->irq_state[idx]; 817 818 pr_devel("int_off(irq=0x%x)\n", irq); 819 820 /* 821 * Lock and mask 822 */ 823 state->saved_priority = xive_lock_and_mask(xive, sb, state); 824 arch_spin_unlock(&sb->lock); 825 826 return 0; 827 } 828 829 static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq) 830 { 831 struct kvmppc_xive_src_block *sb; 832 struct kvmppc_xive_irq_state *state; 833 u16 idx; 834 835 sb = kvmppc_xive_find_source(xive, irq, &idx); 836 if (!sb) 837 return false; 838 state = &sb->irq_state[idx]; 839 if (!state->valid) 840 return false; 841 842 /* 843 * Trigger the IPI. This assumes we never restore a pass-through 844 * interrupt which should be safe enough 845 */ 846 xive_irq_trigger(&state->ipi_data); 847 848 return true; 849 } 850 851 u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) 852 { 853 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 854 855 if (!xc) 856 return 0; 857 858 /* Return the per-cpu state for state saving/migration */ 859 return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT | 860 (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT | 861 (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT; 862 } 863 864 int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) 865 { 866 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 867 struct kvmppc_xive *xive = vcpu->kvm->arch.xive; 868 u8 cppr, mfrr; 869 u32 xisr; 870 871 if (!xc || !xive) 872 return -ENOENT; 873 874 /* Grab individual state fields. We don't use pending_pri */ 875 cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; 876 xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & 877 KVM_REG_PPC_ICP_XISR_MASK; 878 mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; 879 880 pr_devel("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n", 881 xc->server_num, cppr, mfrr, xisr); 882 883 /* 884 * We can't update the state of a "pushed" VCPU, but that 885 * shouldn't happen because the vcpu->mutex makes running a 886 * vcpu mutually exclusive with doing one_reg get/set on it. 887 */ 888 if (WARN_ON(vcpu->arch.xive_pushed)) 889 return -EIO; 890 891 /* Update VCPU HW saved state */ 892 vcpu->arch.xive_saved_state.cppr = cppr; 893 xc->hw_cppr = xc->cppr = cppr; 894 895 /* 896 * Update MFRR state. If it's not 0xff, we mark the VCPU as 897 * having a pending MFRR change, which will re-evaluate the 898 * target. The VCPU will thus potentially get a spurious 899 * interrupt but that's not a big deal. 900 */ 901 xc->mfrr = mfrr; 902 if (mfrr < cppr) 903 xive_irq_trigger(&xc->vp_ipi_data); 904 905 /* 906 * Now saved XIRR is "interesting". It means there's something in 907 * the legacy "1 element" queue... for an IPI we simply ignore it, 908 * as the MFRR restore will handle that. For anything else we need 909 * to force a resend of the source. 910 * However the source may not have been setup yet. If that's the 911 * case, we keep that info and increment a counter in the xive to 912 * tell subsequent xive_set_source() to go look. 913 */ 914 if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) { 915 xc->delayed_irq = xisr; 916 xive->delayed_irqs++; 917 pr_devel(" xisr restore delayed\n"); 918 } 919 920 return 0; 921 } 922 923 int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, 924 struct irq_desc *host_desc) 925 { 926 struct kvmppc_xive *xive = kvm->arch.xive; 927 struct kvmppc_xive_src_block *sb; 928 struct kvmppc_xive_irq_state *state; 929 struct irq_data *host_data = irq_desc_get_irq_data(host_desc); 930 unsigned int host_irq = irq_desc_get_irq(host_desc); 931 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); 932 u16 idx; 933 u8 prio; 934 int rc; 935 936 if (!xive) 937 return -ENODEV; 938 939 pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, hw_irq); 940 941 sb = kvmppc_xive_find_source(xive, guest_irq, &idx); 942 if (!sb) 943 return -EINVAL; 944 state = &sb->irq_state[idx]; 945 946 /* 947 * Mark the passed-through interrupt as going to a VCPU, 948 * this will prevent further EOIs and similar operations 949 * from the XIVE code. It will also mask the interrupt 950 * to either PQ=10 or 11 state, the latter if the interrupt 951 * is pending. This will allow us to unmask or retrigger it 952 * after routing it to the guest with a simple EOI. 953 * 954 * The "state" argument is a "token", all it needs is to be 955 * non-NULL to switch to passed-through or NULL for the 956 * other way around. We may not yet have an actual VCPU 957 * target here and we don't really care. 958 */ 959 rc = irq_set_vcpu_affinity(host_irq, state); 960 if (rc) { 961 pr_err("Failed to set VCPU affinity for irq %d\n", host_irq); 962 return rc; 963 } 964 965 /* 966 * Mask and read state of IPI. We need to know if its P bit 967 * is set as that means it's potentially already using a 968 * queue entry in the target 969 */ 970 prio = xive_lock_and_mask(xive, sb, state); 971 pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio, 972 state->old_p, state->old_q); 973 974 /* Turn the IPI hard off */ 975 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); 976 977 /* 978 * Reset ESB guest mapping. Needed when ESB pages are exposed 979 * to the guest in XIVE native mode 980 */ 981 if (xive->ops && xive->ops->reset_mapped) 982 xive->ops->reset_mapped(kvm, guest_irq); 983 984 /* Grab info about irq */ 985 state->pt_number = hw_irq; 986 state->pt_data = irq_data_get_irq_handler_data(host_data); 987 988 /* 989 * Configure the IRQ to match the existing configuration of 990 * the IPI if it was already targetted. Otherwise this will 991 * mask the interrupt in a lossy way (act_priority is 0xff) 992 * which is fine for a never started interrupt. 993 */ 994 xive_native_configure_irq(hw_irq, 995 kvmppc_xive_vp(xive, state->act_server), 996 state->act_priority, state->number); 997 998 /* 999 * We do an EOI to enable the interrupt (and retrigger if needed) 1000 * if the guest has the interrupt unmasked and the P bit was *not* 1001 * set in the IPI. If it was set, we know a slot may still be in 1002 * use in the target queue thus we have to wait for a guest 1003 * originated EOI 1004 */ 1005 if (prio != MASKED && !state->old_p) 1006 xive_vm_source_eoi(hw_irq, state->pt_data); 1007 1008 /* Clear old_p/old_q as they are no longer relevant */ 1009 state->old_p = state->old_q = false; 1010 1011 /* Restore guest prio (unlocks EOI) */ 1012 mb(); 1013 state->guest_priority = prio; 1014 arch_spin_unlock(&sb->lock); 1015 1016 return 0; 1017 } 1018 EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped); 1019 1020 int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, 1021 struct irq_desc *host_desc) 1022 { 1023 struct kvmppc_xive *xive = kvm->arch.xive; 1024 struct kvmppc_xive_src_block *sb; 1025 struct kvmppc_xive_irq_state *state; 1026 unsigned int host_irq = irq_desc_get_irq(host_desc); 1027 u16 idx; 1028 u8 prio; 1029 int rc; 1030 1031 if (!xive) 1032 return -ENODEV; 1033 1034 pr_devel("clr_mapped girq 0x%lx...\n", guest_irq); 1035 1036 sb = kvmppc_xive_find_source(xive, guest_irq, &idx); 1037 if (!sb) 1038 return -EINVAL; 1039 state = &sb->irq_state[idx]; 1040 1041 /* 1042 * Mask and read state of IRQ. We need to know if its P bit 1043 * is set as that means it's potentially already using a 1044 * queue entry in the target 1045 */ 1046 prio = xive_lock_and_mask(xive, sb, state); 1047 pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio, 1048 state->old_p, state->old_q); 1049 1050 /* 1051 * If old_p is set, the interrupt is pending, we switch it to 1052 * PQ=11. This will force a resend in the host so the interrupt 1053 * isn't lost to whatver host driver may pick it up 1054 */ 1055 if (state->old_p) 1056 xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11); 1057 1058 /* Release the passed-through interrupt to the host */ 1059 rc = irq_set_vcpu_affinity(host_irq, NULL); 1060 if (rc) { 1061 pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq); 1062 return rc; 1063 } 1064 1065 /* Forget about the IRQ */ 1066 state->pt_number = 0; 1067 state->pt_data = NULL; 1068 1069 /* 1070 * Reset ESB guest mapping. Needed when ESB pages are exposed 1071 * to the guest in XIVE native mode 1072 */ 1073 if (xive->ops && xive->ops->reset_mapped) { 1074 xive->ops->reset_mapped(kvm, guest_irq); 1075 } 1076 1077 /* Reconfigure the IPI */ 1078 xive_native_configure_irq(state->ipi_number, 1079 kvmppc_xive_vp(xive, state->act_server), 1080 state->act_priority, state->number); 1081 1082 /* 1083 * If old_p is set (we have a queue entry potentially 1084 * occupied) or the interrupt is masked, we set the IPI 1085 * to PQ=10 state. Otherwise we just re-enable it (PQ=00). 1086 */ 1087 if (prio == MASKED || state->old_p) 1088 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10); 1089 else 1090 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00); 1091 1092 /* Restore guest prio (unlocks EOI) */ 1093 mb(); 1094 state->guest_priority = prio; 1095 arch_spin_unlock(&sb->lock); 1096 1097 return 0; 1098 } 1099 EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped); 1100 1101 void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) 1102 { 1103 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1104 struct kvm *kvm = vcpu->kvm; 1105 struct kvmppc_xive *xive = kvm->arch.xive; 1106 int i, j; 1107 1108 for (i = 0; i <= xive->max_sbid; i++) { 1109 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1110 1111 if (!sb) 1112 continue; 1113 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) { 1114 struct kvmppc_xive_irq_state *state = &sb->irq_state[j]; 1115 1116 if (!state->valid) 1117 continue; 1118 if (state->act_priority == MASKED) 1119 continue; 1120 if (state->act_server != xc->server_num) 1121 continue; 1122 1123 /* Clean it up */ 1124 arch_spin_lock(&sb->lock); 1125 state->act_priority = MASKED; 1126 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); 1127 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0); 1128 if (state->pt_number) { 1129 xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01); 1130 xive_native_configure_irq(state->pt_number, 0, MASKED, 0); 1131 } 1132 arch_spin_unlock(&sb->lock); 1133 } 1134 } 1135 1136 /* Disable vcpu's escalation interrupt */ 1137 if (vcpu->arch.xive_esc_on) { 1138 __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + 1139 XIVE_ESB_SET_PQ_01)); 1140 vcpu->arch.xive_esc_on = false; 1141 } 1142 1143 /* 1144 * Clear pointers to escalation interrupt ESB. 1145 * This is safe because the vcpu->mutex is held, preventing 1146 * any other CPU from concurrently executing a KVM_RUN ioctl. 1147 */ 1148 vcpu->arch.xive_esc_vaddr = 0; 1149 vcpu->arch.xive_esc_raddr = 0; 1150 } 1151 1152 /* 1153 * In single escalation mode, the escalation interrupt is marked so 1154 * that EOI doesn't re-enable it, but just sets the stale_p flag to 1155 * indicate that the P bit has already been dealt with. However, the 1156 * assembly code that enters the guest sets PQ to 00 without clearing 1157 * stale_p (because it has no easy way to address it). Hence we have 1158 * to adjust stale_p before shutting down the interrupt. 1159 */ 1160 void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, 1161 struct kvmppc_xive_vcpu *xc, int irq) 1162 { 1163 struct irq_data *d = irq_get_irq_data(irq); 1164 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 1165 1166 /* 1167 * This slightly odd sequence gives the right result 1168 * (i.e. stale_p set if xive_esc_on is false) even if 1169 * we race with xive_esc_irq() and xive_irq_eoi(). 1170 */ 1171 xd->stale_p = false; 1172 smp_mb(); /* paired with smb_wmb in xive_esc_irq */ 1173 if (!vcpu->arch.xive_esc_on) 1174 xd->stale_p = true; 1175 } 1176 1177 void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) 1178 { 1179 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1180 struct kvmppc_xive *xive = vcpu->kvm->arch.xive; 1181 int i; 1182 1183 if (!kvmppc_xics_enabled(vcpu)) 1184 return; 1185 1186 if (!xc) 1187 return; 1188 1189 pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); 1190 1191 /* Ensure no interrupt is still routed to that VP */ 1192 xc->valid = false; 1193 kvmppc_xive_disable_vcpu_interrupts(vcpu); 1194 1195 /* Mask the VP IPI */ 1196 xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); 1197 1198 /* Free escalations */ 1199 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1200 if (xc->esc_virq[i]) { 1201 if (xc->xive->single_escalation) 1202 xive_cleanup_single_escalation(vcpu, xc, 1203 xc->esc_virq[i]); 1204 free_irq(xc->esc_virq[i], vcpu); 1205 irq_dispose_mapping(xc->esc_virq[i]); 1206 kfree(xc->esc_virq_names[i]); 1207 } 1208 } 1209 1210 /* Disable the VP */ 1211 xive_native_disable_vp(xc->vp_id); 1212 1213 /* Clear the cam word so guest entry won't try to push context */ 1214 vcpu->arch.xive_cam_word = 0; 1215 1216 /* Free the queues */ 1217 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1218 struct xive_q *q = &xc->queues[i]; 1219 1220 xive_native_disable_queue(xc->vp_id, q, i); 1221 if (q->qpage) { 1222 free_pages((unsigned long)q->qpage, 1223 xive->q_page_order); 1224 q->qpage = NULL; 1225 } 1226 } 1227 1228 /* Free the IPI */ 1229 if (xc->vp_ipi) { 1230 xive_cleanup_irq_data(&xc->vp_ipi_data); 1231 xive_native_free_irq(xc->vp_ipi); 1232 } 1233 /* Free the VP */ 1234 kfree(xc); 1235 1236 /* Cleanup the vcpu */ 1237 vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; 1238 vcpu->arch.xive_vcpu = NULL; 1239 } 1240 1241 static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) 1242 { 1243 /* We have a block of xive->nr_servers VPs. We just need to check 1244 * packed vCPU ids are below that. 1245 */ 1246 return kvmppc_pack_vcpu_id(xive->kvm, cpu) < xive->nr_servers; 1247 } 1248 1249 int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) 1250 { 1251 u32 vp_id; 1252 1253 if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) { 1254 pr_devel("Out of bounds !\n"); 1255 return -EINVAL; 1256 } 1257 1258 if (xive->vp_base == XIVE_INVALID_VP) { 1259 xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers); 1260 pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers); 1261 1262 if (xive->vp_base == XIVE_INVALID_VP) 1263 return -ENOSPC; 1264 } 1265 1266 vp_id = kvmppc_xive_vp(xive, cpu); 1267 if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { 1268 pr_devel("Duplicate !\n"); 1269 return -EEXIST; 1270 } 1271 1272 *vp = vp_id; 1273 1274 return 0; 1275 } 1276 1277 int kvmppc_xive_connect_vcpu(struct kvm_device *dev, 1278 struct kvm_vcpu *vcpu, u32 cpu) 1279 { 1280 struct kvmppc_xive *xive = dev->private; 1281 struct kvmppc_xive_vcpu *xc; 1282 int i, r = -EBUSY; 1283 u32 vp_id; 1284 1285 pr_devel("connect_vcpu(cpu=%d)\n", cpu); 1286 1287 if (dev->ops != &kvm_xive_ops) { 1288 pr_devel("Wrong ops !\n"); 1289 return -EPERM; 1290 } 1291 if (xive->kvm != vcpu->kvm) 1292 return -EPERM; 1293 if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) 1294 return -EBUSY; 1295 1296 /* We need to synchronize with queue provisioning */ 1297 mutex_lock(&xive->lock); 1298 1299 r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id); 1300 if (r) 1301 goto bail; 1302 1303 xc = kzalloc(sizeof(*xc), GFP_KERNEL); 1304 if (!xc) { 1305 r = -ENOMEM; 1306 goto bail; 1307 } 1308 1309 vcpu->arch.xive_vcpu = xc; 1310 xc->xive = xive; 1311 xc->vcpu = vcpu; 1312 xc->server_num = cpu; 1313 xc->vp_id = vp_id; 1314 xc->mfrr = 0xff; 1315 xc->valid = true; 1316 1317 r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id); 1318 if (r) 1319 goto bail; 1320 1321 /* Configure VCPU fields for use by assembly push/pull */ 1322 vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000); 1323 vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO); 1324 1325 /* Allocate IPI */ 1326 xc->vp_ipi = xive_native_alloc_irq(); 1327 if (!xc->vp_ipi) { 1328 pr_err("Failed to allocate xive irq for VCPU IPI\n"); 1329 r = -EIO; 1330 goto bail; 1331 } 1332 pr_devel(" IPI=0x%x\n", xc->vp_ipi); 1333 1334 r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data); 1335 if (r) 1336 goto bail; 1337 1338 /* 1339 * Enable the VP first as the single escalation mode will 1340 * affect escalation interrupts numbering 1341 */ 1342 r = xive_native_enable_vp(xc->vp_id, xive->single_escalation); 1343 if (r) { 1344 pr_err("Failed to enable VP in OPAL, err %d\n", r); 1345 goto bail; 1346 } 1347 1348 /* 1349 * Initialize queues. Initially we set them all for no queueing 1350 * and we enable escalation for queue 0 only which we'll use for 1351 * our mfrr change notifications. If the VCPU is hot-plugged, we 1352 * do handle provisioning however based on the existing "map" 1353 * of enabled queues. 1354 */ 1355 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1356 struct xive_q *q = &xc->queues[i]; 1357 1358 /* Single escalation, no queue 7 */ 1359 if (i == 7 && xive->single_escalation) 1360 break; 1361 1362 /* Is queue already enabled ? Provision it */ 1363 if (xive->qmap & (1 << i)) { 1364 r = xive_provision_queue(vcpu, i); 1365 if (r == 0 && !xive->single_escalation) 1366 kvmppc_xive_attach_escalation( 1367 vcpu, i, xive->single_escalation); 1368 if (r) 1369 goto bail; 1370 } else { 1371 r = xive_native_configure_queue(xc->vp_id, 1372 q, i, NULL, 0, true); 1373 if (r) { 1374 pr_err("Failed to configure queue %d for VCPU %d\n", 1375 i, cpu); 1376 goto bail; 1377 } 1378 } 1379 } 1380 1381 /* If not done above, attach priority 0 escalation */ 1382 r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation); 1383 if (r) 1384 goto bail; 1385 1386 /* Route the IPI */ 1387 r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI); 1388 if (!r) 1389 xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00); 1390 1391 bail: 1392 mutex_unlock(&xive->lock); 1393 if (r) { 1394 kvmppc_xive_cleanup_vcpu(vcpu); 1395 return r; 1396 } 1397 1398 vcpu->arch.irq_type = KVMPPC_IRQ_XICS; 1399 return 0; 1400 } 1401 1402 /* 1403 * Scanning of queues before/after migration save 1404 */ 1405 static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq) 1406 { 1407 struct kvmppc_xive_src_block *sb; 1408 struct kvmppc_xive_irq_state *state; 1409 u16 idx; 1410 1411 sb = kvmppc_xive_find_source(xive, irq, &idx); 1412 if (!sb) 1413 return; 1414 1415 state = &sb->irq_state[idx]; 1416 1417 /* Some sanity checking */ 1418 if (!state->valid) { 1419 pr_err("invalid irq 0x%x in cpu queue!\n", irq); 1420 return; 1421 } 1422 1423 /* 1424 * If the interrupt is in a queue it should have P set. 1425 * We warn so that gets reported. A backtrace isn't useful 1426 * so no need to use a WARN_ON. 1427 */ 1428 if (!state->saved_p) 1429 pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq); 1430 1431 /* Set flag */ 1432 state->in_queue = true; 1433 } 1434 1435 static void xive_pre_save_mask_irq(struct kvmppc_xive *xive, 1436 struct kvmppc_xive_src_block *sb, 1437 u32 irq) 1438 { 1439 struct kvmppc_xive_irq_state *state = &sb->irq_state[irq]; 1440 1441 if (!state->valid) 1442 return; 1443 1444 /* Mask and save state, this will also sync HW queues */ 1445 state->saved_scan_prio = xive_lock_and_mask(xive, sb, state); 1446 1447 /* Transfer P and Q */ 1448 state->saved_p = state->old_p; 1449 state->saved_q = state->old_q; 1450 1451 /* Unlock */ 1452 arch_spin_unlock(&sb->lock); 1453 } 1454 1455 static void xive_pre_save_unmask_irq(struct kvmppc_xive *xive, 1456 struct kvmppc_xive_src_block *sb, 1457 u32 irq) 1458 { 1459 struct kvmppc_xive_irq_state *state = &sb->irq_state[irq]; 1460 1461 if (!state->valid) 1462 return; 1463 1464 /* 1465 * Lock / exclude EOI (not technically necessary if the 1466 * guest isn't running concurrently. If this becomes a 1467 * performance issue we can probably remove the lock. 1468 */ 1469 xive_lock_for_unmask(sb, state); 1470 1471 /* Restore mask/prio if it wasn't masked */ 1472 if (state->saved_scan_prio != MASKED) 1473 xive_finish_unmask(xive, sb, state, state->saved_scan_prio); 1474 1475 /* Unlock */ 1476 arch_spin_unlock(&sb->lock); 1477 } 1478 1479 static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q) 1480 { 1481 u32 idx = q->idx; 1482 u32 toggle = q->toggle; 1483 u32 irq; 1484 1485 do { 1486 irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle); 1487 if (irq > XICS_IPI) 1488 xive_pre_save_set_queued(xive, irq); 1489 } while(irq); 1490 } 1491 1492 static void xive_pre_save_scan(struct kvmppc_xive *xive) 1493 { 1494 struct kvm_vcpu *vcpu = NULL; 1495 int i, j; 1496 1497 /* 1498 * See comment in xive_get_source() about how this 1499 * work. Collect a stable state for all interrupts 1500 */ 1501 for (i = 0; i <= xive->max_sbid; i++) { 1502 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1503 if (!sb) 1504 continue; 1505 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) 1506 xive_pre_save_mask_irq(xive, sb, j); 1507 } 1508 1509 /* Then scan the queues and update the "in_queue" flag */ 1510 kvm_for_each_vcpu(i, vcpu, xive->kvm) { 1511 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1512 if (!xc) 1513 continue; 1514 for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) { 1515 if (xc->queues[j].qpage) 1516 xive_pre_save_queue(xive, &xc->queues[j]); 1517 } 1518 } 1519 1520 /* Finally restore interrupt states */ 1521 for (i = 0; i <= xive->max_sbid; i++) { 1522 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1523 if (!sb) 1524 continue; 1525 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) 1526 xive_pre_save_unmask_irq(xive, sb, j); 1527 } 1528 } 1529 1530 static void xive_post_save_scan(struct kvmppc_xive *xive) 1531 { 1532 u32 i, j; 1533 1534 /* Clear all the in_queue flags */ 1535 for (i = 0; i <= xive->max_sbid; i++) { 1536 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1537 if (!sb) 1538 continue; 1539 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) 1540 sb->irq_state[j].in_queue = false; 1541 } 1542 1543 /* Next get_source() will do a new scan */ 1544 xive->saved_src_count = 0; 1545 } 1546 1547 /* 1548 * This returns the source configuration and state to user space. 1549 */ 1550 static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr) 1551 { 1552 struct kvmppc_xive_src_block *sb; 1553 struct kvmppc_xive_irq_state *state; 1554 u64 __user *ubufp = (u64 __user *) addr; 1555 u64 val, prio; 1556 u16 idx; 1557 1558 sb = kvmppc_xive_find_source(xive, irq, &idx); 1559 if (!sb) 1560 return -ENOENT; 1561 1562 state = &sb->irq_state[idx]; 1563 1564 if (!state->valid) 1565 return -ENOENT; 1566 1567 pr_devel("get_source(%ld)...\n", irq); 1568 1569 /* 1570 * So to properly save the state into something that looks like a 1571 * XICS migration stream we cannot treat interrupts individually. 1572 * 1573 * We need, instead, mask them all (& save their previous PQ state) 1574 * to get a stable state in the HW, then sync them to ensure that 1575 * any interrupt that had already fired hits its queue, and finally 1576 * scan all the queues to collect which interrupts are still present 1577 * in the queues, so we can set the "pending" flag on them and 1578 * they can be resent on restore. 1579 * 1580 * So we do it all when the "first" interrupt gets saved, all the 1581 * state is collected at that point, the rest of xive_get_source() 1582 * will merely collect and convert that state to the expected 1583 * userspace bit mask. 1584 */ 1585 if (xive->saved_src_count == 0) 1586 xive_pre_save_scan(xive); 1587 xive->saved_src_count++; 1588 1589 /* Convert saved state into something compatible with xics */ 1590 val = state->act_server; 1591 prio = state->saved_scan_prio; 1592 1593 if (prio == MASKED) { 1594 val |= KVM_XICS_MASKED; 1595 prio = state->saved_priority; 1596 } 1597 val |= prio << KVM_XICS_PRIORITY_SHIFT; 1598 if (state->lsi) { 1599 val |= KVM_XICS_LEVEL_SENSITIVE; 1600 if (state->saved_p) 1601 val |= KVM_XICS_PENDING; 1602 } else { 1603 if (state->saved_p) 1604 val |= KVM_XICS_PRESENTED; 1605 1606 if (state->saved_q) 1607 val |= KVM_XICS_QUEUED; 1608 1609 /* 1610 * We mark it pending (which will attempt a re-delivery) 1611 * if we are in a queue *or* we were masked and had 1612 * Q set which is equivalent to the XICS "masked pending" 1613 * state 1614 */ 1615 if (state->in_queue || (prio == MASKED && state->saved_q)) 1616 val |= KVM_XICS_PENDING; 1617 } 1618 1619 /* 1620 * If that was the last interrupt saved, reset the 1621 * in_queue flags 1622 */ 1623 if (xive->saved_src_count == xive->src_count) 1624 xive_post_save_scan(xive); 1625 1626 /* Copy the result to userspace */ 1627 if (put_user(val, ubufp)) 1628 return -EFAULT; 1629 1630 return 0; 1631 } 1632 1633 struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( 1634 struct kvmppc_xive *xive, int irq) 1635 { 1636 struct kvmppc_xive_src_block *sb; 1637 int i, bid; 1638 1639 bid = irq >> KVMPPC_XICS_ICS_SHIFT; 1640 1641 mutex_lock(&xive->lock); 1642 1643 /* block already exists - somebody else got here first */ 1644 if (xive->src_blocks[bid]) 1645 goto out; 1646 1647 /* Create the ICS */ 1648 sb = kzalloc(sizeof(*sb), GFP_KERNEL); 1649 if (!sb) 1650 goto out; 1651 1652 sb->id = bid; 1653 1654 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 1655 sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i; 1656 sb->irq_state[i].eisn = 0; 1657 sb->irq_state[i].guest_priority = MASKED; 1658 sb->irq_state[i].saved_priority = MASKED; 1659 sb->irq_state[i].act_priority = MASKED; 1660 } 1661 smp_wmb(); 1662 xive->src_blocks[bid] = sb; 1663 1664 if (bid > xive->max_sbid) 1665 xive->max_sbid = bid; 1666 1667 out: 1668 mutex_unlock(&xive->lock); 1669 return xive->src_blocks[bid]; 1670 } 1671 1672 static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq) 1673 { 1674 struct kvm *kvm = xive->kvm; 1675 struct kvm_vcpu *vcpu = NULL; 1676 int i; 1677 1678 kvm_for_each_vcpu(i, vcpu, kvm) { 1679 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1680 1681 if (!xc) 1682 continue; 1683 1684 if (xc->delayed_irq == irq) { 1685 xc->delayed_irq = 0; 1686 xive->delayed_irqs--; 1687 return true; 1688 } 1689 } 1690 return false; 1691 } 1692 1693 static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) 1694 { 1695 struct kvmppc_xive_src_block *sb; 1696 struct kvmppc_xive_irq_state *state; 1697 u64 __user *ubufp = (u64 __user *) addr; 1698 u16 idx; 1699 u64 val; 1700 u8 act_prio, guest_prio; 1701 u32 server; 1702 int rc = 0; 1703 1704 if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) 1705 return -ENOENT; 1706 1707 pr_devel("set_source(irq=0x%lx)\n", irq); 1708 1709 /* Find the source */ 1710 sb = kvmppc_xive_find_source(xive, irq, &idx); 1711 if (!sb) { 1712 pr_devel("No source, creating source block...\n"); 1713 sb = kvmppc_xive_create_src_block(xive, irq); 1714 if (!sb) { 1715 pr_devel("Failed to create block...\n"); 1716 return -ENOMEM; 1717 } 1718 } 1719 state = &sb->irq_state[idx]; 1720 1721 /* Read user passed data */ 1722 if (get_user(val, ubufp)) { 1723 pr_devel("fault getting user info !\n"); 1724 return -EFAULT; 1725 } 1726 1727 server = val & KVM_XICS_DESTINATION_MASK; 1728 guest_prio = val >> KVM_XICS_PRIORITY_SHIFT; 1729 1730 pr_devel(" val=0x016%llx (server=0x%x, guest_prio=%d)\n", 1731 val, server, guest_prio); 1732 1733 /* 1734 * If the source doesn't already have an IPI, allocate 1735 * one and get the corresponding data 1736 */ 1737 if (!state->ipi_number) { 1738 state->ipi_number = xive_native_alloc_irq(); 1739 if (state->ipi_number == 0) { 1740 pr_devel("Failed to allocate IPI !\n"); 1741 return -ENOMEM; 1742 } 1743 xive_native_populate_irq_data(state->ipi_number, &state->ipi_data); 1744 pr_devel(" src_ipi=0x%x\n", state->ipi_number); 1745 } 1746 1747 /* 1748 * We use lock_and_mask() to set us in the right masked 1749 * state. We will override that state from the saved state 1750 * further down, but this will handle the cases of interrupts 1751 * that need FW masking. We set the initial guest_priority to 1752 * 0 before calling it to ensure it actually performs the masking. 1753 */ 1754 state->guest_priority = 0; 1755 xive_lock_and_mask(xive, sb, state); 1756 1757 /* 1758 * Now, we select a target if we have one. If we don't we 1759 * leave the interrupt untargetted. It means that an interrupt 1760 * can become "untargetted" accross migration if it was masked 1761 * by set_xive() but there is little we can do about it. 1762 */ 1763 1764 /* First convert prio and mark interrupt as untargetted */ 1765 act_prio = xive_prio_from_guest(guest_prio); 1766 state->act_priority = MASKED; 1767 1768 /* 1769 * We need to drop the lock due to the mutex below. Hopefully 1770 * nothing is touching that interrupt yet since it hasn't been 1771 * advertized to a running guest yet 1772 */ 1773 arch_spin_unlock(&sb->lock); 1774 1775 /* If we have a priority target the interrupt */ 1776 if (act_prio != MASKED) { 1777 /* First, check provisioning of queues */ 1778 mutex_lock(&xive->lock); 1779 rc = xive_check_provisioning(xive->kvm, act_prio); 1780 mutex_unlock(&xive->lock); 1781 1782 /* Target interrupt */ 1783 if (rc == 0) 1784 rc = xive_target_interrupt(xive->kvm, state, 1785 server, act_prio); 1786 /* 1787 * If provisioning or targetting failed, leave it 1788 * alone and masked. It will remain disabled until 1789 * the guest re-targets it. 1790 */ 1791 } 1792 1793 /* 1794 * Find out if this was a delayed irq stashed in an ICP, 1795 * in which case, treat it as pending 1796 */ 1797 if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) { 1798 val |= KVM_XICS_PENDING; 1799 pr_devel(" Found delayed ! forcing PENDING !\n"); 1800 } 1801 1802 /* Cleanup the SW state */ 1803 state->old_p = false; 1804 state->old_q = false; 1805 state->lsi = false; 1806 state->asserted = false; 1807 1808 /* Restore LSI state */ 1809 if (val & KVM_XICS_LEVEL_SENSITIVE) { 1810 state->lsi = true; 1811 if (val & KVM_XICS_PENDING) 1812 state->asserted = true; 1813 pr_devel(" LSI ! Asserted=%d\n", state->asserted); 1814 } 1815 1816 /* 1817 * Restore P and Q. If the interrupt was pending, we 1818 * force Q and !P, which will trigger a resend. 1819 * 1820 * That means that a guest that had both an interrupt 1821 * pending (queued) and Q set will restore with only 1822 * one instance of that interrupt instead of 2, but that 1823 * is perfectly fine as coalescing interrupts that haven't 1824 * been presented yet is always allowed. 1825 */ 1826 if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING)) 1827 state->old_p = true; 1828 if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING) 1829 state->old_q = true; 1830 1831 pr_devel(" P=%d, Q=%d\n", state->old_p, state->old_q); 1832 1833 /* 1834 * If the interrupt was unmasked, update guest priority and 1835 * perform the appropriate state transition and do a 1836 * re-trigger if necessary. 1837 */ 1838 if (val & KVM_XICS_MASKED) { 1839 pr_devel(" masked, saving prio\n"); 1840 state->guest_priority = MASKED; 1841 state->saved_priority = guest_prio; 1842 } else { 1843 pr_devel(" unmasked, restoring to prio %d\n", guest_prio); 1844 xive_finish_unmask(xive, sb, state, guest_prio); 1845 state->saved_priority = guest_prio; 1846 } 1847 1848 /* Increment the number of valid sources and mark this one valid */ 1849 if (!state->valid) 1850 xive->src_count++; 1851 state->valid = true; 1852 1853 return 0; 1854 } 1855 1856 int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, 1857 bool line_status) 1858 { 1859 struct kvmppc_xive *xive = kvm->arch.xive; 1860 struct kvmppc_xive_src_block *sb; 1861 struct kvmppc_xive_irq_state *state; 1862 u16 idx; 1863 1864 if (!xive) 1865 return -ENODEV; 1866 1867 sb = kvmppc_xive_find_source(xive, irq, &idx); 1868 if (!sb) 1869 return -EINVAL; 1870 1871 /* Perform locklessly .... (we need to do some RCUisms here...) */ 1872 state = &sb->irq_state[idx]; 1873 if (!state->valid) 1874 return -EINVAL; 1875 1876 /* We don't allow a trigger on a passed-through interrupt */ 1877 if (state->pt_number) 1878 return -EINVAL; 1879 1880 if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) 1881 state->asserted = true; 1882 else if (level == 0 || level == KVM_INTERRUPT_UNSET) { 1883 state->asserted = false; 1884 return 0; 1885 } 1886 1887 /* Trigger the IPI */ 1888 xive_irq_trigger(&state->ipi_data); 1889 1890 return 0; 1891 } 1892 1893 int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr) 1894 { 1895 u32 __user *ubufp = (u32 __user *) addr; 1896 u32 nr_servers; 1897 int rc = 0; 1898 1899 if (get_user(nr_servers, ubufp)) 1900 return -EFAULT; 1901 1902 pr_devel("%s nr_servers=%u\n", __func__, nr_servers); 1903 1904 if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID) 1905 return -EINVAL; 1906 1907 mutex_lock(&xive->lock); 1908 if (xive->vp_base != XIVE_INVALID_VP) 1909 /* The VP block is allocated once and freed when the device 1910 * is released. Better not allow to change its size since its 1911 * used by connect_vcpu to validate vCPU ids are valid (eg, 1912 * setting it back to a higher value could allow connect_vcpu 1913 * to come up with a VP id that goes beyond the VP block, which 1914 * is likely to cause a crash in OPAL). 1915 */ 1916 rc = -EBUSY; 1917 else if (nr_servers > KVM_MAX_VCPUS) 1918 /* We don't need more servers. Higher vCPU ids get packed 1919 * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id(). 1920 */ 1921 xive->nr_servers = KVM_MAX_VCPUS; 1922 else 1923 xive->nr_servers = nr_servers; 1924 1925 mutex_unlock(&xive->lock); 1926 1927 return rc; 1928 } 1929 1930 static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1931 { 1932 struct kvmppc_xive *xive = dev->private; 1933 1934 /* We honor the existing XICS ioctl */ 1935 switch (attr->group) { 1936 case KVM_DEV_XICS_GRP_SOURCES: 1937 return xive_set_source(xive, attr->attr, attr->addr); 1938 case KVM_DEV_XICS_GRP_CTRL: 1939 switch (attr->attr) { 1940 case KVM_DEV_XICS_NR_SERVERS: 1941 return kvmppc_xive_set_nr_servers(xive, attr->addr); 1942 } 1943 } 1944 return -ENXIO; 1945 } 1946 1947 static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1948 { 1949 struct kvmppc_xive *xive = dev->private; 1950 1951 /* We honor the existing XICS ioctl */ 1952 switch (attr->group) { 1953 case KVM_DEV_XICS_GRP_SOURCES: 1954 return xive_get_source(xive, attr->attr, attr->addr); 1955 } 1956 return -ENXIO; 1957 } 1958 1959 static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1960 { 1961 /* We honor the same limits as XICS, at least for now */ 1962 switch (attr->group) { 1963 case KVM_DEV_XICS_GRP_SOURCES: 1964 if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && 1965 attr->attr < KVMPPC_XICS_NR_IRQS) 1966 return 0; 1967 break; 1968 case KVM_DEV_XICS_GRP_CTRL: 1969 switch (attr->attr) { 1970 case KVM_DEV_XICS_NR_SERVERS: 1971 return 0; 1972 } 1973 } 1974 return -ENXIO; 1975 } 1976 1977 static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd) 1978 { 1979 xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); 1980 xive_native_configure_irq(hw_num, 0, MASKED, 0); 1981 } 1982 1983 void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) 1984 { 1985 int i; 1986 1987 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 1988 struct kvmppc_xive_irq_state *state = &sb->irq_state[i]; 1989 1990 if (!state->valid) 1991 continue; 1992 1993 kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data); 1994 xive_cleanup_irq_data(&state->ipi_data); 1995 xive_native_free_irq(state->ipi_number); 1996 1997 /* Pass-through, cleanup too but keep IRQ hw data */ 1998 if (state->pt_number) 1999 kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data); 2000 2001 state->valid = false; 2002 } 2003 } 2004 2005 /* 2006 * Called when device fd is closed. kvm->lock is held. 2007 */ 2008 static void kvmppc_xive_release(struct kvm_device *dev) 2009 { 2010 struct kvmppc_xive *xive = dev->private; 2011 struct kvm *kvm = xive->kvm; 2012 struct kvm_vcpu *vcpu; 2013 int i; 2014 2015 pr_devel("Releasing xive device\n"); 2016 2017 /* 2018 * Since this is the device release function, we know that 2019 * userspace does not have any open fd referring to the 2020 * device. Therefore there can not be any of the device 2021 * attribute set/get functions being executed concurrently, 2022 * and similarly, the connect_vcpu and set/clr_mapped 2023 * functions also cannot be being executed. 2024 */ 2025 2026 debugfs_remove(xive->dentry); 2027 2028 /* 2029 * We should clean up the vCPU interrupt presenters first. 2030 */ 2031 kvm_for_each_vcpu(i, vcpu, kvm) { 2032 /* 2033 * Take vcpu->mutex to ensure that no one_reg get/set ioctl 2034 * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently. 2035 * Holding the vcpu->mutex also means that the vcpu cannot 2036 * be executing the KVM_RUN ioctl, and therefore it cannot 2037 * be executing the XIVE push or pull code or accessing 2038 * the XIVE MMIO regions. 2039 */ 2040 mutex_lock(&vcpu->mutex); 2041 kvmppc_xive_cleanup_vcpu(vcpu); 2042 mutex_unlock(&vcpu->mutex); 2043 } 2044 2045 /* 2046 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type 2047 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe 2048 * against xive code getting called during vcpu execution or 2049 * set/get one_reg operations. 2050 */ 2051 kvm->arch.xive = NULL; 2052 2053 /* Mask and free interrupts */ 2054 for (i = 0; i <= xive->max_sbid; i++) { 2055 if (xive->src_blocks[i]) 2056 kvmppc_xive_free_sources(xive->src_blocks[i]); 2057 kfree(xive->src_blocks[i]); 2058 xive->src_blocks[i] = NULL; 2059 } 2060 2061 if (xive->vp_base != XIVE_INVALID_VP) 2062 xive_native_free_vp_block(xive->vp_base); 2063 2064 /* 2065 * A reference of the kvmppc_xive pointer is now kept under 2066 * the xive_devices struct of the machine for reuse. It is 2067 * freed when the VM is destroyed for now until we fix all the 2068 * execution paths. 2069 */ 2070 2071 kfree(dev); 2072 } 2073 2074 /* 2075 * When the guest chooses the interrupt mode (XICS legacy or XIVE 2076 * native), the VM will switch of KVM device. The previous device will 2077 * be "released" before the new one is created. 2078 * 2079 * Until we are sure all execution paths are well protected, provide a 2080 * fail safe (transitional) method for device destruction, in which 2081 * the XIVE device pointer is recycled and not directly freed. 2082 */ 2083 struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) 2084 { 2085 struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ? 2086 &kvm->arch.xive_devices.native : 2087 &kvm->arch.xive_devices.xics_on_xive; 2088 struct kvmppc_xive *xive = *kvm_xive_device; 2089 2090 if (!xive) { 2091 xive = kzalloc(sizeof(*xive), GFP_KERNEL); 2092 *kvm_xive_device = xive; 2093 } else { 2094 memset(xive, 0, sizeof(*xive)); 2095 } 2096 2097 return xive; 2098 } 2099 2100 /* 2101 * Create a XICS device with XIVE backend. kvm->lock is held. 2102 */ 2103 static int kvmppc_xive_create(struct kvm_device *dev, u32 type) 2104 { 2105 struct kvmppc_xive *xive; 2106 struct kvm *kvm = dev->kvm; 2107 2108 pr_devel("Creating xive for partition\n"); 2109 2110 /* Already there ? */ 2111 if (kvm->arch.xive) 2112 return -EEXIST; 2113 2114 xive = kvmppc_xive_get_device(kvm, type); 2115 if (!xive) 2116 return -ENOMEM; 2117 2118 dev->private = xive; 2119 xive->dev = dev; 2120 xive->kvm = kvm; 2121 mutex_init(&xive->lock); 2122 2123 /* We use the default queue size set by the host */ 2124 xive->q_order = xive_native_default_eq_shift(); 2125 if (xive->q_order < PAGE_SHIFT) 2126 xive->q_page_order = 0; 2127 else 2128 xive->q_page_order = xive->q_order - PAGE_SHIFT; 2129 2130 /* VP allocation is delayed to the first call to connect_vcpu */ 2131 xive->vp_base = XIVE_INVALID_VP; 2132 /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets 2133 * on a POWER9 system. 2134 */ 2135 xive->nr_servers = KVM_MAX_VCPUS; 2136 2137 xive->single_escalation = xive_native_has_single_escalation(); 2138 2139 kvm->arch.xive = xive; 2140 return 0; 2141 } 2142 2143 int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req) 2144 { 2145 struct kvmppc_vcore *vc = vcpu->arch.vcore; 2146 2147 /* The VM should have configured XICS mode before doing XICS hcalls. */ 2148 if (!kvmppc_xics_enabled(vcpu)) 2149 return H_TOO_HARD; 2150 2151 switch (req) { 2152 case H_XIRR: 2153 return xive_vm_h_xirr(vcpu); 2154 case H_CPPR: 2155 return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); 2156 case H_EOI: 2157 return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); 2158 case H_IPI: 2159 return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), 2160 kvmppc_get_gpr(vcpu, 5)); 2161 case H_IPOLL: 2162 return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4)); 2163 case H_XIRR_X: 2164 xive_vm_h_xirr(vcpu); 2165 kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset); 2166 return H_SUCCESS; 2167 } 2168 2169 return H_UNSUPPORTED; 2170 } 2171 EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall); 2172 2173 int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu) 2174 { 2175 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 2176 unsigned int i; 2177 2178 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 2179 struct xive_q *q = &xc->queues[i]; 2180 u32 i0, i1, idx; 2181 2182 if (!q->qpage && !xc->esc_virq[i]) 2183 continue; 2184 2185 if (q->qpage) { 2186 seq_printf(m, " q[%d]: ", i); 2187 idx = q->idx; 2188 i0 = be32_to_cpup(q->qpage + idx); 2189 idx = (idx + 1) & q->msk; 2190 i1 = be32_to_cpup(q->qpage + idx); 2191 seq_printf(m, "T=%d %08x %08x...\n", q->toggle, 2192 i0, i1); 2193 } 2194 if (xc->esc_virq[i]) { 2195 struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]); 2196 struct xive_irq_data *xd = 2197 irq_data_get_irq_handler_data(d); 2198 u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET); 2199 2200 seq_printf(m, " ESC %d %c%c EOI @%llx", 2201 xc->esc_virq[i], 2202 (pq & XIVE_ESB_VAL_P) ? 'P' : '-', 2203 (pq & XIVE_ESB_VAL_Q) ? 'Q' : '-', 2204 xd->eoi_page); 2205 seq_puts(m, "\n"); 2206 } 2207 } 2208 return 0; 2209 } 2210 2211 void kvmppc_xive_debug_show_sources(struct seq_file *m, 2212 struct kvmppc_xive_src_block *sb) 2213 { 2214 int i; 2215 2216 seq_puts(m, " LISN HW/CHIP TYPE PQ EISN CPU/PRIO\n"); 2217 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 2218 struct kvmppc_xive_irq_state *state = &sb->irq_state[i]; 2219 struct xive_irq_data *xd; 2220 u64 pq; 2221 u32 hw_num; 2222 2223 if (!state->valid) 2224 continue; 2225 2226 kvmppc_xive_select_irq(state, &hw_num, &xd); 2227 2228 pq = xive_vm_esb_load(xd, XIVE_ESB_GET); 2229 2230 seq_printf(m, "%08x %08x/%02x", state->number, hw_num, 2231 xd->src_chip); 2232 if (state->lsi) 2233 seq_printf(m, " %cLSI", state->asserted ? '^' : ' '); 2234 else 2235 seq_puts(m, " MSI"); 2236 2237 seq_printf(m, " %s %c%c %08x % 4d/%d", 2238 state->ipi_number == hw_num ? "IPI" : " PT", 2239 pq & XIVE_ESB_VAL_P ? 'P' : '-', 2240 pq & XIVE_ESB_VAL_Q ? 'Q' : '-', 2241 state->eisn, state->act_server, 2242 state->act_priority); 2243 2244 seq_puts(m, "\n"); 2245 } 2246 } 2247 2248 static int xive_debug_show(struct seq_file *m, void *private) 2249 { 2250 struct kvmppc_xive *xive = m->private; 2251 struct kvm *kvm = xive->kvm; 2252 struct kvm_vcpu *vcpu; 2253 u64 t_rm_h_xirr = 0; 2254 u64 t_rm_h_ipoll = 0; 2255 u64 t_rm_h_cppr = 0; 2256 u64 t_rm_h_eoi = 0; 2257 u64 t_rm_h_ipi = 0; 2258 u64 t_vm_h_xirr = 0; 2259 u64 t_vm_h_ipoll = 0; 2260 u64 t_vm_h_cppr = 0; 2261 u64 t_vm_h_eoi = 0; 2262 u64 t_vm_h_ipi = 0; 2263 unsigned int i; 2264 2265 if (!kvm) 2266 return 0; 2267 2268 seq_puts(m, "=========\nVCPU state\n=========\n"); 2269 2270 kvm_for_each_vcpu(i, vcpu, kvm) { 2271 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 2272 2273 if (!xc) 2274 continue; 2275 2276 seq_printf(m, "VCPU %d: VP:%#x/%02x\n" 2277 " CPPR:%#x HWCPPR:%#x MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n", 2278 xc->server_num, xc->vp_id, xc->vp_chip_id, 2279 xc->cppr, xc->hw_cppr, 2280 xc->mfrr, xc->pending, 2281 xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); 2282 2283 kvmppc_xive_debug_show_queues(m, vcpu); 2284 2285 t_rm_h_xirr += xc->stat_rm_h_xirr; 2286 t_rm_h_ipoll += xc->stat_rm_h_ipoll; 2287 t_rm_h_cppr += xc->stat_rm_h_cppr; 2288 t_rm_h_eoi += xc->stat_rm_h_eoi; 2289 t_rm_h_ipi += xc->stat_rm_h_ipi; 2290 t_vm_h_xirr += xc->stat_vm_h_xirr; 2291 t_vm_h_ipoll += xc->stat_vm_h_ipoll; 2292 t_vm_h_cppr += xc->stat_vm_h_cppr; 2293 t_vm_h_eoi += xc->stat_vm_h_eoi; 2294 t_vm_h_ipi += xc->stat_vm_h_ipi; 2295 } 2296 2297 seq_puts(m, "Hcalls totals\n"); 2298 seq_printf(m, " H_XIRR R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr); 2299 seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll); 2300 seq_printf(m, " H_CPPR R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr); 2301 seq_printf(m, " H_EOI R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi); 2302 seq_printf(m, " H_IPI R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi); 2303 2304 seq_puts(m, "=========\nSources\n=========\n"); 2305 2306 for (i = 0; i <= xive->max_sbid; i++) { 2307 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 2308 2309 if (sb) { 2310 arch_spin_lock(&sb->lock); 2311 kvmppc_xive_debug_show_sources(m, sb); 2312 arch_spin_unlock(&sb->lock); 2313 } 2314 } 2315 2316 return 0; 2317 } 2318 2319 DEFINE_SHOW_ATTRIBUTE(xive_debug); 2320 2321 static void xive_debugfs_init(struct kvmppc_xive *xive) 2322 { 2323 char *name; 2324 2325 name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive); 2326 if (!name) { 2327 pr_err("%s: no memory for name\n", __func__); 2328 return; 2329 } 2330 2331 xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, 2332 xive, &xive_debug_fops); 2333 2334 pr_debug("%s: created %s\n", __func__, name); 2335 kfree(name); 2336 } 2337 2338 static void kvmppc_xive_init(struct kvm_device *dev) 2339 { 2340 struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private; 2341 2342 /* Register some debug interfaces */ 2343 xive_debugfs_init(xive); 2344 } 2345 2346 struct kvm_device_ops kvm_xive_ops = { 2347 .name = "kvm-xive", 2348 .create = kvmppc_xive_create, 2349 .init = kvmppc_xive_init, 2350 .release = kvmppc_xive_release, 2351 .set_attr = xive_set_attr, 2352 .get_attr = xive_get_attr, 2353 .has_attr = xive_has_attr, 2354 }; 2355