1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation. 4 */ 5 6 #define pr_fmt(fmt) "xive-kvm: " fmt 7 8 #include <linux/kernel.h> 9 #include <linux/kvm_host.h> 10 #include <linux/err.h> 11 #include <linux/gfp.h> 12 #include <linux/spinlock.h> 13 #include <linux/delay.h> 14 #include <linux/percpu.h> 15 #include <linux/cpumask.h> 16 #include <linux/uaccess.h> 17 #include <linux/irqdomain.h> 18 #include <asm/kvm_book3s.h> 19 #include <asm/kvm_ppc.h> 20 #include <asm/hvcall.h> 21 #include <asm/xics.h> 22 #include <asm/xive.h> 23 #include <asm/xive-regs.h> 24 #include <asm/debug.h> 25 #include <asm/time.h> 26 #include <asm/opal.h> 27 28 #include <linux/debugfs.h> 29 #include <linux/seq_file.h> 30 31 #include "book3s_xive.h" 32 33 34 /* 35 * Virtual mode variants of the hcalls for use on radix/radix 36 * with AIL. They require the VCPU's VP to be "pushed" 37 * 38 * We still instantiate them here because we use some of the 39 * generated utility functions as well in this file. 40 */ 41 #define XIVE_RUNTIME_CHECKS 42 #define X_PFX xive_vm_ 43 #define X_STATIC static 44 #define X_STAT_PFX stat_vm_ 45 #define __x_tima xive_tima 46 #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) 47 #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) 48 #define __x_writeb __raw_writeb 49 #define __x_readw __raw_readw 50 #define __x_readq __raw_readq 51 #define __x_writeq __raw_writeq 52 53 #include "book3s_xive_template.c" 54 55 /* 56 * We leave a gap of a couple of interrupts in the queue to 57 * account for the IPI and additional safety guard. 58 */ 59 #define XIVE_Q_GAP 2 60 61 static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu) 62 { 63 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 64 65 /* Check enablement at VP level */ 66 return xc->vp_cam & TM_QW1W2_HO; 67 } 68 69 bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu) 70 { 71 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 72 struct kvmppc_xive *xive = xc->xive; 73 74 if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE) 75 return kvmppc_xive_vcpu_has_save_restore(vcpu); 76 77 return true; 78 } 79 80 /* 81 * Push a vcpu's context to the XIVE on guest entry. 82 * This assumes we are in virtual mode (MMU on) 83 */ 84 void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) 85 { 86 void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; 87 u64 pq; 88 89 /* 90 * Nothing to do if the platform doesn't have a XIVE 91 * or this vCPU doesn't have its own XIVE context 92 * (e.g. because it's not using an in-kernel interrupt controller). 93 */ 94 if (!tima || !vcpu->arch.xive_cam_word) 95 return; 96 97 eieio(); 98 if (!kvmppc_xive_vcpu_has_save_restore(vcpu)) 99 __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); 100 __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); 101 vcpu->arch.xive_pushed = 1; 102 eieio(); 103 104 /* 105 * We clear the irq_pending flag. There is a small chance of a 106 * race vs. the escalation interrupt happening on another 107 * processor setting it again, but the only consequence is to 108 * cause a spurious wakeup on the next H_CEDE, which is not an 109 * issue. 110 */ 111 vcpu->arch.irq_pending = 0; 112 113 /* 114 * In single escalation mode, if the escalation interrupt is 115 * on, we mask it. 116 */ 117 if (vcpu->arch.xive_esc_on) { 118 pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + 119 XIVE_ESB_SET_PQ_01)); 120 mb(); 121 122 /* 123 * We have a possible subtle race here: The escalation 124 * interrupt might have fired and be on its way to the 125 * host queue while we mask it, and if we unmask it 126 * early enough (re-cede right away), there is a 127 * theorical possibility that it fires again, thus 128 * landing in the target queue more than once which is 129 * a big no-no. 130 * 131 * Fortunately, solving this is rather easy. If the 132 * above load setting PQ to 01 returns a previous 133 * value where P is set, then we know the escalation 134 * interrupt is somewhere on its way to the host. In 135 * that case we simply don't clear the xive_esc_on 136 * flag below. It will be eventually cleared by the 137 * handler for the escalation interrupt. 138 * 139 * Then, when doing a cede, we check that flag again 140 * before re-enabling the escalation interrupt, and if 141 * set, we abort the cede. 142 */ 143 if (!(pq & XIVE_ESB_VAL_P)) 144 /* Now P is 0, we can clear the flag */ 145 vcpu->arch.xive_esc_on = 0; 146 } 147 } 148 EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); 149 150 /* 151 * Pull a vcpu's context from the XIVE on guest exit. 152 * This assumes we are in virtual mode (MMU on) 153 */ 154 void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) 155 { 156 void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; 157 158 if (!vcpu->arch.xive_pushed) 159 return; 160 161 /* 162 * Should not have been pushed if there is no tima 163 */ 164 if (WARN_ON(!tima)) 165 return; 166 167 eieio(); 168 /* First load to pull the context, we ignore the value */ 169 __raw_readl(tima + TM_SPC_PULL_OS_CTX); 170 /* Second load to recover the context state (Words 0 and 1) */ 171 if (!kvmppc_xive_vcpu_has_save_restore(vcpu)) 172 vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); 173 174 /* Fixup some of the state for the next load */ 175 vcpu->arch.xive_saved_state.lsmfb = 0; 176 vcpu->arch.xive_saved_state.ack = 0xff; 177 vcpu->arch.xive_pushed = 0; 178 eieio(); 179 } 180 EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu); 181 182 void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) 183 { 184 void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr; 185 186 if (!esc_vaddr) 187 return; 188 189 /* we are using XIVE with single escalation */ 190 191 if (vcpu->arch.xive_esc_on) { 192 /* 193 * If we still have a pending escalation, abort the cede, 194 * and we must set PQ to 10 rather than 00 so that we don't 195 * potentially end up with two entries for the escalation 196 * interrupt in the XIVE interrupt queue. In that case 197 * we also don't want to set xive_esc_on to 1 here in 198 * case we race with xive_esc_irq(). 199 */ 200 vcpu->arch.ceded = 0; 201 /* 202 * The escalation interrupts are special as we don't EOI them. 203 * There is no need to use the load-after-store ordering offset 204 * to set PQ to 10 as we won't use StoreEOI. 205 */ 206 __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10); 207 } else { 208 vcpu->arch.xive_esc_on = true; 209 mb(); 210 __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00); 211 } 212 mb(); 213 } 214 EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation); 215 216 /* 217 * This is a simple trigger for a generic XIVE IRQ. This must 218 * only be called for interrupts that support a trigger page 219 */ 220 static bool xive_irq_trigger(struct xive_irq_data *xd) 221 { 222 /* This should be only for MSIs */ 223 if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) 224 return false; 225 226 /* Those interrupts should always have a trigger page */ 227 if (WARN_ON(!xd->trig_mmio)) 228 return false; 229 230 out_be64(xd->trig_mmio, 0); 231 232 return true; 233 } 234 235 static irqreturn_t xive_esc_irq(int irq, void *data) 236 { 237 struct kvm_vcpu *vcpu = data; 238 239 vcpu->arch.irq_pending = 1; 240 smp_mb(); 241 if (vcpu->arch.ceded) 242 kvmppc_fast_vcpu_kick(vcpu); 243 244 /* Since we have the no-EOI flag, the interrupt is effectively 245 * disabled now. Clearing xive_esc_on means we won't bother 246 * doing so on the next entry. 247 * 248 * This also allows the entry code to know that if a PQ combination 249 * of 10 is observed while xive_esc_on is true, it means the queue 250 * contains an unprocessed escalation interrupt. We don't make use of 251 * that knowledge today but might (see comment in book3s_hv_rmhandler.S) 252 */ 253 vcpu->arch.xive_esc_on = false; 254 255 /* This orders xive_esc_on = false vs. subsequent stale_p = true */ 256 smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */ 257 258 return IRQ_HANDLED; 259 } 260 261 int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, 262 bool single_escalation) 263 { 264 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 265 struct xive_q *q = &xc->queues[prio]; 266 char *name = NULL; 267 int rc; 268 269 /* Already there ? */ 270 if (xc->esc_virq[prio]) 271 return 0; 272 273 /* Hook up the escalation interrupt */ 274 xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq); 275 if (!xc->esc_virq[prio]) { 276 pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n", 277 prio, xc->server_num); 278 return -EIO; 279 } 280 281 if (single_escalation) 282 name = kasprintf(GFP_KERNEL, "kvm-%d-%d", 283 vcpu->kvm->arch.lpid, xc->server_num); 284 else 285 name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", 286 vcpu->kvm->arch.lpid, xc->server_num, prio); 287 if (!name) { 288 pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n", 289 prio, xc->server_num); 290 rc = -ENOMEM; 291 goto error; 292 } 293 294 pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio); 295 296 rc = request_irq(xc->esc_virq[prio], xive_esc_irq, 297 IRQF_NO_THREAD, name, vcpu); 298 if (rc) { 299 pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n", 300 prio, xc->server_num); 301 goto error; 302 } 303 xc->esc_virq_names[prio] = name; 304 305 /* In single escalation mode, we grab the ESB MMIO of the 306 * interrupt and mask it. Also populate the VCPU v/raddr 307 * of the ESB page for use by asm entry/exit code. Finally 308 * set the XIVE_IRQ_FLAG_NO_EOI flag which will prevent the 309 * core code from performing an EOI on the escalation 310 * interrupt, thus leaving it effectively masked after 311 * it fires once. 312 */ 313 if (single_escalation) { 314 struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]); 315 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 316 317 xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); 318 vcpu->arch.xive_esc_raddr = xd->eoi_page; 319 vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio; 320 xd->flags |= XIVE_IRQ_FLAG_NO_EOI; 321 } 322 323 return 0; 324 error: 325 irq_dispose_mapping(xc->esc_virq[prio]); 326 xc->esc_virq[prio] = 0; 327 kfree(name); 328 return rc; 329 } 330 331 static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio) 332 { 333 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 334 struct kvmppc_xive *xive = xc->xive; 335 struct xive_q *q = &xc->queues[prio]; 336 void *qpage; 337 int rc; 338 339 if (WARN_ON(q->qpage)) 340 return 0; 341 342 /* Allocate the queue and retrieve infos on current node for now */ 343 qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order); 344 if (!qpage) { 345 pr_err("Failed to allocate queue %d for VCPU %d\n", 346 prio, xc->server_num); 347 return -ENOMEM; 348 } 349 memset(qpage, 0, 1 << xive->q_order); 350 351 /* 352 * Reconfigure the queue. This will set q->qpage only once the 353 * queue is fully configured. This is a requirement for prio 0 354 * as we will stop doing EOIs for every IPI as soon as we observe 355 * qpage being non-NULL, and instead will only EOI when we receive 356 * corresponding queue 0 entries 357 */ 358 rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage, 359 xive->q_order, true); 360 if (rc) 361 pr_err("Failed to configure queue %d for VCPU %d\n", 362 prio, xc->server_num); 363 return rc; 364 } 365 366 /* Called with xive->lock held */ 367 static int xive_check_provisioning(struct kvm *kvm, u8 prio) 368 { 369 struct kvmppc_xive *xive = kvm->arch.xive; 370 struct kvm_vcpu *vcpu; 371 int i, rc; 372 373 lockdep_assert_held(&xive->lock); 374 375 /* Already provisioned ? */ 376 if (xive->qmap & (1 << prio)) 377 return 0; 378 379 pr_devel("Provisioning prio... %d\n", prio); 380 381 /* Provision each VCPU and enable escalations if needed */ 382 kvm_for_each_vcpu(i, vcpu, kvm) { 383 if (!vcpu->arch.xive_vcpu) 384 continue; 385 rc = xive_provision_queue(vcpu, prio); 386 if (rc == 0 && !kvmppc_xive_has_single_escalation(xive)) 387 kvmppc_xive_attach_escalation(vcpu, prio, 388 kvmppc_xive_has_single_escalation(xive)); 389 if (rc) 390 return rc; 391 } 392 393 /* Order previous stores and mark it as provisioned */ 394 mb(); 395 xive->qmap |= (1 << prio); 396 return 0; 397 } 398 399 static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio) 400 { 401 struct kvm_vcpu *vcpu; 402 struct kvmppc_xive_vcpu *xc; 403 struct xive_q *q; 404 405 /* Locate target server */ 406 vcpu = kvmppc_xive_find_server(kvm, server); 407 if (!vcpu) { 408 pr_warn("%s: Can't find server %d\n", __func__, server); 409 return; 410 } 411 xc = vcpu->arch.xive_vcpu; 412 if (WARN_ON(!xc)) 413 return; 414 415 q = &xc->queues[prio]; 416 atomic_inc(&q->pending_count); 417 } 418 419 static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio) 420 { 421 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 422 struct xive_q *q; 423 u32 max; 424 425 if (WARN_ON(!xc)) 426 return -ENXIO; 427 if (!xc->valid) 428 return -ENXIO; 429 430 q = &xc->queues[prio]; 431 if (WARN_ON(!q->qpage)) 432 return -ENXIO; 433 434 /* Calculate max number of interrupts in that queue. */ 435 max = (q->msk + 1) - XIVE_Q_GAP; 436 return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY; 437 } 438 439 int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio) 440 { 441 struct kvm_vcpu *vcpu; 442 int i, rc; 443 444 /* Locate target server */ 445 vcpu = kvmppc_xive_find_server(kvm, *server); 446 if (!vcpu) { 447 pr_devel("Can't find server %d\n", *server); 448 return -EINVAL; 449 } 450 451 pr_devel("Finding irq target on 0x%x/%d...\n", *server, prio); 452 453 /* Try pick it */ 454 rc = xive_try_pick_queue(vcpu, prio); 455 if (rc == 0) 456 return rc; 457 458 pr_devel(" .. failed, looking up candidate...\n"); 459 460 /* Failed, pick another VCPU */ 461 kvm_for_each_vcpu(i, vcpu, kvm) { 462 if (!vcpu->arch.xive_vcpu) 463 continue; 464 rc = xive_try_pick_queue(vcpu, prio); 465 if (rc == 0) { 466 *server = vcpu->arch.xive_vcpu->server_num; 467 pr_devel(" found on 0x%x/%d\n", *server, prio); 468 return rc; 469 } 470 } 471 pr_devel(" no available target !\n"); 472 473 /* No available target ! */ 474 return -EBUSY; 475 } 476 477 static u8 xive_lock_and_mask(struct kvmppc_xive *xive, 478 struct kvmppc_xive_src_block *sb, 479 struct kvmppc_xive_irq_state *state) 480 { 481 struct xive_irq_data *xd; 482 u32 hw_num; 483 u8 old_prio; 484 u64 val; 485 486 /* 487 * Take the lock, set masked, try again if racing 488 * with H_EOI 489 */ 490 for (;;) { 491 arch_spin_lock(&sb->lock); 492 old_prio = state->guest_priority; 493 state->guest_priority = MASKED; 494 mb(); 495 if (!state->in_eoi) 496 break; 497 state->guest_priority = old_prio; 498 arch_spin_unlock(&sb->lock); 499 } 500 501 /* No change ? Bail */ 502 if (old_prio == MASKED) 503 return old_prio; 504 505 /* Get the right irq */ 506 kvmppc_xive_select_irq(state, &hw_num, &xd); 507 508 /* Set PQ to 10, return old P and old Q and remember them */ 509 val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10); 510 state->old_p = !!(val & 2); 511 state->old_q = !!(val & 1); 512 513 /* 514 * Synchronize hardware to sensure the queues are updated when 515 * masking 516 */ 517 xive_native_sync_source(hw_num); 518 519 return old_prio; 520 } 521 522 static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb, 523 struct kvmppc_xive_irq_state *state) 524 { 525 /* 526 * Take the lock try again if racing with H_EOI 527 */ 528 for (;;) { 529 arch_spin_lock(&sb->lock); 530 if (!state->in_eoi) 531 break; 532 arch_spin_unlock(&sb->lock); 533 } 534 } 535 536 static void xive_finish_unmask(struct kvmppc_xive *xive, 537 struct kvmppc_xive_src_block *sb, 538 struct kvmppc_xive_irq_state *state, 539 u8 prio) 540 { 541 struct xive_irq_data *xd; 542 u32 hw_num; 543 544 /* If we aren't changing a thing, move on */ 545 if (state->guest_priority != MASKED) 546 goto bail; 547 548 /* Get the right irq */ 549 kvmppc_xive_select_irq(state, &hw_num, &xd); 550 551 /* Old Q set, set PQ to 11 */ 552 if (state->old_q) 553 xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11); 554 555 /* 556 * If not old P, then perform an "effective" EOI, 557 * on the source. This will handle the cases where 558 * FW EOI is needed. 559 */ 560 if (!state->old_p) 561 xive_vm_source_eoi(hw_num, xd); 562 563 /* Synchronize ordering and mark unmasked */ 564 mb(); 565 bail: 566 state->guest_priority = prio; 567 } 568 569 /* 570 * Target an interrupt to a given server/prio, this will fallback 571 * to another server if necessary and perform the HW targetting 572 * updates as needed 573 * 574 * NOTE: Must be called with the state lock held 575 */ 576 static int xive_target_interrupt(struct kvm *kvm, 577 struct kvmppc_xive_irq_state *state, 578 u32 server, u8 prio) 579 { 580 struct kvmppc_xive *xive = kvm->arch.xive; 581 u32 hw_num; 582 int rc; 583 584 /* 585 * This will return a tentative server and actual 586 * priority. The count for that new target will have 587 * already been incremented. 588 */ 589 rc = kvmppc_xive_select_target(kvm, &server, prio); 590 591 /* 592 * We failed to find a target ? Not much we can do 593 * at least until we support the GIQ. 594 */ 595 if (rc) 596 return rc; 597 598 /* 599 * Increment the old queue pending count if there 600 * was one so that the old queue count gets adjusted later 601 * when observed to be empty. 602 */ 603 if (state->act_priority != MASKED) 604 xive_inc_q_pending(kvm, 605 state->act_server, 606 state->act_priority); 607 /* 608 * Update state and HW 609 */ 610 state->act_priority = prio; 611 state->act_server = server; 612 613 /* Get the right irq */ 614 kvmppc_xive_select_irq(state, &hw_num, NULL); 615 616 return xive_native_configure_irq(hw_num, 617 kvmppc_xive_vp(xive, server), 618 prio, state->number); 619 } 620 621 /* 622 * Targetting rules: In order to avoid losing track of 623 * pending interrupts accross mask and unmask, which would 624 * allow queue overflows, we implement the following rules: 625 * 626 * - Unless it was never enabled (or we run out of capacity) 627 * an interrupt is always targetted at a valid server/queue 628 * pair even when "masked" by the guest. This pair tends to 629 * be the last one used but it can be changed under some 630 * circumstances. That allows us to separate targetting 631 * from masking, we only handle accounting during (re)targetting, 632 * this also allows us to let an interrupt drain into its target 633 * queue after masking, avoiding complex schemes to remove 634 * interrupts out of remote processor queues. 635 * 636 * - When masking, we set PQ to 10 and save the previous value 637 * of P and Q. 638 * 639 * - When unmasking, if saved Q was set, we set PQ to 11 640 * otherwise we leave PQ to the HW state which will be either 641 * 10 if nothing happened or 11 if the interrupt fired while 642 * masked. Effectively we are OR'ing the previous Q into the 643 * HW Q. 644 * 645 * Then if saved P is clear, we do an effective EOI (Q->P->Trigger) 646 * which will unmask the interrupt and shoot a new one if Q was 647 * set. 648 * 649 * Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11, 650 * effectively meaning an H_EOI from the guest is still expected 651 * for that interrupt). 652 * 653 * - If H_EOI occurs while masked, we clear the saved P. 654 * 655 * - When changing target, we account on the new target and 656 * increment a separate "pending" counter on the old one. 657 * This pending counter will be used to decrement the old 658 * target's count when its queue has been observed empty. 659 */ 660 661 int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, 662 u32 priority) 663 { 664 struct kvmppc_xive *xive = kvm->arch.xive; 665 struct kvmppc_xive_src_block *sb; 666 struct kvmppc_xive_irq_state *state; 667 u8 new_act_prio; 668 int rc = 0; 669 u16 idx; 670 671 if (!xive) 672 return -ENODEV; 673 674 pr_devel("set_xive ! irq 0x%x server 0x%x prio %d\n", 675 irq, server, priority); 676 677 /* First, check provisioning of queues */ 678 if (priority != MASKED) { 679 mutex_lock(&xive->lock); 680 rc = xive_check_provisioning(xive->kvm, 681 xive_prio_from_guest(priority)); 682 mutex_unlock(&xive->lock); 683 } 684 if (rc) { 685 pr_devel(" provisioning failure %d !\n", rc); 686 return rc; 687 } 688 689 sb = kvmppc_xive_find_source(xive, irq, &idx); 690 if (!sb) 691 return -EINVAL; 692 state = &sb->irq_state[idx]; 693 694 /* 695 * We first handle masking/unmasking since the locking 696 * might need to be retried due to EOIs, we'll handle 697 * targetting changes later. These functions will return 698 * with the SB lock held. 699 * 700 * xive_lock_and_mask() will also set state->guest_priority 701 * but won't otherwise change other fields of the state. 702 * 703 * xive_lock_for_unmask will not actually unmask, this will 704 * be done later by xive_finish_unmask() once the targetting 705 * has been done, so we don't try to unmask an interrupt 706 * that hasn't yet been targetted. 707 */ 708 if (priority == MASKED) 709 xive_lock_and_mask(xive, sb, state); 710 else 711 xive_lock_for_unmask(sb, state); 712 713 714 /* 715 * Then we handle targetting. 716 * 717 * First calculate a new "actual priority" 718 */ 719 new_act_prio = state->act_priority; 720 if (priority != MASKED) 721 new_act_prio = xive_prio_from_guest(priority); 722 723 pr_devel(" new_act_prio=%x act_server=%x act_prio=%x\n", 724 new_act_prio, state->act_server, state->act_priority); 725 726 /* 727 * Then check if we actually need to change anything, 728 * 729 * The condition for re-targetting the interrupt is that 730 * we have a valid new priority (new_act_prio is not 0xff) 731 * and either the server or the priority changed. 732 * 733 * Note: If act_priority was ff and the new priority is 734 * also ff, we don't do anything and leave the interrupt 735 * untargetted. An attempt of doing an int_on on an 736 * untargetted interrupt will fail. If that is a problem 737 * we could initialize interrupts with valid default 738 */ 739 740 if (new_act_prio != MASKED && 741 (state->act_server != server || 742 state->act_priority != new_act_prio)) 743 rc = xive_target_interrupt(kvm, state, server, new_act_prio); 744 745 /* 746 * Perform the final unmasking of the interrupt source 747 * if necessary 748 */ 749 if (priority != MASKED) 750 xive_finish_unmask(xive, sb, state, priority); 751 752 /* 753 * Finally Update saved_priority to match. Only int_on/off 754 * set this field to a different value. 755 */ 756 state->saved_priority = priority; 757 758 arch_spin_unlock(&sb->lock); 759 return rc; 760 } 761 762 int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server, 763 u32 *priority) 764 { 765 struct kvmppc_xive *xive = kvm->arch.xive; 766 struct kvmppc_xive_src_block *sb; 767 struct kvmppc_xive_irq_state *state; 768 u16 idx; 769 770 if (!xive) 771 return -ENODEV; 772 773 sb = kvmppc_xive_find_source(xive, irq, &idx); 774 if (!sb) 775 return -EINVAL; 776 state = &sb->irq_state[idx]; 777 arch_spin_lock(&sb->lock); 778 *server = state->act_server; 779 *priority = state->guest_priority; 780 arch_spin_unlock(&sb->lock); 781 782 return 0; 783 } 784 785 int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) 786 { 787 struct kvmppc_xive *xive = kvm->arch.xive; 788 struct kvmppc_xive_src_block *sb; 789 struct kvmppc_xive_irq_state *state; 790 u16 idx; 791 792 if (!xive) 793 return -ENODEV; 794 795 sb = kvmppc_xive_find_source(xive, irq, &idx); 796 if (!sb) 797 return -EINVAL; 798 state = &sb->irq_state[idx]; 799 800 pr_devel("int_on(irq=0x%x)\n", irq); 801 802 /* 803 * Check if interrupt was not targetted 804 */ 805 if (state->act_priority == MASKED) { 806 pr_devel("int_on on untargetted interrupt\n"); 807 return -EINVAL; 808 } 809 810 /* If saved_priority is 0xff, do nothing */ 811 if (state->saved_priority == MASKED) 812 return 0; 813 814 /* 815 * Lock and unmask it. 816 */ 817 xive_lock_for_unmask(sb, state); 818 xive_finish_unmask(xive, sb, state, state->saved_priority); 819 arch_spin_unlock(&sb->lock); 820 821 return 0; 822 } 823 824 int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) 825 { 826 struct kvmppc_xive *xive = kvm->arch.xive; 827 struct kvmppc_xive_src_block *sb; 828 struct kvmppc_xive_irq_state *state; 829 u16 idx; 830 831 if (!xive) 832 return -ENODEV; 833 834 sb = kvmppc_xive_find_source(xive, irq, &idx); 835 if (!sb) 836 return -EINVAL; 837 state = &sb->irq_state[idx]; 838 839 pr_devel("int_off(irq=0x%x)\n", irq); 840 841 /* 842 * Lock and mask 843 */ 844 state->saved_priority = xive_lock_and_mask(xive, sb, state); 845 arch_spin_unlock(&sb->lock); 846 847 return 0; 848 } 849 850 static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq) 851 { 852 struct kvmppc_xive_src_block *sb; 853 struct kvmppc_xive_irq_state *state; 854 u16 idx; 855 856 sb = kvmppc_xive_find_source(xive, irq, &idx); 857 if (!sb) 858 return false; 859 state = &sb->irq_state[idx]; 860 if (!state->valid) 861 return false; 862 863 /* 864 * Trigger the IPI. This assumes we never restore a pass-through 865 * interrupt which should be safe enough 866 */ 867 xive_irq_trigger(&state->ipi_data); 868 869 return true; 870 } 871 872 u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) 873 { 874 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 875 876 if (!xc) 877 return 0; 878 879 /* Return the per-cpu state for state saving/migration */ 880 return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT | 881 (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT | 882 (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT; 883 } 884 885 int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) 886 { 887 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 888 struct kvmppc_xive *xive = vcpu->kvm->arch.xive; 889 u8 cppr, mfrr; 890 u32 xisr; 891 892 if (!xc || !xive) 893 return -ENOENT; 894 895 /* Grab individual state fields. We don't use pending_pri */ 896 cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; 897 xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & 898 KVM_REG_PPC_ICP_XISR_MASK; 899 mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; 900 901 pr_devel("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n", 902 xc->server_num, cppr, mfrr, xisr); 903 904 /* 905 * We can't update the state of a "pushed" VCPU, but that 906 * shouldn't happen because the vcpu->mutex makes running a 907 * vcpu mutually exclusive with doing one_reg get/set on it. 908 */ 909 if (WARN_ON(vcpu->arch.xive_pushed)) 910 return -EIO; 911 912 /* Update VCPU HW saved state */ 913 vcpu->arch.xive_saved_state.cppr = cppr; 914 xc->hw_cppr = xc->cppr = cppr; 915 916 /* 917 * Update MFRR state. If it's not 0xff, we mark the VCPU as 918 * having a pending MFRR change, which will re-evaluate the 919 * target. The VCPU will thus potentially get a spurious 920 * interrupt but that's not a big deal. 921 */ 922 xc->mfrr = mfrr; 923 if (mfrr < cppr) 924 xive_irq_trigger(&xc->vp_ipi_data); 925 926 /* 927 * Now saved XIRR is "interesting". It means there's something in 928 * the legacy "1 element" queue... for an IPI we simply ignore it, 929 * as the MFRR restore will handle that. For anything else we need 930 * to force a resend of the source. 931 * However the source may not have been setup yet. If that's the 932 * case, we keep that info and increment a counter in the xive to 933 * tell subsequent xive_set_source() to go look. 934 */ 935 if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) { 936 xc->delayed_irq = xisr; 937 xive->delayed_irqs++; 938 pr_devel(" xisr restore delayed\n"); 939 } 940 941 return 0; 942 } 943 944 int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, 945 unsigned long host_irq) 946 { 947 struct kvmppc_xive *xive = kvm->arch.xive; 948 struct kvmppc_xive_src_block *sb; 949 struct kvmppc_xive_irq_state *state; 950 struct irq_data *host_data = 951 irq_domain_get_irq_data(irq_get_default_host(), host_irq); 952 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); 953 u16 idx; 954 u8 prio; 955 int rc; 956 957 if (!xive) 958 return -ENODEV; 959 960 pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n", 961 __func__, guest_irq, host_irq, hw_irq); 962 963 sb = kvmppc_xive_find_source(xive, guest_irq, &idx); 964 if (!sb) 965 return -EINVAL; 966 state = &sb->irq_state[idx]; 967 968 /* 969 * Mark the passed-through interrupt as going to a VCPU, 970 * this will prevent further EOIs and similar operations 971 * from the XIVE code. It will also mask the interrupt 972 * to either PQ=10 or 11 state, the latter if the interrupt 973 * is pending. This will allow us to unmask or retrigger it 974 * after routing it to the guest with a simple EOI. 975 * 976 * The "state" argument is a "token", all it needs is to be 977 * non-NULL to switch to passed-through or NULL for the 978 * other way around. We may not yet have an actual VCPU 979 * target here and we don't really care. 980 */ 981 rc = irq_set_vcpu_affinity(host_irq, state); 982 if (rc) { 983 pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq); 984 return rc; 985 } 986 987 /* 988 * Mask and read state of IPI. We need to know if its P bit 989 * is set as that means it's potentially already using a 990 * queue entry in the target 991 */ 992 prio = xive_lock_and_mask(xive, sb, state); 993 pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio, 994 state->old_p, state->old_q); 995 996 /* Turn the IPI hard off */ 997 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); 998 999 /* 1000 * Reset ESB guest mapping. Needed when ESB pages are exposed 1001 * to the guest in XIVE native mode 1002 */ 1003 if (xive->ops && xive->ops->reset_mapped) 1004 xive->ops->reset_mapped(kvm, guest_irq); 1005 1006 /* Grab info about irq */ 1007 state->pt_number = hw_irq; 1008 state->pt_data = irq_data_get_irq_handler_data(host_data); 1009 1010 /* 1011 * Configure the IRQ to match the existing configuration of 1012 * the IPI if it was already targetted. Otherwise this will 1013 * mask the interrupt in a lossy way (act_priority is 0xff) 1014 * which is fine for a never started interrupt. 1015 */ 1016 xive_native_configure_irq(hw_irq, 1017 kvmppc_xive_vp(xive, state->act_server), 1018 state->act_priority, state->number); 1019 1020 /* 1021 * We do an EOI to enable the interrupt (and retrigger if needed) 1022 * if the guest has the interrupt unmasked and the P bit was *not* 1023 * set in the IPI. If it was set, we know a slot may still be in 1024 * use in the target queue thus we have to wait for a guest 1025 * originated EOI 1026 */ 1027 if (prio != MASKED && !state->old_p) 1028 xive_vm_source_eoi(hw_irq, state->pt_data); 1029 1030 /* Clear old_p/old_q as they are no longer relevant */ 1031 state->old_p = state->old_q = false; 1032 1033 /* Restore guest prio (unlocks EOI) */ 1034 mb(); 1035 state->guest_priority = prio; 1036 arch_spin_unlock(&sb->lock); 1037 1038 return 0; 1039 } 1040 EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped); 1041 1042 int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, 1043 unsigned long host_irq) 1044 { 1045 struct kvmppc_xive *xive = kvm->arch.xive; 1046 struct kvmppc_xive_src_block *sb; 1047 struct kvmppc_xive_irq_state *state; 1048 u16 idx; 1049 u8 prio; 1050 int rc; 1051 1052 if (!xive) 1053 return -ENODEV; 1054 1055 pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__, guest_irq, host_irq); 1056 1057 sb = kvmppc_xive_find_source(xive, guest_irq, &idx); 1058 if (!sb) 1059 return -EINVAL; 1060 state = &sb->irq_state[idx]; 1061 1062 /* 1063 * Mask and read state of IRQ. We need to know if its P bit 1064 * is set as that means it's potentially already using a 1065 * queue entry in the target 1066 */ 1067 prio = xive_lock_and_mask(xive, sb, state); 1068 pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio, 1069 state->old_p, state->old_q); 1070 1071 /* 1072 * If old_p is set, the interrupt is pending, we switch it to 1073 * PQ=11. This will force a resend in the host so the interrupt 1074 * isn't lost to whatver host driver may pick it up 1075 */ 1076 if (state->old_p) 1077 xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11); 1078 1079 /* Release the passed-through interrupt to the host */ 1080 rc = irq_set_vcpu_affinity(host_irq, NULL); 1081 if (rc) { 1082 pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq); 1083 return rc; 1084 } 1085 1086 /* Forget about the IRQ */ 1087 state->pt_number = 0; 1088 state->pt_data = NULL; 1089 1090 /* 1091 * Reset ESB guest mapping. Needed when ESB pages are exposed 1092 * to the guest in XIVE native mode 1093 */ 1094 if (xive->ops && xive->ops->reset_mapped) { 1095 xive->ops->reset_mapped(kvm, guest_irq); 1096 } 1097 1098 /* Reconfigure the IPI */ 1099 xive_native_configure_irq(state->ipi_number, 1100 kvmppc_xive_vp(xive, state->act_server), 1101 state->act_priority, state->number); 1102 1103 /* 1104 * If old_p is set (we have a queue entry potentially 1105 * occupied) or the interrupt is masked, we set the IPI 1106 * to PQ=10 state. Otherwise we just re-enable it (PQ=00). 1107 */ 1108 if (prio == MASKED || state->old_p) 1109 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10); 1110 else 1111 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00); 1112 1113 /* Restore guest prio (unlocks EOI) */ 1114 mb(); 1115 state->guest_priority = prio; 1116 arch_spin_unlock(&sb->lock); 1117 1118 return 0; 1119 } 1120 EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped); 1121 1122 void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) 1123 { 1124 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1125 struct kvm *kvm = vcpu->kvm; 1126 struct kvmppc_xive *xive = kvm->arch.xive; 1127 int i, j; 1128 1129 for (i = 0; i <= xive->max_sbid; i++) { 1130 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1131 1132 if (!sb) 1133 continue; 1134 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) { 1135 struct kvmppc_xive_irq_state *state = &sb->irq_state[j]; 1136 1137 if (!state->valid) 1138 continue; 1139 if (state->act_priority == MASKED) 1140 continue; 1141 if (state->act_server != xc->server_num) 1142 continue; 1143 1144 /* Clean it up */ 1145 arch_spin_lock(&sb->lock); 1146 state->act_priority = MASKED; 1147 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); 1148 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0); 1149 if (state->pt_number) { 1150 xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01); 1151 xive_native_configure_irq(state->pt_number, 0, MASKED, 0); 1152 } 1153 arch_spin_unlock(&sb->lock); 1154 } 1155 } 1156 1157 /* Disable vcpu's escalation interrupt */ 1158 if (vcpu->arch.xive_esc_on) { 1159 __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + 1160 XIVE_ESB_SET_PQ_01)); 1161 vcpu->arch.xive_esc_on = false; 1162 } 1163 1164 /* 1165 * Clear pointers to escalation interrupt ESB. 1166 * This is safe because the vcpu->mutex is held, preventing 1167 * any other CPU from concurrently executing a KVM_RUN ioctl. 1168 */ 1169 vcpu->arch.xive_esc_vaddr = 0; 1170 vcpu->arch.xive_esc_raddr = 0; 1171 } 1172 1173 /* 1174 * In single escalation mode, the escalation interrupt is marked so 1175 * that EOI doesn't re-enable it, but just sets the stale_p flag to 1176 * indicate that the P bit has already been dealt with. However, the 1177 * assembly code that enters the guest sets PQ to 00 without clearing 1178 * stale_p (because it has no easy way to address it). Hence we have 1179 * to adjust stale_p before shutting down the interrupt. 1180 */ 1181 void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, 1182 struct kvmppc_xive_vcpu *xc, int irq) 1183 { 1184 struct irq_data *d = irq_get_irq_data(irq); 1185 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 1186 1187 /* 1188 * This slightly odd sequence gives the right result 1189 * (i.e. stale_p set if xive_esc_on is false) even if 1190 * we race with xive_esc_irq() and xive_irq_eoi(). 1191 */ 1192 xd->stale_p = false; 1193 smp_mb(); /* paired with smb_wmb in xive_esc_irq */ 1194 if (!vcpu->arch.xive_esc_on) 1195 xd->stale_p = true; 1196 } 1197 1198 void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) 1199 { 1200 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1201 struct kvmppc_xive *xive = vcpu->kvm->arch.xive; 1202 int i; 1203 1204 if (!kvmppc_xics_enabled(vcpu)) 1205 return; 1206 1207 if (!xc) 1208 return; 1209 1210 pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); 1211 1212 /* Ensure no interrupt is still routed to that VP */ 1213 xc->valid = false; 1214 kvmppc_xive_disable_vcpu_interrupts(vcpu); 1215 1216 /* Mask the VP IPI */ 1217 xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); 1218 1219 /* Free escalations */ 1220 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1221 if (xc->esc_virq[i]) { 1222 if (kvmppc_xive_has_single_escalation(xc->xive)) 1223 xive_cleanup_single_escalation(vcpu, xc, 1224 xc->esc_virq[i]); 1225 free_irq(xc->esc_virq[i], vcpu); 1226 irq_dispose_mapping(xc->esc_virq[i]); 1227 kfree(xc->esc_virq_names[i]); 1228 } 1229 } 1230 1231 /* Disable the VP */ 1232 xive_native_disable_vp(xc->vp_id); 1233 1234 /* Clear the cam word so guest entry won't try to push context */ 1235 vcpu->arch.xive_cam_word = 0; 1236 1237 /* Free the queues */ 1238 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1239 struct xive_q *q = &xc->queues[i]; 1240 1241 xive_native_disable_queue(xc->vp_id, q, i); 1242 if (q->qpage) { 1243 free_pages((unsigned long)q->qpage, 1244 xive->q_page_order); 1245 q->qpage = NULL; 1246 } 1247 } 1248 1249 /* Free the IPI */ 1250 if (xc->vp_ipi) { 1251 xive_cleanup_irq_data(&xc->vp_ipi_data); 1252 xive_native_free_irq(xc->vp_ipi); 1253 } 1254 /* Free the VP */ 1255 kfree(xc); 1256 1257 /* Cleanup the vcpu */ 1258 vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; 1259 vcpu->arch.xive_vcpu = NULL; 1260 } 1261 1262 static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) 1263 { 1264 /* We have a block of xive->nr_servers VPs. We just need to check 1265 * packed vCPU ids are below that. 1266 */ 1267 return kvmppc_pack_vcpu_id(xive->kvm, cpu) < xive->nr_servers; 1268 } 1269 1270 int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) 1271 { 1272 u32 vp_id; 1273 1274 if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) { 1275 pr_devel("Out of bounds !\n"); 1276 return -EINVAL; 1277 } 1278 1279 if (xive->vp_base == XIVE_INVALID_VP) { 1280 xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers); 1281 pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers); 1282 1283 if (xive->vp_base == XIVE_INVALID_VP) 1284 return -ENOSPC; 1285 } 1286 1287 vp_id = kvmppc_xive_vp(xive, cpu); 1288 if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { 1289 pr_devel("Duplicate !\n"); 1290 return -EEXIST; 1291 } 1292 1293 *vp = vp_id; 1294 1295 return 0; 1296 } 1297 1298 int kvmppc_xive_connect_vcpu(struct kvm_device *dev, 1299 struct kvm_vcpu *vcpu, u32 cpu) 1300 { 1301 struct kvmppc_xive *xive = dev->private; 1302 struct kvmppc_xive_vcpu *xc; 1303 int i, r = -EBUSY; 1304 u32 vp_id; 1305 1306 pr_devel("connect_vcpu(cpu=%d)\n", cpu); 1307 1308 if (dev->ops != &kvm_xive_ops) { 1309 pr_devel("Wrong ops !\n"); 1310 return -EPERM; 1311 } 1312 if (xive->kvm != vcpu->kvm) 1313 return -EPERM; 1314 if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) 1315 return -EBUSY; 1316 1317 /* We need to synchronize with queue provisioning */ 1318 mutex_lock(&xive->lock); 1319 1320 r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id); 1321 if (r) 1322 goto bail; 1323 1324 xc = kzalloc(sizeof(*xc), GFP_KERNEL); 1325 if (!xc) { 1326 r = -ENOMEM; 1327 goto bail; 1328 } 1329 1330 vcpu->arch.xive_vcpu = xc; 1331 xc->xive = xive; 1332 xc->vcpu = vcpu; 1333 xc->server_num = cpu; 1334 xc->vp_id = vp_id; 1335 xc->mfrr = 0xff; 1336 xc->valid = true; 1337 1338 r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id); 1339 if (r) 1340 goto bail; 1341 1342 if (!kvmppc_xive_check_save_restore(vcpu)) { 1343 pr_err("inconsistent save-restore setup for VCPU %d\n", cpu); 1344 r = -EIO; 1345 goto bail; 1346 } 1347 1348 /* Configure VCPU fields for use by assembly push/pull */ 1349 vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000); 1350 vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO); 1351 1352 /* Allocate IPI */ 1353 xc->vp_ipi = xive_native_alloc_irq(); 1354 if (!xc->vp_ipi) { 1355 pr_err("Failed to allocate xive irq for VCPU IPI\n"); 1356 r = -EIO; 1357 goto bail; 1358 } 1359 pr_devel(" IPI=0x%x\n", xc->vp_ipi); 1360 1361 r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data); 1362 if (r) 1363 goto bail; 1364 1365 /* 1366 * Enable the VP first as the single escalation mode will 1367 * affect escalation interrupts numbering 1368 */ 1369 r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); 1370 if (r) { 1371 pr_err("Failed to enable VP in OPAL, err %d\n", r); 1372 goto bail; 1373 } 1374 1375 /* 1376 * Initialize queues. Initially we set them all for no queueing 1377 * and we enable escalation for queue 0 only which we'll use for 1378 * our mfrr change notifications. If the VCPU is hot-plugged, we 1379 * do handle provisioning however based on the existing "map" 1380 * of enabled queues. 1381 */ 1382 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1383 struct xive_q *q = &xc->queues[i]; 1384 1385 /* Single escalation, no queue 7 */ 1386 if (i == 7 && kvmppc_xive_has_single_escalation(xive)) 1387 break; 1388 1389 /* Is queue already enabled ? Provision it */ 1390 if (xive->qmap & (1 << i)) { 1391 r = xive_provision_queue(vcpu, i); 1392 if (r == 0 && !kvmppc_xive_has_single_escalation(xive)) 1393 kvmppc_xive_attach_escalation( 1394 vcpu, i, kvmppc_xive_has_single_escalation(xive)); 1395 if (r) 1396 goto bail; 1397 } else { 1398 r = xive_native_configure_queue(xc->vp_id, 1399 q, i, NULL, 0, true); 1400 if (r) { 1401 pr_err("Failed to configure queue %d for VCPU %d\n", 1402 i, cpu); 1403 goto bail; 1404 } 1405 } 1406 } 1407 1408 /* If not done above, attach priority 0 escalation */ 1409 r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive)); 1410 if (r) 1411 goto bail; 1412 1413 /* Route the IPI */ 1414 r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI); 1415 if (!r) 1416 xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00); 1417 1418 bail: 1419 mutex_unlock(&xive->lock); 1420 if (r) { 1421 kvmppc_xive_cleanup_vcpu(vcpu); 1422 return r; 1423 } 1424 1425 vcpu->arch.irq_type = KVMPPC_IRQ_XICS; 1426 return 0; 1427 } 1428 1429 /* 1430 * Scanning of queues before/after migration save 1431 */ 1432 static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq) 1433 { 1434 struct kvmppc_xive_src_block *sb; 1435 struct kvmppc_xive_irq_state *state; 1436 u16 idx; 1437 1438 sb = kvmppc_xive_find_source(xive, irq, &idx); 1439 if (!sb) 1440 return; 1441 1442 state = &sb->irq_state[idx]; 1443 1444 /* Some sanity checking */ 1445 if (!state->valid) { 1446 pr_err("invalid irq 0x%x in cpu queue!\n", irq); 1447 return; 1448 } 1449 1450 /* 1451 * If the interrupt is in a queue it should have P set. 1452 * We warn so that gets reported. A backtrace isn't useful 1453 * so no need to use a WARN_ON. 1454 */ 1455 if (!state->saved_p) 1456 pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq); 1457 1458 /* Set flag */ 1459 state->in_queue = true; 1460 } 1461 1462 static void xive_pre_save_mask_irq(struct kvmppc_xive *xive, 1463 struct kvmppc_xive_src_block *sb, 1464 u32 irq) 1465 { 1466 struct kvmppc_xive_irq_state *state = &sb->irq_state[irq]; 1467 1468 if (!state->valid) 1469 return; 1470 1471 /* Mask and save state, this will also sync HW queues */ 1472 state->saved_scan_prio = xive_lock_and_mask(xive, sb, state); 1473 1474 /* Transfer P and Q */ 1475 state->saved_p = state->old_p; 1476 state->saved_q = state->old_q; 1477 1478 /* Unlock */ 1479 arch_spin_unlock(&sb->lock); 1480 } 1481 1482 static void xive_pre_save_unmask_irq(struct kvmppc_xive *xive, 1483 struct kvmppc_xive_src_block *sb, 1484 u32 irq) 1485 { 1486 struct kvmppc_xive_irq_state *state = &sb->irq_state[irq]; 1487 1488 if (!state->valid) 1489 return; 1490 1491 /* 1492 * Lock / exclude EOI (not technically necessary if the 1493 * guest isn't running concurrently. If this becomes a 1494 * performance issue we can probably remove the lock. 1495 */ 1496 xive_lock_for_unmask(sb, state); 1497 1498 /* Restore mask/prio if it wasn't masked */ 1499 if (state->saved_scan_prio != MASKED) 1500 xive_finish_unmask(xive, sb, state, state->saved_scan_prio); 1501 1502 /* Unlock */ 1503 arch_spin_unlock(&sb->lock); 1504 } 1505 1506 static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q) 1507 { 1508 u32 idx = q->idx; 1509 u32 toggle = q->toggle; 1510 u32 irq; 1511 1512 do { 1513 irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle); 1514 if (irq > XICS_IPI) 1515 xive_pre_save_set_queued(xive, irq); 1516 } while(irq); 1517 } 1518 1519 static void xive_pre_save_scan(struct kvmppc_xive *xive) 1520 { 1521 struct kvm_vcpu *vcpu = NULL; 1522 int i, j; 1523 1524 /* 1525 * See comment in xive_get_source() about how this 1526 * work. Collect a stable state for all interrupts 1527 */ 1528 for (i = 0; i <= xive->max_sbid; i++) { 1529 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1530 if (!sb) 1531 continue; 1532 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) 1533 xive_pre_save_mask_irq(xive, sb, j); 1534 } 1535 1536 /* Then scan the queues and update the "in_queue" flag */ 1537 kvm_for_each_vcpu(i, vcpu, xive->kvm) { 1538 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1539 if (!xc) 1540 continue; 1541 for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) { 1542 if (xc->queues[j].qpage) 1543 xive_pre_save_queue(xive, &xc->queues[j]); 1544 } 1545 } 1546 1547 /* Finally restore interrupt states */ 1548 for (i = 0; i <= xive->max_sbid; i++) { 1549 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1550 if (!sb) 1551 continue; 1552 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) 1553 xive_pre_save_unmask_irq(xive, sb, j); 1554 } 1555 } 1556 1557 static void xive_post_save_scan(struct kvmppc_xive *xive) 1558 { 1559 u32 i, j; 1560 1561 /* Clear all the in_queue flags */ 1562 for (i = 0; i <= xive->max_sbid; i++) { 1563 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1564 if (!sb) 1565 continue; 1566 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) 1567 sb->irq_state[j].in_queue = false; 1568 } 1569 1570 /* Next get_source() will do a new scan */ 1571 xive->saved_src_count = 0; 1572 } 1573 1574 /* 1575 * This returns the source configuration and state to user space. 1576 */ 1577 static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr) 1578 { 1579 struct kvmppc_xive_src_block *sb; 1580 struct kvmppc_xive_irq_state *state; 1581 u64 __user *ubufp = (u64 __user *) addr; 1582 u64 val, prio; 1583 u16 idx; 1584 1585 sb = kvmppc_xive_find_source(xive, irq, &idx); 1586 if (!sb) 1587 return -ENOENT; 1588 1589 state = &sb->irq_state[idx]; 1590 1591 if (!state->valid) 1592 return -ENOENT; 1593 1594 pr_devel("get_source(%ld)...\n", irq); 1595 1596 /* 1597 * So to properly save the state into something that looks like a 1598 * XICS migration stream we cannot treat interrupts individually. 1599 * 1600 * We need, instead, mask them all (& save their previous PQ state) 1601 * to get a stable state in the HW, then sync them to ensure that 1602 * any interrupt that had already fired hits its queue, and finally 1603 * scan all the queues to collect which interrupts are still present 1604 * in the queues, so we can set the "pending" flag on them and 1605 * they can be resent on restore. 1606 * 1607 * So we do it all when the "first" interrupt gets saved, all the 1608 * state is collected at that point, the rest of xive_get_source() 1609 * will merely collect and convert that state to the expected 1610 * userspace bit mask. 1611 */ 1612 if (xive->saved_src_count == 0) 1613 xive_pre_save_scan(xive); 1614 xive->saved_src_count++; 1615 1616 /* Convert saved state into something compatible with xics */ 1617 val = state->act_server; 1618 prio = state->saved_scan_prio; 1619 1620 if (prio == MASKED) { 1621 val |= KVM_XICS_MASKED; 1622 prio = state->saved_priority; 1623 } 1624 val |= prio << KVM_XICS_PRIORITY_SHIFT; 1625 if (state->lsi) { 1626 val |= KVM_XICS_LEVEL_SENSITIVE; 1627 if (state->saved_p) 1628 val |= KVM_XICS_PENDING; 1629 } else { 1630 if (state->saved_p) 1631 val |= KVM_XICS_PRESENTED; 1632 1633 if (state->saved_q) 1634 val |= KVM_XICS_QUEUED; 1635 1636 /* 1637 * We mark it pending (which will attempt a re-delivery) 1638 * if we are in a queue *or* we were masked and had 1639 * Q set which is equivalent to the XICS "masked pending" 1640 * state 1641 */ 1642 if (state->in_queue || (prio == MASKED && state->saved_q)) 1643 val |= KVM_XICS_PENDING; 1644 } 1645 1646 /* 1647 * If that was the last interrupt saved, reset the 1648 * in_queue flags 1649 */ 1650 if (xive->saved_src_count == xive->src_count) 1651 xive_post_save_scan(xive); 1652 1653 /* Copy the result to userspace */ 1654 if (put_user(val, ubufp)) 1655 return -EFAULT; 1656 1657 return 0; 1658 } 1659 1660 struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( 1661 struct kvmppc_xive *xive, int irq) 1662 { 1663 struct kvmppc_xive_src_block *sb; 1664 int i, bid; 1665 1666 bid = irq >> KVMPPC_XICS_ICS_SHIFT; 1667 1668 mutex_lock(&xive->lock); 1669 1670 /* block already exists - somebody else got here first */ 1671 if (xive->src_blocks[bid]) 1672 goto out; 1673 1674 /* Create the ICS */ 1675 sb = kzalloc(sizeof(*sb), GFP_KERNEL); 1676 if (!sb) 1677 goto out; 1678 1679 sb->id = bid; 1680 1681 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 1682 sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i; 1683 sb->irq_state[i].eisn = 0; 1684 sb->irq_state[i].guest_priority = MASKED; 1685 sb->irq_state[i].saved_priority = MASKED; 1686 sb->irq_state[i].act_priority = MASKED; 1687 } 1688 smp_wmb(); 1689 xive->src_blocks[bid] = sb; 1690 1691 if (bid > xive->max_sbid) 1692 xive->max_sbid = bid; 1693 1694 out: 1695 mutex_unlock(&xive->lock); 1696 return xive->src_blocks[bid]; 1697 } 1698 1699 static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq) 1700 { 1701 struct kvm *kvm = xive->kvm; 1702 struct kvm_vcpu *vcpu = NULL; 1703 int i; 1704 1705 kvm_for_each_vcpu(i, vcpu, kvm) { 1706 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1707 1708 if (!xc) 1709 continue; 1710 1711 if (xc->delayed_irq == irq) { 1712 xc->delayed_irq = 0; 1713 xive->delayed_irqs--; 1714 return true; 1715 } 1716 } 1717 return false; 1718 } 1719 1720 static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) 1721 { 1722 struct kvmppc_xive_src_block *sb; 1723 struct kvmppc_xive_irq_state *state; 1724 u64 __user *ubufp = (u64 __user *) addr; 1725 u16 idx; 1726 u64 val; 1727 u8 act_prio, guest_prio; 1728 u32 server; 1729 int rc = 0; 1730 1731 if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) 1732 return -ENOENT; 1733 1734 pr_devel("set_source(irq=0x%lx)\n", irq); 1735 1736 /* Find the source */ 1737 sb = kvmppc_xive_find_source(xive, irq, &idx); 1738 if (!sb) { 1739 pr_devel("No source, creating source block...\n"); 1740 sb = kvmppc_xive_create_src_block(xive, irq); 1741 if (!sb) { 1742 pr_devel("Failed to create block...\n"); 1743 return -ENOMEM; 1744 } 1745 } 1746 state = &sb->irq_state[idx]; 1747 1748 /* Read user passed data */ 1749 if (get_user(val, ubufp)) { 1750 pr_devel("fault getting user info !\n"); 1751 return -EFAULT; 1752 } 1753 1754 server = val & KVM_XICS_DESTINATION_MASK; 1755 guest_prio = val >> KVM_XICS_PRIORITY_SHIFT; 1756 1757 pr_devel(" val=0x016%llx (server=0x%x, guest_prio=%d)\n", 1758 val, server, guest_prio); 1759 1760 /* 1761 * If the source doesn't already have an IPI, allocate 1762 * one and get the corresponding data 1763 */ 1764 if (!state->ipi_number) { 1765 state->ipi_number = xive_native_alloc_irq(); 1766 if (state->ipi_number == 0) { 1767 pr_devel("Failed to allocate IPI !\n"); 1768 return -ENOMEM; 1769 } 1770 xive_native_populate_irq_data(state->ipi_number, &state->ipi_data); 1771 pr_devel(" src_ipi=0x%x\n", state->ipi_number); 1772 } 1773 1774 /* 1775 * We use lock_and_mask() to set us in the right masked 1776 * state. We will override that state from the saved state 1777 * further down, but this will handle the cases of interrupts 1778 * that need FW masking. We set the initial guest_priority to 1779 * 0 before calling it to ensure it actually performs the masking. 1780 */ 1781 state->guest_priority = 0; 1782 xive_lock_and_mask(xive, sb, state); 1783 1784 /* 1785 * Now, we select a target if we have one. If we don't we 1786 * leave the interrupt untargetted. It means that an interrupt 1787 * can become "untargetted" accross migration if it was masked 1788 * by set_xive() but there is little we can do about it. 1789 */ 1790 1791 /* First convert prio and mark interrupt as untargetted */ 1792 act_prio = xive_prio_from_guest(guest_prio); 1793 state->act_priority = MASKED; 1794 1795 /* 1796 * We need to drop the lock due to the mutex below. Hopefully 1797 * nothing is touching that interrupt yet since it hasn't been 1798 * advertized to a running guest yet 1799 */ 1800 arch_spin_unlock(&sb->lock); 1801 1802 /* If we have a priority target the interrupt */ 1803 if (act_prio != MASKED) { 1804 /* First, check provisioning of queues */ 1805 mutex_lock(&xive->lock); 1806 rc = xive_check_provisioning(xive->kvm, act_prio); 1807 mutex_unlock(&xive->lock); 1808 1809 /* Target interrupt */ 1810 if (rc == 0) 1811 rc = xive_target_interrupt(xive->kvm, state, 1812 server, act_prio); 1813 /* 1814 * If provisioning or targetting failed, leave it 1815 * alone and masked. It will remain disabled until 1816 * the guest re-targets it. 1817 */ 1818 } 1819 1820 /* 1821 * Find out if this was a delayed irq stashed in an ICP, 1822 * in which case, treat it as pending 1823 */ 1824 if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) { 1825 val |= KVM_XICS_PENDING; 1826 pr_devel(" Found delayed ! forcing PENDING !\n"); 1827 } 1828 1829 /* Cleanup the SW state */ 1830 state->old_p = false; 1831 state->old_q = false; 1832 state->lsi = false; 1833 state->asserted = false; 1834 1835 /* Restore LSI state */ 1836 if (val & KVM_XICS_LEVEL_SENSITIVE) { 1837 state->lsi = true; 1838 if (val & KVM_XICS_PENDING) 1839 state->asserted = true; 1840 pr_devel(" LSI ! Asserted=%d\n", state->asserted); 1841 } 1842 1843 /* 1844 * Restore P and Q. If the interrupt was pending, we 1845 * force Q and !P, which will trigger a resend. 1846 * 1847 * That means that a guest that had both an interrupt 1848 * pending (queued) and Q set will restore with only 1849 * one instance of that interrupt instead of 2, but that 1850 * is perfectly fine as coalescing interrupts that haven't 1851 * been presented yet is always allowed. 1852 */ 1853 if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING)) 1854 state->old_p = true; 1855 if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING) 1856 state->old_q = true; 1857 1858 pr_devel(" P=%d, Q=%d\n", state->old_p, state->old_q); 1859 1860 /* 1861 * If the interrupt was unmasked, update guest priority and 1862 * perform the appropriate state transition and do a 1863 * re-trigger if necessary. 1864 */ 1865 if (val & KVM_XICS_MASKED) { 1866 pr_devel(" masked, saving prio\n"); 1867 state->guest_priority = MASKED; 1868 state->saved_priority = guest_prio; 1869 } else { 1870 pr_devel(" unmasked, restoring to prio %d\n", guest_prio); 1871 xive_finish_unmask(xive, sb, state, guest_prio); 1872 state->saved_priority = guest_prio; 1873 } 1874 1875 /* Increment the number of valid sources and mark this one valid */ 1876 if (!state->valid) 1877 xive->src_count++; 1878 state->valid = true; 1879 1880 return 0; 1881 } 1882 1883 int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, 1884 bool line_status) 1885 { 1886 struct kvmppc_xive *xive = kvm->arch.xive; 1887 struct kvmppc_xive_src_block *sb; 1888 struct kvmppc_xive_irq_state *state; 1889 u16 idx; 1890 1891 if (!xive) 1892 return -ENODEV; 1893 1894 sb = kvmppc_xive_find_source(xive, irq, &idx); 1895 if (!sb) 1896 return -EINVAL; 1897 1898 /* Perform locklessly .... (we need to do some RCUisms here...) */ 1899 state = &sb->irq_state[idx]; 1900 if (!state->valid) 1901 return -EINVAL; 1902 1903 /* We don't allow a trigger on a passed-through interrupt */ 1904 if (state->pt_number) 1905 return -EINVAL; 1906 1907 if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) 1908 state->asserted = true; 1909 else if (level == 0 || level == KVM_INTERRUPT_UNSET) { 1910 state->asserted = false; 1911 return 0; 1912 } 1913 1914 /* Trigger the IPI */ 1915 xive_irq_trigger(&state->ipi_data); 1916 1917 return 0; 1918 } 1919 1920 int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr) 1921 { 1922 u32 __user *ubufp = (u32 __user *) addr; 1923 u32 nr_servers; 1924 int rc = 0; 1925 1926 if (get_user(nr_servers, ubufp)) 1927 return -EFAULT; 1928 1929 pr_devel("%s nr_servers=%u\n", __func__, nr_servers); 1930 1931 if (!nr_servers || nr_servers > KVM_MAX_VCPU_IDS) 1932 return -EINVAL; 1933 1934 mutex_lock(&xive->lock); 1935 if (xive->vp_base != XIVE_INVALID_VP) 1936 /* The VP block is allocated once and freed when the device 1937 * is released. Better not allow to change its size since its 1938 * used by connect_vcpu to validate vCPU ids are valid (eg, 1939 * setting it back to a higher value could allow connect_vcpu 1940 * to come up with a VP id that goes beyond the VP block, which 1941 * is likely to cause a crash in OPAL). 1942 */ 1943 rc = -EBUSY; 1944 else if (nr_servers > KVM_MAX_VCPUS) 1945 /* We don't need more servers. Higher vCPU ids get packed 1946 * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id(). 1947 */ 1948 xive->nr_servers = KVM_MAX_VCPUS; 1949 else 1950 xive->nr_servers = nr_servers; 1951 1952 mutex_unlock(&xive->lock); 1953 1954 return rc; 1955 } 1956 1957 static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1958 { 1959 struct kvmppc_xive *xive = dev->private; 1960 1961 /* We honor the existing XICS ioctl */ 1962 switch (attr->group) { 1963 case KVM_DEV_XICS_GRP_SOURCES: 1964 return xive_set_source(xive, attr->attr, attr->addr); 1965 case KVM_DEV_XICS_GRP_CTRL: 1966 switch (attr->attr) { 1967 case KVM_DEV_XICS_NR_SERVERS: 1968 return kvmppc_xive_set_nr_servers(xive, attr->addr); 1969 } 1970 } 1971 return -ENXIO; 1972 } 1973 1974 static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1975 { 1976 struct kvmppc_xive *xive = dev->private; 1977 1978 /* We honor the existing XICS ioctl */ 1979 switch (attr->group) { 1980 case KVM_DEV_XICS_GRP_SOURCES: 1981 return xive_get_source(xive, attr->attr, attr->addr); 1982 } 1983 return -ENXIO; 1984 } 1985 1986 static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1987 { 1988 /* We honor the same limits as XICS, at least for now */ 1989 switch (attr->group) { 1990 case KVM_DEV_XICS_GRP_SOURCES: 1991 if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && 1992 attr->attr < KVMPPC_XICS_NR_IRQS) 1993 return 0; 1994 break; 1995 case KVM_DEV_XICS_GRP_CTRL: 1996 switch (attr->attr) { 1997 case KVM_DEV_XICS_NR_SERVERS: 1998 return 0; 1999 } 2000 } 2001 return -ENXIO; 2002 } 2003 2004 static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd) 2005 { 2006 xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); 2007 xive_native_configure_irq(hw_num, 0, MASKED, 0); 2008 } 2009 2010 void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) 2011 { 2012 int i; 2013 2014 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 2015 struct kvmppc_xive_irq_state *state = &sb->irq_state[i]; 2016 2017 if (!state->valid) 2018 continue; 2019 2020 kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data); 2021 xive_cleanup_irq_data(&state->ipi_data); 2022 xive_native_free_irq(state->ipi_number); 2023 2024 /* Pass-through, cleanup too but keep IRQ hw data */ 2025 if (state->pt_number) 2026 kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data); 2027 2028 state->valid = false; 2029 } 2030 } 2031 2032 /* 2033 * Called when device fd is closed. kvm->lock is held. 2034 */ 2035 static void kvmppc_xive_release(struct kvm_device *dev) 2036 { 2037 struct kvmppc_xive *xive = dev->private; 2038 struct kvm *kvm = xive->kvm; 2039 struct kvm_vcpu *vcpu; 2040 int i; 2041 2042 pr_devel("Releasing xive device\n"); 2043 2044 /* 2045 * Since this is the device release function, we know that 2046 * userspace does not have any open fd referring to the 2047 * device. Therefore there can not be any of the device 2048 * attribute set/get functions being executed concurrently, 2049 * and similarly, the connect_vcpu and set/clr_mapped 2050 * functions also cannot be being executed. 2051 */ 2052 2053 debugfs_remove(xive->dentry); 2054 2055 /* 2056 * We should clean up the vCPU interrupt presenters first. 2057 */ 2058 kvm_for_each_vcpu(i, vcpu, kvm) { 2059 /* 2060 * Take vcpu->mutex to ensure that no one_reg get/set ioctl 2061 * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently. 2062 * Holding the vcpu->mutex also means that the vcpu cannot 2063 * be executing the KVM_RUN ioctl, and therefore it cannot 2064 * be executing the XIVE push or pull code or accessing 2065 * the XIVE MMIO regions. 2066 */ 2067 mutex_lock(&vcpu->mutex); 2068 kvmppc_xive_cleanup_vcpu(vcpu); 2069 mutex_unlock(&vcpu->mutex); 2070 } 2071 2072 /* 2073 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type 2074 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe 2075 * against xive code getting called during vcpu execution or 2076 * set/get one_reg operations. 2077 */ 2078 kvm->arch.xive = NULL; 2079 2080 /* Mask and free interrupts */ 2081 for (i = 0; i <= xive->max_sbid; i++) { 2082 if (xive->src_blocks[i]) 2083 kvmppc_xive_free_sources(xive->src_blocks[i]); 2084 kfree(xive->src_blocks[i]); 2085 xive->src_blocks[i] = NULL; 2086 } 2087 2088 if (xive->vp_base != XIVE_INVALID_VP) 2089 xive_native_free_vp_block(xive->vp_base); 2090 2091 /* 2092 * A reference of the kvmppc_xive pointer is now kept under 2093 * the xive_devices struct of the machine for reuse. It is 2094 * freed when the VM is destroyed for now until we fix all the 2095 * execution paths. 2096 */ 2097 2098 kfree(dev); 2099 } 2100 2101 /* 2102 * When the guest chooses the interrupt mode (XICS legacy or XIVE 2103 * native), the VM will switch of KVM device. The previous device will 2104 * be "released" before the new one is created. 2105 * 2106 * Until we are sure all execution paths are well protected, provide a 2107 * fail safe (transitional) method for device destruction, in which 2108 * the XIVE device pointer is recycled and not directly freed. 2109 */ 2110 struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) 2111 { 2112 struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ? 2113 &kvm->arch.xive_devices.native : 2114 &kvm->arch.xive_devices.xics_on_xive; 2115 struct kvmppc_xive *xive = *kvm_xive_device; 2116 2117 if (!xive) { 2118 xive = kzalloc(sizeof(*xive), GFP_KERNEL); 2119 *kvm_xive_device = xive; 2120 } else { 2121 memset(xive, 0, sizeof(*xive)); 2122 } 2123 2124 return xive; 2125 } 2126 2127 /* 2128 * Create a XICS device with XIVE backend. kvm->lock is held. 2129 */ 2130 static int kvmppc_xive_create(struct kvm_device *dev, u32 type) 2131 { 2132 struct kvmppc_xive *xive; 2133 struct kvm *kvm = dev->kvm; 2134 2135 pr_devel("Creating xive for partition\n"); 2136 2137 /* Already there ? */ 2138 if (kvm->arch.xive) 2139 return -EEXIST; 2140 2141 xive = kvmppc_xive_get_device(kvm, type); 2142 if (!xive) 2143 return -ENOMEM; 2144 2145 dev->private = xive; 2146 xive->dev = dev; 2147 xive->kvm = kvm; 2148 mutex_init(&xive->lock); 2149 2150 /* We use the default queue size set by the host */ 2151 xive->q_order = xive_native_default_eq_shift(); 2152 if (xive->q_order < PAGE_SHIFT) 2153 xive->q_page_order = 0; 2154 else 2155 xive->q_page_order = xive->q_order - PAGE_SHIFT; 2156 2157 /* VP allocation is delayed to the first call to connect_vcpu */ 2158 xive->vp_base = XIVE_INVALID_VP; 2159 /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets 2160 * on a POWER9 system. 2161 */ 2162 xive->nr_servers = KVM_MAX_VCPUS; 2163 2164 if (xive_native_has_single_escalation()) 2165 xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; 2166 2167 if (xive_native_has_save_restore()) 2168 xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE; 2169 2170 kvm->arch.xive = xive; 2171 return 0; 2172 } 2173 2174 int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req) 2175 { 2176 struct kvmppc_vcore *vc = vcpu->arch.vcore; 2177 2178 /* The VM should have configured XICS mode before doing XICS hcalls. */ 2179 if (!kvmppc_xics_enabled(vcpu)) 2180 return H_TOO_HARD; 2181 2182 switch (req) { 2183 case H_XIRR: 2184 return xive_vm_h_xirr(vcpu); 2185 case H_CPPR: 2186 return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); 2187 case H_EOI: 2188 return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); 2189 case H_IPI: 2190 return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), 2191 kvmppc_get_gpr(vcpu, 5)); 2192 case H_IPOLL: 2193 return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4)); 2194 case H_XIRR_X: 2195 xive_vm_h_xirr(vcpu); 2196 kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset); 2197 return H_SUCCESS; 2198 } 2199 2200 return H_UNSUPPORTED; 2201 } 2202 EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall); 2203 2204 int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu) 2205 { 2206 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 2207 unsigned int i; 2208 2209 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 2210 struct xive_q *q = &xc->queues[i]; 2211 u32 i0, i1, idx; 2212 2213 if (!q->qpage && !xc->esc_virq[i]) 2214 continue; 2215 2216 if (q->qpage) { 2217 seq_printf(m, " q[%d]: ", i); 2218 idx = q->idx; 2219 i0 = be32_to_cpup(q->qpage + idx); 2220 idx = (idx + 1) & q->msk; 2221 i1 = be32_to_cpup(q->qpage + idx); 2222 seq_printf(m, "T=%d %08x %08x...\n", q->toggle, 2223 i0, i1); 2224 } 2225 if (xc->esc_virq[i]) { 2226 struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]); 2227 struct xive_irq_data *xd = 2228 irq_data_get_irq_handler_data(d); 2229 u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET); 2230 2231 seq_printf(m, " ESC %d %c%c EOI @%llx", 2232 xc->esc_virq[i], 2233 (pq & XIVE_ESB_VAL_P) ? 'P' : '-', 2234 (pq & XIVE_ESB_VAL_Q) ? 'Q' : '-', 2235 xd->eoi_page); 2236 seq_puts(m, "\n"); 2237 } 2238 } 2239 return 0; 2240 } 2241 2242 void kvmppc_xive_debug_show_sources(struct seq_file *m, 2243 struct kvmppc_xive_src_block *sb) 2244 { 2245 int i; 2246 2247 seq_puts(m, " LISN HW/CHIP TYPE PQ EISN CPU/PRIO\n"); 2248 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 2249 struct kvmppc_xive_irq_state *state = &sb->irq_state[i]; 2250 struct xive_irq_data *xd; 2251 u64 pq; 2252 u32 hw_num; 2253 2254 if (!state->valid) 2255 continue; 2256 2257 kvmppc_xive_select_irq(state, &hw_num, &xd); 2258 2259 pq = xive_vm_esb_load(xd, XIVE_ESB_GET); 2260 2261 seq_printf(m, "%08x %08x/%02x", state->number, hw_num, 2262 xd->src_chip); 2263 if (state->lsi) 2264 seq_printf(m, " %cLSI", state->asserted ? '^' : ' '); 2265 else 2266 seq_puts(m, " MSI"); 2267 2268 seq_printf(m, " %s %c%c %08x % 4d/%d", 2269 state->ipi_number == hw_num ? "IPI" : " PT", 2270 pq & XIVE_ESB_VAL_P ? 'P' : '-', 2271 pq & XIVE_ESB_VAL_Q ? 'Q' : '-', 2272 state->eisn, state->act_server, 2273 state->act_priority); 2274 2275 seq_puts(m, "\n"); 2276 } 2277 } 2278 2279 static int xive_debug_show(struct seq_file *m, void *private) 2280 { 2281 struct kvmppc_xive *xive = m->private; 2282 struct kvm *kvm = xive->kvm; 2283 struct kvm_vcpu *vcpu; 2284 u64 t_rm_h_xirr = 0; 2285 u64 t_rm_h_ipoll = 0; 2286 u64 t_rm_h_cppr = 0; 2287 u64 t_rm_h_eoi = 0; 2288 u64 t_rm_h_ipi = 0; 2289 u64 t_vm_h_xirr = 0; 2290 u64 t_vm_h_ipoll = 0; 2291 u64 t_vm_h_cppr = 0; 2292 u64 t_vm_h_eoi = 0; 2293 u64 t_vm_h_ipi = 0; 2294 unsigned int i; 2295 2296 if (!kvm) 2297 return 0; 2298 2299 seq_puts(m, "=========\nVCPU state\n=========\n"); 2300 2301 kvm_for_each_vcpu(i, vcpu, kvm) { 2302 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 2303 2304 if (!xc) 2305 continue; 2306 2307 seq_printf(m, "VCPU %d: VP:%#x/%02x\n" 2308 " CPPR:%#x HWCPPR:%#x MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n", 2309 xc->server_num, xc->vp_id, xc->vp_chip_id, 2310 xc->cppr, xc->hw_cppr, 2311 xc->mfrr, xc->pending, 2312 xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); 2313 2314 kvmppc_xive_debug_show_queues(m, vcpu); 2315 2316 t_rm_h_xirr += xc->stat_rm_h_xirr; 2317 t_rm_h_ipoll += xc->stat_rm_h_ipoll; 2318 t_rm_h_cppr += xc->stat_rm_h_cppr; 2319 t_rm_h_eoi += xc->stat_rm_h_eoi; 2320 t_rm_h_ipi += xc->stat_rm_h_ipi; 2321 t_vm_h_xirr += xc->stat_vm_h_xirr; 2322 t_vm_h_ipoll += xc->stat_vm_h_ipoll; 2323 t_vm_h_cppr += xc->stat_vm_h_cppr; 2324 t_vm_h_eoi += xc->stat_vm_h_eoi; 2325 t_vm_h_ipi += xc->stat_vm_h_ipi; 2326 } 2327 2328 seq_puts(m, "Hcalls totals\n"); 2329 seq_printf(m, " H_XIRR R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr); 2330 seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll); 2331 seq_printf(m, " H_CPPR R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr); 2332 seq_printf(m, " H_EOI R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi); 2333 seq_printf(m, " H_IPI R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi); 2334 2335 seq_puts(m, "=========\nSources\n=========\n"); 2336 2337 for (i = 0; i <= xive->max_sbid; i++) { 2338 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 2339 2340 if (sb) { 2341 arch_spin_lock(&sb->lock); 2342 kvmppc_xive_debug_show_sources(m, sb); 2343 arch_spin_unlock(&sb->lock); 2344 } 2345 } 2346 2347 return 0; 2348 } 2349 2350 DEFINE_SHOW_ATTRIBUTE(xive_debug); 2351 2352 static void xive_debugfs_init(struct kvmppc_xive *xive) 2353 { 2354 char *name; 2355 2356 name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive); 2357 if (!name) { 2358 pr_err("%s: no memory for name\n", __func__); 2359 return; 2360 } 2361 2362 xive->dentry = debugfs_create_file(name, S_IRUGO, arch_debugfs_dir, 2363 xive, &xive_debug_fops); 2364 2365 pr_debug("%s: created %s\n", __func__, name); 2366 kfree(name); 2367 } 2368 2369 static void kvmppc_xive_init(struct kvm_device *dev) 2370 { 2371 struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private; 2372 2373 /* Register some debug interfaces */ 2374 xive_debugfs_init(xive); 2375 } 2376 2377 struct kvm_device_ops kvm_xive_ops = { 2378 .name = "kvm-xive", 2379 .create = kvmppc_xive_create, 2380 .init = kvmppc_xive_init, 2381 .release = kvmppc_xive_release, 2382 .set_attr = xive_set_attr, 2383 .get_attr = xive_get_attr, 2384 .has_attr = xive_has_attr, 2385 }; 2386