1 // SPDX-License-Identifier: GPL-2.0+ 2 // Copyright 2017 IBM Corp. 3 #include <linux/sched/mm.h> 4 #include <linux/mutex.h> 5 #include <linux/mmu_context.h> 6 #include <asm/copro.h> 7 #include <asm/pnv-ocxl.h> 8 #include <misc/ocxl.h> 9 #include "ocxl_internal.h" 10 #include "trace.h" 11 12 13 #define SPA_PASID_BITS 15 14 #define SPA_PASID_MAX ((1 << SPA_PASID_BITS) - 1) 15 #define SPA_PE_MASK SPA_PASID_MAX 16 #define SPA_SPA_SIZE_LOG 22 /* Each SPA is 4 Mb */ 17 18 #define SPA_CFG_SF (1ull << (63-0)) 19 #define SPA_CFG_TA (1ull << (63-1)) 20 #define SPA_CFG_HV (1ull << (63-3)) 21 #define SPA_CFG_UV (1ull << (63-4)) 22 #define SPA_CFG_XLAT_hpt (0ull << (63-6)) /* Hashed page table (HPT) mode */ 23 #define SPA_CFG_XLAT_roh (2ull << (63-6)) /* Radix on HPT mode */ 24 #define SPA_CFG_XLAT_ror (3ull << (63-6)) /* Radix on Radix mode */ 25 #define SPA_CFG_PR (1ull << (63-49)) 26 #define SPA_CFG_TC (1ull << (63-54)) 27 #define SPA_CFG_DR (1ull << (63-59)) 28 29 #define SPA_XSL_TF (1ull << (63-3)) /* Translation fault */ 30 #define SPA_XSL_S (1ull << (63-38)) /* Store operation */ 31 32 #define SPA_PE_VALID 0x80000000 33 34 35 struct pe_data { 36 struct mm_struct *mm; 37 /* callback to trigger when a translation fault occurs */ 38 void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr); 39 /* opaque pointer to be passed to the above callback */ 40 void *xsl_err_data; 41 struct rcu_head rcu; 42 }; 43 44 struct spa { 45 struct ocxl_process_element *spa_mem; 46 int spa_order; 47 struct mutex spa_lock; 48 struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */ 49 char *irq_name; 50 int virq; 51 void __iomem *reg_dsisr; 52 void __iomem *reg_dar; 53 void __iomem *reg_tfc; 54 void __iomem *reg_pe_handle; 55 /* 56 * The following field are used by the memory fault 57 * interrupt handler. We can only have one interrupt at a 58 * time. The NPU won't raise another interrupt until the 59 * previous one has been ack'd by writing to the TFC register 60 */ 61 struct xsl_fault { 62 struct work_struct fault_work; 63 u64 pe; 64 u64 dsisr; 65 u64 dar; 66 struct pe_data pe_data; 67 } xsl_fault; 68 }; 69 70 /* 71 * A opencapi link can be used be by several PCI functions. We have 72 * one link per device slot. 73 * 74 * A linked list of opencapi links should suffice, as there's a 75 * limited number of opencapi slots on a system and lookup is only 76 * done when the device is probed 77 */ 78 struct link { 79 struct list_head list; 80 struct kref ref; 81 int domain; 82 int bus; 83 int dev; 84 atomic_t irq_available; 85 struct spa *spa; 86 void *platform_data; 87 }; 88 static struct list_head links_list = LIST_HEAD_INIT(links_list); 89 static DEFINE_MUTEX(links_list_lock); 90 91 enum xsl_response { 92 CONTINUE, 93 ADDRESS_ERROR, 94 RESTART, 95 }; 96 97 98 static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe) 99 { 100 u64 reg; 101 102 *dsisr = in_be64(spa->reg_dsisr); 103 *dar = in_be64(spa->reg_dar); 104 reg = in_be64(spa->reg_pe_handle); 105 *pe = reg & SPA_PE_MASK; 106 } 107 108 static void ack_irq(struct spa *spa, enum xsl_response r) 109 { 110 u64 reg = 0; 111 112 /* continue is not supported */ 113 if (r == RESTART) 114 reg = PPC_BIT(31); 115 else if (r == ADDRESS_ERROR) 116 reg = PPC_BIT(30); 117 else 118 WARN(1, "Invalid irq response %d\n", r); 119 120 if (reg) { 121 trace_ocxl_fault_ack(spa->spa_mem, spa->xsl_fault.pe, 122 spa->xsl_fault.dsisr, spa->xsl_fault.dar, reg); 123 out_be64(spa->reg_tfc, reg); 124 } 125 } 126 127 static void xsl_fault_handler_bh(struct work_struct *fault_work) 128 { 129 unsigned int flt = 0; 130 unsigned long access, flags, inv_flags = 0; 131 enum xsl_response r; 132 struct xsl_fault *fault = container_of(fault_work, struct xsl_fault, 133 fault_work); 134 struct spa *spa = container_of(fault, struct spa, xsl_fault); 135 136 int rc; 137 138 /* 139 * We need to release a reference on the mm whenever exiting this 140 * function (taken in the memory fault interrupt handler) 141 */ 142 rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr, 143 &flt); 144 if (rc) { 145 pr_debug("copro_handle_mm_fault failed: %d\n", rc); 146 if (fault->pe_data.xsl_err_cb) { 147 fault->pe_data.xsl_err_cb( 148 fault->pe_data.xsl_err_data, 149 fault->dar, fault->dsisr); 150 } 151 r = ADDRESS_ERROR; 152 goto ack; 153 } 154 155 if (!radix_enabled()) { 156 /* 157 * update_mmu_cache() will not have loaded the hash 158 * since current->trap is not a 0x400 or 0x300, so 159 * just call hash_page_mm() here. 160 */ 161 access = _PAGE_PRESENT | _PAGE_READ; 162 if (fault->dsisr & SPA_XSL_S) 163 access |= _PAGE_WRITE; 164 165 if (REGION_ID(fault->dar) != USER_REGION_ID) 166 access |= _PAGE_PRIVILEGED; 167 168 local_irq_save(flags); 169 hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300, 170 inv_flags); 171 local_irq_restore(flags); 172 } 173 r = RESTART; 174 ack: 175 mmdrop(fault->pe_data.mm); 176 ack_irq(spa, r); 177 } 178 179 static irqreturn_t xsl_fault_handler(int irq, void *data) 180 { 181 struct link *link = (struct link *) data; 182 struct spa *spa = link->spa; 183 u64 dsisr, dar, pe_handle; 184 struct pe_data *pe_data; 185 struct ocxl_process_element *pe; 186 int lpid, pid, tid; 187 188 read_irq(spa, &dsisr, &dar, &pe_handle); 189 trace_ocxl_fault(spa->spa_mem, pe_handle, dsisr, dar, -1); 190 191 WARN_ON(pe_handle > SPA_PE_MASK); 192 pe = spa->spa_mem + pe_handle; 193 lpid = be32_to_cpu(pe->lpid); 194 pid = be32_to_cpu(pe->pid); 195 tid = be32_to_cpu(pe->tid); 196 /* We could be reading all null values here if the PE is being 197 * removed while an interrupt kicks in. It's not supposed to 198 * happen if the driver notified the AFU to terminate the 199 * PASID, and the AFU waited for pending operations before 200 * acknowledging. But even if it happens, we won't find a 201 * memory context below and fail silently, so it should be ok. 202 */ 203 if (!(dsisr & SPA_XSL_TF)) { 204 WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr); 205 ack_irq(spa, ADDRESS_ERROR); 206 return IRQ_HANDLED; 207 } 208 209 rcu_read_lock(); 210 pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle); 211 if (!pe_data) { 212 /* 213 * Could only happen if the driver didn't notify the 214 * AFU about PASID termination before removing the PE, 215 * or the AFU didn't wait for all memory access to 216 * have completed. 217 * 218 * Either way, we fail early, but we shouldn't log an 219 * error message, as it is a valid (if unexpected) 220 * scenario 221 */ 222 rcu_read_unlock(); 223 pr_debug("Unknown mm context for xsl interrupt\n"); 224 ack_irq(spa, ADDRESS_ERROR); 225 return IRQ_HANDLED; 226 } 227 WARN_ON(pe_data->mm->context.id != pid); 228 229 spa->xsl_fault.pe = pe_handle; 230 spa->xsl_fault.dar = dar; 231 spa->xsl_fault.dsisr = dsisr; 232 spa->xsl_fault.pe_data = *pe_data; 233 mmgrab(pe_data->mm); /* mm count is released by bottom half */ 234 235 rcu_read_unlock(); 236 schedule_work(&spa->xsl_fault.fault_work); 237 return IRQ_HANDLED; 238 } 239 240 static void unmap_irq_registers(struct spa *spa) 241 { 242 pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc, 243 spa->reg_pe_handle); 244 } 245 246 static int map_irq_registers(struct pci_dev *dev, struct spa *spa) 247 { 248 return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar, 249 &spa->reg_tfc, &spa->reg_pe_handle); 250 } 251 252 static int setup_xsl_irq(struct pci_dev *dev, struct link *link) 253 { 254 struct spa *spa = link->spa; 255 int rc; 256 int hwirq; 257 258 rc = pnv_ocxl_get_xsl_irq(dev, &hwirq); 259 if (rc) 260 return rc; 261 262 rc = map_irq_registers(dev, spa); 263 if (rc) 264 return rc; 265 266 spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x", 267 link->domain, link->bus, link->dev); 268 if (!spa->irq_name) { 269 unmap_irq_registers(spa); 270 dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n"); 271 return -ENOMEM; 272 } 273 /* 274 * At some point, we'll need to look into allowing a higher 275 * number of interrupts. Could we have an IRQ domain per link? 276 */ 277 spa->virq = irq_create_mapping(NULL, hwirq); 278 if (!spa->virq) { 279 kfree(spa->irq_name); 280 unmap_irq_registers(spa); 281 dev_err(&dev->dev, 282 "irq_create_mapping failed for translation interrupt\n"); 283 return -EINVAL; 284 } 285 286 dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq); 287 288 rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name, 289 link); 290 if (rc) { 291 irq_dispose_mapping(spa->virq); 292 kfree(spa->irq_name); 293 unmap_irq_registers(spa); 294 dev_err(&dev->dev, 295 "request_irq failed for translation interrupt: %d\n", 296 rc); 297 return -EINVAL; 298 } 299 return 0; 300 } 301 302 static void release_xsl_irq(struct link *link) 303 { 304 struct spa *spa = link->spa; 305 306 if (spa->virq) { 307 free_irq(spa->virq, link); 308 irq_dispose_mapping(spa->virq); 309 } 310 kfree(spa->irq_name); 311 unmap_irq_registers(spa); 312 } 313 314 static int alloc_spa(struct pci_dev *dev, struct link *link) 315 { 316 struct spa *spa; 317 318 spa = kzalloc(sizeof(struct spa), GFP_KERNEL); 319 if (!spa) 320 return -ENOMEM; 321 322 mutex_init(&spa->spa_lock); 323 INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL); 324 INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh); 325 326 spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT; 327 spa->spa_mem = (struct ocxl_process_element *) 328 __get_free_pages(GFP_KERNEL | __GFP_ZERO, spa->spa_order); 329 if (!spa->spa_mem) { 330 dev_err(&dev->dev, "Can't allocate Shared Process Area\n"); 331 kfree(spa); 332 return -ENOMEM; 333 } 334 pr_debug("Allocated SPA for %x:%x:%x at %p\n", link->domain, link->bus, 335 link->dev, spa->spa_mem); 336 337 link->spa = spa; 338 return 0; 339 } 340 341 static void free_spa(struct link *link) 342 { 343 struct spa *spa = link->spa; 344 345 pr_debug("Freeing SPA for %x:%x:%x\n", link->domain, link->bus, 346 link->dev); 347 348 if (spa && spa->spa_mem) { 349 free_pages((unsigned long) spa->spa_mem, spa->spa_order); 350 kfree(spa); 351 link->spa = NULL; 352 } 353 } 354 355 static int alloc_link(struct pci_dev *dev, int PE_mask, struct link **out_link) 356 { 357 struct link *link; 358 int rc; 359 360 link = kzalloc(sizeof(struct link), GFP_KERNEL); 361 if (!link) 362 return -ENOMEM; 363 364 kref_init(&link->ref); 365 link->domain = pci_domain_nr(dev->bus); 366 link->bus = dev->bus->number; 367 link->dev = PCI_SLOT(dev->devfn); 368 atomic_set(&link->irq_available, MAX_IRQ_PER_LINK); 369 370 rc = alloc_spa(dev, link); 371 if (rc) 372 goto err_free; 373 374 rc = setup_xsl_irq(dev, link); 375 if (rc) 376 goto err_spa; 377 378 /* platform specific hook */ 379 rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask, 380 &link->platform_data); 381 if (rc) 382 goto err_xsl_irq; 383 384 *out_link = link; 385 return 0; 386 387 err_xsl_irq: 388 release_xsl_irq(link); 389 err_spa: 390 free_spa(link); 391 err_free: 392 kfree(link); 393 return rc; 394 } 395 396 static void free_link(struct link *link) 397 { 398 release_xsl_irq(link); 399 free_spa(link); 400 kfree(link); 401 } 402 403 int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle) 404 { 405 int rc = 0; 406 struct link *link; 407 408 mutex_lock(&links_list_lock); 409 list_for_each_entry(link, &links_list, list) { 410 /* The functions of a device all share the same link */ 411 if (link->domain == pci_domain_nr(dev->bus) && 412 link->bus == dev->bus->number && 413 link->dev == PCI_SLOT(dev->devfn)) { 414 kref_get(&link->ref); 415 *link_handle = link; 416 goto unlock; 417 } 418 } 419 rc = alloc_link(dev, PE_mask, &link); 420 if (rc) 421 goto unlock; 422 423 list_add(&link->list, &links_list); 424 *link_handle = link; 425 unlock: 426 mutex_unlock(&links_list_lock); 427 return rc; 428 } 429 EXPORT_SYMBOL_GPL(ocxl_link_setup); 430 431 static void release_xsl(struct kref *ref) 432 { 433 struct link *link = container_of(ref, struct link, ref); 434 435 list_del(&link->list); 436 /* call platform code before releasing data */ 437 pnv_ocxl_spa_release(link->platform_data); 438 free_link(link); 439 } 440 441 void ocxl_link_release(struct pci_dev *dev, void *link_handle) 442 { 443 struct link *link = (struct link *) link_handle; 444 445 mutex_lock(&links_list_lock); 446 kref_put(&link->ref, release_xsl); 447 mutex_unlock(&links_list_lock); 448 } 449 EXPORT_SYMBOL_GPL(ocxl_link_release); 450 451 static u64 calculate_cfg_state(bool kernel) 452 { 453 u64 state; 454 455 state = SPA_CFG_DR; 456 if (mfspr(SPRN_LPCR) & LPCR_TC) 457 state |= SPA_CFG_TC; 458 if (radix_enabled()) 459 state |= SPA_CFG_XLAT_ror; 460 else 461 state |= SPA_CFG_XLAT_hpt; 462 state |= SPA_CFG_HV; 463 if (kernel) { 464 if (mfmsr() & MSR_SF) 465 state |= SPA_CFG_SF; 466 } else { 467 state |= SPA_CFG_PR; 468 if (!test_tsk_thread_flag(current, TIF_32BIT)) 469 state |= SPA_CFG_SF; 470 } 471 return state; 472 } 473 474 int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, 475 u64 amr, struct mm_struct *mm, 476 void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr), 477 void *xsl_err_data) 478 { 479 struct link *link = (struct link *) link_handle; 480 struct spa *spa = link->spa; 481 struct ocxl_process_element *pe; 482 int pe_handle, rc = 0; 483 struct pe_data *pe_data; 484 485 BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128); 486 if (pasid > SPA_PASID_MAX) 487 return -EINVAL; 488 489 mutex_lock(&spa->spa_lock); 490 pe_handle = pasid & SPA_PE_MASK; 491 pe = spa->spa_mem + pe_handle; 492 493 if (pe->software_state) { 494 rc = -EBUSY; 495 goto unlock; 496 } 497 498 pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL); 499 if (!pe_data) { 500 rc = -ENOMEM; 501 goto unlock; 502 } 503 504 pe_data->mm = mm; 505 pe_data->xsl_err_cb = xsl_err_cb; 506 pe_data->xsl_err_data = xsl_err_data; 507 508 memset(pe, 0, sizeof(struct ocxl_process_element)); 509 pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0)); 510 pe->lpid = cpu_to_be32(mfspr(SPRN_LPID)); 511 pe->pid = cpu_to_be32(pidr); 512 pe->tid = cpu_to_be32(tidr); 513 pe->amr = cpu_to_be64(amr); 514 pe->software_state = cpu_to_be32(SPA_PE_VALID); 515 516 mm_context_add_copro(mm); 517 /* 518 * Barrier is to make sure PE is visible in the SPA before it 519 * is used by the device. It also helps with the global TLBI 520 * invalidation 521 */ 522 mb(); 523 radix_tree_insert(&spa->pe_tree, pe_handle, pe_data); 524 525 /* 526 * The mm must stay valid for as long as the device uses it. We 527 * lower the count when the context is removed from the SPA. 528 * 529 * We grab mm_count (and not mm_users), as we don't want to 530 * end up in a circular dependency if a process mmaps its 531 * mmio, therefore incrementing the file ref count when 532 * calling mmap(), and forgets to unmap before exiting. In 533 * that scenario, when the kernel handles the death of the 534 * process, the file is not cleaned because unmap was not 535 * called, and the mm wouldn't be freed because we would still 536 * have a reference on mm_users. Incrementing mm_count solves 537 * the problem. 538 */ 539 mmgrab(mm); 540 trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr); 541 unlock: 542 mutex_unlock(&spa->spa_lock); 543 return rc; 544 } 545 EXPORT_SYMBOL_GPL(ocxl_link_add_pe); 546 547 int ocxl_link_remove_pe(void *link_handle, int pasid) 548 { 549 struct link *link = (struct link *) link_handle; 550 struct spa *spa = link->spa; 551 struct ocxl_process_element *pe; 552 struct pe_data *pe_data; 553 int pe_handle, rc; 554 555 if (pasid > SPA_PASID_MAX) 556 return -EINVAL; 557 558 /* 559 * About synchronization with our memory fault handler: 560 * 561 * Before removing the PE, the driver is supposed to have 562 * notified the AFU, which should have cleaned up and make 563 * sure the PASID is no longer in use, including pending 564 * interrupts. However, there's no way to be sure... 565 * 566 * We clear the PE and remove the context from our radix 567 * tree. From that point on, any new interrupt for that 568 * context will fail silently, which is ok. As mentioned 569 * above, that's not expected, but it could happen if the 570 * driver or AFU didn't do the right thing. 571 * 572 * There could still be a bottom half running, but we don't 573 * need to wait/flush, as it is managing a reference count on 574 * the mm it reads from the radix tree. 575 */ 576 pe_handle = pasid & SPA_PE_MASK; 577 pe = spa->spa_mem + pe_handle; 578 579 mutex_lock(&spa->spa_lock); 580 581 if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) { 582 rc = -EINVAL; 583 goto unlock; 584 } 585 586 trace_ocxl_context_remove(current->pid, spa->spa_mem, pasid, 587 be32_to_cpu(pe->pid), be32_to_cpu(pe->tid)); 588 589 memset(pe, 0, sizeof(struct ocxl_process_element)); 590 /* 591 * The barrier makes sure the PE is removed from the SPA 592 * before we clear the NPU context cache below, so that the 593 * old PE cannot be reloaded erroneously. 594 */ 595 mb(); 596 597 /* 598 * hook to platform code 599 * On powerpc, the entry needs to be cleared from the context 600 * cache of the NPU. 601 */ 602 rc = pnv_ocxl_spa_remove_pe(link->platform_data, pe_handle); 603 WARN_ON(rc); 604 605 pe_data = radix_tree_delete(&spa->pe_tree, pe_handle); 606 if (!pe_data) { 607 WARN(1, "Couldn't find pe data when removing PE\n"); 608 } else { 609 mm_context_remove_copro(pe_data->mm); 610 mmdrop(pe_data->mm); 611 kfree_rcu(pe_data, rcu); 612 } 613 unlock: 614 mutex_unlock(&spa->spa_lock); 615 return rc; 616 } 617 EXPORT_SYMBOL_GPL(ocxl_link_remove_pe); 618 619 int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, u64 *trigger_addr) 620 { 621 struct link *link = (struct link *) link_handle; 622 int rc, irq; 623 u64 addr; 624 625 if (atomic_dec_if_positive(&link->irq_available) < 0) 626 return -ENOSPC; 627 628 rc = pnv_ocxl_alloc_xive_irq(&irq, &addr); 629 if (rc) { 630 atomic_inc(&link->irq_available); 631 return rc; 632 } 633 634 *hw_irq = irq; 635 *trigger_addr = addr; 636 return 0; 637 } 638 EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc); 639 640 void ocxl_link_free_irq(void *link_handle, int hw_irq) 641 { 642 struct link *link = (struct link *) link_handle; 643 644 pnv_ocxl_free_xive_irq(hw_irq); 645 atomic_inc(&link->irq_available); 646 } 647 EXPORT_SYMBOL_GPL(ocxl_link_free_irq); 648