1 /* 2 * Support PCI/PCIe on PowerNV platforms 3 * 4 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 12 #undef DEBUG 13 14 #include <linux/kernel.h> 15 #include <linux/pci.h> 16 #include <linux/crash_dump.h> 17 #include <linux/debugfs.h> 18 #include <linux/delay.h> 19 #include <linux/string.h> 20 #include <linux/init.h> 21 #include <linux/bootmem.h> 22 #include <linux/irq.h> 23 #include <linux/io.h> 24 #include <linux/msi.h> 25 #include <linux/memblock.h> 26 #include <linux/iommu.h> 27 28 #include <asm/sections.h> 29 #include <asm/io.h> 30 #include <asm/prom.h> 31 #include <asm/pci-bridge.h> 32 #include <asm/machdep.h> 33 #include <asm/msi_bitmap.h> 34 #include <asm/ppc-pci.h> 35 #include <asm/opal.h> 36 #include <asm/iommu.h> 37 #include <asm/tce.h> 38 #include <asm/xics.h> 39 #include <asm/debug.h> 40 #include <asm/firmware.h> 41 #include <asm/pnv-pci.h> 42 43 #include <misc/cxl-base.h> 44 45 #include "powernv.h" 46 #include "pci.h" 47 48 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ 49 #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) 50 51 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, 52 const char *fmt, ...) 53 { 54 struct va_format vaf; 55 va_list args; 56 char pfix[32]; 57 58 va_start(args, fmt); 59 60 vaf.fmt = fmt; 61 vaf.va = &args; 62 63 if (pe->flags & PNV_IODA_PE_DEV) 64 strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); 65 else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) 66 sprintf(pfix, "%04x:%02x ", 67 pci_domain_nr(pe->pbus), pe->pbus->number); 68 #ifdef CONFIG_PCI_IOV 69 else if (pe->flags & PNV_IODA_PE_VF) 70 sprintf(pfix, "%04x:%02x:%2x.%d", 71 pci_domain_nr(pe->parent_dev->bus), 72 (pe->rid & 0xff00) >> 8, 73 PCI_SLOT(pe->rid), PCI_FUNC(pe->rid)); 74 #endif /* CONFIG_PCI_IOV*/ 75 76 printk("%spci %s: [PE# %.3d] %pV", 77 level, pfix, pe->pe_number, &vaf); 78 79 va_end(args); 80 } 81 82 #define pe_err(pe, fmt, ...) \ 83 pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) 84 #define pe_warn(pe, fmt, ...) \ 85 pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) 86 #define pe_info(pe, fmt, ...) \ 87 pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) 88 89 static bool pnv_iommu_bypass_disabled __read_mostly; 90 91 static int __init iommu_setup(char *str) 92 { 93 if (!str) 94 return -EINVAL; 95 96 while (*str) { 97 if (!strncmp(str, "nobypass", 8)) { 98 pnv_iommu_bypass_disabled = true; 99 pr_info("PowerNV: IOMMU bypass window disabled.\n"); 100 break; 101 } 102 str += strcspn(str, ","); 103 if (*str == ',') 104 str++; 105 } 106 107 return 0; 108 } 109 early_param("iommu", iommu_setup); 110 111 /* 112 * stdcix is only supposed to be used in hypervisor real mode as per 113 * the architecture spec 114 */ 115 static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) 116 { 117 __asm__ __volatile__("stdcix %0,0,%1" 118 : : "r" (val), "r" (paddr) : "memory"); 119 } 120 121 static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) 122 { 123 return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) == 124 (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); 125 } 126 127 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) 128 { 129 if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { 130 pr_warn("%s: Invalid PE %d on PHB#%x\n", 131 __func__, pe_no, phb->hose->global_number); 132 return; 133 } 134 135 if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) { 136 pr_warn("%s: PE %d was assigned on PHB#%x\n", 137 __func__, pe_no, phb->hose->global_number); 138 return; 139 } 140 141 phb->ioda.pe_array[pe_no].phb = phb; 142 phb->ioda.pe_array[pe_no].pe_number = pe_no; 143 } 144 145 static int pnv_ioda_alloc_pe(struct pnv_phb *phb) 146 { 147 unsigned long pe; 148 149 do { 150 pe = find_next_zero_bit(phb->ioda.pe_alloc, 151 phb->ioda.total_pe, 0); 152 if (pe >= phb->ioda.total_pe) 153 return IODA_INVALID_PE; 154 } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); 155 156 phb->ioda.pe_array[pe].phb = phb; 157 phb->ioda.pe_array[pe].pe_number = pe; 158 return pe; 159 } 160 161 static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) 162 { 163 WARN_ON(phb->ioda.pe_array[pe].pdev); 164 165 memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); 166 clear_bit(pe, phb->ioda.pe_alloc); 167 } 168 169 /* The default M64 BAR is shared by all PEs */ 170 static int pnv_ioda2_init_m64(struct pnv_phb *phb) 171 { 172 const char *desc; 173 struct resource *r; 174 s64 rc; 175 176 /* Configure the default M64 BAR */ 177 rc = opal_pci_set_phb_mem_window(phb->opal_id, 178 OPAL_M64_WINDOW_TYPE, 179 phb->ioda.m64_bar_idx, 180 phb->ioda.m64_base, 181 0, /* unused */ 182 phb->ioda.m64_size); 183 if (rc != OPAL_SUCCESS) { 184 desc = "configuring"; 185 goto fail; 186 } 187 188 /* Enable the default M64 BAR */ 189 rc = opal_pci_phb_mmio_enable(phb->opal_id, 190 OPAL_M64_WINDOW_TYPE, 191 phb->ioda.m64_bar_idx, 192 OPAL_ENABLE_M64_SPLIT); 193 if (rc != OPAL_SUCCESS) { 194 desc = "enabling"; 195 goto fail; 196 } 197 198 /* Mark the M64 BAR assigned */ 199 set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc); 200 201 /* 202 * Strip off the segment used by the reserved PE, which is 203 * expected to be 0 or last one of PE capabicity. 204 */ 205 r = &phb->hose->mem_resources[1]; 206 if (phb->ioda.reserved_pe == 0) 207 r->start += phb->ioda.m64_segsize; 208 else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1)) 209 r->end -= phb->ioda.m64_segsize; 210 else 211 pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", 212 phb->ioda.reserved_pe); 213 214 return 0; 215 216 fail: 217 pr_warn(" Failure %lld %s M64 BAR#%d\n", 218 rc, desc, phb->ioda.m64_bar_idx); 219 opal_pci_phb_mmio_enable(phb->opal_id, 220 OPAL_M64_WINDOW_TYPE, 221 phb->ioda.m64_bar_idx, 222 OPAL_DISABLE_M64); 223 return -EIO; 224 } 225 226 static void pnv_ioda2_reserve_m64_pe(struct pnv_phb *phb) 227 { 228 resource_size_t sgsz = phb->ioda.m64_segsize; 229 struct pci_dev *pdev; 230 struct resource *r; 231 int base, step, i; 232 233 /* 234 * Root bus always has full M64 range and root port has 235 * M64 range used in reality. So we're checking root port 236 * instead of root bus. 237 */ 238 list_for_each_entry(pdev, &phb->hose->bus->devices, bus_list) { 239 for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) { 240 r = &pdev->resource[PCI_BRIDGE_RESOURCES + i]; 241 if (!r->parent || 242 !pnv_pci_is_mem_pref_64(r->flags)) 243 continue; 244 245 base = (r->start - phb->ioda.m64_base) / sgsz; 246 for (step = 0; step < resource_size(r) / sgsz; step++) 247 pnv_ioda_reserve_pe(phb, base + step); 248 } 249 } 250 } 251 252 static int pnv_ioda2_pick_m64_pe(struct pnv_phb *phb, 253 struct pci_bus *bus, int all) 254 { 255 resource_size_t segsz = phb->ioda.m64_segsize; 256 struct pci_dev *pdev; 257 struct resource *r; 258 struct pnv_ioda_pe *master_pe, *pe; 259 unsigned long size, *pe_alloc; 260 bool found; 261 int start, i, j; 262 263 /* Root bus shouldn't use M64 */ 264 if (pci_is_root_bus(bus)) 265 return IODA_INVALID_PE; 266 267 /* We support only one M64 window on each bus */ 268 found = false; 269 pci_bus_for_each_resource(bus, r, i) { 270 if (r && r->parent && 271 pnv_pci_is_mem_pref_64(r->flags)) { 272 found = true; 273 break; 274 } 275 } 276 277 /* No M64 window found ? */ 278 if (!found) 279 return IODA_INVALID_PE; 280 281 /* Allocate bitmap */ 282 size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 283 pe_alloc = kzalloc(size, GFP_KERNEL); 284 if (!pe_alloc) { 285 pr_warn("%s: Out of memory !\n", 286 __func__); 287 return IODA_INVALID_PE; 288 } 289 290 /* 291 * Figure out reserved PE numbers by the PE 292 * the its child PEs. 293 */ 294 start = (r->start - phb->ioda.m64_base) / segsz; 295 for (i = 0; i < resource_size(r) / segsz; i++) 296 set_bit(start + i, pe_alloc); 297 298 if (all) 299 goto done; 300 301 /* 302 * If the PE doesn't cover all subordinate buses, 303 * we need subtract from reserved PEs for children. 304 */ 305 list_for_each_entry(pdev, &bus->devices, bus_list) { 306 if (!pdev->subordinate) 307 continue; 308 309 pci_bus_for_each_resource(pdev->subordinate, r, i) { 310 if (!r || !r->parent || 311 !pnv_pci_is_mem_pref_64(r->flags)) 312 continue; 313 314 start = (r->start - phb->ioda.m64_base) / segsz; 315 for (j = 0; j < resource_size(r) / segsz ; j++) 316 clear_bit(start + j, pe_alloc); 317 } 318 } 319 320 /* 321 * the current bus might not own M64 window and that's all 322 * contributed by its child buses. For the case, we needn't 323 * pick M64 dependent PE#. 324 */ 325 if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { 326 kfree(pe_alloc); 327 return IODA_INVALID_PE; 328 } 329 330 /* 331 * Figure out the master PE and put all slave PEs to master 332 * PE's list to form compound PE. 333 */ 334 done: 335 master_pe = NULL; 336 i = -1; 337 while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < 338 phb->ioda.total_pe) { 339 pe = &phb->ioda.pe_array[i]; 340 341 if (!master_pe) { 342 pe->flags |= PNV_IODA_PE_MASTER; 343 INIT_LIST_HEAD(&pe->slaves); 344 master_pe = pe; 345 } else { 346 pe->flags |= PNV_IODA_PE_SLAVE; 347 pe->master = master_pe; 348 list_add_tail(&pe->list, &master_pe->slaves); 349 } 350 } 351 352 kfree(pe_alloc); 353 return master_pe->pe_number; 354 } 355 356 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) 357 { 358 struct pci_controller *hose = phb->hose; 359 struct device_node *dn = hose->dn; 360 struct resource *res; 361 const u32 *r; 362 u64 pci_addr; 363 364 /* FIXME: Support M64 for P7IOC */ 365 if (phb->type != PNV_PHB_IODA2) { 366 pr_info(" Not support M64 window\n"); 367 return; 368 } 369 370 if (!firmware_has_feature(FW_FEATURE_OPALv3)) { 371 pr_info(" Firmware too old to support M64 window\n"); 372 return; 373 } 374 375 r = of_get_property(dn, "ibm,opal-m64-window", NULL); 376 if (!r) { 377 pr_info(" No <ibm,opal-m64-window> on %s\n", 378 dn->full_name); 379 return; 380 } 381 382 res = &hose->mem_resources[1]; 383 res->start = of_translate_address(dn, r + 2); 384 res->end = res->start + of_read_number(r + 4, 2) - 1; 385 res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); 386 pci_addr = of_read_number(r, 2); 387 hose->mem_offset[1] = res->start - pci_addr; 388 389 phb->ioda.m64_size = resource_size(res); 390 phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe; 391 phb->ioda.m64_base = pci_addr; 392 393 pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n", 394 res->start, res->end, pci_addr); 395 396 /* Use last M64 BAR to cover M64 window */ 397 phb->ioda.m64_bar_idx = 15; 398 phb->init_m64 = pnv_ioda2_init_m64; 399 phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe; 400 phb->pick_m64_pe = pnv_ioda2_pick_m64_pe; 401 } 402 403 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) 404 { 405 struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no]; 406 struct pnv_ioda_pe *slave; 407 s64 rc; 408 409 /* Fetch master PE */ 410 if (pe->flags & PNV_IODA_PE_SLAVE) { 411 pe = pe->master; 412 if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER))) 413 return; 414 415 pe_no = pe->pe_number; 416 } 417 418 /* Freeze master PE */ 419 rc = opal_pci_eeh_freeze_set(phb->opal_id, 420 pe_no, 421 OPAL_EEH_ACTION_SET_FREEZE_ALL); 422 if (rc != OPAL_SUCCESS) { 423 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n", 424 __func__, rc, phb->hose->global_number, pe_no); 425 return; 426 } 427 428 /* Freeze slave PEs */ 429 if (!(pe->flags & PNV_IODA_PE_MASTER)) 430 return; 431 432 list_for_each_entry(slave, &pe->slaves, list) { 433 rc = opal_pci_eeh_freeze_set(phb->opal_id, 434 slave->pe_number, 435 OPAL_EEH_ACTION_SET_FREEZE_ALL); 436 if (rc != OPAL_SUCCESS) 437 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n", 438 __func__, rc, phb->hose->global_number, 439 slave->pe_number); 440 } 441 } 442 443 static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt) 444 { 445 struct pnv_ioda_pe *pe, *slave; 446 s64 rc; 447 448 /* Find master PE */ 449 pe = &phb->ioda.pe_array[pe_no]; 450 if (pe->flags & PNV_IODA_PE_SLAVE) { 451 pe = pe->master; 452 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); 453 pe_no = pe->pe_number; 454 } 455 456 /* Clear frozen state for master PE */ 457 rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt); 458 if (rc != OPAL_SUCCESS) { 459 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n", 460 __func__, rc, opt, phb->hose->global_number, pe_no); 461 return -EIO; 462 } 463 464 if (!(pe->flags & PNV_IODA_PE_MASTER)) 465 return 0; 466 467 /* Clear frozen state for slave PEs */ 468 list_for_each_entry(slave, &pe->slaves, list) { 469 rc = opal_pci_eeh_freeze_clear(phb->opal_id, 470 slave->pe_number, 471 opt); 472 if (rc != OPAL_SUCCESS) { 473 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n", 474 __func__, rc, opt, phb->hose->global_number, 475 slave->pe_number); 476 return -EIO; 477 } 478 } 479 480 return 0; 481 } 482 483 static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) 484 { 485 struct pnv_ioda_pe *slave, *pe; 486 u8 fstate, state; 487 __be16 pcierr; 488 s64 rc; 489 490 /* Sanity check on PE number */ 491 if (pe_no < 0 || pe_no >= phb->ioda.total_pe) 492 return OPAL_EEH_STOPPED_PERM_UNAVAIL; 493 494 /* 495 * Fetch the master PE and the PE instance might be 496 * not initialized yet. 497 */ 498 pe = &phb->ioda.pe_array[pe_no]; 499 if (pe->flags & PNV_IODA_PE_SLAVE) { 500 pe = pe->master; 501 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); 502 pe_no = pe->pe_number; 503 } 504 505 /* Check the master PE */ 506 rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, 507 &state, &pcierr, NULL); 508 if (rc != OPAL_SUCCESS) { 509 pr_warn("%s: Failure %lld getting " 510 "PHB#%x-PE#%x state\n", 511 __func__, rc, 512 phb->hose->global_number, pe_no); 513 return OPAL_EEH_STOPPED_TEMP_UNAVAIL; 514 } 515 516 /* Check the slave PE */ 517 if (!(pe->flags & PNV_IODA_PE_MASTER)) 518 return state; 519 520 list_for_each_entry(slave, &pe->slaves, list) { 521 rc = opal_pci_eeh_freeze_status(phb->opal_id, 522 slave->pe_number, 523 &fstate, 524 &pcierr, 525 NULL); 526 if (rc != OPAL_SUCCESS) { 527 pr_warn("%s: Failure %lld getting " 528 "PHB#%x-PE#%x state\n", 529 __func__, rc, 530 phb->hose->global_number, slave->pe_number); 531 return OPAL_EEH_STOPPED_TEMP_UNAVAIL; 532 } 533 534 /* 535 * Override the result based on the ascending 536 * priority. 537 */ 538 if (fstate > state) 539 state = fstate; 540 } 541 542 return state; 543 } 544 545 /* Currently those 2 are only used when MSIs are enabled, this will change 546 * but in the meantime, we need to protect them to avoid warnings 547 */ 548 #ifdef CONFIG_PCI_MSI 549 static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) 550 { 551 struct pci_controller *hose = pci_bus_to_host(dev->bus); 552 struct pnv_phb *phb = hose->private_data; 553 struct pci_dn *pdn = pci_get_pdn(dev); 554 555 if (!pdn) 556 return NULL; 557 if (pdn->pe_number == IODA_INVALID_PE) 558 return NULL; 559 return &phb->ioda.pe_array[pdn->pe_number]; 560 } 561 #endif /* CONFIG_PCI_MSI */ 562 563 static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, 564 struct pnv_ioda_pe *parent, 565 struct pnv_ioda_pe *child, 566 bool is_add) 567 { 568 const char *desc = is_add ? "adding" : "removing"; 569 uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN : 570 OPAL_REMOVE_PE_FROM_DOMAIN; 571 struct pnv_ioda_pe *slave; 572 long rc; 573 574 /* Parent PE affects child PE */ 575 rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, 576 child->pe_number, op); 577 if (rc != OPAL_SUCCESS) { 578 pe_warn(child, "OPAL error %ld %s to parent PELTV\n", 579 rc, desc); 580 return -ENXIO; 581 } 582 583 if (!(child->flags & PNV_IODA_PE_MASTER)) 584 return 0; 585 586 /* Compound case: parent PE affects slave PEs */ 587 list_for_each_entry(slave, &child->slaves, list) { 588 rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, 589 slave->pe_number, op); 590 if (rc != OPAL_SUCCESS) { 591 pe_warn(slave, "OPAL error %ld %s to parent PELTV\n", 592 rc, desc); 593 return -ENXIO; 594 } 595 } 596 597 return 0; 598 } 599 600 static int pnv_ioda_set_peltv(struct pnv_phb *phb, 601 struct pnv_ioda_pe *pe, 602 bool is_add) 603 { 604 struct pnv_ioda_pe *slave; 605 struct pci_dev *pdev = NULL; 606 int ret; 607 608 /* 609 * Clear PE frozen state. If it's master PE, we need 610 * clear slave PE frozen state as well. 611 */ 612 if (is_add) { 613 opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, 614 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); 615 if (pe->flags & PNV_IODA_PE_MASTER) { 616 list_for_each_entry(slave, &pe->slaves, list) 617 opal_pci_eeh_freeze_clear(phb->opal_id, 618 slave->pe_number, 619 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); 620 } 621 } 622 623 /* 624 * Associate PE in PELT. We need add the PE into the 625 * corresponding PELT-V as well. Otherwise, the error 626 * originated from the PE might contribute to other 627 * PEs. 628 */ 629 ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add); 630 if (ret) 631 return ret; 632 633 /* For compound PEs, any one affects all of them */ 634 if (pe->flags & PNV_IODA_PE_MASTER) { 635 list_for_each_entry(slave, &pe->slaves, list) { 636 ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add); 637 if (ret) 638 return ret; 639 } 640 } 641 642 if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) 643 pdev = pe->pbus->self; 644 else if (pe->flags & PNV_IODA_PE_DEV) 645 pdev = pe->pdev->bus->self; 646 #ifdef CONFIG_PCI_IOV 647 else if (pe->flags & PNV_IODA_PE_VF) 648 pdev = pe->parent_dev->bus->self; 649 #endif /* CONFIG_PCI_IOV */ 650 while (pdev) { 651 struct pci_dn *pdn = pci_get_pdn(pdev); 652 struct pnv_ioda_pe *parent; 653 654 if (pdn && pdn->pe_number != IODA_INVALID_PE) { 655 parent = &phb->ioda.pe_array[pdn->pe_number]; 656 ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); 657 if (ret) 658 return ret; 659 } 660 661 pdev = pdev->bus->self; 662 } 663 664 return 0; 665 } 666 667 #ifdef CONFIG_PCI_IOV 668 static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) 669 { 670 struct pci_dev *parent; 671 uint8_t bcomp, dcomp, fcomp; 672 int64_t rc; 673 long rid_end, rid; 674 675 /* Currently, we just deconfigure VF PE. Bus PE will always there.*/ 676 if (pe->pbus) { 677 int count; 678 679 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; 680 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; 681 parent = pe->pbus->self; 682 if (pe->flags & PNV_IODA_PE_BUS_ALL) 683 count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; 684 else 685 count = 1; 686 687 switch(count) { 688 case 1: bcomp = OpalPciBusAll; break; 689 case 2: bcomp = OpalPciBus7Bits; break; 690 case 4: bcomp = OpalPciBus6Bits; break; 691 case 8: bcomp = OpalPciBus5Bits; break; 692 case 16: bcomp = OpalPciBus4Bits; break; 693 case 32: bcomp = OpalPciBus3Bits; break; 694 default: 695 dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", 696 count); 697 /* Do an exact match only */ 698 bcomp = OpalPciBusAll; 699 } 700 rid_end = pe->rid + (count << 8); 701 } else { 702 if (pe->flags & PNV_IODA_PE_VF) 703 parent = pe->parent_dev; 704 else 705 parent = pe->pdev->bus->self; 706 bcomp = OpalPciBusAll; 707 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; 708 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; 709 rid_end = pe->rid + 1; 710 } 711 712 /* Clear the reverse map */ 713 for (rid = pe->rid; rid < rid_end; rid++) 714 phb->ioda.pe_rmap[rid] = 0; 715 716 /* Release from all parents PELT-V */ 717 while (parent) { 718 struct pci_dn *pdn = pci_get_pdn(parent); 719 if (pdn && pdn->pe_number != IODA_INVALID_PE) { 720 rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, 721 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); 722 /* XXX What to do in case of error ? */ 723 } 724 parent = parent->bus->self; 725 } 726 727 opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, 728 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); 729 730 /* Disassociate PE in PELT */ 731 rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, 732 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); 733 if (rc) 734 pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); 735 rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, 736 bcomp, dcomp, fcomp, OPAL_UNMAP_PE); 737 if (rc) 738 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); 739 740 pe->pbus = NULL; 741 pe->pdev = NULL; 742 pe->parent_dev = NULL; 743 744 return 0; 745 } 746 #endif /* CONFIG_PCI_IOV */ 747 748 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) 749 { 750 struct pci_dev *parent; 751 uint8_t bcomp, dcomp, fcomp; 752 long rc, rid_end, rid; 753 754 /* Bus validation ? */ 755 if (pe->pbus) { 756 int count; 757 758 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; 759 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; 760 parent = pe->pbus->self; 761 if (pe->flags & PNV_IODA_PE_BUS_ALL) 762 count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; 763 else 764 count = 1; 765 766 switch(count) { 767 case 1: bcomp = OpalPciBusAll; break; 768 case 2: bcomp = OpalPciBus7Bits; break; 769 case 4: bcomp = OpalPciBus6Bits; break; 770 case 8: bcomp = OpalPciBus5Bits; break; 771 case 16: bcomp = OpalPciBus4Bits; break; 772 case 32: bcomp = OpalPciBus3Bits; break; 773 default: 774 dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", 775 count); 776 /* Do an exact match only */ 777 bcomp = OpalPciBusAll; 778 } 779 rid_end = pe->rid + (count << 8); 780 } else { 781 #ifdef CONFIG_PCI_IOV 782 if (pe->flags & PNV_IODA_PE_VF) 783 parent = pe->parent_dev; 784 else 785 #endif /* CONFIG_PCI_IOV */ 786 parent = pe->pdev->bus->self; 787 bcomp = OpalPciBusAll; 788 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; 789 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; 790 rid_end = pe->rid + 1; 791 } 792 793 /* 794 * Associate PE in PELT. We need add the PE into the 795 * corresponding PELT-V as well. Otherwise, the error 796 * originated from the PE might contribute to other 797 * PEs. 798 */ 799 rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, 800 bcomp, dcomp, fcomp, OPAL_MAP_PE); 801 if (rc) { 802 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); 803 return -ENXIO; 804 } 805 806 /* Configure PELTV */ 807 pnv_ioda_set_peltv(phb, pe, true); 808 809 /* Setup reverse map */ 810 for (rid = pe->rid; rid < rid_end; rid++) 811 phb->ioda.pe_rmap[rid] = pe->pe_number; 812 813 /* Setup one MVTs on IODA1 */ 814 if (phb->type != PNV_PHB_IODA1) { 815 pe->mve_number = 0; 816 goto out; 817 } 818 819 pe->mve_number = pe->pe_number; 820 rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number); 821 if (rc != OPAL_SUCCESS) { 822 pe_err(pe, "OPAL error %ld setting up MVE %d\n", 823 rc, pe->mve_number); 824 pe->mve_number = -1; 825 } else { 826 rc = opal_pci_set_mve_enable(phb->opal_id, 827 pe->mve_number, OPAL_ENABLE_MVE); 828 if (rc) { 829 pe_err(pe, "OPAL error %ld enabling MVE %d\n", 830 rc, pe->mve_number); 831 pe->mve_number = -1; 832 } 833 } 834 835 out: 836 return 0; 837 } 838 839 static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, 840 struct pnv_ioda_pe *pe) 841 { 842 struct pnv_ioda_pe *lpe; 843 844 list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { 845 if (lpe->dma_weight < pe->dma_weight) { 846 list_add_tail(&pe->dma_link, &lpe->dma_link); 847 return; 848 } 849 } 850 list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); 851 } 852 853 static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) 854 { 855 /* This is quite simplistic. The "base" weight of a device 856 * is 10. 0 means no DMA is to be accounted for it. 857 */ 858 859 /* If it's a bridge, no DMA */ 860 if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) 861 return 0; 862 863 /* Reduce the weight of slow USB controllers */ 864 if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || 865 dev->class == PCI_CLASS_SERIAL_USB_OHCI || 866 dev->class == PCI_CLASS_SERIAL_USB_EHCI) 867 return 3; 868 869 /* Increase the weight of RAID (includes Obsidian) */ 870 if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) 871 return 15; 872 873 /* Default */ 874 return 10; 875 } 876 877 #ifdef CONFIG_PCI_IOV 878 static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) 879 { 880 struct pci_dn *pdn = pci_get_pdn(dev); 881 int i; 882 struct resource *res, res2; 883 resource_size_t size; 884 u16 num_vfs; 885 886 if (!dev->is_physfn) 887 return -EINVAL; 888 889 /* 890 * "offset" is in VFs. The M64 windows are sized so that when they 891 * are segmented, each segment is the same size as the IOV BAR. 892 * Each segment is in a separate PE, and the high order bits of the 893 * address are the PE number. Therefore, each VF's BAR is in a 894 * separate PE, and changing the IOV BAR start address changes the 895 * range of PEs the VFs are in. 896 */ 897 num_vfs = pdn->num_vfs; 898 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 899 res = &dev->resource[i + PCI_IOV_RESOURCES]; 900 if (!res->flags || !res->parent) 901 continue; 902 903 if (!pnv_pci_is_mem_pref_64(res->flags)) 904 continue; 905 906 /* 907 * The actual IOV BAR range is determined by the start address 908 * and the actual size for num_vfs VFs BAR. This check is to 909 * make sure that after shifting, the range will not overlap 910 * with another device. 911 */ 912 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); 913 res2.flags = res->flags; 914 res2.start = res->start + (size * offset); 915 res2.end = res2.start + (size * num_vfs) - 1; 916 917 if (res2.end > res->end) { 918 dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", 919 i, &res2, res, num_vfs, offset); 920 return -EBUSY; 921 } 922 } 923 924 /* 925 * After doing so, there would be a "hole" in the /proc/iomem when 926 * offset is a positive value. It looks like the device return some 927 * mmio back to the system, which actually no one could use it. 928 */ 929 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 930 res = &dev->resource[i + PCI_IOV_RESOURCES]; 931 if (!res->flags || !res->parent) 932 continue; 933 934 if (!pnv_pci_is_mem_pref_64(res->flags)) 935 continue; 936 937 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); 938 res2 = *res; 939 res->start += size * offset; 940 941 dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n", 942 i, &res2, res, num_vfs, offset); 943 pci_update_resource(dev, i + PCI_IOV_RESOURCES); 944 } 945 return 0; 946 } 947 #endif /* CONFIG_PCI_IOV */ 948 949 #if 0 950 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) 951 { 952 struct pci_controller *hose = pci_bus_to_host(dev->bus); 953 struct pnv_phb *phb = hose->private_data; 954 struct pci_dn *pdn = pci_get_pdn(dev); 955 struct pnv_ioda_pe *pe; 956 int pe_num; 957 958 if (!pdn) { 959 pr_err("%s: Device tree node not associated properly\n", 960 pci_name(dev)); 961 return NULL; 962 } 963 if (pdn->pe_number != IODA_INVALID_PE) 964 return NULL; 965 966 /* PE#0 has been pre-set */ 967 if (dev->bus->number == 0) 968 pe_num = 0; 969 else 970 pe_num = pnv_ioda_alloc_pe(phb); 971 if (pe_num == IODA_INVALID_PE) { 972 pr_warning("%s: Not enough PE# available, disabling device\n", 973 pci_name(dev)); 974 return NULL; 975 } 976 977 /* NOTE: We get only one ref to the pci_dev for the pdn, not for the 978 * pointer in the PE data structure, both should be destroyed at the 979 * same time. However, this needs to be looked at more closely again 980 * once we actually start removing things (Hotplug, SR-IOV, ...) 981 * 982 * At some point we want to remove the PDN completely anyways 983 */ 984 pe = &phb->ioda.pe_array[pe_num]; 985 pci_dev_get(dev); 986 pdn->pcidev = dev; 987 pdn->pe_number = pe_num; 988 pe->pdev = dev; 989 pe->pbus = NULL; 990 pe->tce32_seg = -1; 991 pe->mve_number = -1; 992 pe->rid = dev->bus->number << 8 | pdn->devfn; 993 994 pe_info(pe, "Associated device to PE\n"); 995 996 if (pnv_ioda_configure_pe(phb, pe)) { 997 /* XXX What do we do here ? */ 998 if (pe_num) 999 pnv_ioda_free_pe(phb, pe_num); 1000 pdn->pe_number = IODA_INVALID_PE; 1001 pe->pdev = NULL; 1002 pci_dev_put(dev); 1003 return NULL; 1004 } 1005 1006 /* Assign a DMA weight to the device */ 1007 pe->dma_weight = pnv_ioda_dma_weight(dev); 1008 if (pe->dma_weight != 0) { 1009 phb->ioda.dma_weight += pe->dma_weight; 1010 phb->ioda.dma_pe_count++; 1011 } 1012 1013 /* Link the PE */ 1014 pnv_ioda_link_pe_by_weight(phb, pe); 1015 1016 return pe; 1017 } 1018 #endif /* Useful for SRIOV case */ 1019 1020 static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) 1021 { 1022 struct pci_dev *dev; 1023 1024 list_for_each_entry(dev, &bus->devices, bus_list) { 1025 struct pci_dn *pdn = pci_get_pdn(dev); 1026 1027 if (pdn == NULL) { 1028 pr_warn("%s: No device node associated with device !\n", 1029 pci_name(dev)); 1030 continue; 1031 } 1032 pdn->pe_number = pe->pe_number; 1033 pe->dma_weight += pnv_ioda_dma_weight(dev); 1034 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) 1035 pnv_ioda_setup_same_PE(dev->subordinate, pe); 1036 } 1037 } 1038 1039 /* 1040 * There're 2 types of PCI bus sensitive PEs: One that is compromised of 1041 * single PCI bus. Another one that contains the primary PCI bus and its 1042 * subordinate PCI devices and buses. The second type of PE is normally 1043 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. 1044 */ 1045 static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) 1046 { 1047 struct pci_controller *hose = pci_bus_to_host(bus); 1048 struct pnv_phb *phb = hose->private_data; 1049 struct pnv_ioda_pe *pe; 1050 int pe_num = IODA_INVALID_PE; 1051 1052 /* Check if PE is determined by M64 */ 1053 if (phb->pick_m64_pe) 1054 pe_num = phb->pick_m64_pe(phb, bus, all); 1055 1056 /* The PE number isn't pinned by M64 */ 1057 if (pe_num == IODA_INVALID_PE) 1058 pe_num = pnv_ioda_alloc_pe(phb); 1059 1060 if (pe_num == IODA_INVALID_PE) { 1061 pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", 1062 __func__, pci_domain_nr(bus), bus->number); 1063 return; 1064 } 1065 1066 pe = &phb->ioda.pe_array[pe_num]; 1067 pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); 1068 pe->pbus = bus; 1069 pe->pdev = NULL; 1070 pe->tce32_seg = -1; 1071 pe->mve_number = -1; 1072 pe->rid = bus->busn_res.start << 8; 1073 pe->dma_weight = 0; 1074 1075 if (all) 1076 pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", 1077 bus->busn_res.start, bus->busn_res.end, pe_num); 1078 else 1079 pe_info(pe, "Secondary bus %d associated with PE#%d\n", 1080 bus->busn_res.start, pe_num); 1081 1082 if (pnv_ioda_configure_pe(phb, pe)) { 1083 /* XXX What do we do here ? */ 1084 if (pe_num) 1085 pnv_ioda_free_pe(phb, pe_num); 1086 pe->pbus = NULL; 1087 return; 1088 } 1089 1090 /* Associate it with all child devices */ 1091 pnv_ioda_setup_same_PE(bus, pe); 1092 1093 /* Put PE to the list */ 1094 list_add_tail(&pe->list, &phb->ioda.pe_list); 1095 1096 /* Account for one DMA PE if at least one DMA capable device exist 1097 * below the bridge 1098 */ 1099 if (pe->dma_weight != 0) { 1100 phb->ioda.dma_weight += pe->dma_weight; 1101 phb->ioda.dma_pe_count++; 1102 } 1103 1104 /* Link the PE */ 1105 pnv_ioda_link_pe_by_weight(phb, pe); 1106 } 1107 1108 static void pnv_ioda_setup_PEs(struct pci_bus *bus) 1109 { 1110 struct pci_dev *dev; 1111 1112 pnv_ioda_setup_bus_PE(bus, 0); 1113 1114 list_for_each_entry(dev, &bus->devices, bus_list) { 1115 if (dev->subordinate) { 1116 if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) 1117 pnv_ioda_setup_bus_PE(dev->subordinate, 1); 1118 else 1119 pnv_ioda_setup_PEs(dev->subordinate); 1120 } 1121 } 1122 } 1123 1124 /* 1125 * Configure PEs so that the downstream PCI buses and devices 1126 * could have their associated PE#. Unfortunately, we didn't 1127 * figure out the way to identify the PLX bridge yet. So we 1128 * simply put the PCI bus and the subordinate behind the root 1129 * port to PE# here. The game rule here is expected to be changed 1130 * as soon as we can detected PLX bridge correctly. 1131 */ 1132 static void pnv_pci_ioda_setup_PEs(void) 1133 { 1134 struct pci_controller *hose, *tmp; 1135 struct pnv_phb *phb; 1136 1137 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 1138 phb = hose->private_data; 1139 1140 /* M64 layout might affect PE allocation */ 1141 if (phb->reserve_m64_pe) 1142 phb->reserve_m64_pe(phb); 1143 1144 pnv_ioda_setup_PEs(hose->bus); 1145 } 1146 } 1147 1148 #ifdef CONFIG_PCI_IOV 1149 static int pnv_pci_vf_release_m64(struct pci_dev *pdev) 1150 { 1151 struct pci_bus *bus; 1152 struct pci_controller *hose; 1153 struct pnv_phb *phb; 1154 struct pci_dn *pdn; 1155 int i, j; 1156 1157 bus = pdev->bus; 1158 hose = pci_bus_to_host(bus); 1159 phb = hose->private_data; 1160 pdn = pci_get_pdn(pdev); 1161 1162 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) 1163 for (j = 0; j < M64_PER_IOV; j++) { 1164 if (pdn->m64_wins[i][j] == IODA_INVALID_M64) 1165 continue; 1166 opal_pci_phb_mmio_enable(phb->opal_id, 1167 OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); 1168 clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); 1169 pdn->m64_wins[i][j] = IODA_INVALID_M64; 1170 } 1171 1172 return 0; 1173 } 1174 1175 static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) 1176 { 1177 struct pci_bus *bus; 1178 struct pci_controller *hose; 1179 struct pnv_phb *phb; 1180 struct pci_dn *pdn; 1181 unsigned int win; 1182 struct resource *res; 1183 int i, j; 1184 int64_t rc; 1185 int total_vfs; 1186 resource_size_t size, start; 1187 int pe_num; 1188 int vf_groups; 1189 int vf_per_group; 1190 1191 bus = pdev->bus; 1192 hose = pci_bus_to_host(bus); 1193 phb = hose->private_data; 1194 pdn = pci_get_pdn(pdev); 1195 total_vfs = pci_sriov_get_totalvfs(pdev); 1196 1197 /* Initialize the m64_wins to IODA_INVALID_M64 */ 1198 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) 1199 for (j = 0; j < M64_PER_IOV; j++) 1200 pdn->m64_wins[i][j] = IODA_INVALID_M64; 1201 1202 if (pdn->m64_per_iov == M64_PER_IOV) { 1203 vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; 1204 vf_per_group = (num_vfs <= M64_PER_IOV)? 1: 1205 roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; 1206 } else { 1207 vf_groups = 1; 1208 vf_per_group = 1; 1209 } 1210 1211 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 1212 res = &pdev->resource[i + PCI_IOV_RESOURCES]; 1213 if (!res->flags || !res->parent) 1214 continue; 1215 1216 if (!pnv_pci_is_mem_pref_64(res->flags)) 1217 continue; 1218 1219 for (j = 0; j < vf_groups; j++) { 1220 do { 1221 win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, 1222 phb->ioda.m64_bar_idx + 1, 0); 1223 1224 if (win >= phb->ioda.m64_bar_idx + 1) 1225 goto m64_failed; 1226 } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); 1227 1228 pdn->m64_wins[i][j] = win; 1229 1230 if (pdn->m64_per_iov == M64_PER_IOV) { 1231 size = pci_iov_resource_size(pdev, 1232 PCI_IOV_RESOURCES + i); 1233 size = size * vf_per_group; 1234 start = res->start + size * j; 1235 } else { 1236 size = resource_size(res); 1237 start = res->start; 1238 } 1239 1240 /* Map the M64 here */ 1241 if (pdn->m64_per_iov == M64_PER_IOV) { 1242 pe_num = pdn->offset + j; 1243 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 1244 pe_num, OPAL_M64_WINDOW_TYPE, 1245 pdn->m64_wins[i][j], 0); 1246 } 1247 1248 rc = opal_pci_set_phb_mem_window(phb->opal_id, 1249 OPAL_M64_WINDOW_TYPE, 1250 pdn->m64_wins[i][j], 1251 start, 1252 0, /* unused */ 1253 size); 1254 1255 1256 if (rc != OPAL_SUCCESS) { 1257 dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", 1258 win, rc); 1259 goto m64_failed; 1260 } 1261 1262 if (pdn->m64_per_iov == M64_PER_IOV) 1263 rc = opal_pci_phb_mmio_enable(phb->opal_id, 1264 OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); 1265 else 1266 rc = opal_pci_phb_mmio_enable(phb->opal_id, 1267 OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); 1268 1269 if (rc != OPAL_SUCCESS) { 1270 dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", 1271 win, rc); 1272 goto m64_failed; 1273 } 1274 } 1275 } 1276 return 0; 1277 1278 m64_failed: 1279 pnv_pci_vf_release_m64(pdev); 1280 return -EBUSY; 1281 } 1282 1283 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) 1284 { 1285 struct pci_bus *bus; 1286 struct pci_controller *hose; 1287 struct pnv_phb *phb; 1288 struct iommu_table *tbl; 1289 unsigned long addr; 1290 int64_t rc; 1291 1292 bus = dev->bus; 1293 hose = pci_bus_to_host(bus); 1294 phb = hose->private_data; 1295 tbl = pe->table_group.tables[0]; 1296 addr = tbl->it_base; 1297 1298 opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, 1299 pe->pe_number << 1, 1, __pa(addr), 1300 0, 0x1000); 1301 1302 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, 1303 pe->pe_number, 1304 (pe->pe_number << 1) + 1, 1305 pe->tce_bypass_base, 1306 0); 1307 if (rc) 1308 pe_warn(pe, "OPAL error %ld release DMA window\n", rc); 1309 1310 pnv_pci_unlink_table_and_group(tbl, &pe->table_group); 1311 if (pe->table_group.group) { 1312 iommu_group_put(pe->table_group.group); 1313 BUG_ON(pe->table_group.group); 1314 } 1315 iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); 1316 free_pages(addr, get_order(TCE32_TABLE_SIZE)); 1317 } 1318 1319 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) 1320 { 1321 struct pci_bus *bus; 1322 struct pci_controller *hose; 1323 struct pnv_phb *phb; 1324 struct pnv_ioda_pe *pe, *pe_n; 1325 struct pci_dn *pdn; 1326 u16 vf_index; 1327 int64_t rc; 1328 1329 bus = pdev->bus; 1330 hose = pci_bus_to_host(bus); 1331 phb = hose->private_data; 1332 pdn = pci_get_pdn(pdev); 1333 1334 if (!pdev->is_physfn) 1335 return; 1336 1337 if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { 1338 int vf_group; 1339 int vf_per_group; 1340 int vf_index1; 1341 1342 vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; 1343 1344 for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) 1345 for (vf_index = vf_group * vf_per_group; 1346 vf_index < (vf_group + 1) * vf_per_group && 1347 vf_index < num_vfs; 1348 vf_index++) 1349 for (vf_index1 = vf_group * vf_per_group; 1350 vf_index1 < (vf_group + 1) * vf_per_group && 1351 vf_index1 < num_vfs; 1352 vf_index1++){ 1353 1354 rc = opal_pci_set_peltv(phb->opal_id, 1355 pdn->offset + vf_index, 1356 pdn->offset + vf_index1, 1357 OPAL_REMOVE_PE_FROM_DOMAIN); 1358 1359 if (rc) 1360 dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", 1361 __func__, 1362 pdn->offset + vf_index1, rc); 1363 } 1364 } 1365 1366 list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { 1367 if (pe->parent_dev != pdev) 1368 continue; 1369 1370 pnv_pci_ioda2_release_dma_pe(pdev, pe); 1371 1372 /* Remove from list */ 1373 mutex_lock(&phb->ioda.pe_list_mutex); 1374 list_del(&pe->list); 1375 mutex_unlock(&phb->ioda.pe_list_mutex); 1376 1377 pnv_ioda_deconfigure_pe(phb, pe); 1378 1379 pnv_ioda_free_pe(phb, pe->pe_number); 1380 } 1381 } 1382 1383 void pnv_pci_sriov_disable(struct pci_dev *pdev) 1384 { 1385 struct pci_bus *bus; 1386 struct pci_controller *hose; 1387 struct pnv_phb *phb; 1388 struct pci_dn *pdn; 1389 struct pci_sriov *iov; 1390 u16 num_vfs; 1391 1392 bus = pdev->bus; 1393 hose = pci_bus_to_host(bus); 1394 phb = hose->private_data; 1395 pdn = pci_get_pdn(pdev); 1396 iov = pdev->sriov; 1397 num_vfs = pdn->num_vfs; 1398 1399 /* Release VF PEs */ 1400 pnv_ioda_release_vf_PE(pdev, num_vfs); 1401 1402 if (phb->type == PNV_PHB_IODA2) { 1403 if (pdn->m64_per_iov == 1) 1404 pnv_pci_vf_resource_shift(pdev, -pdn->offset); 1405 1406 /* Release M64 windows */ 1407 pnv_pci_vf_release_m64(pdev); 1408 1409 /* Release PE numbers */ 1410 bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); 1411 pdn->offset = 0; 1412 } 1413 } 1414 1415 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 1416 struct pnv_ioda_pe *pe); 1417 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) 1418 { 1419 struct pci_bus *bus; 1420 struct pci_controller *hose; 1421 struct pnv_phb *phb; 1422 struct pnv_ioda_pe *pe; 1423 int pe_num; 1424 u16 vf_index; 1425 struct pci_dn *pdn; 1426 int64_t rc; 1427 1428 bus = pdev->bus; 1429 hose = pci_bus_to_host(bus); 1430 phb = hose->private_data; 1431 pdn = pci_get_pdn(pdev); 1432 1433 if (!pdev->is_physfn) 1434 return; 1435 1436 /* Reserve PE for each VF */ 1437 for (vf_index = 0; vf_index < num_vfs; vf_index++) { 1438 pe_num = pdn->offset + vf_index; 1439 1440 pe = &phb->ioda.pe_array[pe_num]; 1441 pe->pe_number = pe_num; 1442 pe->phb = phb; 1443 pe->flags = PNV_IODA_PE_VF; 1444 pe->pbus = NULL; 1445 pe->parent_dev = pdev; 1446 pe->tce32_seg = -1; 1447 pe->mve_number = -1; 1448 pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | 1449 pci_iov_virtfn_devfn(pdev, vf_index); 1450 1451 pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n", 1452 hose->global_number, pdev->bus->number, 1453 PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), 1454 PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); 1455 1456 if (pnv_ioda_configure_pe(phb, pe)) { 1457 /* XXX What do we do here ? */ 1458 if (pe_num) 1459 pnv_ioda_free_pe(phb, pe_num); 1460 pe->pdev = NULL; 1461 continue; 1462 } 1463 1464 /* Put PE to the list */ 1465 mutex_lock(&phb->ioda.pe_list_mutex); 1466 list_add_tail(&pe->list, &phb->ioda.pe_list); 1467 mutex_unlock(&phb->ioda.pe_list_mutex); 1468 1469 pnv_pci_ioda2_setup_dma_pe(phb, pe); 1470 } 1471 1472 if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { 1473 int vf_group; 1474 int vf_per_group; 1475 int vf_index1; 1476 1477 vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; 1478 1479 for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { 1480 for (vf_index = vf_group * vf_per_group; 1481 vf_index < (vf_group + 1) * vf_per_group && 1482 vf_index < num_vfs; 1483 vf_index++) { 1484 for (vf_index1 = vf_group * vf_per_group; 1485 vf_index1 < (vf_group + 1) * vf_per_group && 1486 vf_index1 < num_vfs; 1487 vf_index1++) { 1488 1489 rc = opal_pci_set_peltv(phb->opal_id, 1490 pdn->offset + vf_index, 1491 pdn->offset + vf_index1, 1492 OPAL_ADD_PE_TO_DOMAIN); 1493 1494 if (rc) 1495 dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", 1496 __func__, 1497 pdn->offset + vf_index1, rc); 1498 } 1499 } 1500 } 1501 } 1502 } 1503 1504 int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) 1505 { 1506 struct pci_bus *bus; 1507 struct pci_controller *hose; 1508 struct pnv_phb *phb; 1509 struct pci_dn *pdn; 1510 int ret; 1511 1512 bus = pdev->bus; 1513 hose = pci_bus_to_host(bus); 1514 phb = hose->private_data; 1515 pdn = pci_get_pdn(pdev); 1516 1517 if (phb->type == PNV_PHB_IODA2) { 1518 /* Calculate available PE for required VFs */ 1519 mutex_lock(&phb->ioda.pe_alloc_mutex); 1520 pdn->offset = bitmap_find_next_zero_area( 1521 phb->ioda.pe_alloc, phb->ioda.total_pe, 1522 0, num_vfs, 0); 1523 if (pdn->offset >= phb->ioda.total_pe) { 1524 mutex_unlock(&phb->ioda.pe_alloc_mutex); 1525 dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); 1526 pdn->offset = 0; 1527 return -EBUSY; 1528 } 1529 bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs); 1530 pdn->num_vfs = num_vfs; 1531 mutex_unlock(&phb->ioda.pe_alloc_mutex); 1532 1533 /* Assign M64 window accordingly */ 1534 ret = pnv_pci_vf_assign_m64(pdev, num_vfs); 1535 if (ret) { 1536 dev_info(&pdev->dev, "Not enough M64 window resources\n"); 1537 goto m64_failed; 1538 } 1539 1540 /* 1541 * When using one M64 BAR to map one IOV BAR, we need to shift 1542 * the IOV BAR according to the PE# allocated to the VFs. 1543 * Otherwise, the PE# for the VF will conflict with others. 1544 */ 1545 if (pdn->m64_per_iov == 1) { 1546 ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); 1547 if (ret) 1548 goto m64_failed; 1549 } 1550 } 1551 1552 /* Setup VF PEs */ 1553 pnv_ioda_setup_vf_PE(pdev, num_vfs); 1554 1555 return 0; 1556 1557 m64_failed: 1558 bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); 1559 pdn->offset = 0; 1560 1561 return ret; 1562 } 1563 1564 int pcibios_sriov_disable(struct pci_dev *pdev) 1565 { 1566 pnv_pci_sriov_disable(pdev); 1567 1568 /* Release PCI data */ 1569 remove_dev_pci_data(pdev); 1570 return 0; 1571 } 1572 1573 int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) 1574 { 1575 /* Allocate PCI data */ 1576 add_dev_pci_data(pdev); 1577 1578 pnv_pci_sriov_enable(pdev, num_vfs); 1579 return 0; 1580 } 1581 #endif /* CONFIG_PCI_IOV */ 1582 1583 static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) 1584 { 1585 struct pci_dn *pdn = pci_get_pdn(pdev); 1586 struct pnv_ioda_pe *pe; 1587 1588 /* 1589 * The function can be called while the PE# 1590 * hasn't been assigned. Do nothing for the 1591 * case. 1592 */ 1593 if (!pdn || pdn->pe_number == IODA_INVALID_PE) 1594 return; 1595 1596 pe = &phb->ioda.pe_array[pdn->pe_number]; 1597 WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); 1598 set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); 1599 /* 1600 * Note: iommu_add_device() will fail here as 1601 * for physical PE: the device is already added by now; 1602 * for virtual PE: sysfs entries are not ready yet and 1603 * tce_iommu_bus_notifier will add the device to a group later. 1604 */ 1605 } 1606 1607 static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) 1608 { 1609 struct pci_controller *hose = pci_bus_to_host(pdev->bus); 1610 struct pnv_phb *phb = hose->private_data; 1611 struct pci_dn *pdn = pci_get_pdn(pdev); 1612 struct pnv_ioda_pe *pe; 1613 uint64_t top; 1614 bool bypass = false; 1615 1616 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 1617 return -ENODEV;; 1618 1619 pe = &phb->ioda.pe_array[pdn->pe_number]; 1620 if (pe->tce_bypass_enabled) { 1621 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; 1622 bypass = (dma_mask >= top); 1623 } 1624 1625 if (bypass) { 1626 dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); 1627 set_dma_ops(&pdev->dev, &dma_direct_ops); 1628 set_dma_offset(&pdev->dev, pe->tce_bypass_base); 1629 } else { 1630 dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); 1631 set_dma_ops(&pdev->dev, &dma_iommu_ops); 1632 set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); 1633 } 1634 *pdev->dev.dma_mask = dma_mask; 1635 return 0; 1636 } 1637 1638 static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb, 1639 struct pci_dev *pdev) 1640 { 1641 struct pci_dn *pdn = pci_get_pdn(pdev); 1642 struct pnv_ioda_pe *pe; 1643 u64 end, mask; 1644 1645 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 1646 return 0; 1647 1648 pe = &phb->ioda.pe_array[pdn->pe_number]; 1649 if (!pe->tce_bypass_enabled) 1650 return __dma_get_required_mask(&pdev->dev); 1651 1652 1653 end = pe->tce_bypass_base + memblock_end_of_DRAM(); 1654 mask = 1ULL << (fls64(end) - 1); 1655 mask += mask - 1; 1656 1657 return mask; 1658 } 1659 1660 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, 1661 struct pci_bus *bus) 1662 { 1663 struct pci_dev *dev; 1664 1665 list_for_each_entry(dev, &bus->devices, bus_list) { 1666 set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); 1667 iommu_add_device(&dev->dev); 1668 1669 if (dev->subordinate) 1670 pnv_ioda_setup_bus_dma(pe, dev->subordinate); 1671 } 1672 } 1673 1674 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, 1675 unsigned long index, unsigned long npages, bool rm) 1676 { 1677 struct iommu_table_group_link *tgl = list_first_entry_or_null( 1678 &tbl->it_group_list, struct iommu_table_group_link, 1679 next); 1680 struct pnv_ioda_pe *pe = container_of(tgl->table_group, 1681 struct pnv_ioda_pe, table_group); 1682 __be64 __iomem *invalidate = rm ? 1683 (__be64 __iomem *)pe->tce_inval_reg_phys : 1684 (__be64 __iomem *)tbl->it_index; 1685 unsigned long start, end, inc; 1686 const unsigned shift = tbl->it_page_shift; 1687 1688 start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset); 1689 end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset + 1690 npages - 1); 1691 1692 /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ 1693 if (tbl->it_busno) { 1694 start <<= shift; 1695 end <<= shift; 1696 inc = 128ull << shift; 1697 start |= tbl->it_busno; 1698 end |= tbl->it_busno; 1699 } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { 1700 /* p7ioc-style invalidation, 2 TCEs per write */ 1701 start |= (1ull << 63); 1702 end |= (1ull << 63); 1703 inc = 16; 1704 } else { 1705 /* Default (older HW) */ 1706 inc = 128; 1707 } 1708 1709 end |= inc - 1; /* round up end to be different than start */ 1710 1711 mb(); /* Ensure above stores are visible */ 1712 while (start <= end) { 1713 if (rm) 1714 __raw_rm_writeq(cpu_to_be64(start), invalidate); 1715 else 1716 __raw_writeq(cpu_to_be64(start), invalidate); 1717 start += inc; 1718 } 1719 1720 /* 1721 * The iommu layer will do another mb() for us on build() 1722 * and we don't care on free() 1723 */ 1724 } 1725 1726 static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, 1727 long npages, unsigned long uaddr, 1728 enum dma_data_direction direction, 1729 struct dma_attrs *attrs) 1730 { 1731 int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, 1732 attrs); 1733 1734 if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE)) 1735 pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); 1736 1737 return ret; 1738 } 1739 1740 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, 1741 long npages) 1742 { 1743 pnv_tce_free(tbl, index, npages); 1744 1745 if (tbl->it_type & TCE_PCI_SWINV_FREE) 1746 pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); 1747 } 1748 1749 static struct iommu_table_ops pnv_ioda1_iommu_ops = { 1750 .set = pnv_ioda1_tce_build, 1751 .clear = pnv_ioda1_tce_free, 1752 .get = pnv_tce_get, 1753 }; 1754 1755 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, 1756 unsigned long index, unsigned long npages, bool rm) 1757 { 1758 struct iommu_table_group_link *tgl = list_first_entry_or_null( 1759 &tbl->it_group_list, struct iommu_table_group_link, 1760 next); 1761 struct pnv_ioda_pe *pe = container_of(tgl->table_group, 1762 struct pnv_ioda_pe, table_group); 1763 unsigned long start, end, inc; 1764 __be64 __iomem *invalidate = rm ? 1765 (__be64 __iomem *)pe->tce_inval_reg_phys : 1766 (__be64 __iomem *)tbl->it_index; 1767 const unsigned shift = tbl->it_page_shift; 1768 1769 /* We'll invalidate DMA address in PE scope */ 1770 start = 0x2ull << 60; 1771 start |= (pe->pe_number & 0xFF); 1772 end = start; 1773 1774 /* Figure out the start, end and step */ 1775 start |= (index << shift); 1776 end |= ((index + npages - 1) << shift); 1777 inc = (0x1ull << shift); 1778 mb(); 1779 1780 while (start <= end) { 1781 if (rm) 1782 __raw_rm_writeq(cpu_to_be64(start), invalidate); 1783 else 1784 __raw_writeq(cpu_to_be64(start), invalidate); 1785 start += inc; 1786 } 1787 } 1788 1789 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, 1790 long npages, unsigned long uaddr, 1791 enum dma_data_direction direction, 1792 struct dma_attrs *attrs) 1793 { 1794 int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, 1795 attrs); 1796 1797 if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE)) 1798 pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); 1799 1800 return ret; 1801 } 1802 1803 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, 1804 long npages) 1805 { 1806 pnv_tce_free(tbl, index, npages); 1807 1808 if (tbl->it_type & TCE_PCI_SWINV_FREE) 1809 pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); 1810 } 1811 1812 static struct iommu_table_ops pnv_ioda2_iommu_ops = { 1813 .set = pnv_ioda2_tce_build, 1814 .clear = pnv_ioda2_tce_free, 1815 .get = pnv_tce_get, 1816 }; 1817 1818 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, 1819 struct pnv_ioda_pe *pe, unsigned int base, 1820 unsigned int segs) 1821 { 1822 1823 struct page *tce_mem = NULL; 1824 const __be64 *swinvp; 1825 struct iommu_table *tbl; 1826 unsigned int i; 1827 int64_t rc; 1828 void *addr; 1829 1830 /* XXX FIXME: Handle 64-bit only DMA devices */ 1831 /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ 1832 /* XXX FIXME: Allocate multi-level tables on PHB3 */ 1833 1834 /* We shouldn't already have a 32-bit DMA associated */ 1835 if (WARN_ON(pe->tce32_seg >= 0)) 1836 return; 1837 1838 tbl = pnv_pci_table_alloc(phb->hose->node); 1839 iommu_register_group(&pe->table_group, phb->hose->global_number, 1840 pe->pe_number); 1841 pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); 1842 1843 /* Grab a 32-bit TCE table */ 1844 pe->tce32_seg = base; 1845 pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", 1846 (base << 28), ((base + segs) << 28) - 1); 1847 1848 /* XXX Currently, we allocate one big contiguous table for the 1849 * TCEs. We only really need one chunk per 256M of TCE space 1850 * (ie per segment) but that's an optimization for later, it 1851 * requires some added smarts with our get/put_tce implementation 1852 */ 1853 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 1854 get_order(TCE32_TABLE_SIZE * segs)); 1855 if (!tce_mem) { 1856 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); 1857 goto fail; 1858 } 1859 addr = page_address(tce_mem); 1860 memset(addr, 0, TCE32_TABLE_SIZE * segs); 1861 1862 /* Configure HW */ 1863 for (i = 0; i < segs; i++) { 1864 rc = opal_pci_map_pe_dma_window(phb->opal_id, 1865 pe->pe_number, 1866 base + i, 1, 1867 __pa(addr) + TCE32_TABLE_SIZE * i, 1868 TCE32_TABLE_SIZE, 0x1000); 1869 if (rc) { 1870 pe_err(pe, " Failed to configure 32-bit TCE table," 1871 " err %ld\n", rc); 1872 goto fail; 1873 } 1874 } 1875 1876 /* Setup linux iommu table */ 1877 pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, 1878 base << 28, IOMMU_PAGE_SHIFT_4K); 1879 1880 /* OPAL variant of P7IOC SW invalidated TCEs */ 1881 swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); 1882 if (swinvp) { 1883 /* We need a couple more fields -- an address and a data 1884 * to or. Since the bus is only printed out on table free 1885 * errors, and on the first pass the data will be a relative 1886 * bus number, print that out instead. 1887 */ 1888 pe->tce_inval_reg_phys = be64_to_cpup(swinvp); 1889 tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 1890 8); 1891 tbl->it_type |= (TCE_PCI_SWINV_CREATE | 1892 TCE_PCI_SWINV_FREE | 1893 TCE_PCI_SWINV_PAIR); 1894 } 1895 tbl->it_ops = &pnv_ioda1_iommu_ops; 1896 iommu_init_table(tbl, phb->hose->node); 1897 1898 if (pe->flags & PNV_IODA_PE_DEV) { 1899 /* 1900 * Setting table base here only for carrying iommu_group 1901 * further down to let iommu_add_device() do the job. 1902 * pnv_pci_ioda_dma_dev_setup will override it later anyway. 1903 */ 1904 set_iommu_table_base(&pe->pdev->dev, tbl); 1905 iommu_add_device(&pe->pdev->dev); 1906 } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) 1907 pnv_ioda_setup_bus_dma(pe, pe->pbus); 1908 1909 return; 1910 fail: 1911 /* XXX Failure: Try to fallback to 64-bit only ? */ 1912 if (pe->tce32_seg >= 0) 1913 pe->tce32_seg = -1; 1914 if (tce_mem) 1915 __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); 1916 if (tbl) { 1917 pnv_pci_unlink_table_and_group(tbl, &pe->table_group); 1918 iommu_free_table(tbl, "pnv"); 1919 } 1920 } 1921 1922 static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) 1923 { 1924 struct iommu_table_group_link *tgl = list_first_entry_or_null( 1925 &tbl->it_group_list, struct iommu_table_group_link, 1926 next); 1927 struct pnv_ioda_pe *pe = container_of(tgl->table_group, 1928 struct pnv_ioda_pe, table_group); 1929 uint16_t window_id = (pe->pe_number << 1 ) + 1; 1930 int64_t rc; 1931 1932 pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis"); 1933 if (enable) { 1934 phys_addr_t top = memblock_end_of_DRAM(); 1935 1936 top = roundup_pow_of_two(top); 1937 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, 1938 pe->pe_number, 1939 window_id, 1940 pe->tce_bypass_base, 1941 top); 1942 } else { 1943 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, 1944 pe->pe_number, 1945 window_id, 1946 pe->tce_bypass_base, 1947 0); 1948 } 1949 if (rc) 1950 pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); 1951 else 1952 pe->tce_bypass_enabled = enable; 1953 } 1954 1955 static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, 1956 struct pnv_ioda_pe *pe) 1957 { 1958 /* TVE #1 is selected by PCI address bit 59 */ 1959 pe->tce_bypass_base = 1ull << 59; 1960 1961 /* Install set_bypass callback for VFIO */ 1962 pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass; 1963 1964 /* Enable bypass by default */ 1965 pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true); 1966 } 1967 1968 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 1969 struct pnv_ioda_pe *pe) 1970 { 1971 struct page *tce_mem = NULL; 1972 void *addr; 1973 const __be64 *swinvp; 1974 struct iommu_table *tbl; 1975 unsigned int tce_table_size, end; 1976 int64_t rc; 1977 1978 /* We shouldn't already have a 32-bit DMA associated */ 1979 if (WARN_ON(pe->tce32_seg >= 0)) 1980 return; 1981 1982 tbl = pnv_pci_table_alloc(phb->hose->node); 1983 iommu_register_group(&pe->table_group, phb->hose->global_number, 1984 pe->pe_number); 1985 pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); 1986 1987 /* The PE will reserve all possible 32-bits space */ 1988 pe->tce32_seg = 0; 1989 end = (1 << ilog2(phb->ioda.m32_pci_base)); 1990 tce_table_size = (end / 0x1000) * 8; 1991 pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", 1992 end); 1993 1994 /* Allocate TCE table */ 1995 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 1996 get_order(tce_table_size)); 1997 if (!tce_mem) { 1998 pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); 1999 goto fail; 2000 } 2001 addr = page_address(tce_mem); 2002 memset(addr, 0, tce_table_size); 2003 2004 /* 2005 * Map TCE table through TVT. The TVE index is the PE number 2006 * shifted by 1 bit for 32-bits DMA space. 2007 */ 2008 rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, 2009 pe->pe_number << 1, 1, __pa(addr), 2010 tce_table_size, 0x1000); 2011 if (rc) { 2012 pe_err(pe, "Failed to configure 32-bit TCE table," 2013 " err %ld\n", rc); 2014 goto fail; 2015 } 2016 2017 /* Setup linux iommu table */ 2018 pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, 2019 IOMMU_PAGE_SHIFT_4K); 2020 2021 /* OPAL variant of PHB3 invalidated TCEs */ 2022 swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); 2023 if (swinvp) { 2024 /* We need a couple more fields -- an address and a data 2025 * to or. Since the bus is only printed out on table free 2026 * errors, and on the first pass the data will be a relative 2027 * bus number, print that out instead. 2028 */ 2029 pe->tce_inval_reg_phys = be64_to_cpup(swinvp); 2030 tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 2031 8); 2032 tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); 2033 } 2034 tbl->it_ops = &pnv_ioda2_iommu_ops; 2035 iommu_init_table(tbl, phb->hose->node); 2036 2037 if (pe->flags & PNV_IODA_PE_DEV) { 2038 /* 2039 * Setting table base here only for carrying iommu_group 2040 * further down to let iommu_add_device() do the job. 2041 * pnv_pci_ioda_dma_dev_setup will override it later anyway. 2042 */ 2043 set_iommu_table_base(&pe->pdev->dev, tbl); 2044 iommu_add_device(&pe->pdev->dev); 2045 } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) 2046 pnv_ioda_setup_bus_dma(pe, pe->pbus); 2047 2048 /* Also create a bypass window */ 2049 if (!pnv_iommu_bypass_disabled) 2050 pnv_pci_ioda2_setup_bypass_pe(phb, pe); 2051 2052 return; 2053 fail: 2054 if (pe->tce32_seg >= 0) 2055 pe->tce32_seg = -1; 2056 if (tce_mem) 2057 __free_pages(tce_mem, get_order(tce_table_size)); 2058 if (tbl) { 2059 pnv_pci_unlink_table_and_group(tbl, &pe->table_group); 2060 iommu_free_table(tbl, "pnv"); 2061 } 2062 } 2063 2064 static void pnv_ioda_setup_dma(struct pnv_phb *phb) 2065 { 2066 struct pci_controller *hose = phb->hose; 2067 unsigned int residual, remaining, segs, tw, base; 2068 struct pnv_ioda_pe *pe; 2069 2070 /* If we have more PE# than segments available, hand out one 2071 * per PE until we run out and let the rest fail. If not, 2072 * then we assign at least one segment per PE, plus more based 2073 * on the amount of devices under that PE 2074 */ 2075 if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) 2076 residual = 0; 2077 else 2078 residual = phb->ioda.tce32_count - 2079 phb->ioda.dma_pe_count; 2080 2081 pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", 2082 hose->global_number, phb->ioda.tce32_count); 2083 pr_info("PCI: %d PE# for a total weight of %d\n", 2084 phb->ioda.dma_pe_count, phb->ioda.dma_weight); 2085 2086 /* Walk our PE list and configure their DMA segments, hand them 2087 * out one base segment plus any residual segments based on 2088 * weight 2089 */ 2090 remaining = phb->ioda.tce32_count; 2091 tw = phb->ioda.dma_weight; 2092 base = 0; 2093 list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { 2094 if (!pe->dma_weight) 2095 continue; 2096 if (!remaining) { 2097 pe_warn(pe, "No DMA32 resources available\n"); 2098 continue; 2099 } 2100 segs = 1; 2101 if (residual) { 2102 segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; 2103 if (segs > remaining) 2104 segs = remaining; 2105 } 2106 2107 /* 2108 * For IODA2 compliant PHB3, we needn't care about the weight. 2109 * The all available 32-bits DMA space will be assigned to 2110 * the specific PE. 2111 */ 2112 if (phb->type == PNV_PHB_IODA1) { 2113 pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", 2114 pe->dma_weight, segs); 2115 pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); 2116 } else { 2117 pe_info(pe, "Assign DMA32 space\n"); 2118 segs = 0; 2119 pnv_pci_ioda2_setup_dma_pe(phb, pe); 2120 } 2121 2122 remaining -= segs; 2123 base += segs; 2124 } 2125 } 2126 2127 #ifdef CONFIG_PCI_MSI 2128 static void pnv_ioda2_msi_eoi(struct irq_data *d) 2129 { 2130 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 2131 struct irq_chip *chip = irq_data_get_irq_chip(d); 2132 struct pnv_phb *phb = container_of(chip, struct pnv_phb, 2133 ioda.irq_chip); 2134 int64_t rc; 2135 2136 rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); 2137 WARN_ON_ONCE(rc); 2138 2139 icp_native_eoi(d); 2140 } 2141 2142 2143 static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) 2144 { 2145 struct irq_data *idata; 2146 struct irq_chip *ichip; 2147 2148 if (phb->type != PNV_PHB_IODA2) 2149 return; 2150 2151 if (!phb->ioda.irq_chip_init) { 2152 /* 2153 * First time we setup an MSI IRQ, we need to setup the 2154 * corresponding IRQ chip to route correctly. 2155 */ 2156 idata = irq_get_irq_data(virq); 2157 ichip = irq_data_get_irq_chip(idata); 2158 phb->ioda.irq_chip_init = 1; 2159 phb->ioda.irq_chip = *ichip; 2160 phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; 2161 } 2162 irq_set_chip(virq, &phb->ioda.irq_chip); 2163 } 2164 2165 #ifdef CONFIG_CXL_BASE 2166 2167 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) 2168 { 2169 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2170 2171 return of_node_get(hose->dn); 2172 } 2173 EXPORT_SYMBOL(pnv_pci_get_phb_node); 2174 2175 int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode) 2176 { 2177 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2178 struct pnv_phb *phb = hose->private_data; 2179 struct pnv_ioda_pe *pe; 2180 int rc; 2181 2182 pe = pnv_ioda_get_pe(dev); 2183 if (!pe) 2184 return -ENODEV; 2185 2186 pe_info(pe, "Switching PHB to CXL\n"); 2187 2188 rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number); 2189 if (rc) 2190 dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc); 2191 2192 return rc; 2193 } 2194 EXPORT_SYMBOL(pnv_phb_to_cxl_mode); 2195 2196 /* Find PHB for cxl dev and allocate MSI hwirqs? 2197 * Returns the absolute hardware IRQ number 2198 */ 2199 int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num) 2200 { 2201 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2202 struct pnv_phb *phb = hose->private_data; 2203 int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num); 2204 2205 if (hwirq < 0) { 2206 dev_warn(&dev->dev, "Failed to find a free MSI\n"); 2207 return -ENOSPC; 2208 } 2209 2210 return phb->msi_base + hwirq; 2211 } 2212 EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs); 2213 2214 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num) 2215 { 2216 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2217 struct pnv_phb *phb = hose->private_data; 2218 2219 msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num); 2220 } 2221 EXPORT_SYMBOL(pnv_cxl_release_hwirqs); 2222 2223 void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs, 2224 struct pci_dev *dev) 2225 { 2226 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2227 struct pnv_phb *phb = hose->private_data; 2228 int i, hwirq; 2229 2230 for (i = 1; i < CXL_IRQ_RANGES; i++) { 2231 if (!irqs->range[i]) 2232 continue; 2233 pr_devel("cxl release irq range 0x%x: offset: 0x%lx limit: %ld\n", 2234 i, irqs->offset[i], 2235 irqs->range[i]); 2236 hwirq = irqs->offset[i] - phb->msi_base; 2237 msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 2238 irqs->range[i]); 2239 } 2240 } 2241 EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges); 2242 2243 int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, 2244 struct pci_dev *dev, int num) 2245 { 2246 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2247 struct pnv_phb *phb = hose->private_data; 2248 int i, hwirq, try; 2249 2250 memset(irqs, 0, sizeof(struct cxl_irq_ranges)); 2251 2252 /* 0 is reserved for the multiplexed PSL DSI interrupt */ 2253 for (i = 1; i < CXL_IRQ_RANGES && num; i++) { 2254 try = num; 2255 while (try) { 2256 hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try); 2257 if (hwirq >= 0) 2258 break; 2259 try /= 2; 2260 } 2261 if (!try) 2262 goto fail; 2263 2264 irqs->offset[i] = phb->msi_base + hwirq; 2265 irqs->range[i] = try; 2266 pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx limit: %li\n", 2267 i, irqs->offset[i], irqs->range[i]); 2268 num -= try; 2269 } 2270 if (num) 2271 goto fail; 2272 2273 return 0; 2274 fail: 2275 pnv_cxl_release_hwirq_ranges(irqs, dev); 2276 return -ENOSPC; 2277 } 2278 EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges); 2279 2280 int pnv_cxl_get_irq_count(struct pci_dev *dev) 2281 { 2282 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2283 struct pnv_phb *phb = hose->private_data; 2284 2285 return phb->msi_bmp.irq_count; 2286 } 2287 EXPORT_SYMBOL(pnv_cxl_get_irq_count); 2288 2289 int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, 2290 unsigned int virq) 2291 { 2292 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2293 struct pnv_phb *phb = hose->private_data; 2294 unsigned int xive_num = hwirq - phb->msi_base; 2295 struct pnv_ioda_pe *pe; 2296 int rc; 2297 2298 if (!(pe = pnv_ioda_get_pe(dev))) 2299 return -ENODEV; 2300 2301 /* Assign XIVE to PE */ 2302 rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); 2303 if (rc) { 2304 pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x " 2305 "hwirq 0x%x XIVE 0x%x PE\n", 2306 pci_name(dev), rc, phb->msi_base, hwirq, xive_num); 2307 return -EIO; 2308 } 2309 set_msi_irq_chip(phb, virq); 2310 2311 return 0; 2312 } 2313 EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup); 2314 #endif 2315 2316 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, 2317 unsigned int hwirq, unsigned int virq, 2318 unsigned int is_64, struct msi_msg *msg) 2319 { 2320 struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); 2321 unsigned int xive_num = hwirq - phb->msi_base; 2322 __be32 data; 2323 int rc; 2324 2325 /* No PE assigned ? bail out ... no MSI for you ! */ 2326 if (pe == NULL) 2327 return -ENXIO; 2328 2329 /* Check if we have an MVE */ 2330 if (pe->mve_number < 0) 2331 return -ENXIO; 2332 2333 /* Force 32-bit MSI on some broken devices */ 2334 if (dev->no_64bit_msi) 2335 is_64 = 0; 2336 2337 /* Assign XIVE to PE */ 2338 rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); 2339 if (rc) { 2340 pr_warn("%s: OPAL error %d setting XIVE %d PE\n", 2341 pci_name(dev), rc, xive_num); 2342 return -EIO; 2343 } 2344 2345 if (is_64) { 2346 __be64 addr64; 2347 2348 rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, 2349 &addr64, &data); 2350 if (rc) { 2351 pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", 2352 pci_name(dev), rc); 2353 return -EIO; 2354 } 2355 msg->address_hi = be64_to_cpu(addr64) >> 32; 2356 msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful; 2357 } else { 2358 __be32 addr32; 2359 2360 rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, 2361 &addr32, &data); 2362 if (rc) { 2363 pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", 2364 pci_name(dev), rc); 2365 return -EIO; 2366 } 2367 msg->address_hi = 0; 2368 msg->address_lo = be32_to_cpu(addr32); 2369 } 2370 msg->data = be32_to_cpu(data); 2371 2372 set_msi_irq_chip(phb, virq); 2373 2374 pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," 2375 " address=%x_%08x data=%x PE# %d\n", 2376 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, 2377 msg->address_hi, msg->address_lo, data, pe->pe_number); 2378 2379 return 0; 2380 } 2381 2382 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) 2383 { 2384 unsigned int count; 2385 const __be32 *prop = of_get_property(phb->hose->dn, 2386 "ibm,opal-msi-ranges", NULL); 2387 if (!prop) { 2388 /* BML Fallback */ 2389 prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); 2390 } 2391 if (!prop) 2392 return; 2393 2394 phb->msi_base = be32_to_cpup(prop); 2395 count = be32_to_cpup(prop + 1); 2396 if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { 2397 pr_err("PCI %d: Failed to allocate MSI bitmap !\n", 2398 phb->hose->global_number); 2399 return; 2400 } 2401 2402 phb->msi_setup = pnv_pci_ioda_msi_setup; 2403 phb->msi32_support = 1; 2404 pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", 2405 count, phb->msi_base); 2406 } 2407 #else 2408 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } 2409 #endif /* CONFIG_PCI_MSI */ 2410 2411 #ifdef CONFIG_PCI_IOV 2412 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) 2413 { 2414 struct pci_controller *hose; 2415 struct pnv_phb *phb; 2416 struct resource *res; 2417 int i; 2418 resource_size_t size; 2419 struct pci_dn *pdn; 2420 int mul, total_vfs; 2421 2422 if (!pdev->is_physfn || pdev->is_added) 2423 return; 2424 2425 hose = pci_bus_to_host(pdev->bus); 2426 phb = hose->private_data; 2427 2428 pdn = pci_get_pdn(pdev); 2429 pdn->vfs_expanded = 0; 2430 2431 total_vfs = pci_sriov_get_totalvfs(pdev); 2432 pdn->m64_per_iov = 1; 2433 mul = phb->ioda.total_pe; 2434 2435 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 2436 res = &pdev->resource[i + PCI_IOV_RESOURCES]; 2437 if (!res->flags || res->parent) 2438 continue; 2439 if (!pnv_pci_is_mem_pref_64(res->flags)) { 2440 dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n", 2441 i, res); 2442 continue; 2443 } 2444 2445 size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); 2446 2447 /* bigger than 64M */ 2448 if (size > (1 << 26)) { 2449 dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", 2450 i, res); 2451 pdn->m64_per_iov = M64_PER_IOV; 2452 mul = roundup_pow_of_two(total_vfs); 2453 break; 2454 } 2455 } 2456 2457 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 2458 res = &pdev->resource[i + PCI_IOV_RESOURCES]; 2459 if (!res->flags || res->parent) 2460 continue; 2461 if (!pnv_pci_is_mem_pref_64(res->flags)) { 2462 dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n", 2463 i, res); 2464 continue; 2465 } 2466 2467 dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); 2468 size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); 2469 res->end = res->start + size * mul - 1; 2470 dev_dbg(&pdev->dev, " %pR\n", res); 2471 dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", 2472 i, res, mul); 2473 } 2474 pdn->vfs_expanded = mul; 2475 } 2476 #endif /* CONFIG_PCI_IOV */ 2477 2478 /* 2479 * This function is supposed to be called on basis of PE from top 2480 * to bottom style. So the the I/O or MMIO segment assigned to 2481 * parent PE could be overrided by its child PEs if necessary. 2482 */ 2483 static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, 2484 struct pnv_ioda_pe *pe) 2485 { 2486 struct pnv_phb *phb = hose->private_data; 2487 struct pci_bus_region region; 2488 struct resource *res; 2489 int i, index; 2490 int rc; 2491 2492 /* 2493 * NOTE: We only care PCI bus based PE for now. For PCI 2494 * device based PE, for example SRIOV sensitive VF should 2495 * be figured out later. 2496 */ 2497 BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); 2498 2499 pci_bus_for_each_resource(pe->pbus, res, i) { 2500 if (!res || !res->flags || 2501 res->start > res->end) 2502 continue; 2503 2504 if (res->flags & IORESOURCE_IO) { 2505 region.start = res->start - phb->ioda.io_pci_base; 2506 region.end = res->end - phb->ioda.io_pci_base; 2507 index = region.start / phb->ioda.io_segsize; 2508 2509 while (index < phb->ioda.total_pe && 2510 region.start <= region.end) { 2511 phb->ioda.io_segmap[index] = pe->pe_number; 2512 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 2513 pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); 2514 if (rc != OPAL_SUCCESS) { 2515 pr_err("%s: OPAL error %d when mapping IO " 2516 "segment #%d to PE#%d\n", 2517 __func__, rc, index, pe->pe_number); 2518 break; 2519 } 2520 2521 region.start += phb->ioda.io_segsize; 2522 index++; 2523 } 2524 } else if ((res->flags & IORESOURCE_MEM) && 2525 !pnv_pci_is_mem_pref_64(res->flags)) { 2526 region.start = res->start - 2527 hose->mem_offset[0] - 2528 phb->ioda.m32_pci_base; 2529 region.end = res->end - 2530 hose->mem_offset[0] - 2531 phb->ioda.m32_pci_base; 2532 index = region.start / phb->ioda.m32_segsize; 2533 2534 while (index < phb->ioda.total_pe && 2535 region.start <= region.end) { 2536 phb->ioda.m32_segmap[index] = pe->pe_number; 2537 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 2538 pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); 2539 if (rc != OPAL_SUCCESS) { 2540 pr_err("%s: OPAL error %d when mapping M32 " 2541 "segment#%d to PE#%d", 2542 __func__, rc, index, pe->pe_number); 2543 break; 2544 } 2545 2546 region.start += phb->ioda.m32_segsize; 2547 index++; 2548 } 2549 } 2550 } 2551 } 2552 2553 static void pnv_pci_ioda_setup_seg(void) 2554 { 2555 struct pci_controller *tmp, *hose; 2556 struct pnv_phb *phb; 2557 struct pnv_ioda_pe *pe; 2558 2559 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 2560 phb = hose->private_data; 2561 list_for_each_entry(pe, &phb->ioda.pe_list, list) { 2562 pnv_ioda_setup_pe_seg(hose, pe); 2563 } 2564 } 2565 } 2566 2567 static void pnv_pci_ioda_setup_DMA(void) 2568 { 2569 struct pci_controller *hose, *tmp; 2570 struct pnv_phb *phb; 2571 2572 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 2573 pnv_ioda_setup_dma(hose->private_data); 2574 2575 /* Mark the PHB initialization done */ 2576 phb = hose->private_data; 2577 phb->initialized = 1; 2578 } 2579 } 2580 2581 static void pnv_pci_ioda_create_dbgfs(void) 2582 { 2583 #ifdef CONFIG_DEBUG_FS 2584 struct pci_controller *hose, *tmp; 2585 struct pnv_phb *phb; 2586 char name[16]; 2587 2588 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 2589 phb = hose->private_data; 2590 2591 sprintf(name, "PCI%04x", hose->global_number); 2592 phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); 2593 if (!phb->dbgfs) 2594 pr_warning("%s: Error on creating debugfs on PHB#%x\n", 2595 __func__, hose->global_number); 2596 } 2597 #endif /* CONFIG_DEBUG_FS */ 2598 } 2599 2600 static void pnv_pci_ioda_fixup(void) 2601 { 2602 pnv_pci_ioda_setup_PEs(); 2603 pnv_pci_ioda_setup_seg(); 2604 pnv_pci_ioda_setup_DMA(); 2605 2606 pnv_pci_ioda_create_dbgfs(); 2607 2608 #ifdef CONFIG_EEH 2609 eeh_init(); 2610 eeh_addr_cache_build(); 2611 #endif 2612 } 2613 2614 /* 2615 * Returns the alignment for I/O or memory windows for P2P 2616 * bridges. That actually depends on how PEs are segmented. 2617 * For now, we return I/O or M32 segment size for PE sensitive 2618 * P2P bridges. Otherwise, the default values (4KiB for I/O, 2619 * 1MiB for memory) will be returned. 2620 * 2621 * The current PCI bus might be put into one PE, which was 2622 * create against the parent PCI bridge. For that case, we 2623 * needn't enlarge the alignment so that we can save some 2624 * resources. 2625 */ 2626 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, 2627 unsigned long type) 2628 { 2629 struct pci_dev *bridge; 2630 struct pci_controller *hose = pci_bus_to_host(bus); 2631 struct pnv_phb *phb = hose->private_data; 2632 int num_pci_bridges = 0; 2633 2634 bridge = bus->self; 2635 while (bridge) { 2636 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) { 2637 num_pci_bridges++; 2638 if (num_pci_bridges >= 2) 2639 return 1; 2640 } 2641 2642 bridge = bridge->bus->self; 2643 } 2644 2645 /* We fail back to M32 if M64 isn't supported */ 2646 if (phb->ioda.m64_segsize && 2647 pnv_pci_is_mem_pref_64(type)) 2648 return phb->ioda.m64_segsize; 2649 if (type & IORESOURCE_MEM) 2650 return phb->ioda.m32_segsize; 2651 2652 return phb->ioda.io_segsize; 2653 } 2654 2655 #ifdef CONFIG_PCI_IOV 2656 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, 2657 int resno) 2658 { 2659 struct pci_dn *pdn = pci_get_pdn(pdev); 2660 resource_size_t align, iov_align; 2661 2662 iov_align = resource_size(&pdev->resource[resno]); 2663 if (iov_align) 2664 return iov_align; 2665 2666 align = pci_iov_resource_size(pdev, resno); 2667 if (pdn->vfs_expanded) 2668 return pdn->vfs_expanded * align; 2669 2670 return align; 2671 } 2672 #endif /* CONFIG_PCI_IOV */ 2673 2674 /* Prevent enabling devices for which we couldn't properly 2675 * assign a PE 2676 */ 2677 static bool pnv_pci_enable_device_hook(struct pci_dev *dev) 2678 { 2679 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2680 struct pnv_phb *phb = hose->private_data; 2681 struct pci_dn *pdn; 2682 2683 /* The function is probably called while the PEs have 2684 * not be created yet. For example, resource reassignment 2685 * during PCI probe period. We just skip the check if 2686 * PEs isn't ready. 2687 */ 2688 if (!phb->initialized) 2689 return true; 2690 2691 pdn = pci_get_pdn(dev); 2692 if (!pdn || pdn->pe_number == IODA_INVALID_PE) 2693 return false; 2694 2695 return true; 2696 } 2697 2698 static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, 2699 u32 devfn) 2700 { 2701 return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; 2702 } 2703 2704 static void pnv_pci_ioda_shutdown(struct pci_controller *hose) 2705 { 2706 struct pnv_phb *phb = hose->private_data; 2707 2708 opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE, 2709 OPAL_ASSERT_RESET); 2710 } 2711 2712 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { 2713 .dma_dev_setup = pnv_pci_dma_dev_setup, 2714 #ifdef CONFIG_PCI_MSI 2715 .setup_msi_irqs = pnv_setup_msi_irqs, 2716 .teardown_msi_irqs = pnv_teardown_msi_irqs, 2717 #endif 2718 .enable_device_hook = pnv_pci_enable_device_hook, 2719 .window_alignment = pnv_pci_window_alignment, 2720 .reset_secondary_bus = pnv_pci_reset_secondary_bus, 2721 .dma_set_mask = pnv_pci_ioda_dma_set_mask, 2722 .shutdown = pnv_pci_ioda_shutdown, 2723 }; 2724 2725 static void __init pnv_pci_init_ioda_phb(struct device_node *np, 2726 u64 hub_id, int ioda_type) 2727 { 2728 struct pci_controller *hose; 2729 struct pnv_phb *phb; 2730 unsigned long size, m32map_off, pemap_off, iomap_off = 0; 2731 const __be64 *prop64; 2732 const __be32 *prop32; 2733 int len; 2734 u64 phb_id; 2735 void *aux; 2736 long rc; 2737 2738 pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name); 2739 2740 prop64 = of_get_property(np, "ibm,opal-phbid", NULL); 2741 if (!prop64) { 2742 pr_err(" Missing \"ibm,opal-phbid\" property !\n"); 2743 return; 2744 } 2745 phb_id = be64_to_cpup(prop64); 2746 pr_debug(" PHB-ID : 0x%016llx\n", phb_id); 2747 2748 phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0); 2749 2750 /* Allocate PCI controller */ 2751 phb->hose = hose = pcibios_alloc_controller(np); 2752 if (!phb->hose) { 2753 pr_err(" Can't allocate PCI controller for %s\n", 2754 np->full_name); 2755 memblock_free(__pa(phb), sizeof(struct pnv_phb)); 2756 return; 2757 } 2758 2759 spin_lock_init(&phb->lock); 2760 prop32 = of_get_property(np, "bus-range", &len); 2761 if (prop32 && len == 8) { 2762 hose->first_busno = be32_to_cpu(prop32[0]); 2763 hose->last_busno = be32_to_cpu(prop32[1]); 2764 } else { 2765 pr_warn(" Broken <bus-range> on %s\n", np->full_name); 2766 hose->first_busno = 0; 2767 hose->last_busno = 0xff; 2768 } 2769 hose->private_data = phb; 2770 phb->hub_id = hub_id; 2771 phb->opal_id = phb_id; 2772 phb->type = ioda_type; 2773 mutex_init(&phb->ioda.pe_alloc_mutex); 2774 2775 /* Detect specific models for error handling */ 2776 if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) 2777 phb->model = PNV_PHB_MODEL_P7IOC; 2778 else if (of_device_is_compatible(np, "ibm,power8-pciex")) 2779 phb->model = PNV_PHB_MODEL_PHB3; 2780 else 2781 phb->model = PNV_PHB_MODEL_UNKNOWN; 2782 2783 /* Parse 32-bit and IO ranges (if any) */ 2784 pci_process_bridge_OF_ranges(hose, np, !hose->global_number); 2785 2786 /* Get registers */ 2787 phb->regs = of_iomap(np, 0); 2788 if (phb->regs == NULL) 2789 pr_err(" Failed to map registers !\n"); 2790 2791 /* Initialize more IODA stuff */ 2792 phb->ioda.total_pe = 1; 2793 prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); 2794 if (prop32) 2795 phb->ioda.total_pe = be32_to_cpup(prop32); 2796 prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); 2797 if (prop32) 2798 phb->ioda.reserved_pe = be32_to_cpup(prop32); 2799 2800 /* Parse 64-bit MMIO range */ 2801 pnv_ioda_parse_m64_window(phb); 2802 2803 phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); 2804 /* FW Has already off top 64k of M32 space (MSI space) */ 2805 phb->ioda.m32_size += 0x10000; 2806 2807 phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; 2808 phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; 2809 phb->ioda.io_size = hose->pci_io_size; 2810 phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; 2811 phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ 2812 2813 /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ 2814 size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 2815 m32map_off = size; 2816 size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); 2817 if (phb->type == PNV_PHB_IODA1) { 2818 iomap_off = size; 2819 size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); 2820 } 2821 pemap_off = size; 2822 size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); 2823 aux = memblock_virt_alloc(size, 0); 2824 phb->ioda.pe_alloc = aux; 2825 phb->ioda.m32_segmap = aux + m32map_off; 2826 if (phb->type == PNV_PHB_IODA1) 2827 phb->ioda.io_segmap = aux + iomap_off; 2828 phb->ioda.pe_array = aux + pemap_off; 2829 set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); 2830 2831 INIT_LIST_HEAD(&phb->ioda.pe_dma_list); 2832 INIT_LIST_HEAD(&phb->ioda.pe_list); 2833 mutex_init(&phb->ioda.pe_list_mutex); 2834 2835 /* Calculate how many 32-bit TCE segments we have */ 2836 phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; 2837 2838 #if 0 /* We should really do that ... */ 2839 rc = opal_pci_set_phb_mem_window(opal->phb_id, 2840 window_type, 2841 window_num, 2842 starting_real_address, 2843 starting_pci_address, 2844 segment_size); 2845 #endif 2846 2847 pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n", 2848 phb->ioda.total_pe, phb->ioda.reserved_pe, 2849 phb->ioda.m32_size, phb->ioda.m32_segsize); 2850 if (phb->ioda.m64_size) 2851 pr_info(" M64: 0x%lx [segment=0x%lx]\n", 2852 phb->ioda.m64_size, phb->ioda.m64_segsize); 2853 if (phb->ioda.io_size) 2854 pr_info(" IO: 0x%x [segment=0x%x]\n", 2855 phb->ioda.io_size, phb->ioda.io_segsize); 2856 2857 2858 phb->hose->ops = &pnv_pci_ops; 2859 phb->get_pe_state = pnv_ioda_get_pe_state; 2860 phb->freeze_pe = pnv_ioda_freeze_pe; 2861 phb->unfreeze_pe = pnv_ioda_unfreeze_pe; 2862 2863 /* Setup RID -> PE mapping function */ 2864 phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; 2865 2866 /* Setup TCEs */ 2867 phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; 2868 phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask; 2869 2870 /* Setup MSI support */ 2871 pnv_pci_init_ioda_msis(phb); 2872 2873 /* 2874 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here 2875 * to let the PCI core do resource assignment. It's supposed 2876 * that the PCI core will do correct I/O and MMIO alignment 2877 * for the P2P bridge bars so that each PCI bus (excluding 2878 * the child P2P bridges) can form individual PE. 2879 */ 2880 ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; 2881 hose->controller_ops = pnv_pci_ioda_controller_ops; 2882 2883 #ifdef CONFIG_PCI_IOV 2884 ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; 2885 ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; 2886 #endif 2887 2888 pci_add_flags(PCI_REASSIGN_ALL_RSRC); 2889 2890 /* Reset IODA tables to a clean state */ 2891 rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET); 2892 if (rc) 2893 pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); 2894 2895 /* If we're running in kdump kerenl, the previous kerenl never 2896 * shutdown PCI devices correctly. We already got IODA table 2897 * cleaned out. So we have to issue PHB reset to stop all PCI 2898 * transactions from previous kerenl. 2899 */ 2900 if (is_kdump_kernel()) { 2901 pr_info(" Issue PHB reset ...\n"); 2902 pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); 2903 pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); 2904 } 2905 2906 /* Remove M64 resource if we can't configure it successfully */ 2907 if (!phb->init_m64 || phb->init_m64(phb)) 2908 hose->mem_resources[1].flags = 0; 2909 } 2910 2911 void __init pnv_pci_init_ioda2_phb(struct device_node *np) 2912 { 2913 pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); 2914 } 2915 2916 void __init pnv_pci_init_ioda_hub(struct device_node *np) 2917 { 2918 struct device_node *phbn; 2919 const __be64 *prop64; 2920 u64 hub_id; 2921 2922 pr_info("Probing IODA IO-Hub %s\n", np->full_name); 2923 2924 prop64 = of_get_property(np, "ibm,opal-hubid", NULL); 2925 if (!prop64) { 2926 pr_err(" Missing \"ibm,opal-hubid\" property !\n"); 2927 return; 2928 } 2929 hub_id = be64_to_cpup(prop64); 2930 pr_devel(" HUB-ID : 0x%016llx\n", hub_id); 2931 2932 /* Count child PHBs */ 2933 for_each_child_of_node(np, phbn) { 2934 /* Look for IODA1 PHBs */ 2935 if (of_device_is_compatible(phbn, "ibm,ioda-phb")) 2936 pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); 2937 } 2938 } 2939