1 /* 2 * Support PCI/PCIe on PowerNV platforms 3 * 4 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 12 #undef DEBUG 13 14 #include <linux/kernel.h> 15 #include <linux/pci.h> 16 #include <linux/delay.h> 17 #include <linux/string.h> 18 #include <linux/init.h> 19 #include <linux/bootmem.h> 20 #include <linux/irq.h> 21 #include <linux/io.h> 22 #include <linux/msi.h> 23 24 #include <asm/sections.h> 25 #include <asm/io.h> 26 #include <asm/prom.h> 27 #include <asm/pci-bridge.h> 28 #include <asm/machdep.h> 29 #include <asm/msi_bitmap.h> 30 #include <asm/ppc-pci.h> 31 #include <asm/opal.h> 32 #include <asm/iommu.h> 33 #include <asm/tce.h> 34 #include <asm/xics.h> 35 36 #include "powernv.h" 37 #include "pci.h" 38 39 #define define_pe_printk_level(func, kern_level) \ 40 static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \ 41 { \ 42 struct va_format vaf; \ 43 va_list args; \ 44 char pfix[32]; \ 45 int r; \ 46 \ 47 va_start(args, fmt); \ 48 \ 49 vaf.fmt = fmt; \ 50 vaf.va = &args; \ 51 \ 52 if (pe->pdev) \ 53 strlcpy(pfix, dev_name(&pe->pdev->dev), \ 54 sizeof(pfix)); \ 55 else \ 56 sprintf(pfix, "%04x:%02x ", \ 57 pci_domain_nr(pe->pbus), \ 58 pe->pbus->number); \ 59 r = printk(kern_level "pci %s: [PE# %.3d] %pV", \ 60 pfix, pe->pe_number, &vaf); \ 61 \ 62 va_end(args); \ 63 \ 64 return r; \ 65 } \ 66 67 define_pe_printk_level(pe_err, KERN_ERR); 68 define_pe_printk_level(pe_warn, KERN_WARNING); 69 define_pe_printk_level(pe_info, KERN_INFO); 70 71 static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev) 72 { 73 struct device_node *np; 74 75 np = pci_device_to_OF_node(dev); 76 if (!np) 77 return NULL; 78 return PCI_DN(np); 79 } 80 81 static int pnv_ioda_alloc_pe(struct pnv_phb *phb) 82 { 83 unsigned long pe; 84 85 do { 86 pe = find_next_zero_bit(phb->ioda.pe_alloc, 87 phb->ioda.total_pe, 0); 88 if (pe >= phb->ioda.total_pe) 89 return IODA_INVALID_PE; 90 } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); 91 92 phb->ioda.pe_array[pe].phb = phb; 93 phb->ioda.pe_array[pe].pe_number = pe; 94 return pe; 95 } 96 97 static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) 98 { 99 WARN_ON(phb->ioda.pe_array[pe].pdev); 100 101 memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); 102 clear_bit(pe, phb->ioda.pe_alloc); 103 } 104 105 /* Currently those 2 are only used when MSIs are enabled, this will change 106 * but in the meantime, we need to protect them to avoid warnings 107 */ 108 #ifdef CONFIG_PCI_MSI 109 static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) 110 { 111 struct pci_controller *hose = pci_bus_to_host(dev->bus); 112 struct pnv_phb *phb = hose->private_data; 113 struct pci_dn *pdn = pnv_ioda_get_pdn(dev); 114 115 if (!pdn) 116 return NULL; 117 if (pdn->pe_number == IODA_INVALID_PE) 118 return NULL; 119 return &phb->ioda.pe_array[pdn->pe_number]; 120 } 121 #endif /* CONFIG_PCI_MSI */ 122 123 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) 124 { 125 struct pci_dev *parent; 126 uint8_t bcomp, dcomp, fcomp; 127 long rc, rid_end, rid; 128 129 /* Bus validation ? */ 130 if (pe->pbus) { 131 int count; 132 133 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; 134 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; 135 parent = pe->pbus->self; 136 if (pe->flags & PNV_IODA_PE_BUS_ALL) 137 count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; 138 else 139 count = 1; 140 141 switch(count) { 142 case 1: bcomp = OpalPciBusAll; break; 143 case 2: bcomp = OpalPciBus7Bits; break; 144 case 4: bcomp = OpalPciBus6Bits; break; 145 case 8: bcomp = OpalPciBus5Bits; break; 146 case 16: bcomp = OpalPciBus4Bits; break; 147 case 32: bcomp = OpalPciBus3Bits; break; 148 default: 149 pr_err("%s: Number of subordinate busses %d" 150 " unsupported\n", 151 pci_name(pe->pbus->self), count); 152 /* Do an exact match only */ 153 bcomp = OpalPciBusAll; 154 } 155 rid_end = pe->rid + (count << 8); 156 } else { 157 parent = pe->pdev->bus->self; 158 bcomp = OpalPciBusAll; 159 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; 160 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; 161 rid_end = pe->rid + 1; 162 } 163 164 /* Associate PE in PELT */ 165 rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, 166 bcomp, dcomp, fcomp, OPAL_MAP_PE); 167 if (rc) { 168 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); 169 return -ENXIO; 170 } 171 opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, 172 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); 173 174 /* Add to all parents PELT-V */ 175 while (parent) { 176 struct pci_dn *pdn = pnv_ioda_get_pdn(parent); 177 if (pdn && pdn->pe_number != IODA_INVALID_PE) { 178 rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, 179 pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); 180 /* XXX What to do in case of error ? */ 181 } 182 parent = parent->bus->self; 183 } 184 /* Setup reverse map */ 185 for (rid = pe->rid; rid < rid_end; rid++) 186 phb->ioda.pe_rmap[rid] = pe->pe_number; 187 188 /* Setup one MVTs on IODA1 */ 189 if (phb->type == PNV_PHB_IODA1) { 190 pe->mve_number = pe->pe_number; 191 rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, 192 pe->pe_number); 193 if (rc) { 194 pe_err(pe, "OPAL error %ld setting up MVE %d\n", 195 rc, pe->mve_number); 196 pe->mve_number = -1; 197 } else { 198 rc = opal_pci_set_mve_enable(phb->opal_id, 199 pe->mve_number, OPAL_ENABLE_MVE); 200 if (rc) { 201 pe_err(pe, "OPAL error %ld enabling MVE %d\n", 202 rc, pe->mve_number); 203 pe->mve_number = -1; 204 } 205 } 206 } else if (phb->type == PNV_PHB_IODA2) 207 pe->mve_number = 0; 208 209 return 0; 210 } 211 212 static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, 213 struct pnv_ioda_pe *pe) 214 { 215 struct pnv_ioda_pe *lpe; 216 217 list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { 218 if (lpe->dma_weight < pe->dma_weight) { 219 list_add_tail(&pe->dma_link, &lpe->dma_link); 220 return; 221 } 222 } 223 list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); 224 } 225 226 static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) 227 { 228 /* This is quite simplistic. The "base" weight of a device 229 * is 10. 0 means no DMA is to be accounted for it. 230 */ 231 232 /* If it's a bridge, no DMA */ 233 if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) 234 return 0; 235 236 /* Reduce the weight of slow USB controllers */ 237 if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || 238 dev->class == PCI_CLASS_SERIAL_USB_OHCI || 239 dev->class == PCI_CLASS_SERIAL_USB_EHCI) 240 return 3; 241 242 /* Increase the weight of RAID (includes Obsidian) */ 243 if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) 244 return 15; 245 246 /* Default */ 247 return 10; 248 } 249 250 #if 0 251 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) 252 { 253 struct pci_controller *hose = pci_bus_to_host(dev->bus); 254 struct pnv_phb *phb = hose->private_data; 255 struct pci_dn *pdn = pnv_ioda_get_pdn(dev); 256 struct pnv_ioda_pe *pe; 257 int pe_num; 258 259 if (!pdn) { 260 pr_err("%s: Device tree node not associated properly\n", 261 pci_name(dev)); 262 return NULL; 263 } 264 if (pdn->pe_number != IODA_INVALID_PE) 265 return NULL; 266 267 /* PE#0 has been pre-set */ 268 if (dev->bus->number == 0) 269 pe_num = 0; 270 else 271 pe_num = pnv_ioda_alloc_pe(phb); 272 if (pe_num == IODA_INVALID_PE) { 273 pr_warning("%s: Not enough PE# available, disabling device\n", 274 pci_name(dev)); 275 return NULL; 276 } 277 278 /* NOTE: We get only one ref to the pci_dev for the pdn, not for the 279 * pointer in the PE data structure, both should be destroyed at the 280 * same time. However, this needs to be looked at more closely again 281 * once we actually start removing things (Hotplug, SR-IOV, ...) 282 * 283 * At some point we want to remove the PDN completely anyways 284 */ 285 pe = &phb->ioda.pe_array[pe_num]; 286 pci_dev_get(dev); 287 pdn->pcidev = dev; 288 pdn->pe_number = pe_num; 289 pe->pdev = dev; 290 pe->pbus = NULL; 291 pe->tce32_seg = -1; 292 pe->mve_number = -1; 293 pe->rid = dev->bus->number << 8 | pdn->devfn; 294 295 pe_info(pe, "Associated device to PE\n"); 296 297 if (pnv_ioda_configure_pe(phb, pe)) { 298 /* XXX What do we do here ? */ 299 if (pe_num) 300 pnv_ioda_free_pe(phb, pe_num); 301 pdn->pe_number = IODA_INVALID_PE; 302 pe->pdev = NULL; 303 pci_dev_put(dev); 304 return NULL; 305 } 306 307 /* Assign a DMA weight to the device */ 308 pe->dma_weight = pnv_ioda_dma_weight(dev); 309 if (pe->dma_weight != 0) { 310 phb->ioda.dma_weight += pe->dma_weight; 311 phb->ioda.dma_pe_count++; 312 } 313 314 /* Link the PE */ 315 pnv_ioda_link_pe_by_weight(phb, pe); 316 317 return pe; 318 } 319 #endif /* Useful for SRIOV case */ 320 321 static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) 322 { 323 struct pci_dev *dev; 324 325 list_for_each_entry(dev, &bus->devices, bus_list) { 326 struct pci_dn *pdn = pnv_ioda_get_pdn(dev); 327 328 if (pdn == NULL) { 329 pr_warn("%s: No device node associated with device !\n", 330 pci_name(dev)); 331 continue; 332 } 333 pci_dev_get(dev); 334 pdn->pcidev = dev; 335 pdn->pe_number = pe->pe_number; 336 pe->dma_weight += pnv_ioda_dma_weight(dev); 337 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) 338 pnv_ioda_setup_same_PE(dev->subordinate, pe); 339 } 340 } 341 342 /* 343 * There're 2 types of PCI bus sensitive PEs: One that is compromised of 344 * single PCI bus. Another one that contains the primary PCI bus and its 345 * subordinate PCI devices and buses. The second type of PE is normally 346 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. 347 */ 348 static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) 349 { 350 struct pci_controller *hose = pci_bus_to_host(bus); 351 struct pnv_phb *phb = hose->private_data; 352 struct pnv_ioda_pe *pe; 353 int pe_num; 354 355 pe_num = pnv_ioda_alloc_pe(phb); 356 if (pe_num == IODA_INVALID_PE) { 357 pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", 358 __func__, pci_domain_nr(bus), bus->number); 359 return; 360 } 361 362 pe = &phb->ioda.pe_array[pe_num]; 363 pe->flags = (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); 364 pe->pbus = bus; 365 pe->pdev = NULL; 366 pe->tce32_seg = -1; 367 pe->mve_number = -1; 368 pe->rid = bus->busn_res.start << 8; 369 pe->dma_weight = 0; 370 371 if (all) 372 pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", 373 bus->busn_res.start, bus->busn_res.end, pe_num); 374 else 375 pe_info(pe, "Secondary bus %d associated with PE#%d\n", 376 bus->busn_res.start, pe_num); 377 378 if (pnv_ioda_configure_pe(phb, pe)) { 379 /* XXX What do we do here ? */ 380 if (pe_num) 381 pnv_ioda_free_pe(phb, pe_num); 382 pe->pbus = NULL; 383 return; 384 } 385 386 /* Associate it with all child devices */ 387 pnv_ioda_setup_same_PE(bus, pe); 388 389 /* Put PE to the list */ 390 list_add_tail(&pe->list, &phb->ioda.pe_list); 391 392 /* Account for one DMA PE if at least one DMA capable device exist 393 * below the bridge 394 */ 395 if (pe->dma_weight != 0) { 396 phb->ioda.dma_weight += pe->dma_weight; 397 phb->ioda.dma_pe_count++; 398 } 399 400 /* Link the PE */ 401 pnv_ioda_link_pe_by_weight(phb, pe); 402 } 403 404 static void pnv_ioda_setup_PEs(struct pci_bus *bus) 405 { 406 struct pci_dev *dev; 407 408 pnv_ioda_setup_bus_PE(bus, 0); 409 410 list_for_each_entry(dev, &bus->devices, bus_list) { 411 if (dev->subordinate) { 412 if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) 413 pnv_ioda_setup_bus_PE(dev->subordinate, 1); 414 else 415 pnv_ioda_setup_PEs(dev->subordinate); 416 } 417 } 418 } 419 420 /* 421 * Configure PEs so that the downstream PCI buses and devices 422 * could have their associated PE#. Unfortunately, we didn't 423 * figure out the way to identify the PLX bridge yet. So we 424 * simply put the PCI bus and the subordinate behind the root 425 * port to PE# here. The game rule here is expected to be changed 426 * as soon as we can detected PLX bridge correctly. 427 */ 428 static void pnv_pci_ioda_setup_PEs(void) 429 { 430 struct pci_controller *hose, *tmp; 431 432 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 433 pnv_ioda_setup_PEs(hose->bus); 434 } 435 } 436 437 static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) 438 { 439 struct pci_dn *pdn = pnv_ioda_get_pdn(pdev); 440 struct pnv_ioda_pe *pe; 441 442 /* 443 * The function can be called while the PE# 444 * hasn't been assigned. Do nothing for the 445 * case. 446 */ 447 if (!pdn || pdn->pe_number == IODA_INVALID_PE) 448 return; 449 450 pe = &phb->ioda.pe_array[pdn->pe_number]; 451 set_iommu_table_base(&pdev->dev, &pe->tce32_table); 452 } 453 454 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, 455 u64 *startp, u64 *endp) 456 { 457 u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; 458 unsigned long start, end, inc; 459 460 start = __pa(startp); 461 end = __pa(endp); 462 463 /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ 464 if (tbl->it_busno) { 465 start <<= 12; 466 end <<= 12; 467 inc = 128 << 12; 468 start |= tbl->it_busno; 469 end |= tbl->it_busno; 470 } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { 471 /* p7ioc-style invalidation, 2 TCEs per write */ 472 start |= (1ull << 63); 473 end |= (1ull << 63); 474 inc = 16; 475 } else { 476 /* Default (older HW) */ 477 inc = 128; 478 } 479 480 end |= inc - 1; /* round up end to be different than start */ 481 482 mb(); /* Ensure above stores are visible */ 483 while (start <= end) { 484 __raw_writeq(start, invalidate); 485 start += inc; 486 } 487 488 /* 489 * The iommu layer will do another mb() for us on build() 490 * and we don't care on free() 491 */ 492 } 493 494 static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, 495 struct iommu_table *tbl, 496 u64 *startp, u64 *endp) 497 { 498 unsigned long start, end, inc; 499 u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; 500 501 /* We'll invalidate DMA address in PE scope */ 502 start = 0x2ul << 60; 503 start |= (pe->pe_number & 0xFF); 504 end = start; 505 506 /* Figure out the start, end and step */ 507 inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64)); 508 start |= (inc << 12); 509 inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64)); 510 end |= (inc << 12); 511 inc = (0x1ul << 12); 512 mb(); 513 514 while (start <= end) { 515 __raw_writeq(start, invalidate); 516 start += inc; 517 } 518 } 519 520 void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, 521 u64 *startp, u64 *endp) 522 { 523 struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, 524 tce32_table); 525 struct pnv_phb *phb = pe->phb; 526 527 if (phb->type == PNV_PHB_IODA1) 528 pnv_pci_ioda1_tce_invalidate(tbl, startp, endp); 529 else 530 pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp); 531 } 532 533 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, 534 struct pnv_ioda_pe *pe, unsigned int base, 535 unsigned int segs) 536 { 537 538 struct page *tce_mem = NULL; 539 const __be64 *swinvp; 540 struct iommu_table *tbl; 541 unsigned int i; 542 int64_t rc; 543 void *addr; 544 545 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ 546 #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) 547 548 /* XXX FIXME: Handle 64-bit only DMA devices */ 549 /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ 550 /* XXX FIXME: Allocate multi-level tables on PHB3 */ 551 552 /* We shouldn't already have a 32-bit DMA associated */ 553 if (WARN_ON(pe->tce32_seg >= 0)) 554 return; 555 556 /* Grab a 32-bit TCE table */ 557 pe->tce32_seg = base; 558 pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", 559 (base << 28), ((base + segs) << 28) - 1); 560 561 /* XXX Currently, we allocate one big contiguous table for the 562 * TCEs. We only really need one chunk per 256M of TCE space 563 * (ie per segment) but that's an optimization for later, it 564 * requires some added smarts with our get/put_tce implementation 565 */ 566 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 567 get_order(TCE32_TABLE_SIZE * segs)); 568 if (!tce_mem) { 569 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); 570 goto fail; 571 } 572 addr = page_address(tce_mem); 573 memset(addr, 0, TCE32_TABLE_SIZE * segs); 574 575 /* Configure HW */ 576 for (i = 0; i < segs; i++) { 577 rc = opal_pci_map_pe_dma_window(phb->opal_id, 578 pe->pe_number, 579 base + i, 1, 580 __pa(addr) + TCE32_TABLE_SIZE * i, 581 TCE32_TABLE_SIZE, 0x1000); 582 if (rc) { 583 pe_err(pe, " Failed to configure 32-bit TCE table," 584 " err %ld\n", rc); 585 goto fail; 586 } 587 } 588 589 /* Setup linux iommu table */ 590 tbl = &pe->tce32_table; 591 pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, 592 base << 28); 593 594 /* OPAL variant of P7IOC SW invalidated TCEs */ 595 swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); 596 if (swinvp) { 597 /* We need a couple more fields -- an address and a data 598 * to or. Since the bus is only printed out on table free 599 * errors, and on the first pass the data will be a relative 600 * bus number, print that out instead. 601 */ 602 tbl->it_busno = 0; 603 tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8); 604 tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE | 605 TCE_PCI_SWINV_PAIR; 606 } 607 iommu_init_table(tbl, phb->hose->node); 608 609 return; 610 fail: 611 /* XXX Failure: Try to fallback to 64-bit only ? */ 612 if (pe->tce32_seg >= 0) 613 pe->tce32_seg = -1; 614 if (tce_mem) 615 __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); 616 } 617 618 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 619 struct pnv_ioda_pe *pe) 620 { 621 struct page *tce_mem = NULL; 622 void *addr; 623 const __be64 *swinvp; 624 struct iommu_table *tbl; 625 unsigned int tce_table_size, end; 626 int64_t rc; 627 628 /* We shouldn't already have a 32-bit DMA associated */ 629 if (WARN_ON(pe->tce32_seg >= 0)) 630 return; 631 632 /* The PE will reserve all possible 32-bits space */ 633 pe->tce32_seg = 0; 634 end = (1 << ilog2(phb->ioda.m32_pci_base)); 635 tce_table_size = (end / 0x1000) * 8; 636 pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", 637 end); 638 639 /* Allocate TCE table */ 640 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 641 get_order(tce_table_size)); 642 if (!tce_mem) { 643 pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); 644 goto fail; 645 } 646 addr = page_address(tce_mem); 647 memset(addr, 0, tce_table_size); 648 649 /* 650 * Map TCE table through TVT. The TVE index is the PE number 651 * shifted by 1 bit for 32-bits DMA space. 652 */ 653 rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, 654 pe->pe_number << 1, 1, __pa(addr), 655 tce_table_size, 0x1000); 656 if (rc) { 657 pe_err(pe, "Failed to configure 32-bit TCE table," 658 " err %ld\n", rc); 659 goto fail; 660 } 661 662 /* Setup linux iommu table */ 663 tbl = &pe->tce32_table; 664 pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0); 665 666 /* OPAL variant of PHB3 invalidated TCEs */ 667 swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); 668 if (swinvp) { 669 /* We need a couple more fields -- an address and a data 670 * to or. Since the bus is only printed out on table free 671 * errors, and on the first pass the data will be a relative 672 * bus number, print that out instead. 673 */ 674 tbl->it_busno = 0; 675 tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8); 676 tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; 677 } 678 iommu_init_table(tbl, phb->hose->node); 679 680 return; 681 fail: 682 if (pe->tce32_seg >= 0) 683 pe->tce32_seg = -1; 684 if (tce_mem) 685 __free_pages(tce_mem, get_order(tce_table_size)); 686 } 687 688 static void pnv_ioda_setup_dma(struct pnv_phb *phb) 689 { 690 struct pci_controller *hose = phb->hose; 691 unsigned int residual, remaining, segs, tw, base; 692 struct pnv_ioda_pe *pe; 693 694 /* If we have more PE# than segments available, hand out one 695 * per PE until we run out and let the rest fail. If not, 696 * then we assign at least one segment per PE, plus more based 697 * on the amount of devices under that PE 698 */ 699 if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) 700 residual = 0; 701 else 702 residual = phb->ioda.tce32_count - 703 phb->ioda.dma_pe_count; 704 705 pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", 706 hose->global_number, phb->ioda.tce32_count); 707 pr_info("PCI: %d PE# for a total weight of %d\n", 708 phb->ioda.dma_pe_count, phb->ioda.dma_weight); 709 710 /* Walk our PE list and configure their DMA segments, hand them 711 * out one base segment plus any residual segments based on 712 * weight 713 */ 714 remaining = phb->ioda.tce32_count; 715 tw = phb->ioda.dma_weight; 716 base = 0; 717 list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { 718 if (!pe->dma_weight) 719 continue; 720 if (!remaining) { 721 pe_warn(pe, "No DMA32 resources available\n"); 722 continue; 723 } 724 segs = 1; 725 if (residual) { 726 segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; 727 if (segs > remaining) 728 segs = remaining; 729 } 730 731 /* 732 * For IODA2 compliant PHB3, we needn't care about the weight. 733 * The all available 32-bits DMA space will be assigned to 734 * the specific PE. 735 */ 736 if (phb->type == PNV_PHB_IODA1) { 737 pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", 738 pe->dma_weight, segs); 739 pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); 740 } else { 741 pe_info(pe, "Assign DMA32 space\n"); 742 segs = 0; 743 pnv_pci_ioda2_setup_dma_pe(phb, pe); 744 } 745 746 remaining -= segs; 747 base += segs; 748 } 749 } 750 751 #ifdef CONFIG_PCI_MSI 752 static void pnv_ioda2_msi_eoi(struct irq_data *d) 753 { 754 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 755 struct irq_chip *chip = irq_data_get_irq_chip(d); 756 struct pnv_phb *phb = container_of(chip, struct pnv_phb, 757 ioda.irq_chip); 758 int64_t rc; 759 760 rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); 761 WARN_ON_ONCE(rc); 762 763 icp_native_eoi(d); 764 } 765 766 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, 767 unsigned int hwirq, unsigned int virq, 768 unsigned int is_64, struct msi_msg *msg) 769 { 770 struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); 771 struct irq_data *idata; 772 struct irq_chip *ichip; 773 unsigned int xive_num = hwirq - phb->msi_base; 774 uint64_t addr64; 775 uint32_t addr32, data; 776 int rc; 777 778 /* No PE assigned ? bail out ... no MSI for you ! */ 779 if (pe == NULL) 780 return -ENXIO; 781 782 /* Check if we have an MVE */ 783 if (pe->mve_number < 0) 784 return -ENXIO; 785 786 /* Assign XIVE to PE */ 787 rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); 788 if (rc) { 789 pr_warn("%s: OPAL error %d setting XIVE %d PE\n", 790 pci_name(dev), rc, xive_num); 791 return -EIO; 792 } 793 794 if (is_64) { 795 rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, 796 &addr64, &data); 797 if (rc) { 798 pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", 799 pci_name(dev), rc); 800 return -EIO; 801 } 802 msg->address_hi = addr64 >> 32; 803 msg->address_lo = addr64 & 0xfffffffful; 804 } else { 805 rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, 806 &addr32, &data); 807 if (rc) { 808 pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", 809 pci_name(dev), rc); 810 return -EIO; 811 } 812 msg->address_hi = 0; 813 msg->address_lo = addr32; 814 } 815 msg->data = data; 816 817 /* 818 * Change the IRQ chip for the MSI interrupts on PHB3. 819 * The corresponding IRQ chip should be populated for 820 * the first time. 821 */ 822 if (phb->type == PNV_PHB_IODA2) { 823 if (!phb->ioda.irq_chip_init) { 824 idata = irq_get_irq_data(virq); 825 ichip = irq_data_get_irq_chip(idata); 826 phb->ioda.irq_chip_init = 1; 827 phb->ioda.irq_chip = *ichip; 828 phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; 829 } 830 831 irq_set_chip(virq, &phb->ioda.irq_chip); 832 } 833 834 pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," 835 " address=%x_%08x data=%x PE# %d\n", 836 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, 837 msg->address_hi, msg->address_lo, data, pe->pe_number); 838 839 return 0; 840 } 841 842 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) 843 { 844 unsigned int count; 845 const __be32 *prop = of_get_property(phb->hose->dn, 846 "ibm,opal-msi-ranges", NULL); 847 if (!prop) { 848 /* BML Fallback */ 849 prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); 850 } 851 if (!prop) 852 return; 853 854 phb->msi_base = be32_to_cpup(prop); 855 count = be32_to_cpup(prop + 1); 856 if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { 857 pr_err("PCI %d: Failed to allocate MSI bitmap !\n", 858 phb->hose->global_number); 859 return; 860 } 861 862 phb->msi_setup = pnv_pci_ioda_msi_setup; 863 phb->msi32_support = 1; 864 pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", 865 count, phb->msi_base); 866 } 867 #else 868 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } 869 #endif /* CONFIG_PCI_MSI */ 870 871 /* 872 * This function is supposed to be called on basis of PE from top 873 * to bottom style. So the the I/O or MMIO segment assigned to 874 * parent PE could be overrided by its child PEs if necessary. 875 */ 876 static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, 877 struct pnv_ioda_pe *pe) 878 { 879 struct pnv_phb *phb = hose->private_data; 880 struct pci_bus_region region; 881 struct resource *res; 882 int i, index; 883 int rc; 884 885 /* 886 * NOTE: We only care PCI bus based PE for now. For PCI 887 * device based PE, for example SRIOV sensitive VF should 888 * be figured out later. 889 */ 890 BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); 891 892 pci_bus_for_each_resource(pe->pbus, res, i) { 893 if (!res || !res->flags || 894 res->start > res->end) 895 continue; 896 897 if (res->flags & IORESOURCE_IO) { 898 region.start = res->start - phb->ioda.io_pci_base; 899 region.end = res->end - phb->ioda.io_pci_base; 900 index = region.start / phb->ioda.io_segsize; 901 902 while (index < phb->ioda.total_pe && 903 region.start <= region.end) { 904 phb->ioda.io_segmap[index] = pe->pe_number; 905 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 906 pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); 907 if (rc != OPAL_SUCCESS) { 908 pr_err("%s: OPAL error %d when mapping IO " 909 "segment #%d to PE#%d\n", 910 __func__, rc, index, pe->pe_number); 911 break; 912 } 913 914 region.start += phb->ioda.io_segsize; 915 index++; 916 } 917 } else if (res->flags & IORESOURCE_MEM) { 918 /* WARNING: Assumes M32 is mem region 0 in PHB. We need to 919 * harden that algorithm when we start supporting M64 920 */ 921 region.start = res->start - 922 hose->mem_offset[0] - 923 phb->ioda.m32_pci_base; 924 region.end = res->end - 925 hose->mem_offset[0] - 926 phb->ioda.m32_pci_base; 927 index = region.start / phb->ioda.m32_segsize; 928 929 while (index < phb->ioda.total_pe && 930 region.start <= region.end) { 931 phb->ioda.m32_segmap[index] = pe->pe_number; 932 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 933 pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); 934 if (rc != OPAL_SUCCESS) { 935 pr_err("%s: OPAL error %d when mapping M32 " 936 "segment#%d to PE#%d", 937 __func__, rc, index, pe->pe_number); 938 break; 939 } 940 941 region.start += phb->ioda.m32_segsize; 942 index++; 943 } 944 } 945 } 946 } 947 948 static void pnv_pci_ioda_setup_seg(void) 949 { 950 struct pci_controller *tmp, *hose; 951 struct pnv_phb *phb; 952 struct pnv_ioda_pe *pe; 953 954 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 955 phb = hose->private_data; 956 list_for_each_entry(pe, &phb->ioda.pe_list, list) { 957 pnv_ioda_setup_pe_seg(hose, pe); 958 } 959 } 960 } 961 962 static void pnv_pci_ioda_setup_DMA(void) 963 { 964 struct pci_controller *hose, *tmp; 965 struct pnv_phb *phb; 966 967 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 968 pnv_ioda_setup_dma(hose->private_data); 969 970 /* Mark the PHB initialization done */ 971 phb = hose->private_data; 972 phb->initialized = 1; 973 } 974 } 975 976 static void pnv_pci_ioda_fixup(void) 977 { 978 pnv_pci_ioda_setup_PEs(); 979 pnv_pci_ioda_setup_seg(); 980 pnv_pci_ioda_setup_DMA(); 981 } 982 983 /* 984 * Returns the alignment for I/O or memory windows for P2P 985 * bridges. That actually depends on how PEs are segmented. 986 * For now, we return I/O or M32 segment size for PE sensitive 987 * P2P bridges. Otherwise, the default values (4KiB for I/O, 988 * 1MiB for memory) will be returned. 989 * 990 * The current PCI bus might be put into one PE, which was 991 * create against the parent PCI bridge. For that case, we 992 * needn't enlarge the alignment so that we can save some 993 * resources. 994 */ 995 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, 996 unsigned long type) 997 { 998 struct pci_dev *bridge; 999 struct pci_controller *hose = pci_bus_to_host(bus); 1000 struct pnv_phb *phb = hose->private_data; 1001 int num_pci_bridges = 0; 1002 1003 bridge = bus->self; 1004 while (bridge) { 1005 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) { 1006 num_pci_bridges++; 1007 if (num_pci_bridges >= 2) 1008 return 1; 1009 } 1010 1011 bridge = bridge->bus->self; 1012 } 1013 1014 /* We need support prefetchable memory window later */ 1015 if (type & IORESOURCE_MEM) 1016 return phb->ioda.m32_segsize; 1017 1018 return phb->ioda.io_segsize; 1019 } 1020 1021 /* Prevent enabling devices for which we couldn't properly 1022 * assign a PE 1023 */ 1024 static int pnv_pci_enable_device_hook(struct pci_dev *dev) 1025 { 1026 struct pci_controller *hose = pci_bus_to_host(dev->bus); 1027 struct pnv_phb *phb = hose->private_data; 1028 struct pci_dn *pdn; 1029 1030 /* The function is probably called while the PEs have 1031 * not be created yet. For example, resource reassignment 1032 * during PCI probe period. We just skip the check if 1033 * PEs isn't ready. 1034 */ 1035 if (!phb->initialized) 1036 return 0; 1037 1038 pdn = pnv_ioda_get_pdn(dev); 1039 if (!pdn || pdn->pe_number == IODA_INVALID_PE) 1040 return -EINVAL; 1041 1042 return 0; 1043 } 1044 1045 static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, 1046 u32 devfn) 1047 { 1048 return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; 1049 } 1050 1051 void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type) 1052 { 1053 struct pci_controller *hose; 1054 static int primary = 1; 1055 struct pnv_phb *phb; 1056 unsigned long size, m32map_off, iomap_off, pemap_off; 1057 const u64 *prop64; 1058 const u32 *prop32; 1059 u64 phb_id; 1060 void *aux; 1061 long rc; 1062 1063 pr_info(" Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name); 1064 1065 prop64 = of_get_property(np, "ibm,opal-phbid", NULL); 1066 if (!prop64) { 1067 pr_err(" Missing \"ibm,opal-phbid\" property !\n"); 1068 return; 1069 } 1070 phb_id = be64_to_cpup(prop64); 1071 pr_debug(" PHB-ID : 0x%016llx\n", phb_id); 1072 1073 phb = alloc_bootmem(sizeof(struct pnv_phb)); 1074 if (phb) { 1075 memset(phb, 0, sizeof(struct pnv_phb)); 1076 phb->hose = hose = pcibios_alloc_controller(np); 1077 } 1078 if (!phb || !phb->hose) { 1079 pr_err("PCI: Failed to allocate PCI controller for %s\n", 1080 np->full_name); 1081 return; 1082 } 1083 1084 spin_lock_init(&phb->lock); 1085 /* XXX Use device-tree */ 1086 hose->first_busno = 0; 1087 hose->last_busno = 0xff; 1088 hose->private_data = phb; 1089 phb->opal_id = phb_id; 1090 phb->type = ioda_type; 1091 1092 /* Detect specific models for error handling */ 1093 if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) 1094 phb->model = PNV_PHB_MODEL_P7IOC; 1095 else if (of_device_is_compatible(np, "ibm,power8-pciex")) 1096 phb->model = PNV_PHB_MODEL_PHB3; 1097 else 1098 phb->model = PNV_PHB_MODEL_UNKNOWN; 1099 1100 /* Parse 32-bit and IO ranges (if any) */ 1101 pci_process_bridge_OF_ranges(phb->hose, np, primary); 1102 primary = 0; 1103 1104 /* Get registers */ 1105 phb->regs = of_iomap(np, 0); 1106 if (phb->regs == NULL) 1107 pr_err(" Failed to map registers !\n"); 1108 1109 /* Initialize more IODA stuff */ 1110 prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); 1111 if (!prop32) 1112 phb->ioda.total_pe = 1; 1113 else 1114 phb->ioda.total_pe = *prop32; 1115 1116 phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); 1117 /* FW Has already off top 64k of M32 space (MSI space) */ 1118 phb->ioda.m32_size += 0x10000; 1119 1120 phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; 1121 phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; 1122 phb->ioda.io_size = hose->pci_io_size; 1123 phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; 1124 phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ 1125 1126 /* Allocate aux data & arrays 1127 * 1128 * XXX TODO: Don't allocate io segmap on PHB3 1129 */ 1130 size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 1131 m32map_off = size; 1132 size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); 1133 iomap_off = size; 1134 size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); 1135 pemap_off = size; 1136 size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); 1137 aux = alloc_bootmem(size); 1138 memset(aux, 0, size); 1139 phb->ioda.pe_alloc = aux; 1140 phb->ioda.m32_segmap = aux + m32map_off; 1141 phb->ioda.io_segmap = aux + iomap_off; 1142 phb->ioda.pe_array = aux + pemap_off; 1143 set_bit(0, phb->ioda.pe_alloc); 1144 1145 INIT_LIST_HEAD(&phb->ioda.pe_dma_list); 1146 INIT_LIST_HEAD(&phb->ioda.pe_list); 1147 1148 /* Calculate how many 32-bit TCE segments we have */ 1149 phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; 1150 1151 /* Clear unusable m64 */ 1152 hose->mem_resources[1].flags = 0; 1153 hose->mem_resources[1].start = 0; 1154 hose->mem_resources[1].end = 0; 1155 hose->mem_resources[2].flags = 0; 1156 hose->mem_resources[2].start = 0; 1157 hose->mem_resources[2].end = 0; 1158 1159 #if 0 /* We should really do that ... */ 1160 rc = opal_pci_set_phb_mem_window(opal->phb_id, 1161 window_type, 1162 window_num, 1163 starting_real_address, 1164 starting_pci_address, 1165 segment_size); 1166 #endif 1167 1168 pr_info(" %d PE's M32: 0x%x [segment=0x%x] IO: 0x%x [segment=0x%x]\n", 1169 phb->ioda.total_pe, 1170 phb->ioda.m32_size, phb->ioda.m32_segsize, 1171 phb->ioda.io_size, phb->ioda.io_segsize); 1172 1173 phb->hose->ops = &pnv_pci_ops; 1174 1175 /* Setup RID -> PE mapping function */ 1176 phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; 1177 1178 /* Setup TCEs */ 1179 phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; 1180 1181 /* Setup MSI support */ 1182 pnv_pci_init_ioda_msis(phb); 1183 1184 /* 1185 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here 1186 * to let the PCI core do resource assignment. It's supposed 1187 * that the PCI core will do correct I/O and MMIO alignment 1188 * for the P2P bridge bars so that each PCI bus (excluding 1189 * the child P2P bridges) can form individual PE. 1190 */ 1191 ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; 1192 ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; 1193 ppc_md.pcibios_window_alignment = pnv_pci_window_alignment; 1194 pci_add_flags(PCI_REASSIGN_ALL_RSRC); 1195 1196 /* Reset IODA tables to a clean state */ 1197 rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); 1198 if (rc) 1199 pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); 1200 1201 /* 1202 * On IODA1 map everything to PE#0, on IODA2 we assume the IODA reset 1203 * has cleared the RTT which has the same effect 1204 */ 1205 if (ioda_type == PNV_PHB_IODA1) 1206 opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE); 1207 } 1208 1209 void pnv_pci_init_ioda2_phb(struct device_node *np) 1210 { 1211 pnv_pci_init_ioda_phb(np, PNV_PHB_IODA2); 1212 } 1213 1214 void __init pnv_pci_init_ioda_hub(struct device_node *np) 1215 { 1216 struct device_node *phbn; 1217 const u64 *prop64; 1218 u64 hub_id; 1219 1220 pr_info("Probing IODA IO-Hub %s\n", np->full_name); 1221 1222 prop64 = of_get_property(np, "ibm,opal-hubid", NULL); 1223 if (!prop64) { 1224 pr_err(" Missing \"ibm,opal-hubid\" property !\n"); 1225 return; 1226 } 1227 hub_id = be64_to_cpup(prop64); 1228 pr_devel(" HUB-ID : 0x%016llx\n", hub_id); 1229 1230 /* Count child PHBs */ 1231 for_each_child_of_node(np, phbn) { 1232 /* Look for IODA1 PHBs */ 1233 if (of_device_is_compatible(phbn, "ibm,ioda-phb")) 1234 pnv_pci_init_ioda_phb(phbn, PNV_PHB_IODA1); 1235 } 1236 } 1237