1 /* 2 * Support PCI/PCIe on PowerNV platforms 3 * 4 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 12 #undef DEBUG 13 14 #include <linux/kernel.h> 15 #include <linux/pci.h> 16 #include <linux/crash_dump.h> 17 #include <linux/debugfs.h> 18 #include <linux/delay.h> 19 #include <linux/string.h> 20 #include <linux/init.h> 21 #include <linux/bootmem.h> 22 #include <linux/irq.h> 23 #include <linux/io.h> 24 #include <linux/msi.h> 25 #include <linux/memblock.h> 26 27 #include <asm/sections.h> 28 #include <asm/io.h> 29 #include <asm/prom.h> 30 #include <asm/pci-bridge.h> 31 #include <asm/machdep.h> 32 #include <asm/msi_bitmap.h> 33 #include <asm/ppc-pci.h> 34 #include <asm/opal.h> 35 #include <asm/iommu.h> 36 #include <asm/tce.h> 37 #include <asm/xics.h> 38 #include <asm/debug.h> 39 #include <asm/firmware.h> 40 41 #include "powernv.h" 42 #include "pci.h" 43 44 #define define_pe_printk_level(func, kern_level) \ 45 static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \ 46 { \ 47 struct va_format vaf; \ 48 va_list args; \ 49 char pfix[32]; \ 50 int r; \ 51 \ 52 va_start(args, fmt); \ 53 \ 54 vaf.fmt = fmt; \ 55 vaf.va = &args; \ 56 \ 57 if (pe->pdev) \ 58 strlcpy(pfix, dev_name(&pe->pdev->dev), \ 59 sizeof(pfix)); \ 60 else \ 61 sprintf(pfix, "%04x:%02x ", \ 62 pci_domain_nr(pe->pbus), \ 63 pe->pbus->number); \ 64 r = printk(kern_level "pci %s: [PE# %.3d] %pV", \ 65 pfix, pe->pe_number, &vaf); \ 66 \ 67 va_end(args); \ 68 \ 69 return r; \ 70 } \ 71 72 define_pe_printk_level(pe_err, KERN_ERR); 73 define_pe_printk_level(pe_warn, KERN_WARNING); 74 define_pe_printk_level(pe_info, KERN_INFO); 75 76 /* 77 * stdcix is only supposed to be used in hypervisor real mode as per 78 * the architecture spec 79 */ 80 static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) 81 { 82 __asm__ __volatile__("stdcix %0,0,%1" 83 : : "r" (val), "r" (paddr) : "memory"); 84 } 85 86 static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) 87 { 88 return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) == 89 (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); 90 } 91 92 static int pnv_ioda_alloc_pe(struct pnv_phb *phb) 93 { 94 unsigned long pe; 95 96 do { 97 pe = find_next_zero_bit(phb->ioda.pe_alloc, 98 phb->ioda.total_pe, 0); 99 if (pe >= phb->ioda.total_pe) 100 return IODA_INVALID_PE; 101 } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); 102 103 phb->ioda.pe_array[pe].phb = phb; 104 phb->ioda.pe_array[pe].pe_number = pe; 105 return pe; 106 } 107 108 static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) 109 { 110 WARN_ON(phb->ioda.pe_array[pe].pdev); 111 112 memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); 113 clear_bit(pe, phb->ioda.pe_alloc); 114 } 115 116 /* The default M64 BAR is shared by all PEs */ 117 static int pnv_ioda2_init_m64(struct pnv_phb *phb) 118 { 119 const char *desc; 120 struct resource *r; 121 s64 rc; 122 123 /* Configure the default M64 BAR */ 124 rc = opal_pci_set_phb_mem_window(phb->opal_id, 125 OPAL_M64_WINDOW_TYPE, 126 phb->ioda.m64_bar_idx, 127 phb->ioda.m64_base, 128 0, /* unused */ 129 phb->ioda.m64_size); 130 if (rc != OPAL_SUCCESS) { 131 desc = "configuring"; 132 goto fail; 133 } 134 135 /* Enable the default M64 BAR */ 136 rc = opal_pci_phb_mmio_enable(phb->opal_id, 137 OPAL_M64_WINDOW_TYPE, 138 phb->ioda.m64_bar_idx, 139 OPAL_ENABLE_M64_SPLIT); 140 if (rc != OPAL_SUCCESS) { 141 desc = "enabling"; 142 goto fail; 143 } 144 145 /* Mark the M64 BAR assigned */ 146 set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc); 147 148 /* 149 * Strip off the segment used by the reserved PE, which is 150 * expected to be 0 or last one of PE capabicity. 151 */ 152 r = &phb->hose->mem_resources[1]; 153 if (phb->ioda.reserved_pe == 0) 154 r->start += phb->ioda.m64_segsize; 155 else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1)) 156 r->end -= phb->ioda.m64_segsize; 157 else 158 pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", 159 phb->ioda.reserved_pe); 160 161 return 0; 162 163 fail: 164 pr_warn(" Failure %lld %s M64 BAR#%d\n", 165 rc, desc, phb->ioda.m64_bar_idx); 166 opal_pci_phb_mmio_enable(phb->opal_id, 167 OPAL_M64_WINDOW_TYPE, 168 phb->ioda.m64_bar_idx, 169 OPAL_DISABLE_M64); 170 return -EIO; 171 } 172 173 static void pnv_ioda2_alloc_m64_pe(struct pnv_phb *phb) 174 { 175 resource_size_t sgsz = phb->ioda.m64_segsize; 176 struct pci_dev *pdev; 177 struct resource *r; 178 int base, step, i; 179 180 /* 181 * Root bus always has full M64 range and root port has 182 * M64 range used in reality. So we're checking root port 183 * instead of root bus. 184 */ 185 list_for_each_entry(pdev, &phb->hose->bus->devices, bus_list) { 186 for (i = PCI_BRIDGE_RESOURCES; 187 i <= PCI_BRIDGE_RESOURCE_END; i++) { 188 r = &pdev->resource[i]; 189 if (!r->parent || 190 !pnv_pci_is_mem_pref_64(r->flags)) 191 continue; 192 193 base = (r->start - phb->ioda.m64_base) / sgsz; 194 for (step = 0; step < resource_size(r) / sgsz; step++) 195 set_bit(base + step, phb->ioda.pe_alloc); 196 } 197 } 198 } 199 200 static int pnv_ioda2_pick_m64_pe(struct pnv_phb *phb, 201 struct pci_bus *bus, int all) 202 { 203 resource_size_t segsz = phb->ioda.m64_segsize; 204 struct pci_dev *pdev; 205 struct resource *r; 206 struct pnv_ioda_pe *master_pe, *pe; 207 unsigned long size, *pe_alloc; 208 bool found; 209 int start, i, j; 210 211 /* Root bus shouldn't use M64 */ 212 if (pci_is_root_bus(bus)) 213 return IODA_INVALID_PE; 214 215 /* We support only one M64 window on each bus */ 216 found = false; 217 pci_bus_for_each_resource(bus, r, i) { 218 if (r && r->parent && 219 pnv_pci_is_mem_pref_64(r->flags)) { 220 found = true; 221 break; 222 } 223 } 224 225 /* No M64 window found ? */ 226 if (!found) 227 return IODA_INVALID_PE; 228 229 /* Allocate bitmap */ 230 size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 231 pe_alloc = kzalloc(size, GFP_KERNEL); 232 if (!pe_alloc) { 233 pr_warn("%s: Out of memory !\n", 234 __func__); 235 return IODA_INVALID_PE; 236 } 237 238 /* 239 * Figure out reserved PE numbers by the PE 240 * the its child PEs. 241 */ 242 start = (r->start - phb->ioda.m64_base) / segsz; 243 for (i = 0; i < resource_size(r) / segsz; i++) 244 set_bit(start + i, pe_alloc); 245 246 if (all) 247 goto done; 248 249 /* 250 * If the PE doesn't cover all subordinate buses, 251 * we need subtract from reserved PEs for children. 252 */ 253 list_for_each_entry(pdev, &bus->devices, bus_list) { 254 if (!pdev->subordinate) 255 continue; 256 257 pci_bus_for_each_resource(pdev->subordinate, r, i) { 258 if (!r || !r->parent || 259 !pnv_pci_is_mem_pref_64(r->flags)) 260 continue; 261 262 start = (r->start - phb->ioda.m64_base) / segsz; 263 for (j = 0; j < resource_size(r) / segsz ; j++) 264 clear_bit(start + j, pe_alloc); 265 } 266 } 267 268 /* 269 * the current bus might not own M64 window and that's all 270 * contributed by its child buses. For the case, we needn't 271 * pick M64 dependent PE#. 272 */ 273 if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { 274 kfree(pe_alloc); 275 return IODA_INVALID_PE; 276 } 277 278 /* 279 * Figure out the master PE and put all slave PEs to master 280 * PE's list to form compound PE. 281 */ 282 done: 283 master_pe = NULL; 284 i = -1; 285 while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < 286 phb->ioda.total_pe) { 287 pe = &phb->ioda.pe_array[i]; 288 pe->phb = phb; 289 pe->pe_number = i; 290 291 if (!master_pe) { 292 pe->flags |= PNV_IODA_PE_MASTER; 293 INIT_LIST_HEAD(&pe->slaves); 294 master_pe = pe; 295 } else { 296 pe->flags |= PNV_IODA_PE_SLAVE; 297 pe->master = master_pe; 298 list_add_tail(&pe->list, &master_pe->slaves); 299 } 300 } 301 302 kfree(pe_alloc); 303 return master_pe->pe_number; 304 } 305 306 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) 307 { 308 struct pci_controller *hose = phb->hose; 309 struct device_node *dn = hose->dn; 310 struct resource *res; 311 const u32 *r; 312 u64 pci_addr; 313 314 if (!firmware_has_feature(FW_FEATURE_OPALv3)) { 315 pr_info(" Firmware too old to support M64 window\n"); 316 return; 317 } 318 319 r = of_get_property(dn, "ibm,opal-m64-window", NULL); 320 if (!r) { 321 pr_info(" No <ibm,opal-m64-window> on %s\n", 322 dn->full_name); 323 return; 324 } 325 326 /* FIXME: Support M64 for P7IOC */ 327 if (phb->type != PNV_PHB_IODA2) { 328 pr_info(" Not support M64 window\n"); 329 return; 330 } 331 332 res = &hose->mem_resources[1]; 333 res->start = of_translate_address(dn, r + 2); 334 res->end = res->start + of_read_number(r + 4, 2) - 1; 335 res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); 336 pci_addr = of_read_number(r, 2); 337 hose->mem_offset[1] = res->start - pci_addr; 338 339 phb->ioda.m64_size = resource_size(res); 340 phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe; 341 phb->ioda.m64_base = pci_addr; 342 343 /* Use last M64 BAR to cover M64 window */ 344 phb->ioda.m64_bar_idx = 15; 345 phb->init_m64 = pnv_ioda2_init_m64; 346 phb->alloc_m64_pe = pnv_ioda2_alloc_m64_pe; 347 phb->pick_m64_pe = pnv_ioda2_pick_m64_pe; 348 } 349 350 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) 351 { 352 struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no]; 353 struct pnv_ioda_pe *slave; 354 s64 rc; 355 356 /* Fetch master PE */ 357 if (pe->flags & PNV_IODA_PE_SLAVE) { 358 pe = pe->master; 359 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); 360 pe_no = pe->pe_number; 361 } 362 363 /* Freeze master PE */ 364 rc = opal_pci_eeh_freeze_set(phb->opal_id, 365 pe_no, 366 OPAL_EEH_ACTION_SET_FREEZE_ALL); 367 if (rc != OPAL_SUCCESS) { 368 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n", 369 __func__, rc, phb->hose->global_number, pe_no); 370 return; 371 } 372 373 /* Freeze slave PEs */ 374 if (!(pe->flags & PNV_IODA_PE_MASTER)) 375 return; 376 377 list_for_each_entry(slave, &pe->slaves, list) { 378 rc = opal_pci_eeh_freeze_set(phb->opal_id, 379 slave->pe_number, 380 OPAL_EEH_ACTION_SET_FREEZE_ALL); 381 if (rc != OPAL_SUCCESS) 382 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n", 383 __func__, rc, phb->hose->global_number, 384 slave->pe_number); 385 } 386 } 387 388 int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt) 389 { 390 struct pnv_ioda_pe *pe, *slave; 391 s64 rc; 392 393 /* Find master PE */ 394 pe = &phb->ioda.pe_array[pe_no]; 395 if (pe->flags & PNV_IODA_PE_SLAVE) { 396 pe = pe->master; 397 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); 398 pe_no = pe->pe_number; 399 } 400 401 /* Clear frozen state for master PE */ 402 rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt); 403 if (rc != OPAL_SUCCESS) { 404 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n", 405 __func__, rc, opt, phb->hose->global_number, pe_no); 406 return -EIO; 407 } 408 409 if (!(pe->flags & PNV_IODA_PE_MASTER)) 410 return 0; 411 412 /* Clear frozen state for slave PEs */ 413 list_for_each_entry(slave, &pe->slaves, list) { 414 rc = opal_pci_eeh_freeze_clear(phb->opal_id, 415 slave->pe_number, 416 opt); 417 if (rc != OPAL_SUCCESS) { 418 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n", 419 __func__, rc, opt, phb->hose->global_number, 420 slave->pe_number); 421 return -EIO; 422 } 423 } 424 425 return 0; 426 } 427 428 static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) 429 { 430 struct pnv_ioda_pe *slave, *pe; 431 u8 fstate, state; 432 __be16 pcierr; 433 s64 rc; 434 435 /* Sanity check on PE number */ 436 if (pe_no < 0 || pe_no >= phb->ioda.total_pe) 437 return OPAL_EEH_STOPPED_PERM_UNAVAIL; 438 439 /* 440 * Fetch the master PE and the PE instance might be 441 * not initialized yet. 442 */ 443 pe = &phb->ioda.pe_array[pe_no]; 444 if (pe->flags & PNV_IODA_PE_SLAVE) { 445 pe = pe->master; 446 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); 447 pe_no = pe->pe_number; 448 } 449 450 /* Check the master PE */ 451 rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, 452 &state, &pcierr, NULL); 453 if (rc != OPAL_SUCCESS) { 454 pr_warn("%s: Failure %lld getting " 455 "PHB#%x-PE#%x state\n", 456 __func__, rc, 457 phb->hose->global_number, pe_no); 458 return OPAL_EEH_STOPPED_TEMP_UNAVAIL; 459 } 460 461 /* Check the slave PE */ 462 if (!(pe->flags & PNV_IODA_PE_MASTER)) 463 return state; 464 465 list_for_each_entry(slave, &pe->slaves, list) { 466 rc = opal_pci_eeh_freeze_status(phb->opal_id, 467 slave->pe_number, 468 &fstate, 469 &pcierr, 470 NULL); 471 if (rc != OPAL_SUCCESS) { 472 pr_warn("%s: Failure %lld getting " 473 "PHB#%x-PE#%x state\n", 474 __func__, rc, 475 phb->hose->global_number, slave->pe_number); 476 return OPAL_EEH_STOPPED_TEMP_UNAVAIL; 477 } 478 479 /* 480 * Override the result based on the ascending 481 * priority. 482 */ 483 if (fstate > state) 484 state = fstate; 485 } 486 487 return state; 488 } 489 490 /* Currently those 2 are only used when MSIs are enabled, this will change 491 * but in the meantime, we need to protect them to avoid warnings 492 */ 493 #ifdef CONFIG_PCI_MSI 494 static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) 495 { 496 struct pci_controller *hose = pci_bus_to_host(dev->bus); 497 struct pnv_phb *phb = hose->private_data; 498 struct pci_dn *pdn = pci_get_pdn(dev); 499 500 if (!pdn) 501 return NULL; 502 if (pdn->pe_number == IODA_INVALID_PE) 503 return NULL; 504 return &phb->ioda.pe_array[pdn->pe_number]; 505 } 506 #endif /* CONFIG_PCI_MSI */ 507 508 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) 509 { 510 struct pci_dev *parent; 511 uint8_t bcomp, dcomp, fcomp; 512 long rc, rid_end, rid; 513 514 /* Bus validation ? */ 515 if (pe->pbus) { 516 int count; 517 518 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; 519 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; 520 parent = pe->pbus->self; 521 if (pe->flags & PNV_IODA_PE_BUS_ALL) 522 count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; 523 else 524 count = 1; 525 526 switch(count) { 527 case 1: bcomp = OpalPciBusAll; break; 528 case 2: bcomp = OpalPciBus7Bits; break; 529 case 4: bcomp = OpalPciBus6Bits; break; 530 case 8: bcomp = OpalPciBus5Bits; break; 531 case 16: bcomp = OpalPciBus4Bits; break; 532 case 32: bcomp = OpalPciBus3Bits; break; 533 default: 534 pr_err("%s: Number of subordinate busses %d" 535 " unsupported\n", 536 pci_name(pe->pbus->self), count); 537 /* Do an exact match only */ 538 bcomp = OpalPciBusAll; 539 } 540 rid_end = pe->rid + (count << 8); 541 } else { 542 parent = pe->pdev->bus->self; 543 bcomp = OpalPciBusAll; 544 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; 545 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; 546 rid_end = pe->rid + 1; 547 } 548 549 /* 550 * Associate PE in PELT. We need add the PE into the 551 * corresponding PELT-V as well. Otherwise, the error 552 * originated from the PE might contribute to other 553 * PEs. 554 */ 555 rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, 556 bcomp, dcomp, fcomp, OPAL_MAP_PE); 557 if (rc) { 558 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); 559 return -ENXIO; 560 } 561 562 rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, 563 pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); 564 if (rc) 565 pe_warn(pe, "OPAL error %d adding self to PELTV\n", rc); 566 opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, 567 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); 568 569 /* Add to all parents PELT-V */ 570 while (parent) { 571 struct pci_dn *pdn = pci_get_pdn(parent); 572 if (pdn && pdn->pe_number != IODA_INVALID_PE) { 573 rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, 574 pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); 575 /* XXX What to do in case of error ? */ 576 } 577 parent = parent->bus->self; 578 } 579 /* Setup reverse map */ 580 for (rid = pe->rid; rid < rid_end; rid++) 581 phb->ioda.pe_rmap[rid] = pe->pe_number; 582 583 /* Setup one MVTs on IODA1 */ 584 if (phb->type == PNV_PHB_IODA1) { 585 pe->mve_number = pe->pe_number; 586 rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, 587 pe->pe_number); 588 if (rc) { 589 pe_err(pe, "OPAL error %ld setting up MVE %d\n", 590 rc, pe->mve_number); 591 pe->mve_number = -1; 592 } else { 593 rc = opal_pci_set_mve_enable(phb->opal_id, 594 pe->mve_number, OPAL_ENABLE_MVE); 595 if (rc) { 596 pe_err(pe, "OPAL error %ld enabling MVE %d\n", 597 rc, pe->mve_number); 598 pe->mve_number = -1; 599 } 600 } 601 } else if (phb->type == PNV_PHB_IODA2) 602 pe->mve_number = 0; 603 604 return 0; 605 } 606 607 static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, 608 struct pnv_ioda_pe *pe) 609 { 610 struct pnv_ioda_pe *lpe; 611 612 list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { 613 if (lpe->dma_weight < pe->dma_weight) { 614 list_add_tail(&pe->dma_link, &lpe->dma_link); 615 return; 616 } 617 } 618 list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); 619 } 620 621 static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) 622 { 623 /* This is quite simplistic. The "base" weight of a device 624 * is 10. 0 means no DMA is to be accounted for it. 625 */ 626 627 /* If it's a bridge, no DMA */ 628 if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) 629 return 0; 630 631 /* Reduce the weight of slow USB controllers */ 632 if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || 633 dev->class == PCI_CLASS_SERIAL_USB_OHCI || 634 dev->class == PCI_CLASS_SERIAL_USB_EHCI) 635 return 3; 636 637 /* Increase the weight of RAID (includes Obsidian) */ 638 if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) 639 return 15; 640 641 /* Default */ 642 return 10; 643 } 644 645 #if 0 646 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) 647 { 648 struct pci_controller *hose = pci_bus_to_host(dev->bus); 649 struct pnv_phb *phb = hose->private_data; 650 struct pci_dn *pdn = pci_get_pdn(dev); 651 struct pnv_ioda_pe *pe; 652 int pe_num; 653 654 if (!pdn) { 655 pr_err("%s: Device tree node not associated properly\n", 656 pci_name(dev)); 657 return NULL; 658 } 659 if (pdn->pe_number != IODA_INVALID_PE) 660 return NULL; 661 662 /* PE#0 has been pre-set */ 663 if (dev->bus->number == 0) 664 pe_num = 0; 665 else 666 pe_num = pnv_ioda_alloc_pe(phb); 667 if (pe_num == IODA_INVALID_PE) { 668 pr_warning("%s: Not enough PE# available, disabling device\n", 669 pci_name(dev)); 670 return NULL; 671 } 672 673 /* NOTE: We get only one ref to the pci_dev for the pdn, not for the 674 * pointer in the PE data structure, both should be destroyed at the 675 * same time. However, this needs to be looked at more closely again 676 * once we actually start removing things (Hotplug, SR-IOV, ...) 677 * 678 * At some point we want to remove the PDN completely anyways 679 */ 680 pe = &phb->ioda.pe_array[pe_num]; 681 pci_dev_get(dev); 682 pdn->pcidev = dev; 683 pdn->pe_number = pe_num; 684 pe->pdev = dev; 685 pe->pbus = NULL; 686 pe->tce32_seg = -1; 687 pe->mve_number = -1; 688 pe->rid = dev->bus->number << 8 | pdn->devfn; 689 690 pe_info(pe, "Associated device to PE\n"); 691 692 if (pnv_ioda_configure_pe(phb, pe)) { 693 /* XXX What do we do here ? */ 694 if (pe_num) 695 pnv_ioda_free_pe(phb, pe_num); 696 pdn->pe_number = IODA_INVALID_PE; 697 pe->pdev = NULL; 698 pci_dev_put(dev); 699 return NULL; 700 } 701 702 /* Assign a DMA weight to the device */ 703 pe->dma_weight = pnv_ioda_dma_weight(dev); 704 if (pe->dma_weight != 0) { 705 phb->ioda.dma_weight += pe->dma_weight; 706 phb->ioda.dma_pe_count++; 707 } 708 709 /* Link the PE */ 710 pnv_ioda_link_pe_by_weight(phb, pe); 711 712 return pe; 713 } 714 #endif /* Useful for SRIOV case */ 715 716 static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) 717 { 718 struct pci_dev *dev; 719 720 list_for_each_entry(dev, &bus->devices, bus_list) { 721 struct pci_dn *pdn = pci_get_pdn(dev); 722 723 if (pdn == NULL) { 724 pr_warn("%s: No device node associated with device !\n", 725 pci_name(dev)); 726 continue; 727 } 728 pdn->pcidev = dev; 729 pdn->pe_number = pe->pe_number; 730 pe->dma_weight += pnv_ioda_dma_weight(dev); 731 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) 732 pnv_ioda_setup_same_PE(dev->subordinate, pe); 733 } 734 } 735 736 /* 737 * There're 2 types of PCI bus sensitive PEs: One that is compromised of 738 * single PCI bus. Another one that contains the primary PCI bus and its 739 * subordinate PCI devices and buses. The second type of PE is normally 740 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. 741 */ 742 static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) 743 { 744 struct pci_controller *hose = pci_bus_to_host(bus); 745 struct pnv_phb *phb = hose->private_data; 746 struct pnv_ioda_pe *pe; 747 int pe_num = IODA_INVALID_PE; 748 749 /* Check if PE is determined by M64 */ 750 if (phb->pick_m64_pe) 751 pe_num = phb->pick_m64_pe(phb, bus, all); 752 753 /* The PE number isn't pinned by M64 */ 754 if (pe_num == IODA_INVALID_PE) 755 pe_num = pnv_ioda_alloc_pe(phb); 756 757 if (pe_num == IODA_INVALID_PE) { 758 pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", 759 __func__, pci_domain_nr(bus), bus->number); 760 return; 761 } 762 763 pe = &phb->ioda.pe_array[pe_num]; 764 pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); 765 pe->pbus = bus; 766 pe->pdev = NULL; 767 pe->tce32_seg = -1; 768 pe->mve_number = -1; 769 pe->rid = bus->busn_res.start << 8; 770 pe->dma_weight = 0; 771 772 if (all) 773 pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", 774 bus->busn_res.start, bus->busn_res.end, pe_num); 775 else 776 pe_info(pe, "Secondary bus %d associated with PE#%d\n", 777 bus->busn_res.start, pe_num); 778 779 if (pnv_ioda_configure_pe(phb, pe)) { 780 /* XXX What do we do here ? */ 781 if (pe_num) 782 pnv_ioda_free_pe(phb, pe_num); 783 pe->pbus = NULL; 784 return; 785 } 786 787 /* Associate it with all child devices */ 788 pnv_ioda_setup_same_PE(bus, pe); 789 790 /* Put PE to the list */ 791 list_add_tail(&pe->list, &phb->ioda.pe_list); 792 793 /* Account for one DMA PE if at least one DMA capable device exist 794 * below the bridge 795 */ 796 if (pe->dma_weight != 0) { 797 phb->ioda.dma_weight += pe->dma_weight; 798 phb->ioda.dma_pe_count++; 799 } 800 801 /* Link the PE */ 802 pnv_ioda_link_pe_by_weight(phb, pe); 803 } 804 805 static void pnv_ioda_setup_PEs(struct pci_bus *bus) 806 { 807 struct pci_dev *dev; 808 809 pnv_ioda_setup_bus_PE(bus, 0); 810 811 list_for_each_entry(dev, &bus->devices, bus_list) { 812 if (dev->subordinate) { 813 if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) 814 pnv_ioda_setup_bus_PE(dev->subordinate, 1); 815 else 816 pnv_ioda_setup_PEs(dev->subordinate); 817 } 818 } 819 } 820 821 /* 822 * Configure PEs so that the downstream PCI buses and devices 823 * could have their associated PE#. Unfortunately, we didn't 824 * figure out the way to identify the PLX bridge yet. So we 825 * simply put the PCI bus and the subordinate behind the root 826 * port to PE# here. The game rule here is expected to be changed 827 * as soon as we can detected PLX bridge correctly. 828 */ 829 static void pnv_pci_ioda_setup_PEs(void) 830 { 831 struct pci_controller *hose, *tmp; 832 struct pnv_phb *phb; 833 834 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 835 phb = hose->private_data; 836 837 /* M64 layout might affect PE allocation */ 838 if (phb->alloc_m64_pe) 839 phb->alloc_m64_pe(phb); 840 841 pnv_ioda_setup_PEs(hose->bus); 842 } 843 } 844 845 static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) 846 { 847 struct pci_dn *pdn = pci_get_pdn(pdev); 848 struct pnv_ioda_pe *pe; 849 850 /* 851 * The function can be called while the PE# 852 * hasn't been assigned. Do nothing for the 853 * case. 854 */ 855 if (!pdn || pdn->pe_number == IODA_INVALID_PE) 856 return; 857 858 pe = &phb->ioda.pe_array[pdn->pe_number]; 859 WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); 860 set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table); 861 } 862 863 static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, 864 struct pci_dev *pdev, u64 dma_mask) 865 { 866 struct pci_dn *pdn = pci_get_pdn(pdev); 867 struct pnv_ioda_pe *pe; 868 uint64_t top; 869 bool bypass = false; 870 871 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 872 return -ENODEV;; 873 874 pe = &phb->ioda.pe_array[pdn->pe_number]; 875 if (pe->tce_bypass_enabled) { 876 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; 877 bypass = (dma_mask >= top); 878 } 879 880 if (bypass) { 881 dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); 882 set_dma_ops(&pdev->dev, &dma_direct_ops); 883 set_dma_offset(&pdev->dev, pe->tce_bypass_base); 884 } else { 885 dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); 886 set_dma_ops(&pdev->dev, &dma_iommu_ops); 887 set_iommu_table_base(&pdev->dev, &pe->tce32_table); 888 } 889 *pdev->dev.dma_mask = dma_mask; 890 return 0; 891 } 892 893 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, 894 struct pci_bus *bus, 895 bool add_to_iommu_group) 896 { 897 struct pci_dev *dev; 898 899 list_for_each_entry(dev, &bus->devices, bus_list) { 900 if (add_to_iommu_group) 901 set_iommu_table_base_and_group(&dev->dev, 902 &pe->tce32_table); 903 else 904 set_iommu_table_base(&dev->dev, &pe->tce32_table); 905 906 if (dev->subordinate) 907 pnv_ioda_setup_bus_dma(pe, dev->subordinate, 908 add_to_iommu_group); 909 } 910 } 911 912 static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, 913 struct iommu_table *tbl, 914 __be64 *startp, __be64 *endp, bool rm) 915 { 916 __be64 __iomem *invalidate = rm ? 917 (__be64 __iomem *)pe->tce_inval_reg_phys : 918 (__be64 __iomem *)tbl->it_index; 919 unsigned long start, end, inc; 920 const unsigned shift = tbl->it_page_shift; 921 922 start = __pa(startp); 923 end = __pa(endp); 924 925 /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ 926 if (tbl->it_busno) { 927 start <<= shift; 928 end <<= shift; 929 inc = 128ull << shift; 930 start |= tbl->it_busno; 931 end |= tbl->it_busno; 932 } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { 933 /* p7ioc-style invalidation, 2 TCEs per write */ 934 start |= (1ull << 63); 935 end |= (1ull << 63); 936 inc = 16; 937 } else { 938 /* Default (older HW) */ 939 inc = 128; 940 } 941 942 end |= inc - 1; /* round up end to be different than start */ 943 944 mb(); /* Ensure above stores are visible */ 945 while (start <= end) { 946 if (rm) 947 __raw_rm_writeq(cpu_to_be64(start), invalidate); 948 else 949 __raw_writeq(cpu_to_be64(start), invalidate); 950 start += inc; 951 } 952 953 /* 954 * The iommu layer will do another mb() for us on build() 955 * and we don't care on free() 956 */ 957 } 958 959 static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, 960 struct iommu_table *tbl, 961 __be64 *startp, __be64 *endp, bool rm) 962 { 963 unsigned long start, end, inc; 964 __be64 __iomem *invalidate = rm ? 965 (__be64 __iomem *)pe->tce_inval_reg_phys : 966 (__be64 __iomem *)tbl->it_index; 967 const unsigned shift = tbl->it_page_shift; 968 969 /* We'll invalidate DMA address in PE scope */ 970 start = 0x2ull << 60; 971 start |= (pe->pe_number & 0xFF); 972 end = start; 973 974 /* Figure out the start, end and step */ 975 inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64)); 976 start |= (inc << shift); 977 inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64)); 978 end |= (inc << shift); 979 inc = (0x1ull << shift); 980 mb(); 981 982 while (start <= end) { 983 if (rm) 984 __raw_rm_writeq(cpu_to_be64(start), invalidate); 985 else 986 __raw_writeq(cpu_to_be64(start), invalidate); 987 start += inc; 988 } 989 } 990 991 void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, 992 __be64 *startp, __be64 *endp, bool rm) 993 { 994 struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, 995 tce32_table); 996 struct pnv_phb *phb = pe->phb; 997 998 if (phb->type == PNV_PHB_IODA1) 999 pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); 1000 else 1001 pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); 1002 } 1003 1004 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, 1005 struct pnv_ioda_pe *pe, unsigned int base, 1006 unsigned int segs) 1007 { 1008 1009 struct page *tce_mem = NULL; 1010 const __be64 *swinvp; 1011 struct iommu_table *tbl; 1012 unsigned int i; 1013 int64_t rc; 1014 void *addr; 1015 1016 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ 1017 #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) 1018 1019 /* XXX FIXME: Handle 64-bit only DMA devices */ 1020 /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ 1021 /* XXX FIXME: Allocate multi-level tables on PHB3 */ 1022 1023 /* We shouldn't already have a 32-bit DMA associated */ 1024 if (WARN_ON(pe->tce32_seg >= 0)) 1025 return; 1026 1027 /* Grab a 32-bit TCE table */ 1028 pe->tce32_seg = base; 1029 pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", 1030 (base << 28), ((base + segs) << 28) - 1); 1031 1032 /* XXX Currently, we allocate one big contiguous table for the 1033 * TCEs. We only really need one chunk per 256M of TCE space 1034 * (ie per segment) but that's an optimization for later, it 1035 * requires some added smarts with our get/put_tce implementation 1036 */ 1037 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 1038 get_order(TCE32_TABLE_SIZE * segs)); 1039 if (!tce_mem) { 1040 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); 1041 goto fail; 1042 } 1043 addr = page_address(tce_mem); 1044 memset(addr, 0, TCE32_TABLE_SIZE * segs); 1045 1046 /* Configure HW */ 1047 for (i = 0; i < segs; i++) { 1048 rc = opal_pci_map_pe_dma_window(phb->opal_id, 1049 pe->pe_number, 1050 base + i, 1, 1051 __pa(addr) + TCE32_TABLE_SIZE * i, 1052 TCE32_TABLE_SIZE, 0x1000); 1053 if (rc) { 1054 pe_err(pe, " Failed to configure 32-bit TCE table," 1055 " err %ld\n", rc); 1056 goto fail; 1057 } 1058 } 1059 1060 /* Setup linux iommu table */ 1061 tbl = &pe->tce32_table; 1062 pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, 1063 base << 28, IOMMU_PAGE_SHIFT_4K); 1064 1065 /* OPAL variant of P7IOC SW invalidated TCEs */ 1066 swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); 1067 if (swinvp) { 1068 /* We need a couple more fields -- an address and a data 1069 * to or. Since the bus is only printed out on table free 1070 * errors, and on the first pass the data will be a relative 1071 * bus number, print that out instead. 1072 */ 1073 pe->tce_inval_reg_phys = be64_to_cpup(swinvp); 1074 tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 1075 8); 1076 tbl->it_type |= (TCE_PCI_SWINV_CREATE | 1077 TCE_PCI_SWINV_FREE | 1078 TCE_PCI_SWINV_PAIR); 1079 } 1080 iommu_init_table(tbl, phb->hose->node); 1081 iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); 1082 1083 if (pe->pdev) 1084 set_iommu_table_base_and_group(&pe->pdev->dev, tbl); 1085 else 1086 pnv_ioda_setup_bus_dma(pe, pe->pbus, true); 1087 1088 return; 1089 fail: 1090 /* XXX Failure: Try to fallback to 64-bit only ? */ 1091 if (pe->tce32_seg >= 0) 1092 pe->tce32_seg = -1; 1093 if (tce_mem) 1094 __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); 1095 } 1096 1097 static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) 1098 { 1099 struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, 1100 tce32_table); 1101 uint16_t window_id = (pe->pe_number << 1 ) + 1; 1102 int64_t rc; 1103 1104 pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis"); 1105 if (enable) { 1106 phys_addr_t top = memblock_end_of_DRAM(); 1107 1108 top = roundup_pow_of_two(top); 1109 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, 1110 pe->pe_number, 1111 window_id, 1112 pe->tce_bypass_base, 1113 top); 1114 } else { 1115 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, 1116 pe->pe_number, 1117 window_id, 1118 pe->tce_bypass_base, 1119 0); 1120 1121 /* 1122 * EEH needs the mapping between IOMMU table and group 1123 * of those VFIO/KVM pass-through devices. We can postpone 1124 * resetting DMA ops until the DMA mask is configured in 1125 * host side. 1126 */ 1127 if (pe->pdev) 1128 set_iommu_table_base(&pe->pdev->dev, tbl); 1129 else 1130 pnv_ioda_setup_bus_dma(pe, pe->pbus, false); 1131 } 1132 if (rc) 1133 pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); 1134 else 1135 pe->tce_bypass_enabled = enable; 1136 } 1137 1138 static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, 1139 struct pnv_ioda_pe *pe) 1140 { 1141 /* TVE #1 is selected by PCI address bit 59 */ 1142 pe->tce_bypass_base = 1ull << 59; 1143 1144 /* Install set_bypass callback for VFIO */ 1145 pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; 1146 1147 /* Enable bypass by default */ 1148 pnv_pci_ioda2_set_bypass(&pe->tce32_table, true); 1149 } 1150 1151 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 1152 struct pnv_ioda_pe *pe) 1153 { 1154 struct page *tce_mem = NULL; 1155 void *addr; 1156 const __be64 *swinvp; 1157 struct iommu_table *tbl; 1158 unsigned int tce_table_size, end; 1159 int64_t rc; 1160 1161 /* We shouldn't already have a 32-bit DMA associated */ 1162 if (WARN_ON(pe->tce32_seg >= 0)) 1163 return; 1164 1165 /* The PE will reserve all possible 32-bits space */ 1166 pe->tce32_seg = 0; 1167 end = (1 << ilog2(phb->ioda.m32_pci_base)); 1168 tce_table_size = (end / 0x1000) * 8; 1169 pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", 1170 end); 1171 1172 /* Allocate TCE table */ 1173 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 1174 get_order(tce_table_size)); 1175 if (!tce_mem) { 1176 pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); 1177 goto fail; 1178 } 1179 addr = page_address(tce_mem); 1180 memset(addr, 0, tce_table_size); 1181 1182 /* 1183 * Map TCE table through TVT. The TVE index is the PE number 1184 * shifted by 1 bit for 32-bits DMA space. 1185 */ 1186 rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, 1187 pe->pe_number << 1, 1, __pa(addr), 1188 tce_table_size, 0x1000); 1189 if (rc) { 1190 pe_err(pe, "Failed to configure 32-bit TCE table," 1191 " err %ld\n", rc); 1192 goto fail; 1193 } 1194 1195 /* Setup linux iommu table */ 1196 tbl = &pe->tce32_table; 1197 pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, 1198 IOMMU_PAGE_SHIFT_4K); 1199 1200 /* OPAL variant of PHB3 invalidated TCEs */ 1201 swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); 1202 if (swinvp) { 1203 /* We need a couple more fields -- an address and a data 1204 * to or. Since the bus is only printed out on table free 1205 * errors, and on the first pass the data will be a relative 1206 * bus number, print that out instead. 1207 */ 1208 pe->tce_inval_reg_phys = be64_to_cpup(swinvp); 1209 tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 1210 8); 1211 tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); 1212 } 1213 iommu_init_table(tbl, phb->hose->node); 1214 iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); 1215 1216 if (pe->pdev) 1217 set_iommu_table_base_and_group(&pe->pdev->dev, tbl); 1218 else 1219 pnv_ioda_setup_bus_dma(pe, pe->pbus, true); 1220 1221 /* Also create a bypass window */ 1222 pnv_pci_ioda2_setup_bypass_pe(phb, pe); 1223 return; 1224 fail: 1225 if (pe->tce32_seg >= 0) 1226 pe->tce32_seg = -1; 1227 if (tce_mem) 1228 __free_pages(tce_mem, get_order(tce_table_size)); 1229 } 1230 1231 static void pnv_ioda_setup_dma(struct pnv_phb *phb) 1232 { 1233 struct pci_controller *hose = phb->hose; 1234 unsigned int residual, remaining, segs, tw, base; 1235 struct pnv_ioda_pe *pe; 1236 1237 /* If we have more PE# than segments available, hand out one 1238 * per PE until we run out and let the rest fail. If not, 1239 * then we assign at least one segment per PE, plus more based 1240 * on the amount of devices under that PE 1241 */ 1242 if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) 1243 residual = 0; 1244 else 1245 residual = phb->ioda.tce32_count - 1246 phb->ioda.dma_pe_count; 1247 1248 pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", 1249 hose->global_number, phb->ioda.tce32_count); 1250 pr_info("PCI: %d PE# for a total weight of %d\n", 1251 phb->ioda.dma_pe_count, phb->ioda.dma_weight); 1252 1253 /* Walk our PE list and configure their DMA segments, hand them 1254 * out one base segment plus any residual segments based on 1255 * weight 1256 */ 1257 remaining = phb->ioda.tce32_count; 1258 tw = phb->ioda.dma_weight; 1259 base = 0; 1260 list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { 1261 if (!pe->dma_weight) 1262 continue; 1263 if (!remaining) { 1264 pe_warn(pe, "No DMA32 resources available\n"); 1265 continue; 1266 } 1267 segs = 1; 1268 if (residual) { 1269 segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; 1270 if (segs > remaining) 1271 segs = remaining; 1272 } 1273 1274 /* 1275 * For IODA2 compliant PHB3, we needn't care about the weight. 1276 * The all available 32-bits DMA space will be assigned to 1277 * the specific PE. 1278 */ 1279 if (phb->type == PNV_PHB_IODA1) { 1280 pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", 1281 pe->dma_weight, segs); 1282 pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); 1283 } else { 1284 pe_info(pe, "Assign DMA32 space\n"); 1285 segs = 0; 1286 pnv_pci_ioda2_setup_dma_pe(phb, pe); 1287 } 1288 1289 remaining -= segs; 1290 base += segs; 1291 } 1292 } 1293 1294 #ifdef CONFIG_PCI_MSI 1295 static void pnv_ioda2_msi_eoi(struct irq_data *d) 1296 { 1297 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 1298 struct irq_chip *chip = irq_data_get_irq_chip(d); 1299 struct pnv_phb *phb = container_of(chip, struct pnv_phb, 1300 ioda.irq_chip); 1301 int64_t rc; 1302 1303 rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); 1304 WARN_ON_ONCE(rc); 1305 1306 icp_native_eoi(d); 1307 } 1308 1309 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, 1310 unsigned int hwirq, unsigned int virq, 1311 unsigned int is_64, struct msi_msg *msg) 1312 { 1313 struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); 1314 struct pci_dn *pdn = pci_get_pdn(dev); 1315 struct irq_data *idata; 1316 struct irq_chip *ichip; 1317 unsigned int xive_num = hwirq - phb->msi_base; 1318 __be32 data; 1319 int rc; 1320 1321 /* No PE assigned ? bail out ... no MSI for you ! */ 1322 if (pe == NULL) 1323 return -ENXIO; 1324 1325 /* Check if we have an MVE */ 1326 if (pe->mve_number < 0) 1327 return -ENXIO; 1328 1329 /* Force 32-bit MSI on some broken devices */ 1330 if (pdn && pdn->force_32bit_msi) 1331 is_64 = 0; 1332 1333 /* Assign XIVE to PE */ 1334 rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); 1335 if (rc) { 1336 pr_warn("%s: OPAL error %d setting XIVE %d PE\n", 1337 pci_name(dev), rc, xive_num); 1338 return -EIO; 1339 } 1340 1341 if (is_64) { 1342 __be64 addr64; 1343 1344 rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, 1345 &addr64, &data); 1346 if (rc) { 1347 pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", 1348 pci_name(dev), rc); 1349 return -EIO; 1350 } 1351 msg->address_hi = be64_to_cpu(addr64) >> 32; 1352 msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful; 1353 } else { 1354 __be32 addr32; 1355 1356 rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, 1357 &addr32, &data); 1358 if (rc) { 1359 pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", 1360 pci_name(dev), rc); 1361 return -EIO; 1362 } 1363 msg->address_hi = 0; 1364 msg->address_lo = be32_to_cpu(addr32); 1365 } 1366 msg->data = be32_to_cpu(data); 1367 1368 /* 1369 * Change the IRQ chip for the MSI interrupts on PHB3. 1370 * The corresponding IRQ chip should be populated for 1371 * the first time. 1372 */ 1373 if (phb->type == PNV_PHB_IODA2) { 1374 if (!phb->ioda.irq_chip_init) { 1375 idata = irq_get_irq_data(virq); 1376 ichip = irq_data_get_irq_chip(idata); 1377 phb->ioda.irq_chip_init = 1; 1378 phb->ioda.irq_chip = *ichip; 1379 phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; 1380 } 1381 1382 irq_set_chip(virq, &phb->ioda.irq_chip); 1383 } 1384 1385 pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," 1386 " address=%x_%08x data=%x PE# %d\n", 1387 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, 1388 msg->address_hi, msg->address_lo, data, pe->pe_number); 1389 1390 return 0; 1391 } 1392 1393 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) 1394 { 1395 unsigned int count; 1396 const __be32 *prop = of_get_property(phb->hose->dn, 1397 "ibm,opal-msi-ranges", NULL); 1398 if (!prop) { 1399 /* BML Fallback */ 1400 prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); 1401 } 1402 if (!prop) 1403 return; 1404 1405 phb->msi_base = be32_to_cpup(prop); 1406 count = be32_to_cpup(prop + 1); 1407 if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { 1408 pr_err("PCI %d: Failed to allocate MSI bitmap !\n", 1409 phb->hose->global_number); 1410 return; 1411 } 1412 1413 phb->msi_setup = pnv_pci_ioda_msi_setup; 1414 phb->msi32_support = 1; 1415 pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", 1416 count, phb->msi_base); 1417 } 1418 #else 1419 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } 1420 #endif /* CONFIG_PCI_MSI */ 1421 1422 /* 1423 * This function is supposed to be called on basis of PE from top 1424 * to bottom style. So the the I/O or MMIO segment assigned to 1425 * parent PE could be overrided by its child PEs if necessary. 1426 */ 1427 static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, 1428 struct pnv_ioda_pe *pe) 1429 { 1430 struct pnv_phb *phb = hose->private_data; 1431 struct pci_bus_region region; 1432 struct resource *res; 1433 int i, index; 1434 int rc; 1435 1436 /* 1437 * NOTE: We only care PCI bus based PE for now. For PCI 1438 * device based PE, for example SRIOV sensitive VF should 1439 * be figured out later. 1440 */ 1441 BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); 1442 1443 pci_bus_for_each_resource(pe->pbus, res, i) { 1444 if (!res || !res->flags || 1445 res->start > res->end) 1446 continue; 1447 1448 if (res->flags & IORESOURCE_IO) { 1449 region.start = res->start - phb->ioda.io_pci_base; 1450 region.end = res->end - phb->ioda.io_pci_base; 1451 index = region.start / phb->ioda.io_segsize; 1452 1453 while (index < phb->ioda.total_pe && 1454 region.start <= region.end) { 1455 phb->ioda.io_segmap[index] = pe->pe_number; 1456 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 1457 pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); 1458 if (rc != OPAL_SUCCESS) { 1459 pr_err("%s: OPAL error %d when mapping IO " 1460 "segment #%d to PE#%d\n", 1461 __func__, rc, index, pe->pe_number); 1462 break; 1463 } 1464 1465 region.start += phb->ioda.io_segsize; 1466 index++; 1467 } 1468 } else if (res->flags & IORESOURCE_MEM) { 1469 region.start = res->start - 1470 hose->mem_offset[0] - 1471 phb->ioda.m32_pci_base; 1472 region.end = res->end - 1473 hose->mem_offset[0] - 1474 phb->ioda.m32_pci_base; 1475 index = region.start / phb->ioda.m32_segsize; 1476 1477 while (index < phb->ioda.total_pe && 1478 region.start <= region.end) { 1479 phb->ioda.m32_segmap[index] = pe->pe_number; 1480 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 1481 pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); 1482 if (rc != OPAL_SUCCESS) { 1483 pr_err("%s: OPAL error %d when mapping M32 " 1484 "segment#%d to PE#%d", 1485 __func__, rc, index, pe->pe_number); 1486 break; 1487 } 1488 1489 region.start += phb->ioda.m32_segsize; 1490 index++; 1491 } 1492 } 1493 } 1494 } 1495 1496 static void pnv_pci_ioda_setup_seg(void) 1497 { 1498 struct pci_controller *tmp, *hose; 1499 struct pnv_phb *phb; 1500 struct pnv_ioda_pe *pe; 1501 1502 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 1503 phb = hose->private_data; 1504 list_for_each_entry(pe, &phb->ioda.pe_list, list) { 1505 pnv_ioda_setup_pe_seg(hose, pe); 1506 } 1507 } 1508 } 1509 1510 static void pnv_pci_ioda_setup_DMA(void) 1511 { 1512 struct pci_controller *hose, *tmp; 1513 struct pnv_phb *phb; 1514 1515 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 1516 pnv_ioda_setup_dma(hose->private_data); 1517 1518 /* Mark the PHB initialization done */ 1519 phb = hose->private_data; 1520 phb->initialized = 1; 1521 } 1522 } 1523 1524 static void pnv_pci_ioda_create_dbgfs(void) 1525 { 1526 #ifdef CONFIG_DEBUG_FS 1527 struct pci_controller *hose, *tmp; 1528 struct pnv_phb *phb; 1529 char name[16]; 1530 1531 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 1532 phb = hose->private_data; 1533 1534 sprintf(name, "PCI%04x", hose->global_number); 1535 phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); 1536 if (!phb->dbgfs) 1537 pr_warning("%s: Error on creating debugfs on PHB#%x\n", 1538 __func__, hose->global_number); 1539 } 1540 #endif /* CONFIG_DEBUG_FS */ 1541 } 1542 1543 static void pnv_pci_ioda_fixup(void) 1544 { 1545 pnv_pci_ioda_setup_PEs(); 1546 pnv_pci_ioda_setup_seg(); 1547 pnv_pci_ioda_setup_DMA(); 1548 1549 pnv_pci_ioda_create_dbgfs(); 1550 1551 #ifdef CONFIG_EEH 1552 eeh_init(); 1553 eeh_addr_cache_build(); 1554 #endif 1555 } 1556 1557 /* 1558 * Returns the alignment for I/O or memory windows for P2P 1559 * bridges. That actually depends on how PEs are segmented. 1560 * For now, we return I/O or M32 segment size for PE sensitive 1561 * P2P bridges. Otherwise, the default values (4KiB for I/O, 1562 * 1MiB for memory) will be returned. 1563 * 1564 * The current PCI bus might be put into one PE, which was 1565 * create against the parent PCI bridge. For that case, we 1566 * needn't enlarge the alignment so that we can save some 1567 * resources. 1568 */ 1569 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, 1570 unsigned long type) 1571 { 1572 struct pci_dev *bridge; 1573 struct pci_controller *hose = pci_bus_to_host(bus); 1574 struct pnv_phb *phb = hose->private_data; 1575 int num_pci_bridges = 0; 1576 1577 bridge = bus->self; 1578 while (bridge) { 1579 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) { 1580 num_pci_bridges++; 1581 if (num_pci_bridges >= 2) 1582 return 1; 1583 } 1584 1585 bridge = bridge->bus->self; 1586 } 1587 1588 /* We fail back to M32 if M64 isn't supported */ 1589 if (phb->ioda.m64_segsize && 1590 pnv_pci_is_mem_pref_64(type)) 1591 return phb->ioda.m64_segsize; 1592 if (type & IORESOURCE_MEM) 1593 return phb->ioda.m32_segsize; 1594 1595 return phb->ioda.io_segsize; 1596 } 1597 1598 /* Prevent enabling devices for which we couldn't properly 1599 * assign a PE 1600 */ 1601 static int pnv_pci_enable_device_hook(struct pci_dev *dev) 1602 { 1603 struct pci_controller *hose = pci_bus_to_host(dev->bus); 1604 struct pnv_phb *phb = hose->private_data; 1605 struct pci_dn *pdn; 1606 1607 /* The function is probably called while the PEs have 1608 * not be created yet. For example, resource reassignment 1609 * during PCI probe period. We just skip the check if 1610 * PEs isn't ready. 1611 */ 1612 if (!phb->initialized) 1613 return 0; 1614 1615 pdn = pci_get_pdn(dev); 1616 if (!pdn || pdn->pe_number == IODA_INVALID_PE) 1617 return -EINVAL; 1618 1619 return 0; 1620 } 1621 1622 static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, 1623 u32 devfn) 1624 { 1625 return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; 1626 } 1627 1628 static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) 1629 { 1630 opal_pci_reset(phb->opal_id, OPAL_PCI_IODA_TABLE_RESET, 1631 OPAL_ASSERT_RESET); 1632 } 1633 1634 void __init pnv_pci_init_ioda_phb(struct device_node *np, 1635 u64 hub_id, int ioda_type) 1636 { 1637 struct pci_controller *hose; 1638 struct pnv_phb *phb; 1639 unsigned long size, m32map_off, pemap_off, iomap_off = 0; 1640 const __be64 *prop64; 1641 const __be32 *prop32; 1642 int len; 1643 u64 phb_id; 1644 void *aux; 1645 long rc; 1646 1647 pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name); 1648 1649 prop64 = of_get_property(np, "ibm,opal-phbid", NULL); 1650 if (!prop64) { 1651 pr_err(" Missing \"ibm,opal-phbid\" property !\n"); 1652 return; 1653 } 1654 phb_id = be64_to_cpup(prop64); 1655 pr_debug(" PHB-ID : 0x%016llx\n", phb_id); 1656 1657 phb = alloc_bootmem(sizeof(struct pnv_phb)); 1658 if (!phb) { 1659 pr_err(" Out of memory !\n"); 1660 return; 1661 } 1662 1663 /* Allocate PCI controller */ 1664 memset(phb, 0, sizeof(struct pnv_phb)); 1665 phb->hose = hose = pcibios_alloc_controller(np); 1666 if (!phb->hose) { 1667 pr_err(" Can't allocate PCI controller for %s\n", 1668 np->full_name); 1669 free_bootmem((unsigned long)phb, sizeof(struct pnv_phb)); 1670 return; 1671 } 1672 1673 spin_lock_init(&phb->lock); 1674 prop32 = of_get_property(np, "bus-range", &len); 1675 if (prop32 && len == 8) { 1676 hose->first_busno = be32_to_cpu(prop32[0]); 1677 hose->last_busno = be32_to_cpu(prop32[1]); 1678 } else { 1679 pr_warn(" Broken <bus-range> on %s\n", np->full_name); 1680 hose->first_busno = 0; 1681 hose->last_busno = 0xff; 1682 } 1683 hose->private_data = phb; 1684 phb->hub_id = hub_id; 1685 phb->opal_id = phb_id; 1686 phb->type = ioda_type; 1687 1688 /* Detect specific models for error handling */ 1689 if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) 1690 phb->model = PNV_PHB_MODEL_P7IOC; 1691 else if (of_device_is_compatible(np, "ibm,power8-pciex")) 1692 phb->model = PNV_PHB_MODEL_PHB3; 1693 else 1694 phb->model = PNV_PHB_MODEL_UNKNOWN; 1695 1696 /* Parse 32-bit and IO ranges (if any) */ 1697 pci_process_bridge_OF_ranges(hose, np, !hose->global_number); 1698 1699 /* Get registers */ 1700 phb->regs = of_iomap(np, 0); 1701 if (phb->regs == NULL) 1702 pr_err(" Failed to map registers !\n"); 1703 1704 /* Initialize more IODA stuff */ 1705 phb->ioda.total_pe = 1; 1706 prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); 1707 if (prop32) 1708 phb->ioda.total_pe = be32_to_cpup(prop32); 1709 prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); 1710 if (prop32) 1711 phb->ioda.reserved_pe = be32_to_cpup(prop32); 1712 1713 /* Parse 64-bit MMIO range */ 1714 pnv_ioda_parse_m64_window(phb); 1715 1716 phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); 1717 /* FW Has already off top 64k of M32 space (MSI space) */ 1718 phb->ioda.m32_size += 0x10000; 1719 1720 phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; 1721 phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; 1722 phb->ioda.io_size = hose->pci_io_size; 1723 phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; 1724 phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ 1725 1726 /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ 1727 size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 1728 m32map_off = size; 1729 size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); 1730 if (phb->type == PNV_PHB_IODA1) { 1731 iomap_off = size; 1732 size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); 1733 } 1734 pemap_off = size; 1735 size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); 1736 aux = alloc_bootmem(size); 1737 memset(aux, 0, size); 1738 phb->ioda.pe_alloc = aux; 1739 phb->ioda.m32_segmap = aux + m32map_off; 1740 if (phb->type == PNV_PHB_IODA1) 1741 phb->ioda.io_segmap = aux + iomap_off; 1742 phb->ioda.pe_array = aux + pemap_off; 1743 set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); 1744 1745 INIT_LIST_HEAD(&phb->ioda.pe_dma_list); 1746 INIT_LIST_HEAD(&phb->ioda.pe_list); 1747 1748 /* Calculate how many 32-bit TCE segments we have */ 1749 phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; 1750 1751 #if 0 /* We should really do that ... */ 1752 rc = opal_pci_set_phb_mem_window(opal->phb_id, 1753 window_type, 1754 window_num, 1755 starting_real_address, 1756 starting_pci_address, 1757 segment_size); 1758 #endif 1759 1760 pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n", 1761 phb->ioda.total_pe, phb->ioda.reserved_pe, 1762 phb->ioda.m32_size, phb->ioda.m32_segsize); 1763 if (phb->ioda.m64_size) 1764 pr_info(" M64: 0x%lx [segment=0x%lx]\n", 1765 phb->ioda.m64_size, phb->ioda.m64_segsize); 1766 if (phb->ioda.io_size) 1767 pr_info(" IO: 0x%x [segment=0x%x]\n", 1768 phb->ioda.io_size, phb->ioda.io_segsize); 1769 1770 1771 phb->hose->ops = &pnv_pci_ops; 1772 phb->get_pe_state = pnv_ioda_get_pe_state; 1773 phb->freeze_pe = pnv_ioda_freeze_pe; 1774 phb->unfreeze_pe = pnv_ioda_unfreeze_pe; 1775 #ifdef CONFIG_EEH 1776 phb->eeh_ops = &ioda_eeh_ops; 1777 #endif 1778 1779 /* Setup RID -> PE mapping function */ 1780 phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; 1781 1782 /* Setup TCEs */ 1783 phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; 1784 phb->dma_set_mask = pnv_pci_ioda_dma_set_mask; 1785 1786 /* Setup shutdown function for kexec */ 1787 phb->shutdown = pnv_pci_ioda_shutdown; 1788 1789 /* Setup MSI support */ 1790 pnv_pci_init_ioda_msis(phb); 1791 1792 /* 1793 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here 1794 * to let the PCI core do resource assignment. It's supposed 1795 * that the PCI core will do correct I/O and MMIO alignment 1796 * for the P2P bridge bars so that each PCI bus (excluding 1797 * the child P2P bridges) can form individual PE. 1798 */ 1799 ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; 1800 ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; 1801 ppc_md.pcibios_window_alignment = pnv_pci_window_alignment; 1802 ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus; 1803 pci_add_flags(PCI_REASSIGN_ALL_RSRC); 1804 1805 /* Reset IODA tables to a clean state */ 1806 rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); 1807 if (rc) 1808 pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); 1809 1810 /* If we're running in kdump kerenl, the previous kerenl never 1811 * shutdown PCI devices correctly. We already got IODA table 1812 * cleaned out. So we have to issue PHB reset to stop all PCI 1813 * transactions from previous kerenl. 1814 */ 1815 if (is_kdump_kernel()) { 1816 pr_info(" Issue PHB reset ...\n"); 1817 ioda_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); 1818 ioda_eeh_phb_reset(hose, OPAL_DEASSERT_RESET); 1819 } 1820 1821 /* Configure M64 window */ 1822 if (phb->init_m64 && phb->init_m64(phb)) 1823 hose->mem_resources[1].flags = 0; 1824 } 1825 1826 void __init pnv_pci_init_ioda2_phb(struct device_node *np) 1827 { 1828 pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); 1829 } 1830 1831 void __init pnv_pci_init_ioda_hub(struct device_node *np) 1832 { 1833 struct device_node *phbn; 1834 const __be64 *prop64; 1835 u64 hub_id; 1836 1837 pr_info("Probing IODA IO-Hub %s\n", np->full_name); 1838 1839 prop64 = of_get_property(np, "ibm,opal-hubid", NULL); 1840 if (!prop64) { 1841 pr_err(" Missing \"ibm,opal-hubid\" property !\n"); 1842 return; 1843 } 1844 hub_id = be64_to_cpup(prop64); 1845 pr_devel(" HUB-ID : 0x%016llx\n", hub_id); 1846 1847 /* Count child PHBs */ 1848 for_each_child_of_node(np, phbn) { 1849 /* Look for IODA1 PHBs */ 1850 if (of_device_is_compatible(phbn, "ibm,ioda-phb")) 1851 pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); 1852 } 1853 } 1854