xref: /openbmc/linux/arch/powerpc/platforms/powernv/pci-ioda.c (revision 7051924f771722c6dd235e693742cda6488ac700)
1 /*
2  * Support PCI/PCIe on PowerNV platforms
3  *
4  * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 
12 #undef DEBUG
13 
14 #include <linux/kernel.h>
15 #include <linux/pci.h>
16 #include <linux/crash_dump.h>
17 #include <linux/debugfs.h>
18 #include <linux/delay.h>
19 #include <linux/string.h>
20 #include <linux/init.h>
21 #include <linux/bootmem.h>
22 #include <linux/irq.h>
23 #include <linux/io.h>
24 #include <linux/msi.h>
25 #include <linux/memblock.h>
26 
27 #include <asm/sections.h>
28 #include <asm/io.h>
29 #include <asm/prom.h>
30 #include <asm/pci-bridge.h>
31 #include <asm/machdep.h>
32 #include <asm/msi_bitmap.h>
33 #include <asm/ppc-pci.h>
34 #include <asm/opal.h>
35 #include <asm/iommu.h>
36 #include <asm/tce.h>
37 #include <asm/xics.h>
38 #include <asm/debug.h>
39 #include <asm/firmware.h>
40 
41 #include "powernv.h"
42 #include "pci.h"
43 
44 #define define_pe_printk_level(func, kern_level)		\
45 static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...)	\
46 {								\
47 	struct va_format vaf;					\
48 	va_list args;						\
49 	char pfix[32];						\
50 	int r;							\
51 								\
52 	va_start(args, fmt);					\
53 								\
54 	vaf.fmt = fmt;						\
55 	vaf.va = &args;						\
56 								\
57 	if (pe->pdev)						\
58 		strlcpy(pfix, dev_name(&pe->pdev->dev),		\
59 			sizeof(pfix));				\
60 	else							\
61 		sprintf(pfix, "%04x:%02x     ",			\
62 			pci_domain_nr(pe->pbus),		\
63 			pe->pbus->number);			\
64 	r = printk(kern_level "pci %s: [PE# %.3d] %pV",		\
65 		   pfix, pe->pe_number, &vaf);			\
66 								\
67 	va_end(args);						\
68 								\
69 	return r;						\
70 }								\
71 
72 define_pe_printk_level(pe_err, KERN_ERR);
73 define_pe_printk_level(pe_warn, KERN_WARNING);
74 define_pe_printk_level(pe_info, KERN_INFO);
75 
76 /*
77  * stdcix is only supposed to be used in hypervisor real mode as per
78  * the architecture spec
79  */
80 static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
81 {
82 	__asm__ __volatile__("stdcix %0,0,%1"
83 		: : "r" (val), "r" (paddr) : "memory");
84 }
85 
86 static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
87 {
88 	return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==
89 		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
90 }
91 
92 static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
93 {
94 	unsigned long pe;
95 
96 	do {
97 		pe = find_next_zero_bit(phb->ioda.pe_alloc,
98 					phb->ioda.total_pe, 0);
99 		if (pe >= phb->ioda.total_pe)
100 			return IODA_INVALID_PE;
101 	} while(test_and_set_bit(pe, phb->ioda.pe_alloc));
102 
103 	phb->ioda.pe_array[pe].phb = phb;
104 	phb->ioda.pe_array[pe].pe_number = pe;
105 	return pe;
106 }
107 
108 static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
109 {
110 	WARN_ON(phb->ioda.pe_array[pe].pdev);
111 
112 	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
113 	clear_bit(pe, phb->ioda.pe_alloc);
114 }
115 
116 /* The default M64 BAR is shared by all PEs */
117 static int pnv_ioda2_init_m64(struct pnv_phb *phb)
118 {
119 	const char *desc;
120 	struct resource *r;
121 	s64 rc;
122 
123 	/* Configure the default M64 BAR */
124 	rc = opal_pci_set_phb_mem_window(phb->opal_id,
125 					 OPAL_M64_WINDOW_TYPE,
126 					 phb->ioda.m64_bar_idx,
127 					 phb->ioda.m64_base,
128 					 0, /* unused */
129 					 phb->ioda.m64_size);
130 	if (rc != OPAL_SUCCESS) {
131 		desc = "configuring";
132 		goto fail;
133 	}
134 
135 	/* Enable the default M64 BAR */
136 	rc = opal_pci_phb_mmio_enable(phb->opal_id,
137 				      OPAL_M64_WINDOW_TYPE,
138 				      phb->ioda.m64_bar_idx,
139 				      OPAL_ENABLE_M64_SPLIT);
140 	if (rc != OPAL_SUCCESS) {
141 		desc = "enabling";
142 		goto fail;
143 	}
144 
145 	/* Mark the M64 BAR assigned */
146 	set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc);
147 
148 	/*
149 	 * Strip off the segment used by the reserved PE, which is
150 	 * expected to be 0 or last one of PE capabicity.
151 	 */
152 	r = &phb->hose->mem_resources[1];
153 	if (phb->ioda.reserved_pe == 0)
154 		r->start += phb->ioda.m64_segsize;
155 	else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1))
156 		r->end -= phb->ioda.m64_segsize;
157 	else
158 		pr_warn("  Cannot strip M64 segment for reserved PE#%d\n",
159 			phb->ioda.reserved_pe);
160 
161 	return 0;
162 
163 fail:
164 	pr_warn("  Failure %lld %s M64 BAR#%d\n",
165 		rc, desc, phb->ioda.m64_bar_idx);
166 	opal_pci_phb_mmio_enable(phb->opal_id,
167 				 OPAL_M64_WINDOW_TYPE,
168 				 phb->ioda.m64_bar_idx,
169 				 OPAL_DISABLE_M64);
170 	return -EIO;
171 }
172 
173 static void pnv_ioda2_alloc_m64_pe(struct pnv_phb *phb)
174 {
175 	resource_size_t sgsz = phb->ioda.m64_segsize;
176 	struct pci_dev *pdev;
177 	struct resource *r;
178 	int base, step, i;
179 
180 	/*
181 	 * Root bus always has full M64 range and root port has
182 	 * M64 range used in reality. So we're checking root port
183 	 * instead of root bus.
184 	 */
185 	list_for_each_entry(pdev, &phb->hose->bus->devices, bus_list) {
186 		for (i = PCI_BRIDGE_RESOURCES;
187 		     i <= PCI_BRIDGE_RESOURCE_END; i++) {
188 			r = &pdev->resource[i];
189 			if (!r->parent ||
190 			    !pnv_pci_is_mem_pref_64(r->flags))
191 				continue;
192 
193 			base = (r->start - phb->ioda.m64_base) / sgsz;
194 			for (step = 0; step < resource_size(r) / sgsz; step++)
195 				set_bit(base + step, phb->ioda.pe_alloc);
196 		}
197 	}
198 }
199 
200 static int pnv_ioda2_pick_m64_pe(struct pnv_phb *phb,
201 				 struct pci_bus *bus, int all)
202 {
203 	resource_size_t segsz = phb->ioda.m64_segsize;
204 	struct pci_dev *pdev;
205 	struct resource *r;
206 	struct pnv_ioda_pe *master_pe, *pe;
207 	unsigned long size, *pe_alloc;
208 	bool found;
209 	int start, i, j;
210 
211 	/* Root bus shouldn't use M64 */
212 	if (pci_is_root_bus(bus))
213 		return IODA_INVALID_PE;
214 
215 	/* We support only one M64 window on each bus */
216 	found = false;
217 	pci_bus_for_each_resource(bus, r, i) {
218 		if (r && r->parent &&
219 		    pnv_pci_is_mem_pref_64(r->flags)) {
220 			found = true;
221 			break;
222 		}
223 	}
224 
225 	/* No M64 window found ? */
226 	if (!found)
227 		return IODA_INVALID_PE;
228 
229 	/* Allocate bitmap */
230 	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
231 	pe_alloc = kzalloc(size, GFP_KERNEL);
232 	if (!pe_alloc) {
233 		pr_warn("%s: Out of memory !\n",
234 			__func__);
235 		return IODA_INVALID_PE;
236 	}
237 
238 	/*
239 	 * Figure out reserved PE numbers by the PE
240 	 * the its child PEs.
241 	 */
242 	start = (r->start - phb->ioda.m64_base) / segsz;
243 	for (i = 0; i < resource_size(r) / segsz; i++)
244 		set_bit(start + i, pe_alloc);
245 
246 	if (all)
247 		goto done;
248 
249 	/*
250 	 * If the PE doesn't cover all subordinate buses,
251 	 * we need subtract from reserved PEs for children.
252 	 */
253 	list_for_each_entry(pdev, &bus->devices, bus_list) {
254 		if (!pdev->subordinate)
255 			continue;
256 
257 		pci_bus_for_each_resource(pdev->subordinate, r, i) {
258 			if (!r || !r->parent ||
259 			    !pnv_pci_is_mem_pref_64(r->flags))
260 				continue;
261 
262 			start = (r->start - phb->ioda.m64_base) / segsz;
263 			for (j = 0; j < resource_size(r) / segsz ; j++)
264 				clear_bit(start + j, pe_alloc);
265                 }
266         }
267 
268 	/*
269 	 * the current bus might not own M64 window and that's all
270 	 * contributed by its child buses. For the case, we needn't
271 	 * pick M64 dependent PE#.
272 	 */
273 	if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) {
274 		kfree(pe_alloc);
275 		return IODA_INVALID_PE;
276 	}
277 
278 	/*
279 	 * Figure out the master PE and put all slave PEs to master
280 	 * PE's list to form compound PE.
281 	 */
282 done:
283 	master_pe = NULL;
284 	i = -1;
285 	while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) <
286 		phb->ioda.total_pe) {
287 		pe = &phb->ioda.pe_array[i];
288 		pe->phb = phb;
289 		pe->pe_number = i;
290 
291 		if (!master_pe) {
292 			pe->flags |= PNV_IODA_PE_MASTER;
293 			INIT_LIST_HEAD(&pe->slaves);
294 			master_pe = pe;
295 		} else {
296 			pe->flags |= PNV_IODA_PE_SLAVE;
297 			pe->master = master_pe;
298 			list_add_tail(&pe->list, &master_pe->slaves);
299 		}
300 	}
301 
302 	kfree(pe_alloc);
303 	return master_pe->pe_number;
304 }
305 
306 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
307 {
308 	struct pci_controller *hose = phb->hose;
309 	struct device_node *dn = hose->dn;
310 	struct resource *res;
311 	const u32 *r;
312 	u64 pci_addr;
313 
314 	if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
315 		pr_info("  Firmware too old to support M64 window\n");
316 		return;
317 	}
318 
319 	r = of_get_property(dn, "ibm,opal-m64-window", NULL);
320 	if (!r) {
321 		pr_info("  No <ibm,opal-m64-window> on %s\n",
322 			dn->full_name);
323 		return;
324 	}
325 
326 	/* FIXME: Support M64 for P7IOC */
327 	if (phb->type != PNV_PHB_IODA2) {
328 		pr_info("  Not support M64 window\n");
329 		return;
330 	}
331 
332 	res = &hose->mem_resources[1];
333 	res->start = of_translate_address(dn, r + 2);
334 	res->end = res->start + of_read_number(r + 4, 2) - 1;
335 	res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
336 	pci_addr = of_read_number(r, 2);
337 	hose->mem_offset[1] = res->start - pci_addr;
338 
339 	phb->ioda.m64_size = resource_size(res);
340 	phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe;
341 	phb->ioda.m64_base = pci_addr;
342 
343 	/* Use last M64 BAR to cover M64 window */
344 	phb->ioda.m64_bar_idx = 15;
345 	phb->init_m64 = pnv_ioda2_init_m64;
346 	phb->alloc_m64_pe = pnv_ioda2_alloc_m64_pe;
347 	phb->pick_m64_pe = pnv_ioda2_pick_m64_pe;
348 }
349 
350 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
351 {
352 	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
353 	struct pnv_ioda_pe *slave;
354 	s64 rc;
355 
356 	/* Fetch master PE */
357 	if (pe->flags & PNV_IODA_PE_SLAVE) {
358 		pe = pe->master;
359 		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
360 		pe_no = pe->pe_number;
361 	}
362 
363 	/* Freeze master PE */
364 	rc = opal_pci_eeh_freeze_set(phb->opal_id,
365 				     pe_no,
366 				     OPAL_EEH_ACTION_SET_FREEZE_ALL);
367 	if (rc != OPAL_SUCCESS) {
368 		pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
369 			__func__, rc, phb->hose->global_number, pe_no);
370 		return;
371 	}
372 
373 	/* Freeze slave PEs */
374 	if (!(pe->flags & PNV_IODA_PE_MASTER))
375 		return;
376 
377 	list_for_each_entry(slave, &pe->slaves, list) {
378 		rc = opal_pci_eeh_freeze_set(phb->opal_id,
379 					     slave->pe_number,
380 					     OPAL_EEH_ACTION_SET_FREEZE_ALL);
381 		if (rc != OPAL_SUCCESS)
382 			pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
383 				__func__, rc, phb->hose->global_number,
384 				slave->pe_number);
385 	}
386 }
387 
388 int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
389 {
390 	struct pnv_ioda_pe *pe, *slave;
391 	s64 rc;
392 
393 	/* Find master PE */
394 	pe = &phb->ioda.pe_array[pe_no];
395 	if (pe->flags & PNV_IODA_PE_SLAVE) {
396 		pe = pe->master;
397 		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
398 		pe_no = pe->pe_number;
399 	}
400 
401 	/* Clear frozen state for master PE */
402 	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
403 	if (rc != OPAL_SUCCESS) {
404 		pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
405 			__func__, rc, opt, phb->hose->global_number, pe_no);
406 		return -EIO;
407 	}
408 
409 	if (!(pe->flags & PNV_IODA_PE_MASTER))
410 		return 0;
411 
412 	/* Clear frozen state for slave PEs */
413 	list_for_each_entry(slave, &pe->slaves, list) {
414 		rc = opal_pci_eeh_freeze_clear(phb->opal_id,
415 					     slave->pe_number,
416 					     opt);
417 		if (rc != OPAL_SUCCESS) {
418 			pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
419 				__func__, rc, opt, phb->hose->global_number,
420 				slave->pe_number);
421 			return -EIO;
422 		}
423 	}
424 
425 	return 0;
426 }
427 
428 static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
429 {
430 	struct pnv_ioda_pe *slave, *pe;
431 	u8 fstate, state;
432 	__be16 pcierr;
433 	s64 rc;
434 
435 	/* Sanity check on PE number */
436 	if (pe_no < 0 || pe_no >= phb->ioda.total_pe)
437 		return OPAL_EEH_STOPPED_PERM_UNAVAIL;
438 
439 	/*
440 	 * Fetch the master PE and the PE instance might be
441 	 * not initialized yet.
442 	 */
443 	pe = &phb->ioda.pe_array[pe_no];
444 	if (pe->flags & PNV_IODA_PE_SLAVE) {
445 		pe = pe->master;
446 		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
447 		pe_no = pe->pe_number;
448 	}
449 
450 	/* Check the master PE */
451 	rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
452 					&state, &pcierr, NULL);
453 	if (rc != OPAL_SUCCESS) {
454 		pr_warn("%s: Failure %lld getting "
455 			"PHB#%x-PE#%x state\n",
456 			__func__, rc,
457 			phb->hose->global_number, pe_no);
458 		return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
459 	}
460 
461 	/* Check the slave PE */
462 	if (!(pe->flags & PNV_IODA_PE_MASTER))
463 		return state;
464 
465 	list_for_each_entry(slave, &pe->slaves, list) {
466 		rc = opal_pci_eeh_freeze_status(phb->opal_id,
467 						slave->pe_number,
468 						&fstate,
469 						&pcierr,
470 						NULL);
471 		if (rc != OPAL_SUCCESS) {
472 			pr_warn("%s: Failure %lld getting "
473 				"PHB#%x-PE#%x state\n",
474 				__func__, rc,
475 				phb->hose->global_number, slave->pe_number);
476 			return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
477 		}
478 
479 		/*
480 		 * Override the result based on the ascending
481 		 * priority.
482 		 */
483 		if (fstate > state)
484 			state = fstate;
485 	}
486 
487 	return state;
488 }
489 
490 /* Currently those 2 are only used when MSIs are enabled, this will change
491  * but in the meantime, we need to protect them to avoid warnings
492  */
493 #ifdef CONFIG_PCI_MSI
494 static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
495 {
496 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
497 	struct pnv_phb *phb = hose->private_data;
498 	struct pci_dn *pdn = pci_get_pdn(dev);
499 
500 	if (!pdn)
501 		return NULL;
502 	if (pdn->pe_number == IODA_INVALID_PE)
503 		return NULL;
504 	return &phb->ioda.pe_array[pdn->pe_number];
505 }
506 #endif /* CONFIG_PCI_MSI */
507 
508 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
509 {
510 	struct pci_dev *parent;
511 	uint8_t bcomp, dcomp, fcomp;
512 	long rc, rid_end, rid;
513 
514 	/* Bus validation ? */
515 	if (pe->pbus) {
516 		int count;
517 
518 		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
519 		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
520 		parent = pe->pbus->self;
521 		if (pe->flags & PNV_IODA_PE_BUS_ALL)
522 			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
523 		else
524 			count = 1;
525 
526 		switch(count) {
527 		case  1: bcomp = OpalPciBusAll;		break;
528 		case  2: bcomp = OpalPciBus7Bits;	break;
529 		case  4: bcomp = OpalPciBus6Bits;	break;
530 		case  8: bcomp = OpalPciBus5Bits;	break;
531 		case 16: bcomp = OpalPciBus4Bits;	break;
532 		case 32: bcomp = OpalPciBus3Bits;	break;
533 		default:
534 			pr_err("%s: Number of subordinate busses %d"
535 			       " unsupported\n",
536 			       pci_name(pe->pbus->self), count);
537 			/* Do an exact match only */
538 			bcomp = OpalPciBusAll;
539 		}
540 		rid_end = pe->rid + (count << 8);
541 	} else {
542 		parent = pe->pdev->bus->self;
543 		bcomp = OpalPciBusAll;
544 		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
545 		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
546 		rid_end = pe->rid + 1;
547 	}
548 
549 	/*
550 	 * Associate PE in PELT. We need add the PE into the
551 	 * corresponding PELT-V as well. Otherwise, the error
552 	 * originated from the PE might contribute to other
553 	 * PEs.
554 	 */
555 	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
556 			     bcomp, dcomp, fcomp, OPAL_MAP_PE);
557 	if (rc) {
558 		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
559 		return -ENXIO;
560 	}
561 
562 	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
563 				pe->pe_number, OPAL_ADD_PE_TO_DOMAIN);
564 	if (rc)
565 		pe_warn(pe, "OPAL error %d adding self to PELTV\n", rc);
566 	opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
567 				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
568 
569 	/* Add to all parents PELT-V */
570 	while (parent) {
571 		struct pci_dn *pdn = pci_get_pdn(parent);
572 		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
573 			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
574 						pe->pe_number, OPAL_ADD_PE_TO_DOMAIN);
575 			/* XXX What to do in case of error ? */
576 		}
577 		parent = parent->bus->self;
578 	}
579 	/* Setup reverse map */
580 	for (rid = pe->rid; rid < rid_end; rid++)
581 		phb->ioda.pe_rmap[rid] = pe->pe_number;
582 
583 	/* Setup one MVTs on IODA1 */
584 	if (phb->type == PNV_PHB_IODA1) {
585 		pe->mve_number = pe->pe_number;
586 		rc = opal_pci_set_mve(phb->opal_id, pe->mve_number,
587 				      pe->pe_number);
588 		if (rc) {
589 			pe_err(pe, "OPAL error %ld setting up MVE %d\n",
590 			       rc, pe->mve_number);
591 			pe->mve_number = -1;
592 		} else {
593 			rc = opal_pci_set_mve_enable(phb->opal_id,
594 						     pe->mve_number, OPAL_ENABLE_MVE);
595 			if (rc) {
596 				pe_err(pe, "OPAL error %ld enabling MVE %d\n",
597 				       rc, pe->mve_number);
598 				pe->mve_number = -1;
599 			}
600 		}
601 	} else if (phb->type == PNV_PHB_IODA2)
602 		pe->mve_number = 0;
603 
604 	return 0;
605 }
606 
607 static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
608 				       struct pnv_ioda_pe *pe)
609 {
610 	struct pnv_ioda_pe *lpe;
611 
612 	list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
613 		if (lpe->dma_weight < pe->dma_weight) {
614 			list_add_tail(&pe->dma_link, &lpe->dma_link);
615 			return;
616 		}
617 	}
618 	list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
619 }
620 
621 static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
622 {
623 	/* This is quite simplistic. The "base" weight of a device
624 	 * is 10. 0 means no DMA is to be accounted for it.
625 	 */
626 
627 	/* If it's a bridge, no DMA */
628 	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
629 		return 0;
630 
631 	/* Reduce the weight of slow USB controllers */
632 	if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
633 	    dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
634 	    dev->class == PCI_CLASS_SERIAL_USB_EHCI)
635 		return 3;
636 
637 	/* Increase the weight of RAID (includes Obsidian) */
638 	if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
639 		return 15;
640 
641 	/* Default */
642 	return 10;
643 }
644 
645 #if 0
646 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
647 {
648 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
649 	struct pnv_phb *phb = hose->private_data;
650 	struct pci_dn *pdn = pci_get_pdn(dev);
651 	struct pnv_ioda_pe *pe;
652 	int pe_num;
653 
654 	if (!pdn) {
655 		pr_err("%s: Device tree node not associated properly\n",
656 			   pci_name(dev));
657 		return NULL;
658 	}
659 	if (pdn->pe_number != IODA_INVALID_PE)
660 		return NULL;
661 
662 	/* PE#0 has been pre-set */
663 	if (dev->bus->number == 0)
664 		pe_num = 0;
665 	else
666 		pe_num = pnv_ioda_alloc_pe(phb);
667 	if (pe_num == IODA_INVALID_PE) {
668 		pr_warning("%s: Not enough PE# available, disabling device\n",
669 			   pci_name(dev));
670 		return NULL;
671 	}
672 
673 	/* NOTE: We get only one ref to the pci_dev for the pdn, not for the
674 	 * pointer in the PE data structure, both should be destroyed at the
675 	 * same time. However, this needs to be looked at more closely again
676 	 * once we actually start removing things (Hotplug, SR-IOV, ...)
677 	 *
678 	 * At some point we want to remove the PDN completely anyways
679 	 */
680 	pe = &phb->ioda.pe_array[pe_num];
681 	pci_dev_get(dev);
682 	pdn->pcidev = dev;
683 	pdn->pe_number = pe_num;
684 	pe->pdev = dev;
685 	pe->pbus = NULL;
686 	pe->tce32_seg = -1;
687 	pe->mve_number = -1;
688 	pe->rid = dev->bus->number << 8 | pdn->devfn;
689 
690 	pe_info(pe, "Associated device to PE\n");
691 
692 	if (pnv_ioda_configure_pe(phb, pe)) {
693 		/* XXX What do we do here ? */
694 		if (pe_num)
695 			pnv_ioda_free_pe(phb, pe_num);
696 		pdn->pe_number = IODA_INVALID_PE;
697 		pe->pdev = NULL;
698 		pci_dev_put(dev);
699 		return NULL;
700 	}
701 
702 	/* Assign a DMA weight to the device */
703 	pe->dma_weight = pnv_ioda_dma_weight(dev);
704 	if (pe->dma_weight != 0) {
705 		phb->ioda.dma_weight += pe->dma_weight;
706 		phb->ioda.dma_pe_count++;
707 	}
708 
709 	/* Link the PE */
710 	pnv_ioda_link_pe_by_weight(phb, pe);
711 
712 	return pe;
713 }
714 #endif /* Useful for SRIOV case */
715 
716 static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
717 {
718 	struct pci_dev *dev;
719 
720 	list_for_each_entry(dev, &bus->devices, bus_list) {
721 		struct pci_dn *pdn = pci_get_pdn(dev);
722 
723 		if (pdn == NULL) {
724 			pr_warn("%s: No device node associated with device !\n",
725 				pci_name(dev));
726 			continue;
727 		}
728 		pdn->pcidev = dev;
729 		pdn->pe_number = pe->pe_number;
730 		pe->dma_weight += pnv_ioda_dma_weight(dev);
731 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
732 			pnv_ioda_setup_same_PE(dev->subordinate, pe);
733 	}
734 }
735 
736 /*
737  * There're 2 types of PCI bus sensitive PEs: One that is compromised of
738  * single PCI bus. Another one that contains the primary PCI bus and its
739  * subordinate PCI devices and buses. The second type of PE is normally
740  * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
741  */
742 static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
743 {
744 	struct pci_controller *hose = pci_bus_to_host(bus);
745 	struct pnv_phb *phb = hose->private_data;
746 	struct pnv_ioda_pe *pe;
747 	int pe_num = IODA_INVALID_PE;
748 
749 	/* Check if PE is determined by M64 */
750 	if (phb->pick_m64_pe)
751 		pe_num = phb->pick_m64_pe(phb, bus, all);
752 
753 	/* The PE number isn't pinned by M64 */
754 	if (pe_num == IODA_INVALID_PE)
755 		pe_num = pnv_ioda_alloc_pe(phb);
756 
757 	if (pe_num == IODA_INVALID_PE) {
758 		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
759 			__func__, pci_domain_nr(bus), bus->number);
760 		return;
761 	}
762 
763 	pe = &phb->ioda.pe_array[pe_num];
764 	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
765 	pe->pbus = bus;
766 	pe->pdev = NULL;
767 	pe->tce32_seg = -1;
768 	pe->mve_number = -1;
769 	pe->rid = bus->busn_res.start << 8;
770 	pe->dma_weight = 0;
771 
772 	if (all)
773 		pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
774 			bus->busn_res.start, bus->busn_res.end, pe_num);
775 	else
776 		pe_info(pe, "Secondary bus %d associated with PE#%d\n",
777 			bus->busn_res.start, pe_num);
778 
779 	if (pnv_ioda_configure_pe(phb, pe)) {
780 		/* XXX What do we do here ? */
781 		if (pe_num)
782 			pnv_ioda_free_pe(phb, pe_num);
783 		pe->pbus = NULL;
784 		return;
785 	}
786 
787 	/* Associate it with all child devices */
788 	pnv_ioda_setup_same_PE(bus, pe);
789 
790 	/* Put PE to the list */
791 	list_add_tail(&pe->list, &phb->ioda.pe_list);
792 
793 	/* Account for one DMA PE if at least one DMA capable device exist
794 	 * below the bridge
795 	 */
796 	if (pe->dma_weight != 0) {
797 		phb->ioda.dma_weight += pe->dma_weight;
798 		phb->ioda.dma_pe_count++;
799 	}
800 
801 	/* Link the PE */
802 	pnv_ioda_link_pe_by_weight(phb, pe);
803 }
804 
805 static void pnv_ioda_setup_PEs(struct pci_bus *bus)
806 {
807 	struct pci_dev *dev;
808 
809 	pnv_ioda_setup_bus_PE(bus, 0);
810 
811 	list_for_each_entry(dev, &bus->devices, bus_list) {
812 		if (dev->subordinate) {
813 			if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE)
814 				pnv_ioda_setup_bus_PE(dev->subordinate, 1);
815 			else
816 				pnv_ioda_setup_PEs(dev->subordinate);
817 		}
818 	}
819 }
820 
821 /*
822  * Configure PEs so that the downstream PCI buses and devices
823  * could have their associated PE#. Unfortunately, we didn't
824  * figure out the way to identify the PLX bridge yet. So we
825  * simply put the PCI bus and the subordinate behind the root
826  * port to PE# here. The game rule here is expected to be changed
827  * as soon as we can detected PLX bridge correctly.
828  */
829 static void pnv_pci_ioda_setup_PEs(void)
830 {
831 	struct pci_controller *hose, *tmp;
832 	struct pnv_phb *phb;
833 
834 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
835 		phb = hose->private_data;
836 
837 		/* M64 layout might affect PE allocation */
838 		if (phb->alloc_m64_pe)
839 			phb->alloc_m64_pe(phb);
840 
841 		pnv_ioda_setup_PEs(hose->bus);
842 	}
843 }
844 
845 static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
846 {
847 	struct pci_dn *pdn = pci_get_pdn(pdev);
848 	struct pnv_ioda_pe *pe;
849 
850 	/*
851 	 * The function can be called while the PE#
852 	 * hasn't been assigned. Do nothing for the
853 	 * case.
854 	 */
855 	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
856 		return;
857 
858 	pe = &phb->ioda.pe_array[pdn->pe_number];
859 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
860 	set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
861 }
862 
863 static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
864 				     struct pci_dev *pdev, u64 dma_mask)
865 {
866 	struct pci_dn *pdn = pci_get_pdn(pdev);
867 	struct pnv_ioda_pe *pe;
868 	uint64_t top;
869 	bool bypass = false;
870 
871 	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
872 		return -ENODEV;;
873 
874 	pe = &phb->ioda.pe_array[pdn->pe_number];
875 	if (pe->tce_bypass_enabled) {
876 		top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
877 		bypass = (dma_mask >= top);
878 	}
879 
880 	if (bypass) {
881 		dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
882 		set_dma_ops(&pdev->dev, &dma_direct_ops);
883 		set_dma_offset(&pdev->dev, pe->tce_bypass_base);
884 	} else {
885 		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
886 		set_dma_ops(&pdev->dev, &dma_iommu_ops);
887 		set_iommu_table_base(&pdev->dev, &pe->tce32_table);
888 	}
889 	*pdev->dev.dma_mask = dma_mask;
890 	return 0;
891 }
892 
893 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
894 				   struct pci_bus *bus,
895 				   bool add_to_iommu_group)
896 {
897 	struct pci_dev *dev;
898 
899 	list_for_each_entry(dev, &bus->devices, bus_list) {
900 		if (add_to_iommu_group)
901 			set_iommu_table_base_and_group(&dev->dev,
902 						       &pe->tce32_table);
903 		else
904 			set_iommu_table_base(&dev->dev, &pe->tce32_table);
905 
906 		if (dev->subordinate)
907 			pnv_ioda_setup_bus_dma(pe, dev->subordinate,
908 					       add_to_iommu_group);
909 	}
910 }
911 
912 static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
913 					 struct iommu_table *tbl,
914 					 __be64 *startp, __be64 *endp, bool rm)
915 {
916 	__be64 __iomem *invalidate = rm ?
917 		(__be64 __iomem *)pe->tce_inval_reg_phys :
918 		(__be64 __iomem *)tbl->it_index;
919 	unsigned long start, end, inc;
920 	const unsigned shift = tbl->it_page_shift;
921 
922 	start = __pa(startp);
923 	end = __pa(endp);
924 
925 	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
926 	if (tbl->it_busno) {
927 		start <<= shift;
928 		end <<= shift;
929 		inc = 128ull << shift;
930 		start |= tbl->it_busno;
931 		end |= tbl->it_busno;
932 	} else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
933 		/* p7ioc-style invalidation, 2 TCEs per write */
934 		start |= (1ull << 63);
935 		end |= (1ull << 63);
936 		inc = 16;
937         } else {
938 		/* Default (older HW) */
939                 inc = 128;
940 	}
941 
942         end |= inc - 1;	/* round up end to be different than start */
943 
944         mb(); /* Ensure above stores are visible */
945         while (start <= end) {
946 		if (rm)
947 			__raw_rm_writeq(cpu_to_be64(start), invalidate);
948 		else
949 			__raw_writeq(cpu_to_be64(start), invalidate);
950                 start += inc;
951         }
952 
953 	/*
954 	 * The iommu layer will do another mb() for us on build()
955 	 * and we don't care on free()
956 	 */
957 }
958 
959 static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
960 					 struct iommu_table *tbl,
961 					 __be64 *startp, __be64 *endp, bool rm)
962 {
963 	unsigned long start, end, inc;
964 	__be64 __iomem *invalidate = rm ?
965 		(__be64 __iomem *)pe->tce_inval_reg_phys :
966 		(__be64 __iomem *)tbl->it_index;
967 	const unsigned shift = tbl->it_page_shift;
968 
969 	/* We'll invalidate DMA address in PE scope */
970 	start = 0x2ull << 60;
971 	start |= (pe->pe_number & 0xFF);
972 	end = start;
973 
974 	/* Figure out the start, end and step */
975 	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
976 	start |= (inc << shift);
977 	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
978 	end |= (inc << shift);
979 	inc = (0x1ull << shift);
980 	mb();
981 
982 	while (start <= end) {
983 		if (rm)
984 			__raw_rm_writeq(cpu_to_be64(start), invalidate);
985 		else
986 			__raw_writeq(cpu_to_be64(start), invalidate);
987 		start += inc;
988 	}
989 }
990 
991 void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
992 				 __be64 *startp, __be64 *endp, bool rm)
993 {
994 	struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
995 					      tce32_table);
996 	struct pnv_phb *phb = pe->phb;
997 
998 	if (phb->type == PNV_PHB_IODA1)
999 		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
1000 	else
1001 		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
1002 }
1003 
1004 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
1005 				      struct pnv_ioda_pe *pe, unsigned int base,
1006 				      unsigned int segs)
1007 {
1008 
1009 	struct page *tce_mem = NULL;
1010 	const __be64 *swinvp;
1011 	struct iommu_table *tbl;
1012 	unsigned int i;
1013 	int64_t rc;
1014 	void *addr;
1015 
1016 	/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
1017 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
1018 
1019 	/* XXX FIXME: Handle 64-bit only DMA devices */
1020 	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
1021 	/* XXX FIXME: Allocate multi-level tables on PHB3 */
1022 
1023 	/* We shouldn't already have a 32-bit DMA associated */
1024 	if (WARN_ON(pe->tce32_seg >= 0))
1025 		return;
1026 
1027 	/* Grab a 32-bit TCE table */
1028 	pe->tce32_seg = base;
1029 	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
1030 		(base << 28), ((base + segs) << 28) - 1);
1031 
1032 	/* XXX Currently, we allocate one big contiguous table for the
1033 	 * TCEs. We only really need one chunk per 256M of TCE space
1034 	 * (ie per segment) but that's an optimization for later, it
1035 	 * requires some added smarts with our get/put_tce implementation
1036 	 */
1037 	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
1038 				   get_order(TCE32_TABLE_SIZE * segs));
1039 	if (!tce_mem) {
1040 		pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
1041 		goto fail;
1042 	}
1043 	addr = page_address(tce_mem);
1044 	memset(addr, 0, TCE32_TABLE_SIZE * segs);
1045 
1046 	/* Configure HW */
1047 	for (i = 0; i < segs; i++) {
1048 		rc = opal_pci_map_pe_dma_window(phb->opal_id,
1049 					      pe->pe_number,
1050 					      base + i, 1,
1051 					      __pa(addr) + TCE32_TABLE_SIZE * i,
1052 					      TCE32_TABLE_SIZE, 0x1000);
1053 		if (rc) {
1054 			pe_err(pe, " Failed to configure 32-bit TCE table,"
1055 			       " err %ld\n", rc);
1056 			goto fail;
1057 		}
1058 	}
1059 
1060 	/* Setup linux iommu table */
1061 	tbl = &pe->tce32_table;
1062 	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
1063 				  base << 28, IOMMU_PAGE_SHIFT_4K);
1064 
1065 	/* OPAL variant of P7IOC SW invalidated TCEs */
1066 	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
1067 	if (swinvp) {
1068 		/* We need a couple more fields -- an address and a data
1069 		 * to or.  Since the bus is only printed out on table free
1070 		 * errors, and on the first pass the data will be a relative
1071 		 * bus number, print that out instead.
1072 		 */
1073 		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
1074 		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
1075 				8);
1076 		tbl->it_type |= (TCE_PCI_SWINV_CREATE |
1077 				 TCE_PCI_SWINV_FREE   |
1078 				 TCE_PCI_SWINV_PAIR);
1079 	}
1080 	iommu_init_table(tbl, phb->hose->node);
1081 	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
1082 
1083 	if (pe->pdev)
1084 		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
1085 	else
1086 		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
1087 
1088 	return;
1089  fail:
1090 	/* XXX Failure: Try to fallback to 64-bit only ? */
1091 	if (pe->tce32_seg >= 0)
1092 		pe->tce32_seg = -1;
1093 	if (tce_mem)
1094 		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
1095 }
1096 
1097 static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
1098 {
1099 	struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
1100 					      tce32_table);
1101 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
1102 	int64_t rc;
1103 
1104 	pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
1105 	if (enable) {
1106 		phys_addr_t top = memblock_end_of_DRAM();
1107 
1108 		top = roundup_pow_of_two(top);
1109 		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1110 						     pe->pe_number,
1111 						     window_id,
1112 						     pe->tce_bypass_base,
1113 						     top);
1114 	} else {
1115 		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1116 						     pe->pe_number,
1117 						     window_id,
1118 						     pe->tce_bypass_base,
1119 						     0);
1120 
1121 		/*
1122 		 * EEH needs the mapping between IOMMU table and group
1123 		 * of those VFIO/KVM pass-through devices. We can postpone
1124 		 * resetting DMA ops until the DMA mask is configured in
1125 		 * host side.
1126 		 */
1127 		if (pe->pdev)
1128 			set_iommu_table_base(&pe->pdev->dev, tbl);
1129 		else
1130 			pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
1131 	}
1132 	if (rc)
1133 		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
1134 	else
1135 		pe->tce_bypass_enabled = enable;
1136 }
1137 
1138 static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
1139 					  struct pnv_ioda_pe *pe)
1140 {
1141 	/* TVE #1 is selected by PCI address bit 59 */
1142 	pe->tce_bypass_base = 1ull << 59;
1143 
1144 	/* Install set_bypass callback for VFIO */
1145 	pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass;
1146 
1147 	/* Enable bypass by default */
1148 	pnv_pci_ioda2_set_bypass(&pe->tce32_table, true);
1149 }
1150 
1151 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1152 				       struct pnv_ioda_pe *pe)
1153 {
1154 	struct page *tce_mem = NULL;
1155 	void *addr;
1156 	const __be64 *swinvp;
1157 	struct iommu_table *tbl;
1158 	unsigned int tce_table_size, end;
1159 	int64_t rc;
1160 
1161 	/* We shouldn't already have a 32-bit DMA associated */
1162 	if (WARN_ON(pe->tce32_seg >= 0))
1163 		return;
1164 
1165 	/* The PE will reserve all possible 32-bits space */
1166 	pe->tce32_seg = 0;
1167 	end = (1 << ilog2(phb->ioda.m32_pci_base));
1168 	tce_table_size = (end / 0x1000) * 8;
1169 	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
1170 		end);
1171 
1172 	/* Allocate TCE table */
1173 	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
1174 				   get_order(tce_table_size));
1175 	if (!tce_mem) {
1176 		pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
1177 		goto fail;
1178 	}
1179 	addr = page_address(tce_mem);
1180 	memset(addr, 0, tce_table_size);
1181 
1182 	/*
1183 	 * Map TCE table through TVT. The TVE index is the PE number
1184 	 * shifted by 1 bit for 32-bits DMA space.
1185 	 */
1186 	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
1187 					pe->pe_number << 1, 1, __pa(addr),
1188 					tce_table_size, 0x1000);
1189 	if (rc) {
1190 		pe_err(pe, "Failed to configure 32-bit TCE table,"
1191 		       " err %ld\n", rc);
1192 		goto fail;
1193 	}
1194 
1195 	/* Setup linux iommu table */
1196 	tbl = &pe->tce32_table;
1197 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
1198 			IOMMU_PAGE_SHIFT_4K);
1199 
1200 	/* OPAL variant of PHB3 invalidated TCEs */
1201 	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
1202 	if (swinvp) {
1203 		/* We need a couple more fields -- an address and a data
1204 		 * to or.  Since the bus is only printed out on table free
1205 		 * errors, and on the first pass the data will be a relative
1206 		 * bus number, print that out instead.
1207 		 */
1208 		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
1209 		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
1210 				8);
1211 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
1212 	}
1213 	iommu_init_table(tbl, phb->hose->node);
1214 	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
1215 
1216 	if (pe->pdev)
1217 		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
1218 	else
1219 		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
1220 
1221 	/* Also create a bypass window */
1222 	pnv_pci_ioda2_setup_bypass_pe(phb, pe);
1223 	return;
1224 fail:
1225 	if (pe->tce32_seg >= 0)
1226 		pe->tce32_seg = -1;
1227 	if (tce_mem)
1228 		__free_pages(tce_mem, get_order(tce_table_size));
1229 }
1230 
1231 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
1232 {
1233 	struct pci_controller *hose = phb->hose;
1234 	unsigned int residual, remaining, segs, tw, base;
1235 	struct pnv_ioda_pe *pe;
1236 
1237 	/* If we have more PE# than segments available, hand out one
1238 	 * per PE until we run out and let the rest fail. If not,
1239 	 * then we assign at least one segment per PE, plus more based
1240 	 * on the amount of devices under that PE
1241 	 */
1242 	if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
1243 		residual = 0;
1244 	else
1245 		residual = phb->ioda.tce32_count -
1246 			phb->ioda.dma_pe_count;
1247 
1248 	pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
1249 		hose->global_number, phb->ioda.tce32_count);
1250 	pr_info("PCI: %d PE# for a total weight of %d\n",
1251 		phb->ioda.dma_pe_count, phb->ioda.dma_weight);
1252 
1253 	/* Walk our PE list and configure their DMA segments, hand them
1254 	 * out one base segment plus any residual segments based on
1255 	 * weight
1256 	 */
1257 	remaining = phb->ioda.tce32_count;
1258 	tw = phb->ioda.dma_weight;
1259 	base = 0;
1260 	list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
1261 		if (!pe->dma_weight)
1262 			continue;
1263 		if (!remaining) {
1264 			pe_warn(pe, "No DMA32 resources available\n");
1265 			continue;
1266 		}
1267 		segs = 1;
1268 		if (residual) {
1269 			segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw;
1270 			if (segs > remaining)
1271 				segs = remaining;
1272 		}
1273 
1274 		/*
1275 		 * For IODA2 compliant PHB3, we needn't care about the weight.
1276 		 * The all available 32-bits DMA space will be assigned to
1277 		 * the specific PE.
1278 		 */
1279 		if (phb->type == PNV_PHB_IODA1) {
1280 			pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
1281 				pe->dma_weight, segs);
1282 			pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
1283 		} else {
1284 			pe_info(pe, "Assign DMA32 space\n");
1285 			segs = 0;
1286 			pnv_pci_ioda2_setup_dma_pe(phb, pe);
1287 		}
1288 
1289 		remaining -= segs;
1290 		base += segs;
1291 	}
1292 }
1293 
1294 #ifdef CONFIG_PCI_MSI
1295 static void pnv_ioda2_msi_eoi(struct irq_data *d)
1296 {
1297 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
1298 	struct irq_chip *chip = irq_data_get_irq_chip(d);
1299 	struct pnv_phb *phb = container_of(chip, struct pnv_phb,
1300 					   ioda.irq_chip);
1301 	int64_t rc;
1302 
1303 	rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
1304 	WARN_ON_ONCE(rc);
1305 
1306 	icp_native_eoi(d);
1307 }
1308 
1309 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
1310 				  unsigned int hwirq, unsigned int virq,
1311 				  unsigned int is_64, struct msi_msg *msg)
1312 {
1313 	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
1314 	struct pci_dn *pdn = pci_get_pdn(dev);
1315 	struct irq_data *idata;
1316 	struct irq_chip *ichip;
1317 	unsigned int xive_num = hwirq - phb->msi_base;
1318 	__be32 data;
1319 	int rc;
1320 
1321 	/* No PE assigned ? bail out ... no MSI for you ! */
1322 	if (pe == NULL)
1323 		return -ENXIO;
1324 
1325 	/* Check if we have an MVE */
1326 	if (pe->mve_number < 0)
1327 		return -ENXIO;
1328 
1329 	/* Force 32-bit MSI on some broken devices */
1330 	if (pdn && pdn->force_32bit_msi)
1331 		is_64 = 0;
1332 
1333 	/* Assign XIVE to PE */
1334 	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
1335 	if (rc) {
1336 		pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
1337 			pci_name(dev), rc, xive_num);
1338 		return -EIO;
1339 	}
1340 
1341 	if (is_64) {
1342 		__be64 addr64;
1343 
1344 		rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
1345 				     &addr64, &data);
1346 		if (rc) {
1347 			pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
1348 				pci_name(dev), rc);
1349 			return -EIO;
1350 		}
1351 		msg->address_hi = be64_to_cpu(addr64) >> 32;
1352 		msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
1353 	} else {
1354 		__be32 addr32;
1355 
1356 		rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
1357 				     &addr32, &data);
1358 		if (rc) {
1359 			pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
1360 				pci_name(dev), rc);
1361 			return -EIO;
1362 		}
1363 		msg->address_hi = 0;
1364 		msg->address_lo = be32_to_cpu(addr32);
1365 	}
1366 	msg->data = be32_to_cpu(data);
1367 
1368 	/*
1369 	 * Change the IRQ chip for the MSI interrupts on PHB3.
1370 	 * The corresponding IRQ chip should be populated for
1371 	 * the first time.
1372 	 */
1373 	if (phb->type == PNV_PHB_IODA2) {
1374 		if (!phb->ioda.irq_chip_init) {
1375 			idata = irq_get_irq_data(virq);
1376 			ichip = irq_data_get_irq_chip(idata);
1377 			phb->ioda.irq_chip_init = 1;
1378 			phb->ioda.irq_chip = *ichip;
1379 			phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
1380 		}
1381 
1382 		irq_set_chip(virq, &phb->ioda.irq_chip);
1383 	}
1384 
1385 	pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
1386 		 " address=%x_%08x data=%x PE# %d\n",
1387 		 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
1388 		 msg->address_hi, msg->address_lo, data, pe->pe_number);
1389 
1390 	return 0;
1391 }
1392 
1393 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
1394 {
1395 	unsigned int count;
1396 	const __be32 *prop = of_get_property(phb->hose->dn,
1397 					     "ibm,opal-msi-ranges", NULL);
1398 	if (!prop) {
1399 		/* BML Fallback */
1400 		prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
1401 	}
1402 	if (!prop)
1403 		return;
1404 
1405 	phb->msi_base = be32_to_cpup(prop);
1406 	count = be32_to_cpup(prop + 1);
1407 	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
1408 		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
1409 		       phb->hose->global_number);
1410 		return;
1411 	}
1412 
1413 	phb->msi_setup = pnv_pci_ioda_msi_setup;
1414 	phb->msi32_support = 1;
1415 	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
1416 		count, phb->msi_base);
1417 }
1418 #else
1419 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
1420 #endif /* CONFIG_PCI_MSI */
1421 
1422 /*
1423  * This function is supposed to be called on basis of PE from top
1424  * to bottom style. So the the I/O or MMIO segment assigned to
1425  * parent PE could be overrided by its child PEs if necessary.
1426  */
1427 static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
1428 				  struct pnv_ioda_pe *pe)
1429 {
1430 	struct pnv_phb *phb = hose->private_data;
1431 	struct pci_bus_region region;
1432 	struct resource *res;
1433 	int i, index;
1434 	int rc;
1435 
1436 	/*
1437 	 * NOTE: We only care PCI bus based PE for now. For PCI
1438 	 * device based PE, for example SRIOV sensitive VF should
1439 	 * be figured out later.
1440 	 */
1441 	BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
1442 
1443 	pci_bus_for_each_resource(pe->pbus, res, i) {
1444 		if (!res || !res->flags ||
1445 		    res->start > res->end)
1446 			continue;
1447 
1448 		if (res->flags & IORESOURCE_IO) {
1449 			region.start = res->start - phb->ioda.io_pci_base;
1450 			region.end   = res->end - phb->ioda.io_pci_base;
1451 			index = region.start / phb->ioda.io_segsize;
1452 
1453 			while (index < phb->ioda.total_pe &&
1454 			       region.start <= region.end) {
1455 				phb->ioda.io_segmap[index] = pe->pe_number;
1456 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1457 					pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
1458 				if (rc != OPAL_SUCCESS) {
1459 					pr_err("%s: OPAL error %d when mapping IO "
1460 					       "segment #%d to PE#%d\n",
1461 					       __func__, rc, index, pe->pe_number);
1462 					break;
1463 				}
1464 
1465 				region.start += phb->ioda.io_segsize;
1466 				index++;
1467 			}
1468 		} else if (res->flags & IORESOURCE_MEM) {
1469 			region.start = res->start -
1470 				       hose->mem_offset[0] -
1471 				       phb->ioda.m32_pci_base;
1472 			region.end   = res->end -
1473 				       hose->mem_offset[0] -
1474 				       phb->ioda.m32_pci_base;
1475 			index = region.start / phb->ioda.m32_segsize;
1476 
1477 			while (index < phb->ioda.total_pe &&
1478 			       region.start <= region.end) {
1479 				phb->ioda.m32_segmap[index] = pe->pe_number;
1480 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1481 					pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
1482 				if (rc != OPAL_SUCCESS) {
1483 					pr_err("%s: OPAL error %d when mapping M32 "
1484 					       "segment#%d to PE#%d",
1485 					       __func__, rc, index, pe->pe_number);
1486 					break;
1487 				}
1488 
1489 				region.start += phb->ioda.m32_segsize;
1490 				index++;
1491 			}
1492 		}
1493 	}
1494 }
1495 
1496 static void pnv_pci_ioda_setup_seg(void)
1497 {
1498 	struct pci_controller *tmp, *hose;
1499 	struct pnv_phb *phb;
1500 	struct pnv_ioda_pe *pe;
1501 
1502 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1503 		phb = hose->private_data;
1504 		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
1505 			pnv_ioda_setup_pe_seg(hose, pe);
1506 		}
1507 	}
1508 }
1509 
1510 static void pnv_pci_ioda_setup_DMA(void)
1511 {
1512 	struct pci_controller *hose, *tmp;
1513 	struct pnv_phb *phb;
1514 
1515 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1516 		pnv_ioda_setup_dma(hose->private_data);
1517 
1518 		/* Mark the PHB initialization done */
1519 		phb = hose->private_data;
1520 		phb->initialized = 1;
1521 	}
1522 }
1523 
1524 static void pnv_pci_ioda_create_dbgfs(void)
1525 {
1526 #ifdef CONFIG_DEBUG_FS
1527 	struct pci_controller *hose, *tmp;
1528 	struct pnv_phb *phb;
1529 	char name[16];
1530 
1531 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1532 		phb = hose->private_data;
1533 
1534 		sprintf(name, "PCI%04x", hose->global_number);
1535 		phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
1536 		if (!phb->dbgfs)
1537 			pr_warning("%s: Error on creating debugfs on PHB#%x\n",
1538 				__func__, hose->global_number);
1539 	}
1540 #endif /* CONFIG_DEBUG_FS */
1541 }
1542 
1543 static void pnv_pci_ioda_fixup(void)
1544 {
1545 	pnv_pci_ioda_setup_PEs();
1546 	pnv_pci_ioda_setup_seg();
1547 	pnv_pci_ioda_setup_DMA();
1548 
1549 	pnv_pci_ioda_create_dbgfs();
1550 
1551 #ifdef CONFIG_EEH
1552 	eeh_init();
1553 	eeh_addr_cache_build();
1554 #endif
1555 }
1556 
1557 /*
1558  * Returns the alignment for I/O or memory windows for P2P
1559  * bridges. That actually depends on how PEs are segmented.
1560  * For now, we return I/O or M32 segment size for PE sensitive
1561  * P2P bridges. Otherwise, the default values (4KiB for I/O,
1562  * 1MiB for memory) will be returned.
1563  *
1564  * The current PCI bus might be put into one PE, which was
1565  * create against the parent PCI bridge. For that case, we
1566  * needn't enlarge the alignment so that we can save some
1567  * resources.
1568  */
1569 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
1570 						unsigned long type)
1571 {
1572 	struct pci_dev *bridge;
1573 	struct pci_controller *hose = pci_bus_to_host(bus);
1574 	struct pnv_phb *phb = hose->private_data;
1575 	int num_pci_bridges = 0;
1576 
1577 	bridge = bus->self;
1578 	while (bridge) {
1579 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
1580 			num_pci_bridges++;
1581 			if (num_pci_bridges >= 2)
1582 				return 1;
1583 		}
1584 
1585 		bridge = bridge->bus->self;
1586 	}
1587 
1588 	/* We fail back to M32 if M64 isn't supported */
1589 	if (phb->ioda.m64_segsize &&
1590 	    pnv_pci_is_mem_pref_64(type))
1591 		return phb->ioda.m64_segsize;
1592 	if (type & IORESOURCE_MEM)
1593 		return phb->ioda.m32_segsize;
1594 
1595 	return phb->ioda.io_segsize;
1596 }
1597 
1598 /* Prevent enabling devices for which we couldn't properly
1599  * assign a PE
1600  */
1601 static int pnv_pci_enable_device_hook(struct pci_dev *dev)
1602 {
1603 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
1604 	struct pnv_phb *phb = hose->private_data;
1605 	struct pci_dn *pdn;
1606 
1607 	/* The function is probably called while the PEs have
1608 	 * not be created yet. For example, resource reassignment
1609 	 * during PCI probe period. We just skip the check if
1610 	 * PEs isn't ready.
1611 	 */
1612 	if (!phb->initialized)
1613 		return 0;
1614 
1615 	pdn = pci_get_pdn(dev);
1616 	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
1617 		return -EINVAL;
1618 
1619 	return 0;
1620 }
1621 
1622 static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
1623 			       u32 devfn)
1624 {
1625 	return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
1626 }
1627 
1628 static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
1629 {
1630 	opal_pci_reset(phb->opal_id, OPAL_PCI_IODA_TABLE_RESET,
1631 		       OPAL_ASSERT_RESET);
1632 }
1633 
1634 void __init pnv_pci_init_ioda_phb(struct device_node *np,
1635 				  u64 hub_id, int ioda_type)
1636 {
1637 	struct pci_controller *hose;
1638 	struct pnv_phb *phb;
1639 	unsigned long size, m32map_off, pemap_off, iomap_off = 0;
1640 	const __be64 *prop64;
1641 	const __be32 *prop32;
1642 	int len;
1643 	u64 phb_id;
1644 	void *aux;
1645 	long rc;
1646 
1647 	pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name);
1648 
1649 	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
1650 	if (!prop64) {
1651 		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
1652 		return;
1653 	}
1654 	phb_id = be64_to_cpup(prop64);
1655 	pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
1656 
1657 	phb = alloc_bootmem(sizeof(struct pnv_phb));
1658 	if (!phb) {
1659 		pr_err("  Out of memory !\n");
1660 		return;
1661 	}
1662 
1663 	/* Allocate PCI controller */
1664 	memset(phb, 0, sizeof(struct pnv_phb));
1665 	phb->hose = hose = pcibios_alloc_controller(np);
1666 	if (!phb->hose) {
1667 		pr_err("  Can't allocate PCI controller for %s\n",
1668 		       np->full_name);
1669 		free_bootmem((unsigned long)phb, sizeof(struct pnv_phb));
1670 		return;
1671 	}
1672 
1673 	spin_lock_init(&phb->lock);
1674 	prop32 = of_get_property(np, "bus-range", &len);
1675 	if (prop32 && len == 8) {
1676 		hose->first_busno = be32_to_cpu(prop32[0]);
1677 		hose->last_busno = be32_to_cpu(prop32[1]);
1678 	} else {
1679 		pr_warn("  Broken <bus-range> on %s\n", np->full_name);
1680 		hose->first_busno = 0;
1681 		hose->last_busno = 0xff;
1682 	}
1683 	hose->private_data = phb;
1684 	phb->hub_id = hub_id;
1685 	phb->opal_id = phb_id;
1686 	phb->type = ioda_type;
1687 
1688 	/* Detect specific models for error handling */
1689 	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
1690 		phb->model = PNV_PHB_MODEL_P7IOC;
1691 	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
1692 		phb->model = PNV_PHB_MODEL_PHB3;
1693 	else
1694 		phb->model = PNV_PHB_MODEL_UNKNOWN;
1695 
1696 	/* Parse 32-bit and IO ranges (if any) */
1697 	pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
1698 
1699 	/* Get registers */
1700 	phb->regs = of_iomap(np, 0);
1701 	if (phb->regs == NULL)
1702 		pr_err("  Failed to map registers !\n");
1703 
1704 	/* Initialize more IODA stuff */
1705 	phb->ioda.total_pe = 1;
1706 	prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
1707 	if (prop32)
1708 		phb->ioda.total_pe = be32_to_cpup(prop32);
1709 	prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
1710 	if (prop32)
1711 		phb->ioda.reserved_pe = be32_to_cpup(prop32);
1712 
1713 	/* Parse 64-bit MMIO range */
1714 	pnv_ioda_parse_m64_window(phb);
1715 
1716 	phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
1717 	/* FW Has already off top 64k of M32 space (MSI space) */
1718 	phb->ioda.m32_size += 0x10000;
1719 
1720 	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
1721 	phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
1722 	phb->ioda.io_size = hose->pci_io_size;
1723 	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
1724 	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
1725 
1726 	/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
1727 	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
1728 	m32map_off = size;
1729 	size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
1730 	if (phb->type == PNV_PHB_IODA1) {
1731 		iomap_off = size;
1732 		size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]);
1733 	}
1734 	pemap_off = size;
1735 	size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
1736 	aux = alloc_bootmem(size);
1737 	memset(aux, 0, size);
1738 	phb->ioda.pe_alloc = aux;
1739 	phb->ioda.m32_segmap = aux + m32map_off;
1740 	if (phb->type == PNV_PHB_IODA1)
1741 		phb->ioda.io_segmap = aux + iomap_off;
1742 	phb->ioda.pe_array = aux + pemap_off;
1743 	set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc);
1744 
1745 	INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
1746 	INIT_LIST_HEAD(&phb->ioda.pe_list);
1747 
1748 	/* Calculate how many 32-bit TCE segments we have */
1749 	phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
1750 
1751 #if 0 /* We should really do that ... */
1752 	rc = opal_pci_set_phb_mem_window(opal->phb_id,
1753 					 window_type,
1754 					 window_num,
1755 					 starting_real_address,
1756 					 starting_pci_address,
1757 					 segment_size);
1758 #endif
1759 
1760 	pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
1761 		phb->ioda.total_pe, phb->ioda.reserved_pe,
1762 		phb->ioda.m32_size, phb->ioda.m32_segsize);
1763 	if (phb->ioda.m64_size)
1764 		pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
1765 			phb->ioda.m64_size, phb->ioda.m64_segsize);
1766 	if (phb->ioda.io_size)
1767 		pr_info("                  IO: 0x%x [segment=0x%x]\n",
1768 			phb->ioda.io_size, phb->ioda.io_segsize);
1769 
1770 
1771 	phb->hose->ops = &pnv_pci_ops;
1772 	phb->get_pe_state = pnv_ioda_get_pe_state;
1773 	phb->freeze_pe = pnv_ioda_freeze_pe;
1774 	phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
1775 #ifdef CONFIG_EEH
1776 	phb->eeh_ops = &ioda_eeh_ops;
1777 #endif
1778 
1779 	/* Setup RID -> PE mapping function */
1780 	phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
1781 
1782 	/* Setup TCEs */
1783 	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
1784 	phb->dma_set_mask = pnv_pci_ioda_dma_set_mask;
1785 
1786 	/* Setup shutdown function for kexec */
1787 	phb->shutdown = pnv_pci_ioda_shutdown;
1788 
1789 	/* Setup MSI support */
1790 	pnv_pci_init_ioda_msis(phb);
1791 
1792 	/*
1793 	 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
1794 	 * to let the PCI core do resource assignment. It's supposed
1795 	 * that the PCI core will do correct I/O and MMIO alignment
1796 	 * for the P2P bridge bars so that each PCI bus (excluding
1797 	 * the child P2P bridges) can form individual PE.
1798 	 */
1799 	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
1800 	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
1801 	ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
1802 	ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
1803 	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
1804 
1805 	/* Reset IODA tables to a clean state */
1806 	rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET);
1807 	if (rc)
1808 		pr_warning("  OPAL Error %ld performing IODA table reset !\n", rc);
1809 
1810 	/* If we're running in kdump kerenl, the previous kerenl never
1811 	 * shutdown PCI devices correctly. We already got IODA table
1812 	 * cleaned out. So we have to issue PHB reset to stop all PCI
1813 	 * transactions from previous kerenl.
1814 	 */
1815 	if (is_kdump_kernel()) {
1816 		pr_info("  Issue PHB reset ...\n");
1817 		ioda_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
1818 		ioda_eeh_phb_reset(hose, OPAL_DEASSERT_RESET);
1819 	}
1820 
1821 	/* Configure M64 window */
1822 	if (phb->init_m64 && phb->init_m64(phb))
1823 		hose->mem_resources[1].flags = 0;
1824 }
1825 
1826 void __init pnv_pci_init_ioda2_phb(struct device_node *np)
1827 {
1828 	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
1829 }
1830 
1831 void __init pnv_pci_init_ioda_hub(struct device_node *np)
1832 {
1833 	struct device_node *phbn;
1834 	const __be64 *prop64;
1835 	u64 hub_id;
1836 
1837 	pr_info("Probing IODA IO-Hub %s\n", np->full_name);
1838 
1839 	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
1840 	if (!prop64) {
1841 		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
1842 		return;
1843 	}
1844 	hub_id = be64_to_cpup(prop64);
1845 	pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
1846 
1847 	/* Count child PHBs */
1848 	for_each_child_of_node(np, phbn) {
1849 		/* Look for IODA1 PHBs */
1850 		if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
1851 			pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
1852 	}
1853 }
1854