1 /*
2  * Support PCI/PCIe on PowerNV platforms
3  *
4  * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 
12 #undef DEBUG
13 
14 #include <linux/kernel.h>
15 #include <linux/pci.h>
16 #include <linux/crash_dump.h>
17 #include <linux/delay.h>
18 #include <linux/string.h>
19 #include <linux/init.h>
20 #include <linux/memblock.h>
21 #include <linux/irq.h>
22 #include <linux/io.h>
23 #include <linux/msi.h>
24 #include <linux/iommu.h>
25 #include <linux/rculist.h>
26 #include <linux/sizes.h>
27 
28 #include <asm/sections.h>
29 #include <asm/io.h>
30 #include <asm/prom.h>
31 #include <asm/pci-bridge.h>
32 #include <asm/machdep.h>
33 #include <asm/msi_bitmap.h>
34 #include <asm/ppc-pci.h>
35 #include <asm/opal.h>
36 #include <asm/iommu.h>
37 #include <asm/tce.h>
38 #include <asm/xics.h>
39 #include <asm/debugfs.h>
40 #include <asm/firmware.h>
41 #include <asm/pnv-pci.h>
42 #include <asm/mmzone.h>
43 
44 #include <misc/cxl-base.h>
45 
46 #include "powernv.h"
47 #include "pci.h"
48 #include "../../../../drivers/pci/pci.h"
49 
50 #define PNV_IODA1_M64_NUM	16	/* Number of M64 BARs	*/
51 #define PNV_IODA1_M64_SEGS	8	/* Segments per M64 BAR	*/
52 #define PNV_IODA1_DMA32_SEGSIZE	0x10000000
53 
54 static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
55 					      "NPU_OCAPI" };
56 
57 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
58 
59 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
60 			    const char *fmt, ...)
61 {
62 	struct va_format vaf;
63 	va_list args;
64 	char pfix[32];
65 
66 	va_start(args, fmt);
67 
68 	vaf.fmt = fmt;
69 	vaf.va = &args;
70 
71 	if (pe->flags & PNV_IODA_PE_DEV)
72 		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
73 	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
74 		sprintf(pfix, "%04x:%02x     ",
75 			pci_domain_nr(pe->pbus), pe->pbus->number);
76 #ifdef CONFIG_PCI_IOV
77 	else if (pe->flags & PNV_IODA_PE_VF)
78 		sprintf(pfix, "%04x:%02x:%2x.%d",
79 			pci_domain_nr(pe->parent_dev->bus),
80 			(pe->rid & 0xff00) >> 8,
81 			PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
82 #endif /* CONFIG_PCI_IOV*/
83 
84 	printk("%spci %s: [PE# %.2x] %pV",
85 	       level, pfix, pe->pe_number, &vaf);
86 
87 	va_end(args);
88 }
89 
90 static bool pnv_iommu_bypass_disabled __read_mostly;
91 static bool pci_reset_phbs __read_mostly;
92 
93 static int __init iommu_setup(char *str)
94 {
95 	if (!str)
96 		return -EINVAL;
97 
98 	while (*str) {
99 		if (!strncmp(str, "nobypass", 8)) {
100 			pnv_iommu_bypass_disabled = true;
101 			pr_info("PowerNV: IOMMU bypass window disabled.\n");
102 			break;
103 		}
104 		str += strcspn(str, ",");
105 		if (*str == ',')
106 			str++;
107 	}
108 
109 	return 0;
110 }
111 early_param("iommu", iommu_setup);
112 
113 static int __init pci_reset_phbs_setup(char *str)
114 {
115 	pci_reset_phbs = true;
116 	return 0;
117 }
118 
119 early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
120 
121 static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
122 {
123 	/*
124 	 * WARNING: We cannot rely on the resource flags. The Linux PCI
125 	 * allocation code sometimes decides to put a 64-bit prefetchable
126 	 * BAR in the 32-bit window, so we have to compare the addresses.
127 	 *
128 	 * For simplicity we only test resource start.
129 	 */
130 	return (r->start >= phb->ioda.m64_base &&
131 		r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
132 }
133 
134 static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
135 {
136 	unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
137 
138 	return (resource_flags & flags) == flags;
139 }
140 
141 static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
142 {
143 	s64 rc;
144 
145 	phb->ioda.pe_array[pe_no].phb = phb;
146 	phb->ioda.pe_array[pe_no].pe_number = pe_no;
147 
148 	/*
149 	 * Clear the PE frozen state as it might be put into frozen state
150 	 * in the last PCI remove path. It's not harmful to do so when the
151 	 * PE is already in unfrozen state.
152 	 */
153 	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
154 				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
155 	if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
156 		pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
157 			__func__, rc, phb->hose->global_number, pe_no);
158 
159 	return &phb->ioda.pe_array[pe_no];
160 }
161 
162 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
163 {
164 	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
165 		pr_warn("%s: Invalid PE %x on PHB#%x\n",
166 			__func__, pe_no, phb->hose->global_number);
167 		return;
168 	}
169 
170 	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
171 		pr_debug("%s: PE %x was reserved on PHB#%x\n",
172 			 __func__, pe_no, phb->hose->global_number);
173 
174 	pnv_ioda_init_pe(phb, pe_no);
175 }
176 
177 static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
178 {
179 	long pe;
180 
181 	for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
182 		if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
183 			return pnv_ioda_init_pe(phb, pe);
184 	}
185 
186 	return NULL;
187 }
188 
189 static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
190 {
191 	struct pnv_phb *phb = pe->phb;
192 	unsigned int pe_num = pe->pe_number;
193 
194 	WARN_ON(pe->pdev);
195 	WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */
196 	kfree(pe->npucomp);
197 	memset(pe, 0, sizeof(struct pnv_ioda_pe));
198 	clear_bit(pe_num, phb->ioda.pe_alloc);
199 }
200 
201 /* The default M64 BAR is shared by all PEs */
202 static int pnv_ioda2_init_m64(struct pnv_phb *phb)
203 {
204 	const char *desc;
205 	struct resource *r;
206 	s64 rc;
207 
208 	/* Configure the default M64 BAR */
209 	rc = opal_pci_set_phb_mem_window(phb->opal_id,
210 					 OPAL_M64_WINDOW_TYPE,
211 					 phb->ioda.m64_bar_idx,
212 					 phb->ioda.m64_base,
213 					 0, /* unused */
214 					 phb->ioda.m64_size);
215 	if (rc != OPAL_SUCCESS) {
216 		desc = "configuring";
217 		goto fail;
218 	}
219 
220 	/* Enable the default M64 BAR */
221 	rc = opal_pci_phb_mmio_enable(phb->opal_id,
222 				      OPAL_M64_WINDOW_TYPE,
223 				      phb->ioda.m64_bar_idx,
224 				      OPAL_ENABLE_M64_SPLIT);
225 	if (rc != OPAL_SUCCESS) {
226 		desc = "enabling";
227 		goto fail;
228 	}
229 
230 	/*
231 	 * Exclude the segments for reserved and root bus PE, which
232 	 * are first or last two PEs.
233 	 */
234 	r = &phb->hose->mem_resources[1];
235 	if (phb->ioda.reserved_pe_idx == 0)
236 		r->start += (2 * phb->ioda.m64_segsize);
237 	else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
238 		r->end -= (2 * phb->ioda.m64_segsize);
239 	else
240 		pr_warn("  Cannot strip M64 segment for reserved PE#%x\n",
241 			phb->ioda.reserved_pe_idx);
242 
243 	return 0;
244 
245 fail:
246 	pr_warn("  Failure %lld %s M64 BAR#%d\n",
247 		rc, desc, phb->ioda.m64_bar_idx);
248 	opal_pci_phb_mmio_enable(phb->opal_id,
249 				 OPAL_M64_WINDOW_TYPE,
250 				 phb->ioda.m64_bar_idx,
251 				 OPAL_DISABLE_M64);
252 	return -EIO;
253 }
254 
255 static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
256 					 unsigned long *pe_bitmap)
257 {
258 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
259 	struct pnv_phb *phb = hose->private_data;
260 	struct resource *r;
261 	resource_size_t base, sgsz, start, end;
262 	int segno, i;
263 
264 	base = phb->ioda.m64_base;
265 	sgsz = phb->ioda.m64_segsize;
266 	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
267 		r = &pdev->resource[i];
268 		if (!r->parent || !pnv_pci_is_m64(phb, r))
269 			continue;
270 
271 		start = _ALIGN_DOWN(r->start - base, sgsz);
272 		end = _ALIGN_UP(r->end - base, sgsz);
273 		for (segno = start / sgsz; segno < end / sgsz; segno++) {
274 			if (pe_bitmap)
275 				set_bit(segno, pe_bitmap);
276 			else
277 				pnv_ioda_reserve_pe(phb, segno);
278 		}
279 	}
280 }
281 
282 static int pnv_ioda1_init_m64(struct pnv_phb *phb)
283 {
284 	struct resource *r;
285 	int index;
286 
287 	/*
288 	 * There are 16 M64 BARs, each of which has 8 segments. So
289 	 * there are as many M64 segments as the maximum number of
290 	 * PEs, which is 128.
291 	 */
292 	for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
293 		unsigned long base, segsz = phb->ioda.m64_segsize;
294 		int64_t rc;
295 
296 		base = phb->ioda.m64_base +
297 		       index * PNV_IODA1_M64_SEGS * segsz;
298 		rc = opal_pci_set_phb_mem_window(phb->opal_id,
299 				OPAL_M64_WINDOW_TYPE, index, base, 0,
300 				PNV_IODA1_M64_SEGS * segsz);
301 		if (rc != OPAL_SUCCESS) {
302 			pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
303 				rc, phb->hose->global_number, index);
304 			goto fail;
305 		}
306 
307 		rc = opal_pci_phb_mmio_enable(phb->opal_id,
308 				OPAL_M64_WINDOW_TYPE, index,
309 				OPAL_ENABLE_M64_SPLIT);
310 		if (rc != OPAL_SUCCESS) {
311 			pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
312 				rc, phb->hose->global_number, index);
313 			goto fail;
314 		}
315 	}
316 
317 	/*
318 	 * Exclude the segments for reserved and root bus PE, which
319 	 * are first or last two PEs.
320 	 */
321 	r = &phb->hose->mem_resources[1];
322 	if (phb->ioda.reserved_pe_idx == 0)
323 		r->start += (2 * phb->ioda.m64_segsize);
324 	else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
325 		r->end -= (2 * phb->ioda.m64_segsize);
326 	else
327 		WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
328 		     phb->ioda.reserved_pe_idx, phb->hose->global_number);
329 
330 	return 0;
331 
332 fail:
333 	for ( ; index >= 0; index--)
334 		opal_pci_phb_mmio_enable(phb->opal_id,
335 			OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
336 
337 	return -EIO;
338 }
339 
340 static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
341 				    unsigned long *pe_bitmap,
342 				    bool all)
343 {
344 	struct pci_dev *pdev;
345 
346 	list_for_each_entry(pdev, &bus->devices, bus_list) {
347 		pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
348 
349 		if (all && pdev->subordinate)
350 			pnv_ioda_reserve_m64_pe(pdev->subordinate,
351 						pe_bitmap, all);
352 	}
353 }
354 
355 static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
356 {
357 	struct pci_controller *hose = pci_bus_to_host(bus);
358 	struct pnv_phb *phb = hose->private_data;
359 	struct pnv_ioda_pe *master_pe, *pe;
360 	unsigned long size, *pe_alloc;
361 	int i;
362 
363 	/* Root bus shouldn't use M64 */
364 	if (pci_is_root_bus(bus))
365 		return NULL;
366 
367 	/* Allocate bitmap */
368 	size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
369 	pe_alloc = kzalloc(size, GFP_KERNEL);
370 	if (!pe_alloc) {
371 		pr_warn("%s: Out of memory !\n",
372 			__func__);
373 		return NULL;
374 	}
375 
376 	/* Figure out reserved PE numbers by the PE */
377 	pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
378 
379 	/*
380 	 * the current bus might not own M64 window and that's all
381 	 * contributed by its child buses. For the case, we needn't
382 	 * pick M64 dependent PE#.
383 	 */
384 	if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
385 		kfree(pe_alloc);
386 		return NULL;
387 	}
388 
389 	/*
390 	 * Figure out the master PE and put all slave PEs to master
391 	 * PE's list to form compound PE.
392 	 */
393 	master_pe = NULL;
394 	i = -1;
395 	while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
396 		phb->ioda.total_pe_num) {
397 		pe = &phb->ioda.pe_array[i];
398 
399 		phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
400 		if (!master_pe) {
401 			pe->flags |= PNV_IODA_PE_MASTER;
402 			INIT_LIST_HEAD(&pe->slaves);
403 			master_pe = pe;
404 		} else {
405 			pe->flags |= PNV_IODA_PE_SLAVE;
406 			pe->master = master_pe;
407 			list_add_tail(&pe->list, &master_pe->slaves);
408 		}
409 
410 		/*
411 		 * P7IOC supports M64DT, which helps mapping M64 segment
412 		 * to one particular PE#. However, PHB3 has fixed mapping
413 		 * between M64 segment and PE#. In order to have same logic
414 		 * for P7IOC and PHB3, we enforce fixed mapping between M64
415 		 * segment and PE# on P7IOC.
416 		 */
417 		if (phb->type == PNV_PHB_IODA1) {
418 			int64_t rc;
419 
420 			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
421 					pe->pe_number, OPAL_M64_WINDOW_TYPE,
422 					pe->pe_number / PNV_IODA1_M64_SEGS,
423 					pe->pe_number % PNV_IODA1_M64_SEGS);
424 			if (rc != OPAL_SUCCESS)
425 				pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
426 					__func__, rc, phb->hose->global_number,
427 					pe->pe_number);
428 		}
429 	}
430 
431 	kfree(pe_alloc);
432 	return master_pe;
433 }
434 
435 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
436 {
437 	struct pci_controller *hose = phb->hose;
438 	struct device_node *dn = hose->dn;
439 	struct resource *res;
440 	u32 m64_range[2], i;
441 	const __be32 *r;
442 	u64 pci_addr;
443 
444 	if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
445 		pr_info("  Not support M64 window\n");
446 		return;
447 	}
448 
449 	if (!firmware_has_feature(FW_FEATURE_OPAL)) {
450 		pr_info("  Firmware too old to support M64 window\n");
451 		return;
452 	}
453 
454 	r = of_get_property(dn, "ibm,opal-m64-window", NULL);
455 	if (!r) {
456 		pr_info("  No <ibm,opal-m64-window> on %pOF\n",
457 			dn);
458 		return;
459 	}
460 
461 	/*
462 	 * Find the available M64 BAR range and pickup the last one for
463 	 * covering the whole 64-bits space. We support only one range.
464 	 */
465 	if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
466 				       m64_range, 2)) {
467 		/* In absence of the property, assume 0..15 */
468 		m64_range[0] = 0;
469 		m64_range[1] = 16;
470 	}
471 	/* We only support 64 bits in our allocator */
472 	if (m64_range[1] > 63) {
473 		pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
474 			__func__, m64_range[1], phb->hose->global_number);
475 		m64_range[1] = 63;
476 	}
477 	/* Empty range, no m64 */
478 	if (m64_range[1] <= m64_range[0]) {
479 		pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
480 			__func__, phb->hose->global_number);
481 		return;
482 	}
483 
484 	/* Configure M64 informations */
485 	res = &hose->mem_resources[1];
486 	res->name = dn->full_name;
487 	res->start = of_translate_address(dn, r + 2);
488 	res->end = res->start + of_read_number(r + 4, 2) - 1;
489 	res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
490 	pci_addr = of_read_number(r, 2);
491 	hose->mem_offset[1] = res->start - pci_addr;
492 
493 	phb->ioda.m64_size = resource_size(res);
494 	phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
495 	phb->ioda.m64_base = pci_addr;
496 
497 	/* This lines up nicely with the display from processing OF ranges */
498 	pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
499 		res->start, res->end, pci_addr, m64_range[0],
500 		m64_range[0] + m64_range[1] - 1);
501 
502 	/* Mark all M64 used up by default */
503 	phb->ioda.m64_bar_alloc = (unsigned long)-1;
504 
505 	/* Use last M64 BAR to cover M64 window */
506 	m64_range[1]--;
507 	phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
508 
509 	pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
510 
511 	/* Mark remaining ones free */
512 	for (i = m64_range[0]; i < m64_range[1]; i++)
513 		clear_bit(i, &phb->ioda.m64_bar_alloc);
514 
515 	/*
516 	 * Setup init functions for M64 based on IODA version, IODA3 uses
517 	 * the IODA2 code.
518 	 */
519 	if (phb->type == PNV_PHB_IODA1)
520 		phb->init_m64 = pnv_ioda1_init_m64;
521 	else
522 		phb->init_m64 = pnv_ioda2_init_m64;
523 }
524 
525 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
526 {
527 	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
528 	struct pnv_ioda_pe *slave;
529 	s64 rc;
530 
531 	/* Fetch master PE */
532 	if (pe->flags & PNV_IODA_PE_SLAVE) {
533 		pe = pe->master;
534 		if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
535 			return;
536 
537 		pe_no = pe->pe_number;
538 	}
539 
540 	/* Freeze master PE */
541 	rc = opal_pci_eeh_freeze_set(phb->opal_id,
542 				     pe_no,
543 				     OPAL_EEH_ACTION_SET_FREEZE_ALL);
544 	if (rc != OPAL_SUCCESS) {
545 		pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
546 			__func__, rc, phb->hose->global_number, pe_no);
547 		return;
548 	}
549 
550 	/* Freeze slave PEs */
551 	if (!(pe->flags & PNV_IODA_PE_MASTER))
552 		return;
553 
554 	list_for_each_entry(slave, &pe->slaves, list) {
555 		rc = opal_pci_eeh_freeze_set(phb->opal_id,
556 					     slave->pe_number,
557 					     OPAL_EEH_ACTION_SET_FREEZE_ALL);
558 		if (rc != OPAL_SUCCESS)
559 			pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
560 				__func__, rc, phb->hose->global_number,
561 				slave->pe_number);
562 	}
563 }
564 
565 static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
566 {
567 	struct pnv_ioda_pe *pe, *slave;
568 	s64 rc;
569 
570 	/* Find master PE */
571 	pe = &phb->ioda.pe_array[pe_no];
572 	if (pe->flags & PNV_IODA_PE_SLAVE) {
573 		pe = pe->master;
574 		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
575 		pe_no = pe->pe_number;
576 	}
577 
578 	/* Clear frozen state for master PE */
579 	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
580 	if (rc != OPAL_SUCCESS) {
581 		pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
582 			__func__, rc, opt, phb->hose->global_number, pe_no);
583 		return -EIO;
584 	}
585 
586 	if (!(pe->flags & PNV_IODA_PE_MASTER))
587 		return 0;
588 
589 	/* Clear frozen state for slave PEs */
590 	list_for_each_entry(slave, &pe->slaves, list) {
591 		rc = opal_pci_eeh_freeze_clear(phb->opal_id,
592 					     slave->pe_number,
593 					     opt);
594 		if (rc != OPAL_SUCCESS) {
595 			pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
596 				__func__, rc, opt, phb->hose->global_number,
597 				slave->pe_number);
598 			return -EIO;
599 		}
600 	}
601 
602 	return 0;
603 }
604 
605 static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
606 {
607 	struct pnv_ioda_pe *slave, *pe;
608 	u8 fstate = 0, state;
609 	__be16 pcierr = 0;
610 	s64 rc;
611 
612 	/* Sanity check on PE number */
613 	if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
614 		return OPAL_EEH_STOPPED_PERM_UNAVAIL;
615 
616 	/*
617 	 * Fetch the master PE and the PE instance might be
618 	 * not initialized yet.
619 	 */
620 	pe = &phb->ioda.pe_array[pe_no];
621 	if (pe->flags & PNV_IODA_PE_SLAVE) {
622 		pe = pe->master;
623 		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
624 		pe_no = pe->pe_number;
625 	}
626 
627 	/* Check the master PE */
628 	rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
629 					&state, &pcierr, NULL);
630 	if (rc != OPAL_SUCCESS) {
631 		pr_warn("%s: Failure %lld getting "
632 			"PHB#%x-PE#%x state\n",
633 			__func__, rc,
634 			phb->hose->global_number, pe_no);
635 		return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
636 	}
637 
638 	/* Check the slave PE */
639 	if (!(pe->flags & PNV_IODA_PE_MASTER))
640 		return state;
641 
642 	list_for_each_entry(slave, &pe->slaves, list) {
643 		rc = opal_pci_eeh_freeze_status(phb->opal_id,
644 						slave->pe_number,
645 						&fstate,
646 						&pcierr,
647 						NULL);
648 		if (rc != OPAL_SUCCESS) {
649 			pr_warn("%s: Failure %lld getting "
650 				"PHB#%x-PE#%x state\n",
651 				__func__, rc,
652 				phb->hose->global_number, slave->pe_number);
653 			return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
654 		}
655 
656 		/*
657 		 * Override the result based on the ascending
658 		 * priority.
659 		 */
660 		if (fstate > state)
661 			state = fstate;
662 	}
663 
664 	return state;
665 }
666 
667 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
668 {
669 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
670 	struct pnv_phb *phb = hose->private_data;
671 	struct pci_dn *pdn = pci_get_pdn(dev);
672 
673 	if (!pdn)
674 		return NULL;
675 	if (pdn->pe_number == IODA_INVALID_PE)
676 		return NULL;
677 	return &phb->ioda.pe_array[pdn->pe_number];
678 }
679 
680 static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
681 				  struct pnv_ioda_pe *parent,
682 				  struct pnv_ioda_pe *child,
683 				  bool is_add)
684 {
685 	const char *desc = is_add ? "adding" : "removing";
686 	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
687 			      OPAL_REMOVE_PE_FROM_DOMAIN;
688 	struct pnv_ioda_pe *slave;
689 	long rc;
690 
691 	/* Parent PE affects child PE */
692 	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
693 				child->pe_number, op);
694 	if (rc != OPAL_SUCCESS) {
695 		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
696 			rc, desc);
697 		return -ENXIO;
698 	}
699 
700 	if (!(child->flags & PNV_IODA_PE_MASTER))
701 		return 0;
702 
703 	/* Compound case: parent PE affects slave PEs */
704 	list_for_each_entry(slave, &child->slaves, list) {
705 		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
706 					slave->pe_number, op);
707 		if (rc != OPAL_SUCCESS) {
708 			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
709 				rc, desc);
710 			return -ENXIO;
711 		}
712 	}
713 
714 	return 0;
715 }
716 
717 static int pnv_ioda_set_peltv(struct pnv_phb *phb,
718 			      struct pnv_ioda_pe *pe,
719 			      bool is_add)
720 {
721 	struct pnv_ioda_pe *slave;
722 	struct pci_dev *pdev = NULL;
723 	int ret;
724 
725 	/*
726 	 * Clear PE frozen state. If it's master PE, we need
727 	 * clear slave PE frozen state as well.
728 	 */
729 	if (is_add) {
730 		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
731 					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
732 		if (pe->flags & PNV_IODA_PE_MASTER) {
733 			list_for_each_entry(slave, &pe->slaves, list)
734 				opal_pci_eeh_freeze_clear(phb->opal_id,
735 							  slave->pe_number,
736 							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
737 		}
738 	}
739 
740 	/*
741 	 * Associate PE in PELT. We need add the PE into the
742 	 * corresponding PELT-V as well. Otherwise, the error
743 	 * originated from the PE might contribute to other
744 	 * PEs.
745 	 */
746 	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
747 	if (ret)
748 		return ret;
749 
750 	/* For compound PEs, any one affects all of them */
751 	if (pe->flags & PNV_IODA_PE_MASTER) {
752 		list_for_each_entry(slave, &pe->slaves, list) {
753 			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
754 			if (ret)
755 				return ret;
756 		}
757 	}
758 
759 	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
760 		pdev = pe->pbus->self;
761 	else if (pe->flags & PNV_IODA_PE_DEV)
762 		pdev = pe->pdev->bus->self;
763 #ifdef CONFIG_PCI_IOV
764 	else if (pe->flags & PNV_IODA_PE_VF)
765 		pdev = pe->parent_dev;
766 #endif /* CONFIG_PCI_IOV */
767 	while (pdev) {
768 		struct pci_dn *pdn = pci_get_pdn(pdev);
769 		struct pnv_ioda_pe *parent;
770 
771 		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
772 			parent = &phb->ioda.pe_array[pdn->pe_number];
773 			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
774 			if (ret)
775 				return ret;
776 		}
777 
778 		pdev = pdev->bus->self;
779 	}
780 
781 	return 0;
782 }
783 
784 static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
785 {
786 	struct pci_dev *parent;
787 	uint8_t bcomp, dcomp, fcomp;
788 	int64_t rc;
789 	long rid_end, rid;
790 
791 	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
792 	if (pe->pbus) {
793 		int count;
794 
795 		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
796 		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
797 		parent = pe->pbus->self;
798 		if (pe->flags & PNV_IODA_PE_BUS_ALL)
799 			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
800 		else
801 			count = 1;
802 
803 		switch(count) {
804 		case  1: bcomp = OpalPciBusAll;         break;
805 		case  2: bcomp = OpalPciBus7Bits;       break;
806 		case  4: bcomp = OpalPciBus6Bits;       break;
807 		case  8: bcomp = OpalPciBus5Bits;       break;
808 		case 16: bcomp = OpalPciBus4Bits;       break;
809 		case 32: bcomp = OpalPciBus3Bits;       break;
810 		default:
811 			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
812 			        count);
813 			/* Do an exact match only */
814 			bcomp = OpalPciBusAll;
815 		}
816 		rid_end = pe->rid + (count << 8);
817 	} else {
818 #ifdef CONFIG_PCI_IOV
819 		if (pe->flags & PNV_IODA_PE_VF)
820 			parent = pe->parent_dev;
821 		else
822 #endif
823 			parent = pe->pdev->bus->self;
824 		bcomp = OpalPciBusAll;
825 		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
826 		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
827 		rid_end = pe->rid + 1;
828 	}
829 
830 	/* Clear the reverse map */
831 	for (rid = pe->rid; rid < rid_end; rid++)
832 		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
833 
834 	/* Release from all parents PELT-V */
835 	while (parent) {
836 		struct pci_dn *pdn = pci_get_pdn(parent);
837 		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
838 			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
839 						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
840 			/* XXX What to do in case of error ? */
841 		}
842 		parent = parent->bus->self;
843 	}
844 
845 	opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
846 				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
847 
848 	/* Disassociate PE in PELT */
849 	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
850 				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
851 	if (rc)
852 		pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
853 	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
854 			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
855 	if (rc)
856 		pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
857 
858 	pe->pbus = NULL;
859 	pe->pdev = NULL;
860 #ifdef CONFIG_PCI_IOV
861 	pe->parent_dev = NULL;
862 #endif
863 
864 	return 0;
865 }
866 
867 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
868 {
869 	struct pci_dev *parent;
870 	uint8_t bcomp, dcomp, fcomp;
871 	long rc, rid_end, rid;
872 
873 	/* Bus validation ? */
874 	if (pe->pbus) {
875 		int count;
876 
877 		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
878 		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
879 		parent = pe->pbus->self;
880 		if (pe->flags & PNV_IODA_PE_BUS_ALL)
881 			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
882 		else
883 			count = 1;
884 
885 		switch(count) {
886 		case  1: bcomp = OpalPciBusAll;		break;
887 		case  2: bcomp = OpalPciBus7Bits;	break;
888 		case  4: bcomp = OpalPciBus6Bits;	break;
889 		case  8: bcomp = OpalPciBus5Bits;	break;
890 		case 16: bcomp = OpalPciBus4Bits;	break;
891 		case 32: bcomp = OpalPciBus3Bits;	break;
892 		default:
893 			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
894 			        count);
895 			/* Do an exact match only */
896 			bcomp = OpalPciBusAll;
897 		}
898 		rid_end = pe->rid + (count << 8);
899 	} else {
900 #ifdef CONFIG_PCI_IOV
901 		if (pe->flags & PNV_IODA_PE_VF)
902 			parent = pe->parent_dev;
903 		else
904 #endif /* CONFIG_PCI_IOV */
905 			parent = pe->pdev->bus->self;
906 		bcomp = OpalPciBusAll;
907 		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
908 		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
909 		rid_end = pe->rid + 1;
910 	}
911 
912 	/*
913 	 * Associate PE in PELT. We need add the PE into the
914 	 * corresponding PELT-V as well. Otherwise, the error
915 	 * originated from the PE might contribute to other
916 	 * PEs.
917 	 */
918 	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
919 			     bcomp, dcomp, fcomp, OPAL_MAP_PE);
920 	if (rc) {
921 		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
922 		return -ENXIO;
923 	}
924 
925 	/*
926 	 * Configure PELTV. NPUs don't have a PELTV table so skip
927 	 * configuration on them.
928 	 */
929 	if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
930 		pnv_ioda_set_peltv(phb, pe, true);
931 
932 	/* Setup reverse map */
933 	for (rid = pe->rid; rid < rid_end; rid++)
934 		phb->ioda.pe_rmap[rid] = pe->pe_number;
935 
936 	/* Setup one MVTs on IODA1 */
937 	if (phb->type != PNV_PHB_IODA1) {
938 		pe->mve_number = 0;
939 		goto out;
940 	}
941 
942 	pe->mve_number = pe->pe_number;
943 	rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
944 	if (rc != OPAL_SUCCESS) {
945 		pe_err(pe, "OPAL error %ld setting up MVE %x\n",
946 		       rc, pe->mve_number);
947 		pe->mve_number = -1;
948 	} else {
949 		rc = opal_pci_set_mve_enable(phb->opal_id,
950 					     pe->mve_number, OPAL_ENABLE_MVE);
951 		if (rc) {
952 			pe_err(pe, "OPAL error %ld enabling MVE %x\n",
953 			       rc, pe->mve_number);
954 			pe->mve_number = -1;
955 		}
956 	}
957 
958 out:
959 	return 0;
960 }
961 
962 #ifdef CONFIG_PCI_IOV
963 static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
964 {
965 	struct pci_dn *pdn = pci_get_pdn(dev);
966 	int i;
967 	struct resource *res, res2;
968 	resource_size_t size;
969 	u16 num_vfs;
970 
971 	if (!dev->is_physfn)
972 		return -EINVAL;
973 
974 	/*
975 	 * "offset" is in VFs.  The M64 windows are sized so that when they
976 	 * are segmented, each segment is the same size as the IOV BAR.
977 	 * Each segment is in a separate PE, and the high order bits of the
978 	 * address are the PE number.  Therefore, each VF's BAR is in a
979 	 * separate PE, and changing the IOV BAR start address changes the
980 	 * range of PEs the VFs are in.
981 	 */
982 	num_vfs = pdn->num_vfs;
983 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
984 		res = &dev->resource[i + PCI_IOV_RESOURCES];
985 		if (!res->flags || !res->parent)
986 			continue;
987 
988 		/*
989 		 * The actual IOV BAR range is determined by the start address
990 		 * and the actual size for num_vfs VFs BAR.  This check is to
991 		 * make sure that after shifting, the range will not overlap
992 		 * with another device.
993 		 */
994 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
995 		res2.flags = res->flags;
996 		res2.start = res->start + (size * offset);
997 		res2.end = res2.start + (size * num_vfs) - 1;
998 
999 		if (res2.end > res->end) {
1000 			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
1001 				i, &res2, res, num_vfs, offset);
1002 			return -EBUSY;
1003 		}
1004 	}
1005 
1006 	/*
1007 	 * Since M64 BAR shares segments among all possible 256 PEs,
1008 	 * we have to shift the beginning of PF IOV BAR to make it start from
1009 	 * the segment which belongs to the PE number assigned to the first VF.
1010 	 * This creates a "hole" in the /proc/iomem which could be used for
1011 	 * allocating other resources so we reserve this area below and
1012 	 * release when IOV is released.
1013 	 */
1014 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1015 		res = &dev->resource[i + PCI_IOV_RESOURCES];
1016 		if (!res->flags || !res->parent)
1017 			continue;
1018 
1019 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
1020 		res2 = *res;
1021 		res->start += size * offset;
1022 
1023 		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
1024 			 i, &res2, res, (offset > 0) ? "En" : "Dis",
1025 			 num_vfs, offset);
1026 
1027 		if (offset < 0) {
1028 			devm_release_resource(&dev->dev, &pdn->holes[i]);
1029 			memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
1030 		}
1031 
1032 		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
1033 
1034 		if (offset > 0) {
1035 			pdn->holes[i].start = res2.start;
1036 			pdn->holes[i].end = res2.start + size * offset - 1;
1037 			pdn->holes[i].flags = IORESOURCE_BUS;
1038 			pdn->holes[i].name = "pnv_iov_reserved";
1039 			devm_request_resource(&dev->dev, res->parent,
1040 					&pdn->holes[i]);
1041 		}
1042 	}
1043 	return 0;
1044 }
1045 #endif /* CONFIG_PCI_IOV */
1046 
1047 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
1048 {
1049 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
1050 	struct pnv_phb *phb = hose->private_data;
1051 	struct pci_dn *pdn = pci_get_pdn(dev);
1052 	struct pnv_ioda_pe *pe;
1053 
1054 	if (!pdn) {
1055 		pr_err("%s: Device tree node not associated properly\n",
1056 			   pci_name(dev));
1057 		return NULL;
1058 	}
1059 	if (pdn->pe_number != IODA_INVALID_PE)
1060 		return NULL;
1061 
1062 	pe = pnv_ioda_alloc_pe(phb);
1063 	if (!pe) {
1064 		pr_warn("%s: Not enough PE# available, disabling device\n",
1065 			pci_name(dev));
1066 		return NULL;
1067 	}
1068 
1069 	/* NOTE: We get only one ref to the pci_dev for the pdn, not for the
1070 	 * pointer in the PE data structure, both should be destroyed at the
1071 	 * same time. However, this needs to be looked at more closely again
1072 	 * once we actually start removing things (Hotplug, SR-IOV, ...)
1073 	 *
1074 	 * At some point we want to remove the PDN completely anyways
1075 	 */
1076 	pci_dev_get(dev);
1077 	pdn->pe_number = pe->pe_number;
1078 	pe->flags = PNV_IODA_PE_DEV;
1079 	pe->pdev = dev;
1080 	pe->pbus = NULL;
1081 	pe->mve_number = -1;
1082 	pe->rid = dev->bus->number << 8 | pdn->devfn;
1083 
1084 	pe_info(pe, "Associated device to PE\n");
1085 
1086 	if (pnv_ioda_configure_pe(phb, pe)) {
1087 		/* XXX What do we do here ? */
1088 		pnv_ioda_free_pe(pe);
1089 		pdn->pe_number = IODA_INVALID_PE;
1090 		pe->pdev = NULL;
1091 		pci_dev_put(dev);
1092 		return NULL;
1093 	}
1094 
1095 	/* Put PE to the list */
1096 	list_add_tail(&pe->list, &phb->ioda.pe_list);
1097 
1098 	return pe;
1099 }
1100 
1101 static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
1102 {
1103 	struct pci_dev *dev;
1104 
1105 	list_for_each_entry(dev, &bus->devices, bus_list) {
1106 		struct pci_dn *pdn = pci_get_pdn(dev);
1107 
1108 		if (pdn == NULL) {
1109 			pr_warn("%s: No device node associated with device !\n",
1110 				pci_name(dev));
1111 			continue;
1112 		}
1113 
1114 		/*
1115 		 * In partial hotplug case, the PCI device might be still
1116 		 * associated with the PE and needn't attach it to the PE
1117 		 * again.
1118 		 */
1119 		if (pdn->pe_number != IODA_INVALID_PE)
1120 			continue;
1121 
1122 		pe->device_count++;
1123 		pdn->pe_number = pe->pe_number;
1124 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1125 			pnv_ioda_setup_same_PE(dev->subordinate, pe);
1126 	}
1127 }
1128 
1129 /*
1130  * There're 2 types of PCI bus sensitive PEs: One that is compromised of
1131  * single PCI bus. Another one that contains the primary PCI bus and its
1132  * subordinate PCI devices and buses. The second type of PE is normally
1133  * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
1134  */
1135 static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
1136 {
1137 	struct pci_controller *hose = pci_bus_to_host(bus);
1138 	struct pnv_phb *phb = hose->private_data;
1139 	struct pnv_ioda_pe *pe = NULL;
1140 	unsigned int pe_num;
1141 
1142 	/*
1143 	 * In partial hotplug case, the PE instance might be still alive.
1144 	 * We should reuse it instead of allocating a new one.
1145 	 */
1146 	pe_num = phb->ioda.pe_rmap[bus->number << 8];
1147 	if (pe_num != IODA_INVALID_PE) {
1148 		pe = &phb->ioda.pe_array[pe_num];
1149 		pnv_ioda_setup_same_PE(bus, pe);
1150 		return NULL;
1151 	}
1152 
1153 	/* PE number for root bus should have been reserved */
1154 	if (pci_is_root_bus(bus) &&
1155 	    phb->ioda.root_pe_idx != IODA_INVALID_PE)
1156 		pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
1157 
1158 	/* Check if PE is determined by M64 */
1159 	if (!pe)
1160 		pe = pnv_ioda_pick_m64_pe(bus, all);
1161 
1162 	/* The PE number isn't pinned by M64 */
1163 	if (!pe)
1164 		pe = pnv_ioda_alloc_pe(phb);
1165 
1166 	if (!pe) {
1167 		pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
1168 			__func__, pci_domain_nr(bus), bus->number);
1169 		return NULL;
1170 	}
1171 
1172 	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
1173 	pe->pbus = bus;
1174 	pe->pdev = NULL;
1175 	pe->mve_number = -1;
1176 	pe->rid = bus->busn_res.start << 8;
1177 
1178 	if (all)
1179 		pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
1180 			&bus->busn_res.start, &bus->busn_res.end,
1181 			pe->pe_number);
1182 	else
1183 		pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
1184 			&bus->busn_res.start, pe->pe_number);
1185 
1186 	if (pnv_ioda_configure_pe(phb, pe)) {
1187 		/* XXX What do we do here ? */
1188 		pnv_ioda_free_pe(pe);
1189 		pe->pbus = NULL;
1190 		return NULL;
1191 	}
1192 
1193 	/* Associate it with all child devices */
1194 	pnv_ioda_setup_same_PE(bus, pe);
1195 
1196 	/* Put PE to the list */
1197 	list_add_tail(&pe->list, &phb->ioda.pe_list);
1198 
1199 	return pe;
1200 }
1201 
1202 static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
1203 {
1204 	int pe_num, found_pe = false, rc;
1205 	long rid;
1206 	struct pnv_ioda_pe *pe;
1207 	struct pci_dev *gpu_pdev;
1208 	struct pci_dn *npu_pdn;
1209 	struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
1210 	struct pnv_phb *phb = hose->private_data;
1211 
1212 	/*
1213 	 * Due to a hardware errata PE#0 on the NPU is reserved for
1214 	 * error handling. This means we only have three PEs remaining
1215 	 * which need to be assigned to four links, implying some
1216 	 * links must share PEs.
1217 	 *
1218 	 * To achieve this we assign PEs such that NPUs linking the
1219 	 * same GPU get assigned the same PE.
1220 	 */
1221 	gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
1222 	for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
1223 		pe = &phb->ioda.pe_array[pe_num];
1224 		if (!pe->pdev)
1225 			continue;
1226 
1227 		if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
1228 			/*
1229 			 * This device has the same peer GPU so should
1230 			 * be assigned the same PE as the existing
1231 			 * peer NPU.
1232 			 */
1233 			dev_info(&npu_pdev->dev,
1234 				"Associating to existing PE %x\n", pe_num);
1235 			pci_dev_get(npu_pdev);
1236 			npu_pdn = pci_get_pdn(npu_pdev);
1237 			rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
1238 			npu_pdn->pe_number = pe_num;
1239 			phb->ioda.pe_rmap[rid] = pe->pe_number;
1240 
1241 			/* Map the PE to this link */
1242 			rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
1243 					OpalPciBusAll,
1244 					OPAL_COMPARE_RID_DEVICE_NUMBER,
1245 					OPAL_COMPARE_RID_FUNCTION_NUMBER,
1246 					OPAL_MAP_PE);
1247 			WARN_ON(rc != OPAL_SUCCESS);
1248 			found_pe = true;
1249 			break;
1250 		}
1251 	}
1252 
1253 	if (!found_pe)
1254 		/*
1255 		 * Could not find an existing PE so allocate a new
1256 		 * one.
1257 		 */
1258 		return pnv_ioda_setup_dev_PE(npu_pdev);
1259 	else
1260 		return pe;
1261 }
1262 
1263 static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
1264 {
1265 	struct pci_dev *pdev;
1266 
1267 	list_for_each_entry(pdev, &bus->devices, bus_list)
1268 		pnv_ioda_setup_npu_PE(pdev);
1269 }
1270 
1271 static void pnv_pci_ioda_setup_PEs(void)
1272 {
1273 	struct pci_controller *hose;
1274 	struct pnv_phb *phb;
1275 	struct pci_bus *bus;
1276 	struct pci_dev *pdev;
1277 	struct pnv_ioda_pe *pe;
1278 
1279 	list_for_each_entry(hose, &hose_list, list_node) {
1280 		phb = hose->private_data;
1281 		if (phb->type == PNV_PHB_NPU_NVLINK) {
1282 			/* PE#0 is needed for error reporting */
1283 			pnv_ioda_reserve_pe(phb, 0);
1284 			pnv_ioda_setup_npu_PEs(hose->bus);
1285 			if (phb->model == PNV_PHB_MODEL_NPU2)
1286 				WARN_ON_ONCE(pnv_npu2_init(hose));
1287 		}
1288 		if (phb->type == PNV_PHB_NPU_OCAPI) {
1289 			bus = hose->bus;
1290 			list_for_each_entry(pdev, &bus->devices, bus_list)
1291 				pnv_ioda_setup_dev_PE(pdev);
1292 		}
1293 	}
1294 	list_for_each_entry(hose, &hose_list, list_node) {
1295 		phb = hose->private_data;
1296 		if (phb->type != PNV_PHB_IODA2)
1297 			continue;
1298 
1299 		list_for_each_entry(pe, &phb->ioda.pe_list, list)
1300 			pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
1301 	}
1302 }
1303 
1304 #ifdef CONFIG_PCI_IOV
1305 static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
1306 {
1307 	struct pci_bus        *bus;
1308 	struct pci_controller *hose;
1309 	struct pnv_phb        *phb;
1310 	struct pci_dn         *pdn;
1311 	int                    i, j;
1312 	int                    m64_bars;
1313 
1314 	bus = pdev->bus;
1315 	hose = pci_bus_to_host(bus);
1316 	phb = hose->private_data;
1317 	pdn = pci_get_pdn(pdev);
1318 
1319 	if (pdn->m64_single_mode)
1320 		m64_bars = num_vfs;
1321 	else
1322 		m64_bars = 1;
1323 
1324 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1325 		for (j = 0; j < m64_bars; j++) {
1326 			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
1327 				continue;
1328 			opal_pci_phb_mmio_enable(phb->opal_id,
1329 				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
1330 			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
1331 			pdn->m64_map[j][i] = IODA_INVALID_M64;
1332 		}
1333 
1334 	kfree(pdn->m64_map);
1335 	return 0;
1336 }
1337 
1338 static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
1339 {
1340 	struct pci_bus        *bus;
1341 	struct pci_controller *hose;
1342 	struct pnv_phb        *phb;
1343 	struct pci_dn         *pdn;
1344 	unsigned int           win;
1345 	struct resource       *res;
1346 	int                    i, j;
1347 	int64_t                rc;
1348 	int                    total_vfs;
1349 	resource_size_t        size, start;
1350 	int                    pe_num;
1351 	int                    m64_bars;
1352 
1353 	bus = pdev->bus;
1354 	hose = pci_bus_to_host(bus);
1355 	phb = hose->private_data;
1356 	pdn = pci_get_pdn(pdev);
1357 	total_vfs = pci_sriov_get_totalvfs(pdev);
1358 
1359 	if (pdn->m64_single_mode)
1360 		m64_bars = num_vfs;
1361 	else
1362 		m64_bars = 1;
1363 
1364 	pdn->m64_map = kmalloc_array(m64_bars,
1365 				     sizeof(*pdn->m64_map),
1366 				     GFP_KERNEL);
1367 	if (!pdn->m64_map)
1368 		return -ENOMEM;
1369 	/* Initialize the m64_map to IODA_INVALID_M64 */
1370 	for (i = 0; i < m64_bars ; i++)
1371 		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
1372 			pdn->m64_map[i][j] = IODA_INVALID_M64;
1373 
1374 
1375 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1376 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
1377 		if (!res->flags || !res->parent)
1378 			continue;
1379 
1380 		for (j = 0; j < m64_bars; j++) {
1381 			do {
1382 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
1383 						phb->ioda.m64_bar_idx + 1, 0);
1384 
1385 				if (win >= phb->ioda.m64_bar_idx + 1)
1386 					goto m64_failed;
1387 			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
1388 
1389 			pdn->m64_map[j][i] = win;
1390 
1391 			if (pdn->m64_single_mode) {
1392 				size = pci_iov_resource_size(pdev,
1393 							PCI_IOV_RESOURCES + i);
1394 				start = res->start + size * j;
1395 			} else {
1396 				size = resource_size(res);
1397 				start = res->start;
1398 			}
1399 
1400 			/* Map the M64 here */
1401 			if (pdn->m64_single_mode) {
1402 				pe_num = pdn->pe_num_map[j];
1403 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1404 						pe_num, OPAL_M64_WINDOW_TYPE,
1405 						pdn->m64_map[j][i], 0);
1406 			}
1407 
1408 			rc = opal_pci_set_phb_mem_window(phb->opal_id,
1409 						 OPAL_M64_WINDOW_TYPE,
1410 						 pdn->m64_map[j][i],
1411 						 start,
1412 						 0, /* unused */
1413 						 size);
1414 
1415 
1416 			if (rc != OPAL_SUCCESS) {
1417 				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
1418 					win, rc);
1419 				goto m64_failed;
1420 			}
1421 
1422 			if (pdn->m64_single_mode)
1423 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
1424 				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
1425 			else
1426 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
1427 				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
1428 
1429 			if (rc != OPAL_SUCCESS) {
1430 				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
1431 					win, rc);
1432 				goto m64_failed;
1433 			}
1434 		}
1435 	}
1436 	return 0;
1437 
1438 m64_failed:
1439 	pnv_pci_vf_release_m64(pdev, num_vfs);
1440 	return -EBUSY;
1441 }
1442 
1443 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
1444 		int num);
1445 
1446 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
1447 {
1448 	struct iommu_table    *tbl;
1449 	int64_t               rc;
1450 
1451 	tbl = pe->table_group.tables[0];
1452 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
1453 	if (rc)
1454 		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
1455 
1456 	pnv_pci_ioda2_set_bypass(pe, false);
1457 	if (pe->table_group.group) {
1458 		iommu_group_put(pe->table_group.group);
1459 		BUG_ON(pe->table_group.group);
1460 	}
1461 	iommu_tce_table_put(tbl);
1462 }
1463 
1464 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
1465 {
1466 	struct pci_bus        *bus;
1467 	struct pci_controller *hose;
1468 	struct pnv_phb        *phb;
1469 	struct pnv_ioda_pe    *pe, *pe_n;
1470 	struct pci_dn         *pdn;
1471 
1472 	bus = pdev->bus;
1473 	hose = pci_bus_to_host(bus);
1474 	phb = hose->private_data;
1475 	pdn = pci_get_pdn(pdev);
1476 
1477 	if (!pdev->is_physfn)
1478 		return;
1479 
1480 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
1481 		if (pe->parent_dev != pdev)
1482 			continue;
1483 
1484 		pnv_pci_ioda2_release_dma_pe(pdev, pe);
1485 
1486 		/* Remove from list */
1487 		mutex_lock(&phb->ioda.pe_list_mutex);
1488 		list_del(&pe->list);
1489 		mutex_unlock(&phb->ioda.pe_list_mutex);
1490 
1491 		pnv_ioda_deconfigure_pe(phb, pe);
1492 
1493 		pnv_ioda_free_pe(pe);
1494 	}
1495 }
1496 
1497 void pnv_pci_sriov_disable(struct pci_dev *pdev)
1498 {
1499 	struct pci_bus        *bus;
1500 	struct pci_controller *hose;
1501 	struct pnv_phb        *phb;
1502 	struct pnv_ioda_pe    *pe;
1503 	struct pci_dn         *pdn;
1504 	u16                    num_vfs, i;
1505 
1506 	bus = pdev->bus;
1507 	hose = pci_bus_to_host(bus);
1508 	phb = hose->private_data;
1509 	pdn = pci_get_pdn(pdev);
1510 	num_vfs = pdn->num_vfs;
1511 
1512 	/* Release VF PEs */
1513 	pnv_ioda_release_vf_PE(pdev);
1514 
1515 	if (phb->type == PNV_PHB_IODA2) {
1516 		if (!pdn->m64_single_mode)
1517 			pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
1518 
1519 		/* Release M64 windows */
1520 		pnv_pci_vf_release_m64(pdev, num_vfs);
1521 
1522 		/* Release PE numbers */
1523 		if (pdn->m64_single_mode) {
1524 			for (i = 0; i < num_vfs; i++) {
1525 				if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1526 					continue;
1527 
1528 				pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1529 				pnv_ioda_free_pe(pe);
1530 			}
1531 		} else
1532 			bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1533 		/* Releasing pe_num_map */
1534 		kfree(pdn->pe_num_map);
1535 	}
1536 }
1537 
1538 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1539 				       struct pnv_ioda_pe *pe);
1540 #ifdef CONFIG_IOMMU_API
1541 static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
1542 		struct iommu_table_group *table_group, struct pci_bus *bus);
1543 
1544 #endif
1545 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1546 {
1547 	struct pci_bus        *bus;
1548 	struct pci_controller *hose;
1549 	struct pnv_phb        *phb;
1550 	struct pnv_ioda_pe    *pe;
1551 	int                    pe_num;
1552 	u16                    vf_index;
1553 	struct pci_dn         *pdn;
1554 
1555 	bus = pdev->bus;
1556 	hose = pci_bus_to_host(bus);
1557 	phb = hose->private_data;
1558 	pdn = pci_get_pdn(pdev);
1559 
1560 	if (!pdev->is_physfn)
1561 		return;
1562 
1563 	/* Reserve PE for each VF */
1564 	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
1565 		if (pdn->m64_single_mode)
1566 			pe_num = pdn->pe_num_map[vf_index];
1567 		else
1568 			pe_num = *pdn->pe_num_map + vf_index;
1569 
1570 		pe = &phb->ioda.pe_array[pe_num];
1571 		pe->pe_number = pe_num;
1572 		pe->phb = phb;
1573 		pe->flags = PNV_IODA_PE_VF;
1574 		pe->pbus = NULL;
1575 		pe->parent_dev = pdev;
1576 		pe->mve_number = -1;
1577 		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
1578 			   pci_iov_virtfn_devfn(pdev, vf_index);
1579 
1580 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
1581 			hose->global_number, pdev->bus->number,
1582 			PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
1583 			PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
1584 
1585 		if (pnv_ioda_configure_pe(phb, pe)) {
1586 			/* XXX What do we do here ? */
1587 			pnv_ioda_free_pe(pe);
1588 			pe->pdev = NULL;
1589 			continue;
1590 		}
1591 
1592 		/* Put PE to the list */
1593 		mutex_lock(&phb->ioda.pe_list_mutex);
1594 		list_add_tail(&pe->list, &phb->ioda.pe_list);
1595 		mutex_unlock(&phb->ioda.pe_list_mutex);
1596 
1597 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
1598 #ifdef CONFIG_IOMMU_API
1599 		iommu_register_group(&pe->table_group,
1600 				pe->phb->hose->global_number, pe->pe_number);
1601 		pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
1602 #endif
1603 	}
1604 }
1605 
1606 int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1607 {
1608 	struct pci_bus        *bus;
1609 	struct pci_controller *hose;
1610 	struct pnv_phb        *phb;
1611 	struct pnv_ioda_pe    *pe;
1612 	struct pci_dn         *pdn;
1613 	int                    ret;
1614 	u16                    i;
1615 
1616 	bus = pdev->bus;
1617 	hose = pci_bus_to_host(bus);
1618 	phb = hose->private_data;
1619 	pdn = pci_get_pdn(pdev);
1620 
1621 	if (phb->type == PNV_PHB_IODA2) {
1622 		if (!pdn->vfs_expanded) {
1623 			dev_info(&pdev->dev, "don't support this SRIOV device"
1624 				" with non 64bit-prefetchable IOV BAR\n");
1625 			return -ENOSPC;
1626 		}
1627 
1628 		/*
1629 		 * When M64 BARs functions in Single PE mode, the number of VFs
1630 		 * could be enabled must be less than the number of M64 BARs.
1631 		 */
1632 		if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
1633 			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
1634 			return -EBUSY;
1635 		}
1636 
1637 		/* Allocating pe_num_map */
1638 		if (pdn->m64_single_mode)
1639 			pdn->pe_num_map = kmalloc_array(num_vfs,
1640 							sizeof(*pdn->pe_num_map),
1641 							GFP_KERNEL);
1642 		else
1643 			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
1644 
1645 		if (!pdn->pe_num_map)
1646 			return -ENOMEM;
1647 
1648 		if (pdn->m64_single_mode)
1649 			for (i = 0; i < num_vfs; i++)
1650 				pdn->pe_num_map[i] = IODA_INVALID_PE;
1651 
1652 		/* Calculate available PE for required VFs */
1653 		if (pdn->m64_single_mode) {
1654 			for (i = 0; i < num_vfs; i++) {
1655 				pe = pnv_ioda_alloc_pe(phb);
1656 				if (!pe) {
1657 					ret = -EBUSY;
1658 					goto m64_failed;
1659 				}
1660 
1661 				pdn->pe_num_map[i] = pe->pe_number;
1662 			}
1663 		} else {
1664 			mutex_lock(&phb->ioda.pe_alloc_mutex);
1665 			*pdn->pe_num_map = bitmap_find_next_zero_area(
1666 				phb->ioda.pe_alloc, phb->ioda.total_pe_num,
1667 				0, num_vfs, 0);
1668 			if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
1669 				mutex_unlock(&phb->ioda.pe_alloc_mutex);
1670 				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
1671 				kfree(pdn->pe_num_map);
1672 				return -EBUSY;
1673 			}
1674 			bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1675 			mutex_unlock(&phb->ioda.pe_alloc_mutex);
1676 		}
1677 		pdn->num_vfs = num_vfs;
1678 
1679 		/* Assign M64 window accordingly */
1680 		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
1681 		if (ret) {
1682 			dev_info(&pdev->dev, "Not enough M64 window resources\n");
1683 			goto m64_failed;
1684 		}
1685 
1686 		/*
1687 		 * When using one M64 BAR to map one IOV BAR, we need to shift
1688 		 * the IOV BAR according to the PE# allocated to the VFs.
1689 		 * Otherwise, the PE# for the VF will conflict with others.
1690 		 */
1691 		if (!pdn->m64_single_mode) {
1692 			ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
1693 			if (ret)
1694 				goto m64_failed;
1695 		}
1696 	}
1697 
1698 	/* Setup VF PEs */
1699 	pnv_ioda_setup_vf_PE(pdev, num_vfs);
1700 
1701 	return 0;
1702 
1703 m64_failed:
1704 	if (pdn->m64_single_mode) {
1705 		for (i = 0; i < num_vfs; i++) {
1706 			if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1707 				continue;
1708 
1709 			pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1710 			pnv_ioda_free_pe(pe);
1711 		}
1712 	} else
1713 		bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1714 
1715 	/* Releasing pe_num_map */
1716 	kfree(pdn->pe_num_map);
1717 
1718 	return ret;
1719 }
1720 
1721 int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
1722 {
1723 	pnv_pci_sriov_disable(pdev);
1724 
1725 	/* Release PCI data */
1726 	remove_dev_pci_data(pdev);
1727 	return 0;
1728 }
1729 
1730 int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1731 {
1732 	/* Allocate PCI data */
1733 	add_dev_pci_data(pdev);
1734 
1735 	return pnv_pci_sriov_enable(pdev, num_vfs);
1736 }
1737 #endif /* CONFIG_PCI_IOV */
1738 
1739 static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
1740 {
1741 	struct pci_dn *pdn = pci_get_pdn(pdev);
1742 	struct pnv_ioda_pe *pe;
1743 
1744 	/*
1745 	 * The function can be called while the PE#
1746 	 * hasn't been assigned. Do nothing for the
1747 	 * case.
1748 	 */
1749 	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
1750 		return;
1751 
1752 	pe = &phb->ioda.pe_array[pdn->pe_number];
1753 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1754 	pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
1755 	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1756 	/*
1757 	 * Note: iommu_add_device() will fail here as
1758 	 * for physical PE: the device is already added by now;
1759 	 * for virtual PE: sysfs entries are not ready yet and
1760 	 * tce_iommu_bus_notifier will add the device to a group later.
1761 	 */
1762 }
1763 
1764 /*
1765  * Reconfigure TVE#0 to be usable as 64-bit DMA space.
1766  *
1767  * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
1768  * Devices can only access more than that if bit 59 of the PCI address is set
1769  * by hardware, which indicates TVE#1 should be used instead of TVE#0.
1770  * Many PCI devices are not capable of addressing that many bits, and as a
1771  * result are limited to the 4GB of virtual memory made available to 32-bit
1772  * devices in TVE#0.
1773  *
1774  * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
1775  * devices by configuring the virtual memory past the first 4GB inaccessible
1776  * by 64-bit DMAs.  This should only be used by devices that want more than
1777  * 4GB, and only on PEs that have no 32-bit devices.
1778  *
1779  * Currently this will only work on PHB3 (POWER8).
1780  */
1781 static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
1782 {
1783 	u64 window_size, table_size, tce_count, addr;
1784 	struct page *table_pages;
1785 	u64 tce_order = 28; /* 256MB TCEs */
1786 	__be64 *tces;
1787 	s64 rc;
1788 
1789 	/*
1790 	 * Window size needs to be a power of two, but needs to account for
1791 	 * shifting memory by the 4GB offset required to skip 32bit space.
1792 	 */
1793 	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
1794 	tce_count = window_size >> tce_order;
1795 	table_size = tce_count << 3;
1796 
1797 	if (table_size < PAGE_SIZE)
1798 		table_size = PAGE_SIZE;
1799 
1800 	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
1801 				       get_order(table_size));
1802 	if (!table_pages)
1803 		goto err;
1804 
1805 	tces = page_address(table_pages);
1806 	if (!tces)
1807 		goto err;
1808 
1809 	memset(tces, 0, table_size);
1810 
1811 	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
1812 		tces[(addr + (1ULL << 32)) >> tce_order] =
1813 			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
1814 	}
1815 
1816 	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
1817 					pe->pe_number,
1818 					/* reconfigure window 0 */
1819 					(pe->pe_number << 1) + 0,
1820 					1,
1821 					__pa(tces),
1822 					table_size,
1823 					1 << tce_order);
1824 	if (rc == OPAL_SUCCESS) {
1825 		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
1826 		return 0;
1827 	}
1828 err:
1829 	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
1830 	return -EIO;
1831 }
1832 
1833 static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
1834 		u64 dma_mask)
1835 {
1836 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1837 	struct pnv_phb *phb = hose->private_data;
1838 	struct pci_dn *pdn = pci_get_pdn(pdev);
1839 	struct pnv_ioda_pe *pe;
1840 
1841 	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1842 		return false;
1843 
1844 	pe = &phb->ioda.pe_array[pdn->pe_number];
1845 	if (pe->tce_bypass_enabled) {
1846 		u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1847 		if (dma_mask >= top)
1848 			return true;
1849 	}
1850 
1851 	/*
1852 	 * If the device can't set the TCE bypass bit but still wants
1853 	 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
1854 	 * bypass the 32-bit region and be usable for 64-bit DMAs.
1855 	 * The device needs to be able to address all of this space.
1856 	 */
1857 	if (dma_mask >> 32 &&
1858 	    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
1859 	    /* pe->pdev should be set if it's a single device, pe->pbus if not */
1860 	    (pe->device_count == 1 || !pe->pbus) &&
1861 	    phb->model == PNV_PHB_MODEL_PHB3) {
1862 		/* Configure the bypass mode */
1863 		s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
1864 		if (rc)
1865 			return false;
1866 		/* 4GB offset bypasses 32-bit space */
1867 		pdev->dev.archdata.dma_offset = (1ULL << 32);
1868 		return true;
1869 	}
1870 
1871 	return false;
1872 }
1873 
1874 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
1875 {
1876 	struct pci_dev *dev;
1877 
1878 	list_for_each_entry(dev, &bus->devices, bus_list) {
1879 		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
1880 		dev->dev.archdata.dma_offset = pe->tce_bypass_base;
1881 
1882 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1883 			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
1884 	}
1885 }
1886 
1887 static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
1888 						     bool real_mode)
1889 {
1890 	return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
1891 		(phb->regs + 0x210);
1892 }
1893 
1894 static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
1895 		unsigned long index, unsigned long npages, bool rm)
1896 {
1897 	struct iommu_table_group_link *tgl = list_first_entry_or_null(
1898 			&tbl->it_group_list, struct iommu_table_group_link,
1899 			next);
1900 	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1901 			struct pnv_ioda_pe, table_group);
1902 	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
1903 	unsigned long start, end, inc;
1904 
1905 	start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
1906 	end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
1907 			npages - 1);
1908 
1909 	/* p7ioc-style invalidation, 2 TCEs per write */
1910 	start |= (1ull << 63);
1911 	end |= (1ull << 63);
1912 	inc = 16;
1913         end |= inc - 1;	/* round up end to be different than start */
1914 
1915         mb(); /* Ensure above stores are visible */
1916         while (start <= end) {
1917 		if (rm)
1918 			__raw_rm_writeq_be(start, invalidate);
1919 		else
1920 			__raw_writeq_be(start, invalidate);
1921 
1922                 start += inc;
1923         }
1924 
1925 	/*
1926 	 * The iommu layer will do another mb() for us on build()
1927 	 * and we don't care on free()
1928 	 */
1929 }
1930 
1931 static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
1932 		long npages, unsigned long uaddr,
1933 		enum dma_data_direction direction,
1934 		unsigned long attrs)
1935 {
1936 	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1937 			attrs);
1938 
1939 	if (!ret)
1940 		pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1941 
1942 	return ret;
1943 }
1944 
1945 #ifdef CONFIG_IOMMU_API
1946 static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
1947 		unsigned long *hpa, enum dma_data_direction *direction)
1948 {
1949 	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
1950 
1951 	if (!ret)
1952 		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
1953 
1954 	return ret;
1955 }
1956 
1957 static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
1958 		unsigned long *hpa, enum dma_data_direction *direction)
1959 {
1960 	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
1961 
1962 	if (!ret)
1963 		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
1964 
1965 	return ret;
1966 }
1967 #endif
1968 
1969 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
1970 		long npages)
1971 {
1972 	pnv_tce_free(tbl, index, npages);
1973 
1974 	pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1975 }
1976 
1977 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1978 	.set = pnv_ioda1_tce_build,
1979 #ifdef CONFIG_IOMMU_API
1980 	.exchange = pnv_ioda1_tce_xchg,
1981 	.exchange_rm = pnv_ioda1_tce_xchg_rm,
1982 	.useraddrptr = pnv_tce_useraddrptr,
1983 #endif
1984 	.clear = pnv_ioda1_tce_free,
1985 	.get = pnv_tce_get,
1986 };
1987 
1988 #define PHB3_TCE_KILL_INVAL_ALL		PPC_BIT(0)
1989 #define PHB3_TCE_KILL_INVAL_PE		PPC_BIT(1)
1990 #define PHB3_TCE_KILL_INVAL_ONE		PPC_BIT(2)
1991 
1992 static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1993 {
1994 	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
1995 	const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
1996 
1997 	mb(); /* Ensure previous TCE table stores are visible */
1998 	if (rm)
1999 		__raw_rm_writeq_be(val, invalidate);
2000 	else
2001 		__raw_writeq_be(val, invalidate);
2002 }
2003 
2004 static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
2005 {
2006 	/* 01xb - invalidate TCEs that match the specified PE# */
2007 	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
2008 	unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
2009 
2010 	mb(); /* Ensure above stores are visible */
2011 	__raw_writeq_be(val, invalidate);
2012 }
2013 
2014 static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
2015 					unsigned shift, unsigned long index,
2016 					unsigned long npages)
2017 {
2018 	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
2019 	unsigned long start, end, inc;
2020 
2021 	/* We'll invalidate DMA address in PE scope */
2022 	start = PHB3_TCE_KILL_INVAL_ONE;
2023 	start |= (pe->pe_number & 0xFF);
2024 	end = start;
2025 
2026 	/* Figure out the start, end and step */
2027 	start |= (index << shift);
2028 	end |= ((index + npages - 1) << shift);
2029 	inc = (0x1ull << shift);
2030 	mb();
2031 
2032 	while (start <= end) {
2033 		if (rm)
2034 			__raw_rm_writeq_be(start, invalidate);
2035 		else
2036 			__raw_writeq_be(start, invalidate);
2037 		start += inc;
2038 	}
2039 }
2040 
2041 static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
2042 {
2043 	struct pnv_phb *phb = pe->phb;
2044 
2045 	if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
2046 		pnv_pci_phb3_tce_invalidate_pe(pe);
2047 	else
2048 		opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
2049 				  pe->pe_number, 0, 0, 0);
2050 }
2051 
2052 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
2053 		unsigned long index, unsigned long npages, bool rm)
2054 {
2055 	struct iommu_table_group_link *tgl;
2056 
2057 	list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
2058 		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
2059 				struct pnv_ioda_pe, table_group);
2060 		struct pnv_phb *phb = pe->phb;
2061 		unsigned int shift = tbl->it_page_shift;
2062 
2063 		/*
2064 		 * NVLink1 can use the TCE kill register directly as
2065 		 * it's the same as PHB3. NVLink2 is different and
2066 		 * should go via the OPAL call.
2067 		 */
2068 		if (phb->model == PNV_PHB_MODEL_NPU) {
2069 			/*
2070 			 * The NVLink hardware does not support TCE kill
2071 			 * per TCE entry so we have to invalidate
2072 			 * the entire cache for it.
2073 			 */
2074 			pnv_pci_phb3_tce_invalidate_entire(phb, rm);
2075 			continue;
2076 		}
2077 		if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
2078 			pnv_pci_phb3_tce_invalidate(pe, rm, shift,
2079 						    index, npages);
2080 		else
2081 			opal_pci_tce_kill(phb->opal_id,
2082 					  OPAL_PCI_TCE_KILL_PAGES,
2083 					  pe->pe_number, 1u << shift,
2084 					  index << shift, npages);
2085 	}
2086 }
2087 
2088 void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
2089 {
2090 	if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
2091 		pnv_pci_phb3_tce_invalidate_entire(phb, rm);
2092 	else
2093 		opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
2094 }
2095 
2096 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
2097 		long npages, unsigned long uaddr,
2098 		enum dma_data_direction direction,
2099 		unsigned long attrs)
2100 {
2101 	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
2102 			attrs);
2103 
2104 	if (!ret)
2105 		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
2106 
2107 	return ret;
2108 }
2109 
2110 #ifdef CONFIG_IOMMU_API
2111 static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
2112 		unsigned long *hpa, enum dma_data_direction *direction)
2113 {
2114 	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
2115 
2116 	if (!ret)
2117 		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
2118 
2119 	return ret;
2120 }
2121 
2122 static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
2123 		unsigned long *hpa, enum dma_data_direction *direction)
2124 {
2125 	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
2126 
2127 	if (!ret)
2128 		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
2129 
2130 	return ret;
2131 }
2132 #endif
2133 
2134 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
2135 		long npages)
2136 {
2137 	pnv_tce_free(tbl, index, npages);
2138 
2139 	pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
2140 }
2141 
2142 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
2143 	.set = pnv_ioda2_tce_build,
2144 #ifdef CONFIG_IOMMU_API
2145 	.exchange = pnv_ioda2_tce_xchg,
2146 	.exchange_rm = pnv_ioda2_tce_xchg_rm,
2147 	.useraddrptr = pnv_tce_useraddrptr,
2148 #endif
2149 	.clear = pnv_ioda2_tce_free,
2150 	.get = pnv_tce_get,
2151 	.free = pnv_pci_ioda2_table_free_pages,
2152 };
2153 
2154 static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
2155 {
2156 	unsigned int *weight = (unsigned int *)data;
2157 
2158 	/* This is quite simplistic. The "base" weight of a device
2159 	 * is 10. 0 means no DMA is to be accounted for it.
2160 	 */
2161 	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
2162 		return 0;
2163 
2164 	if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
2165 	    dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
2166 	    dev->class == PCI_CLASS_SERIAL_USB_EHCI)
2167 		*weight += 3;
2168 	else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
2169 		*weight += 15;
2170 	else
2171 		*weight += 10;
2172 
2173 	return 0;
2174 }
2175 
2176 static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
2177 {
2178 	unsigned int weight = 0;
2179 
2180 	/* SRIOV VF has same DMA32 weight as its PF */
2181 #ifdef CONFIG_PCI_IOV
2182 	if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
2183 		pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
2184 		return weight;
2185 	}
2186 #endif
2187 
2188 	if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
2189 		pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
2190 	} else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
2191 		struct pci_dev *pdev;
2192 
2193 		list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
2194 			pnv_pci_ioda_dev_dma_weight(pdev, &weight);
2195 	} else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
2196 		pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
2197 	}
2198 
2199 	return weight;
2200 }
2201 
2202 static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
2203 				       struct pnv_ioda_pe *pe)
2204 {
2205 
2206 	struct page *tce_mem = NULL;
2207 	struct iommu_table *tbl;
2208 	unsigned int weight, total_weight = 0;
2209 	unsigned int tce32_segsz, base, segs, avail, i;
2210 	int64_t rc;
2211 	void *addr;
2212 
2213 	/* XXX FIXME: Handle 64-bit only DMA devices */
2214 	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
2215 	/* XXX FIXME: Allocate multi-level tables on PHB3 */
2216 	weight = pnv_pci_ioda_pe_dma_weight(pe);
2217 	if (!weight)
2218 		return;
2219 
2220 	pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
2221 		     &total_weight);
2222 	segs = (weight * phb->ioda.dma32_count) / total_weight;
2223 	if (!segs)
2224 		segs = 1;
2225 
2226 	/*
2227 	 * Allocate contiguous DMA32 segments. We begin with the expected
2228 	 * number of segments. With one more attempt, the number of DMA32
2229 	 * segments to be allocated is decreased by one until one segment
2230 	 * is allocated successfully.
2231 	 */
2232 	do {
2233 		for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
2234 			for (avail = 0, i = base; i < base + segs; i++) {
2235 				if (phb->ioda.dma32_segmap[i] ==
2236 				    IODA_INVALID_PE)
2237 					avail++;
2238 			}
2239 
2240 			if (avail == segs)
2241 				goto found;
2242 		}
2243 	} while (--segs);
2244 
2245 	if (!segs) {
2246 		pe_warn(pe, "No available DMA32 segments\n");
2247 		return;
2248 	}
2249 
2250 found:
2251 	tbl = pnv_pci_table_alloc(phb->hose->node);
2252 	if (WARN_ON(!tbl))
2253 		return;
2254 
2255 	iommu_register_group(&pe->table_group, phb->hose->global_number,
2256 			pe->pe_number);
2257 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
2258 
2259 	/* Grab a 32-bit TCE table */
2260 	pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
2261 		weight, total_weight, base, segs);
2262 	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
2263 		base * PNV_IODA1_DMA32_SEGSIZE,
2264 		(base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
2265 
2266 	/* XXX Currently, we allocate one big contiguous table for the
2267 	 * TCEs. We only really need one chunk per 256M of TCE space
2268 	 * (ie per segment) but that's an optimization for later, it
2269 	 * requires some added smarts with our get/put_tce implementation
2270 	 *
2271 	 * Each TCE page is 4KB in size and each TCE entry occupies 8
2272 	 * bytes
2273 	 */
2274 	tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
2275 	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
2276 				   get_order(tce32_segsz * segs));
2277 	if (!tce_mem) {
2278 		pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
2279 		goto fail;
2280 	}
2281 	addr = page_address(tce_mem);
2282 	memset(addr, 0, tce32_segsz * segs);
2283 
2284 	/* Configure HW */
2285 	for (i = 0; i < segs; i++) {
2286 		rc = opal_pci_map_pe_dma_window(phb->opal_id,
2287 					      pe->pe_number,
2288 					      base + i, 1,
2289 					      __pa(addr) + tce32_segsz * i,
2290 					      tce32_segsz, IOMMU_PAGE_SIZE_4K);
2291 		if (rc) {
2292 			pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
2293 			       rc);
2294 			goto fail;
2295 		}
2296 	}
2297 
2298 	/* Setup DMA32 segment mapping */
2299 	for (i = base; i < base + segs; i++)
2300 		phb->ioda.dma32_segmap[i] = pe->pe_number;
2301 
2302 	/* Setup linux iommu table */
2303 	pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
2304 				  base * PNV_IODA1_DMA32_SEGSIZE,
2305 				  IOMMU_PAGE_SHIFT_4K);
2306 
2307 	tbl->it_ops = &pnv_ioda1_iommu_ops;
2308 	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
2309 	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
2310 	iommu_init_table(tbl, phb->hose->node);
2311 
2312 	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2313 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
2314 
2315 	return;
2316  fail:
2317 	/* XXX Failure: Try to fallback to 64-bit only ? */
2318 	if (tce_mem)
2319 		__free_pages(tce_mem, get_order(tce32_segsz * segs));
2320 	if (tbl) {
2321 		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
2322 		iommu_tce_table_put(tbl);
2323 	}
2324 }
2325 
2326 static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
2327 		int num, struct iommu_table *tbl)
2328 {
2329 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2330 			table_group);
2331 	struct pnv_phb *phb = pe->phb;
2332 	int64_t rc;
2333 	const unsigned long size = tbl->it_indirect_levels ?
2334 			tbl->it_level_size : tbl->it_size;
2335 	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
2336 	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
2337 
2338 	pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
2339 		num, start_addr, start_addr + win_size - 1,
2340 		IOMMU_PAGE_SIZE(tbl));
2341 
2342 	/*
2343 	 * Map TCE table through TVT. The TVE index is the PE number
2344 	 * shifted by 1 bit for 32-bits DMA space.
2345 	 */
2346 	rc = opal_pci_map_pe_dma_window(phb->opal_id,
2347 			pe->pe_number,
2348 			(pe->pe_number << 1) + num,
2349 			tbl->it_indirect_levels + 1,
2350 			__pa(tbl->it_base),
2351 			size << 3,
2352 			IOMMU_PAGE_SIZE(tbl));
2353 	if (rc) {
2354 		pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
2355 		return rc;
2356 	}
2357 
2358 	pnv_pci_link_table_and_group(phb->hose->node, num,
2359 			tbl, &pe->table_group);
2360 	pnv_pci_ioda2_tce_invalidate_pe(pe);
2361 
2362 	return 0;
2363 }
2364 
2365 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
2366 {
2367 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
2368 	int64_t rc;
2369 
2370 	pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
2371 	if (enable) {
2372 		phys_addr_t top = memblock_end_of_DRAM();
2373 
2374 		top = roundup_pow_of_two(top);
2375 		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2376 						     pe->pe_number,
2377 						     window_id,
2378 						     pe->tce_bypass_base,
2379 						     top);
2380 	} else {
2381 		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2382 						     pe->pe_number,
2383 						     window_id,
2384 						     pe->tce_bypass_base,
2385 						     0);
2386 	}
2387 	if (rc)
2388 		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
2389 	else
2390 		pe->tce_bypass_enabled = enable;
2391 }
2392 
2393 static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
2394 		int num, __u32 page_shift, __u64 window_size, __u32 levels,
2395 		bool alloc_userspace_copy, struct iommu_table **ptbl)
2396 {
2397 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2398 			table_group);
2399 	int nid = pe->phb->hose->node;
2400 	__u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
2401 	long ret;
2402 	struct iommu_table *tbl;
2403 
2404 	tbl = pnv_pci_table_alloc(nid);
2405 	if (!tbl)
2406 		return -ENOMEM;
2407 
2408 	tbl->it_ops = &pnv_ioda2_iommu_ops;
2409 
2410 	ret = pnv_pci_ioda2_table_alloc_pages(nid,
2411 			bus_offset, page_shift, window_size,
2412 			levels, alloc_userspace_copy, tbl);
2413 	if (ret) {
2414 		iommu_tce_table_put(tbl);
2415 		return ret;
2416 	}
2417 
2418 	*ptbl = tbl;
2419 
2420 	return 0;
2421 }
2422 
2423 static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
2424 {
2425 	struct iommu_table *tbl = NULL;
2426 	long rc;
2427 
2428 	/*
2429 	 * crashkernel= specifies the kdump kernel's maximum memory at
2430 	 * some offset and there is no guaranteed the result is a power
2431 	 * of 2, which will cause errors later.
2432 	 */
2433 	const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
2434 
2435 	/*
2436 	 * In memory constrained environments, e.g. kdump kernel, the
2437 	 * DMA window can be larger than available memory, which will
2438 	 * cause errors later.
2439 	 */
2440 	const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
2441 
2442 	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
2443 			IOMMU_PAGE_SHIFT_4K,
2444 			window_size,
2445 			POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
2446 	if (rc) {
2447 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
2448 				rc);
2449 		return rc;
2450 	}
2451 
2452 	iommu_init_table(tbl, pe->phb->hose->node);
2453 
2454 	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
2455 	if (rc) {
2456 		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
2457 				rc);
2458 		iommu_tce_table_put(tbl);
2459 		return rc;
2460 	}
2461 
2462 	if (!pnv_iommu_bypass_disabled)
2463 		pnv_pci_ioda2_set_bypass(pe, true);
2464 
2465 	return 0;
2466 }
2467 
2468 #if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
2469 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
2470 		int num)
2471 {
2472 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2473 			table_group);
2474 	struct pnv_phb *phb = pe->phb;
2475 	long ret;
2476 
2477 	pe_info(pe, "Removing DMA window #%d\n", num);
2478 
2479 	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
2480 			(pe->pe_number << 1) + num,
2481 			0/* levels */, 0/* table address */,
2482 			0/* table size */, 0/* page size */);
2483 	if (ret)
2484 		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
2485 	else
2486 		pnv_pci_ioda2_tce_invalidate_pe(pe);
2487 
2488 	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
2489 
2490 	return ret;
2491 }
2492 #endif
2493 
2494 #ifdef CONFIG_IOMMU_API
2495 unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
2496 		__u64 window_size, __u32 levels)
2497 {
2498 	unsigned long bytes = 0;
2499 	const unsigned window_shift = ilog2(window_size);
2500 	unsigned entries_shift = window_shift - page_shift;
2501 	unsigned table_shift = entries_shift + 3;
2502 	unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
2503 	unsigned long direct_table_size;
2504 
2505 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
2506 			!is_power_of_2(window_size))
2507 		return 0;
2508 
2509 	/* Calculate a direct table size from window_size and levels */
2510 	entries_shift = (entries_shift + levels - 1) / levels;
2511 	table_shift = entries_shift + 3;
2512 	table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
2513 	direct_table_size =  1UL << table_shift;
2514 
2515 	for ( ; levels; --levels) {
2516 		bytes += _ALIGN_UP(tce_table_size, direct_table_size);
2517 
2518 		tce_table_size /= direct_table_size;
2519 		tce_table_size <<= 3;
2520 		tce_table_size = max_t(unsigned long,
2521 				tce_table_size, direct_table_size);
2522 	}
2523 
2524 	return bytes + bytes; /* one for HW table, one for userspace copy */
2525 }
2526 
2527 static long pnv_pci_ioda2_create_table_userspace(
2528 		struct iommu_table_group *table_group,
2529 		int num, __u32 page_shift, __u64 window_size, __u32 levels,
2530 		struct iommu_table **ptbl)
2531 {
2532 	long ret = pnv_pci_ioda2_create_table(table_group,
2533 			num, page_shift, window_size, levels, true, ptbl);
2534 
2535 	if (!ret)
2536 		(*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
2537 				page_shift, window_size, levels);
2538 	return ret;
2539 }
2540 
2541 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
2542 {
2543 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2544 						table_group);
2545 	/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
2546 	struct iommu_table *tbl = pe->table_group.tables[0];
2547 
2548 	pnv_pci_ioda2_set_bypass(pe, false);
2549 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2550 	if (pe->pbus)
2551 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
2552 	iommu_tce_table_put(tbl);
2553 }
2554 
2555 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
2556 {
2557 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2558 						table_group);
2559 
2560 	pnv_pci_ioda2_setup_default_config(pe);
2561 	if (pe->pbus)
2562 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
2563 }
2564 
2565 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
2566 	.get_table_size = pnv_pci_ioda2_get_table_size,
2567 	.create_table = pnv_pci_ioda2_create_table_userspace,
2568 	.set_window = pnv_pci_ioda2_set_window,
2569 	.unset_window = pnv_pci_ioda2_unset_window,
2570 	.take_ownership = pnv_ioda2_take_ownership,
2571 	.release_ownership = pnv_ioda2_release_ownership,
2572 };
2573 
2574 static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
2575 		struct iommu_table_group *table_group,
2576 		struct pci_bus *bus)
2577 {
2578 	struct pci_dev *dev;
2579 
2580 	list_for_each_entry(dev, &bus->devices, bus_list) {
2581 		iommu_add_device(table_group, &dev->dev);
2582 
2583 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
2584 			pnv_ioda_setup_bus_iommu_group_add_devices(pe,
2585 					table_group, dev->subordinate);
2586 	}
2587 }
2588 
2589 static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
2590 		struct iommu_table_group *table_group, struct pci_bus *bus)
2591 {
2592 
2593 	if (pe->flags & PNV_IODA_PE_DEV)
2594 		iommu_add_device(table_group, &pe->pdev->dev);
2595 
2596 	if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
2597 		pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
2598 				bus);
2599 }
2600 
2601 static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
2602 
2603 static void pnv_pci_ioda_setup_iommu_api(void)
2604 {
2605 	struct pci_controller *hose;
2606 	struct pnv_phb *phb;
2607 	struct pnv_ioda_pe *pe;
2608 
2609 	/*
2610 	 * There are 4 types of PEs:
2611 	 * - PNV_IODA_PE_BUS: a downstream port with an adapter,
2612 	 *   created from pnv_pci_setup_bridge();
2613 	 * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
2614 	 *   created from pnv_pci_setup_bridge();
2615 	 * - PNV_IODA_PE_VF: a SRIOV virtual function,
2616 	 *   created from pnv_pcibios_sriov_enable();
2617 	 * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
2618 	 *   created from pnv_pci_ioda_fixup().
2619 	 *
2620 	 * Normally a PE is represented by an IOMMU group, however for
2621 	 * devices with side channels the groups need to be more strict.
2622 	 */
2623 	list_for_each_entry(hose, &hose_list, list_node) {
2624 		phb = hose->private_data;
2625 
2626 		if (phb->type == PNV_PHB_NPU_NVLINK ||
2627 		    phb->type == PNV_PHB_NPU_OCAPI)
2628 			continue;
2629 
2630 		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2631 			struct iommu_table_group *table_group;
2632 
2633 			table_group = pnv_try_setup_npu_table_group(pe);
2634 			if (!table_group) {
2635 				if (!pnv_pci_ioda_pe_dma_weight(pe))
2636 					continue;
2637 
2638 				table_group = &pe->table_group;
2639 				iommu_register_group(&pe->table_group,
2640 						pe->phb->hose->global_number,
2641 						pe->pe_number);
2642 			}
2643 			pnv_ioda_setup_bus_iommu_group(pe, table_group,
2644 					pe->pbus);
2645 		}
2646 	}
2647 
2648 	/*
2649 	 * Now we have all PHBs discovered, time to add NPU devices to
2650 	 * the corresponding IOMMU groups.
2651 	 */
2652 	list_for_each_entry(hose, &hose_list, list_node) {
2653 		unsigned long  pgsizes;
2654 
2655 		phb = hose->private_data;
2656 
2657 		if (phb->type != PNV_PHB_NPU_NVLINK)
2658 			continue;
2659 
2660 		pgsizes = pnv_ioda_parse_tce_sizes(phb);
2661 		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2662 			/*
2663 			 * IODA2 bridges get this set up from
2664 			 * pci_controller_ops::setup_bridge but NPU bridges
2665 			 * do not have this hook defined so we do it here.
2666 			 */
2667 			pe->table_group.pgsizes = pgsizes;
2668 			pnv_npu_compound_attach(pe);
2669 		}
2670 	}
2671 }
2672 #else /* !CONFIG_IOMMU_API */
2673 static void pnv_pci_ioda_setup_iommu_api(void) { };
2674 #endif
2675 
2676 static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
2677 {
2678 	struct pci_controller *hose = phb->hose;
2679 	struct device_node *dn = hose->dn;
2680 	unsigned long mask = 0;
2681 	int i, rc, count;
2682 	u32 val;
2683 
2684 	count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
2685 	if (count <= 0) {
2686 		mask = SZ_4K | SZ_64K;
2687 		/* Add 16M for POWER8 by default */
2688 		if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
2689 				!cpu_has_feature(CPU_FTR_ARCH_300))
2690 			mask |= SZ_16M | SZ_256M;
2691 		return mask;
2692 	}
2693 
2694 	for (i = 0; i < count; i++) {
2695 		rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
2696 						i, &val);
2697 		if (rc == 0)
2698 			mask |= 1ULL << val;
2699 	}
2700 
2701 	return mask;
2702 }
2703 
2704 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2705 				       struct pnv_ioda_pe *pe)
2706 {
2707 	int64_t rc;
2708 
2709 	if (!pnv_pci_ioda_pe_dma_weight(pe))
2710 		return;
2711 
2712 	/* TVE #1 is selected by PCI address bit 59 */
2713 	pe->tce_bypass_base = 1ull << 59;
2714 
2715 	/* The PE will reserve all possible 32-bits space */
2716 	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
2717 		phb->ioda.m32_pci_base);
2718 
2719 	/* Setup linux iommu table */
2720 	pe->table_group.tce32_start = 0;
2721 	pe->table_group.tce32_size = phb->ioda.m32_pci_base;
2722 	pe->table_group.max_dynamic_windows_supported =
2723 			IOMMU_TABLE_GROUP_MAX_TABLES;
2724 	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
2725 	pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
2726 #ifdef CONFIG_IOMMU_API
2727 	pe->table_group.ops = &pnv_pci_ioda2_ops;
2728 #endif
2729 
2730 	rc = pnv_pci_ioda2_setup_default_config(pe);
2731 	if (rc)
2732 		return;
2733 
2734 	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2735 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
2736 }
2737 
2738 int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
2739 {
2740 	struct pnv_phb *phb = container_of(chip, struct pnv_phb,
2741 					   ioda.irq_chip);
2742 
2743 	return opal_pci_msi_eoi(phb->opal_id, hw_irq);
2744 }
2745 
2746 static void pnv_ioda2_msi_eoi(struct irq_data *d)
2747 {
2748 	int64_t rc;
2749 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
2750 	struct irq_chip *chip = irq_data_get_irq_chip(d);
2751 
2752 	rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
2753 	WARN_ON_ONCE(rc);
2754 
2755 	icp_native_eoi(d);
2756 }
2757 
2758 
2759 void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
2760 {
2761 	struct irq_data *idata;
2762 	struct irq_chip *ichip;
2763 
2764 	/* The MSI EOI OPAL call is only needed on PHB3 */
2765 	if (phb->model != PNV_PHB_MODEL_PHB3)
2766 		return;
2767 
2768 	if (!phb->ioda.irq_chip_init) {
2769 		/*
2770 		 * First time we setup an MSI IRQ, we need to setup the
2771 		 * corresponding IRQ chip to route correctly.
2772 		 */
2773 		idata = irq_get_irq_data(virq);
2774 		ichip = irq_data_get_irq_chip(idata);
2775 		phb->ioda.irq_chip_init = 1;
2776 		phb->ioda.irq_chip = *ichip;
2777 		phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
2778 	}
2779 	irq_set_chip(virq, &phb->ioda.irq_chip);
2780 }
2781 
2782 /*
2783  * Returns true iff chip is something that we could call
2784  * pnv_opal_pci_msi_eoi for.
2785  */
2786 bool is_pnv_opal_msi(struct irq_chip *chip)
2787 {
2788 	return chip->irq_eoi == pnv_ioda2_msi_eoi;
2789 }
2790 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
2791 
2792 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
2793 				  unsigned int hwirq, unsigned int virq,
2794 				  unsigned int is_64, struct msi_msg *msg)
2795 {
2796 	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
2797 	unsigned int xive_num = hwirq - phb->msi_base;
2798 	__be32 data;
2799 	int rc;
2800 
2801 	/* No PE assigned ? bail out ... no MSI for you ! */
2802 	if (pe == NULL)
2803 		return -ENXIO;
2804 
2805 	/* Check if we have an MVE */
2806 	if (pe->mve_number < 0)
2807 		return -ENXIO;
2808 
2809 	/* Force 32-bit MSI on some broken devices */
2810 	if (dev->no_64bit_msi)
2811 		is_64 = 0;
2812 
2813 	/* Assign XIVE to PE */
2814 	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2815 	if (rc) {
2816 		pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
2817 			pci_name(dev), rc, xive_num);
2818 		return -EIO;
2819 	}
2820 
2821 	if (is_64) {
2822 		__be64 addr64;
2823 
2824 		rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
2825 				     &addr64, &data);
2826 		if (rc) {
2827 			pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
2828 				pci_name(dev), rc);
2829 			return -EIO;
2830 		}
2831 		msg->address_hi = be64_to_cpu(addr64) >> 32;
2832 		msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
2833 	} else {
2834 		__be32 addr32;
2835 
2836 		rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
2837 				     &addr32, &data);
2838 		if (rc) {
2839 			pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
2840 				pci_name(dev), rc);
2841 			return -EIO;
2842 		}
2843 		msg->address_hi = 0;
2844 		msg->address_lo = be32_to_cpu(addr32);
2845 	}
2846 	msg->data = be32_to_cpu(data);
2847 
2848 	pnv_set_msi_irq_chip(phb, virq);
2849 
2850 	pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
2851 		 " address=%x_%08x data=%x PE# %x\n",
2852 		 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
2853 		 msg->address_hi, msg->address_lo, data, pe->pe_number);
2854 
2855 	return 0;
2856 }
2857 
2858 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
2859 {
2860 	unsigned int count;
2861 	const __be32 *prop = of_get_property(phb->hose->dn,
2862 					     "ibm,opal-msi-ranges", NULL);
2863 	if (!prop) {
2864 		/* BML Fallback */
2865 		prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
2866 	}
2867 	if (!prop)
2868 		return;
2869 
2870 	phb->msi_base = be32_to_cpup(prop);
2871 	count = be32_to_cpup(prop + 1);
2872 	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
2873 		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
2874 		       phb->hose->global_number);
2875 		return;
2876 	}
2877 
2878 	phb->msi_setup = pnv_pci_ioda_msi_setup;
2879 	phb->msi32_support = 1;
2880 	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
2881 		count, phb->msi_base);
2882 }
2883 
2884 #ifdef CONFIG_PCI_IOV
2885 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
2886 {
2887 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
2888 	struct pnv_phb *phb = hose->private_data;
2889 	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
2890 	struct resource *res;
2891 	int i;
2892 	resource_size_t size, total_vf_bar_sz;
2893 	struct pci_dn *pdn;
2894 	int mul, total_vfs;
2895 
2896 	if (!pdev->is_physfn || pci_dev_is_added(pdev))
2897 		return;
2898 
2899 	pdn = pci_get_pdn(pdev);
2900 	pdn->vfs_expanded = 0;
2901 	pdn->m64_single_mode = false;
2902 
2903 	total_vfs = pci_sriov_get_totalvfs(pdev);
2904 	mul = phb->ioda.total_pe_num;
2905 	total_vf_bar_sz = 0;
2906 
2907 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2908 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
2909 		if (!res->flags || res->parent)
2910 			continue;
2911 		if (!pnv_pci_is_m64_flags(res->flags)) {
2912 			dev_warn(&pdev->dev, "Don't support SR-IOV with"
2913 					" non M64 VF BAR%d: %pR. \n",
2914 				 i, res);
2915 			goto truncate_iov;
2916 		}
2917 
2918 		total_vf_bar_sz += pci_iov_resource_size(pdev,
2919 				i + PCI_IOV_RESOURCES);
2920 
2921 		/*
2922 		 * If bigger than quarter of M64 segment size, just round up
2923 		 * power of two.
2924 		 *
2925 		 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
2926 		 * with other devices, IOV BAR size is expanded to be
2927 		 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
2928 		 * segment size , the expanded size would equal to half of the
2929 		 * whole M64 space size, which will exhaust the M64 Space and
2930 		 * limit the system flexibility.  This is a design decision to
2931 		 * set the boundary to quarter of the M64 segment size.
2932 		 */
2933 		if (total_vf_bar_sz > gate) {
2934 			mul = roundup_pow_of_two(total_vfs);
2935 			dev_info(&pdev->dev,
2936 				"VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
2937 				total_vf_bar_sz, gate, mul);
2938 			pdn->m64_single_mode = true;
2939 			break;
2940 		}
2941 	}
2942 
2943 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2944 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
2945 		if (!res->flags || res->parent)
2946 			continue;
2947 
2948 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
2949 		/*
2950 		 * On PHB3, the minimum size alignment of M64 BAR in single
2951 		 * mode is 32MB.
2952 		 */
2953 		if (pdn->m64_single_mode && (size < SZ_32M))
2954 			goto truncate_iov;
2955 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
2956 		res->end = res->start + size * mul - 1;
2957 		dev_dbg(&pdev->dev, "                       %pR\n", res);
2958 		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
2959 			 i, res, mul);
2960 	}
2961 	pdn->vfs_expanded = mul;
2962 
2963 	return;
2964 
2965 truncate_iov:
2966 	/* To save MMIO space, IOV BAR is truncated. */
2967 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2968 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
2969 		res->flags = 0;
2970 		res->end = res->start - 1;
2971 	}
2972 }
2973 #endif /* CONFIG_PCI_IOV */
2974 
2975 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
2976 				  struct resource *res)
2977 {
2978 	struct pnv_phb *phb = pe->phb;
2979 	struct pci_bus_region region;
2980 	int index;
2981 	int64_t rc;
2982 
2983 	if (!res || !res->flags || res->start > res->end)
2984 		return;
2985 
2986 	if (res->flags & IORESOURCE_IO) {
2987 		region.start = res->start - phb->ioda.io_pci_base;
2988 		region.end   = res->end - phb->ioda.io_pci_base;
2989 		index = region.start / phb->ioda.io_segsize;
2990 
2991 		while (index < phb->ioda.total_pe_num &&
2992 		       region.start <= region.end) {
2993 			phb->ioda.io_segmap[index] = pe->pe_number;
2994 			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2995 				pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
2996 			if (rc != OPAL_SUCCESS) {
2997 				pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
2998 				       __func__, rc, index, pe->pe_number);
2999 				break;
3000 			}
3001 
3002 			region.start += phb->ioda.io_segsize;
3003 			index++;
3004 		}
3005 	} else if ((res->flags & IORESOURCE_MEM) &&
3006 		   !pnv_pci_is_m64(phb, res)) {
3007 		region.start = res->start -
3008 			       phb->hose->mem_offset[0] -
3009 			       phb->ioda.m32_pci_base;
3010 		region.end   = res->end -
3011 			       phb->hose->mem_offset[0] -
3012 			       phb->ioda.m32_pci_base;
3013 		index = region.start / phb->ioda.m32_segsize;
3014 
3015 		while (index < phb->ioda.total_pe_num &&
3016 		       region.start <= region.end) {
3017 			phb->ioda.m32_segmap[index] = pe->pe_number;
3018 			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3019 				pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
3020 			if (rc != OPAL_SUCCESS) {
3021 				pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
3022 				       __func__, rc, index, pe->pe_number);
3023 				break;
3024 			}
3025 
3026 			region.start += phb->ioda.m32_segsize;
3027 			index++;
3028 		}
3029 	}
3030 }
3031 
3032 /*
3033  * This function is supposed to be called on basis of PE from top
3034  * to bottom style. So the the I/O or MMIO segment assigned to
3035  * parent PE could be overridden by its child PEs if necessary.
3036  */
3037 static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
3038 {
3039 	struct pci_dev *pdev;
3040 	int i;
3041 
3042 	/*
3043 	 * NOTE: We only care PCI bus based PE for now. For PCI
3044 	 * device based PE, for example SRIOV sensitive VF should
3045 	 * be figured out later.
3046 	 */
3047 	BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
3048 
3049 	list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
3050 		for (i = 0; i <= PCI_ROM_RESOURCE; i++)
3051 			pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
3052 
3053 		/*
3054 		 * If the PE contains all subordinate PCI buses, the
3055 		 * windows of the child bridges should be mapped to
3056 		 * the PE as well.
3057 		 */
3058 		if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
3059 			continue;
3060 		for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
3061 			pnv_ioda_setup_pe_res(pe,
3062 				&pdev->resource[PCI_BRIDGE_RESOURCES + i]);
3063 	}
3064 }
3065 
3066 #ifdef CONFIG_DEBUG_FS
3067 static int pnv_pci_diag_data_set(void *data, u64 val)
3068 {
3069 	struct pci_controller *hose;
3070 	struct pnv_phb *phb;
3071 	s64 ret;
3072 
3073 	if (val != 1ULL)
3074 		return -EINVAL;
3075 
3076 	hose = (struct pci_controller *)data;
3077 	if (!hose || !hose->private_data)
3078 		return -ENODEV;
3079 
3080 	phb = hose->private_data;
3081 
3082 	/* Retrieve the diag data from firmware */
3083 	ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
3084 					  phb->diag_data_size);
3085 	if (ret != OPAL_SUCCESS)
3086 		return -EIO;
3087 
3088 	/* Print the diag data to the kernel log */
3089 	pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
3090 	return 0;
3091 }
3092 
3093 DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
3094 			pnv_pci_diag_data_set, "%llu\n");
3095 
3096 #endif /* CONFIG_DEBUG_FS */
3097 
3098 static void pnv_pci_ioda_create_dbgfs(void)
3099 {
3100 #ifdef CONFIG_DEBUG_FS
3101 	struct pci_controller *hose, *tmp;
3102 	struct pnv_phb *phb;
3103 	char name[16];
3104 
3105 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
3106 		phb = hose->private_data;
3107 
3108 		/* Notify initialization of PHB done */
3109 		phb->initialized = 1;
3110 
3111 		sprintf(name, "PCI%04x", hose->global_number);
3112 		phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
3113 		if (!phb->dbgfs) {
3114 			pr_warn("%s: Error on creating debugfs on PHB#%x\n",
3115 				__func__, hose->global_number);
3116 			continue;
3117 		}
3118 
3119 		debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
3120 				    &pnv_pci_diag_data_fops);
3121 	}
3122 #endif /* CONFIG_DEBUG_FS */
3123 }
3124 
3125 static void pnv_pci_enable_bridge(struct pci_bus *bus)
3126 {
3127 	struct pci_dev *dev = bus->self;
3128 	struct pci_bus *child;
3129 
3130 	/* Empty bus ? bail */
3131 	if (list_empty(&bus->devices))
3132 		return;
3133 
3134 	/*
3135 	 * If there's a bridge associated with that bus enable it. This works
3136 	 * around races in the generic code if the enabling is done during
3137 	 * parallel probing. This can be removed once those races have been
3138 	 * fixed.
3139 	 */
3140 	if (dev) {
3141 		int rc = pci_enable_device(dev);
3142 		if (rc)
3143 			pci_err(dev, "Error enabling bridge (%d)\n", rc);
3144 		pci_set_master(dev);
3145 	}
3146 
3147 	/* Perform the same to child busses */
3148 	list_for_each_entry(child, &bus->children, node)
3149 		pnv_pci_enable_bridge(child);
3150 }
3151 
3152 static void pnv_pci_enable_bridges(void)
3153 {
3154 	struct pci_controller *hose;
3155 
3156 	list_for_each_entry(hose, &hose_list, list_node)
3157 		pnv_pci_enable_bridge(hose->bus);
3158 }
3159 
3160 static void pnv_pci_ioda_fixup(void)
3161 {
3162 	pnv_pci_ioda_setup_PEs();
3163 	pnv_pci_ioda_setup_iommu_api();
3164 	pnv_pci_ioda_create_dbgfs();
3165 
3166 	pnv_pci_enable_bridges();
3167 
3168 #ifdef CONFIG_EEH
3169 	pnv_eeh_post_init();
3170 #endif
3171 }
3172 
3173 /*
3174  * Returns the alignment for I/O or memory windows for P2P
3175  * bridges. That actually depends on how PEs are segmented.
3176  * For now, we return I/O or M32 segment size for PE sensitive
3177  * P2P bridges. Otherwise, the default values (4KiB for I/O,
3178  * 1MiB for memory) will be returned.
3179  *
3180  * The current PCI bus might be put into one PE, which was
3181  * create against the parent PCI bridge. For that case, we
3182  * needn't enlarge the alignment so that we can save some
3183  * resources.
3184  */
3185 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
3186 						unsigned long type)
3187 {
3188 	struct pci_dev *bridge;
3189 	struct pci_controller *hose = pci_bus_to_host(bus);
3190 	struct pnv_phb *phb = hose->private_data;
3191 	int num_pci_bridges = 0;
3192 
3193 	bridge = bus->self;
3194 	while (bridge) {
3195 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
3196 			num_pci_bridges++;
3197 			if (num_pci_bridges >= 2)
3198 				return 1;
3199 		}
3200 
3201 		bridge = bridge->bus->self;
3202 	}
3203 
3204 	/*
3205 	 * We fall back to M32 if M64 isn't supported. We enforce the M64
3206 	 * alignment for any 64-bit resource, PCIe doesn't care and
3207 	 * bridges only do 64-bit prefetchable anyway.
3208 	 */
3209 	if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
3210 		return phb->ioda.m64_segsize;
3211 	if (type & IORESOURCE_MEM)
3212 		return phb->ioda.m32_segsize;
3213 
3214 	return phb->ioda.io_segsize;
3215 }
3216 
3217 /*
3218  * We are updating root port or the upstream port of the
3219  * bridge behind the root port with PHB's windows in order
3220  * to accommodate the changes on required resources during
3221  * PCI (slot) hotplug, which is connected to either root
3222  * port or the downstream ports of PCIe switch behind the
3223  * root port.
3224  */
3225 static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
3226 					   unsigned long type)
3227 {
3228 	struct pci_controller *hose = pci_bus_to_host(bus);
3229 	struct pnv_phb *phb = hose->private_data;
3230 	struct pci_dev *bridge = bus->self;
3231 	struct resource *r, *w;
3232 	bool msi_region = false;
3233 	int i;
3234 
3235 	/* Check if we need apply fixup to the bridge's windows */
3236 	if (!pci_is_root_bus(bridge->bus) &&
3237 	    !pci_is_root_bus(bridge->bus->self->bus))
3238 		return;
3239 
3240 	/* Fixup the resources */
3241 	for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
3242 		r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
3243 		if (!r->flags || !r->parent)
3244 			continue;
3245 
3246 		w = NULL;
3247 		if (r->flags & type & IORESOURCE_IO)
3248 			w = &hose->io_resource;
3249 		else if (pnv_pci_is_m64(phb, r) &&
3250 			 (type & IORESOURCE_PREFETCH) &&
3251 			 phb->ioda.m64_segsize)
3252 			w = &hose->mem_resources[1];
3253 		else if (r->flags & type & IORESOURCE_MEM) {
3254 			w = &hose->mem_resources[0];
3255 			msi_region = true;
3256 		}
3257 
3258 		r->start = w->start;
3259 		r->end = w->end;
3260 
3261 		/* The 64KB 32-bits MSI region shouldn't be included in
3262 		 * the 32-bits bridge window. Otherwise, we can see strange
3263 		 * issues. One of them is EEH error observed on Garrison.
3264 		 *
3265 		 * Exclude top 1MB region which is the minimal alignment of
3266 		 * 32-bits bridge window.
3267 		 */
3268 		if (msi_region) {
3269 			r->end += 0x10000;
3270 			r->end -= 0x100000;
3271 		}
3272 	}
3273 }
3274 
3275 static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
3276 {
3277 	struct pci_controller *hose = pci_bus_to_host(bus);
3278 	struct pnv_phb *phb = hose->private_data;
3279 	struct pci_dev *bridge = bus->self;
3280 	struct pnv_ioda_pe *pe;
3281 	bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
3282 
3283 	/* Extend bridge's windows if necessary */
3284 	pnv_pci_fixup_bridge_resources(bus, type);
3285 
3286 	/* The PE for root bus should be realized before any one else */
3287 	if (!phb->ioda.root_pe_populated) {
3288 		pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
3289 		if (pe) {
3290 			phb->ioda.root_pe_idx = pe->pe_number;
3291 			phb->ioda.root_pe_populated = true;
3292 		}
3293 	}
3294 
3295 	/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
3296 	if (list_empty(&bus->devices))
3297 		return;
3298 
3299 	/* Reserve PEs according to used M64 resources */
3300 	pnv_ioda_reserve_m64_pe(bus, NULL, all);
3301 
3302 	/*
3303 	 * Assign PE. We might run here because of partial hotplug.
3304 	 * For the case, we just pick up the existing PE and should
3305 	 * not allocate resources again.
3306 	 */
3307 	pe = pnv_ioda_setup_bus_PE(bus, all);
3308 	if (!pe)
3309 		return;
3310 
3311 	pnv_ioda_setup_pe_seg(pe);
3312 	switch (phb->type) {
3313 	case PNV_PHB_IODA1:
3314 		pnv_pci_ioda1_setup_dma_pe(phb, pe);
3315 		break;
3316 	case PNV_PHB_IODA2:
3317 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
3318 		break;
3319 	default:
3320 		pr_warn("%s: No DMA for PHB#%x (type %d)\n",
3321 			__func__, phb->hose->global_number, phb->type);
3322 	}
3323 }
3324 
3325 static resource_size_t pnv_pci_default_alignment(void)
3326 {
3327 	return PAGE_SIZE;
3328 }
3329 
3330 #ifdef CONFIG_PCI_IOV
3331 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
3332 						      int resno)
3333 {
3334 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3335 	struct pnv_phb *phb = hose->private_data;
3336 	struct pci_dn *pdn = pci_get_pdn(pdev);
3337 	resource_size_t align;
3338 
3339 	/*
3340 	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
3341 	 * SR-IOV. While from hardware perspective, the range mapped by M64
3342 	 * BAR should be size aligned.
3343 	 *
3344 	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
3345 	 * powernv-specific hardware restriction is gone. But if just use the
3346 	 * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
3347 	 * in one segment of M64 #15, which introduces the PE conflict between
3348 	 * PF and VF. Based on this, the minimum alignment of an IOV BAR is
3349 	 * m64_segsize.
3350 	 *
3351 	 * This function returns the total IOV BAR size if M64 BAR is in
3352 	 * Shared PE mode or just VF BAR size if not.
3353 	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
3354 	 * M64 segment size if IOV BAR size is less.
3355 	 */
3356 	align = pci_iov_resource_size(pdev, resno);
3357 	if (!pdn->vfs_expanded)
3358 		return align;
3359 	if (pdn->m64_single_mode)
3360 		return max(align, (resource_size_t)phb->ioda.m64_segsize);
3361 
3362 	return pdn->vfs_expanded * align;
3363 }
3364 #endif /* CONFIG_PCI_IOV */
3365 
3366 /* Prevent enabling devices for which we couldn't properly
3367  * assign a PE
3368  */
3369 static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
3370 {
3371 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
3372 	struct pnv_phb *phb = hose->private_data;
3373 	struct pci_dn *pdn;
3374 
3375 	/* The function is probably called while the PEs have
3376 	 * not be created yet. For example, resource reassignment
3377 	 * during PCI probe period. We just skip the check if
3378 	 * PEs isn't ready.
3379 	 */
3380 	if (!phb->initialized)
3381 		return true;
3382 
3383 	pdn = pci_get_pdn(dev);
3384 	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
3385 		return false;
3386 
3387 	return true;
3388 }
3389 
3390 static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
3391 				       int num)
3392 {
3393 	struct pnv_ioda_pe *pe = container_of(table_group,
3394 					      struct pnv_ioda_pe, table_group);
3395 	struct pnv_phb *phb = pe->phb;
3396 	unsigned int idx;
3397 	long rc;
3398 
3399 	pe_info(pe, "Removing DMA window #%d\n", num);
3400 	for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
3401 		if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
3402 			continue;
3403 
3404 		rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
3405 						idx, 0, 0ul, 0ul, 0ul);
3406 		if (rc != OPAL_SUCCESS) {
3407 			pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
3408 				rc, idx);
3409 			return rc;
3410 		}
3411 
3412 		phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
3413 	}
3414 
3415 	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
3416 	return OPAL_SUCCESS;
3417 }
3418 
3419 static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
3420 {
3421 	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3422 	struct iommu_table *tbl = pe->table_group.tables[0];
3423 	int64_t rc;
3424 
3425 	if (!weight)
3426 		return;
3427 
3428 	rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
3429 	if (rc != OPAL_SUCCESS)
3430 		return;
3431 
3432 	pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
3433 	if (pe->table_group.group) {
3434 		iommu_group_put(pe->table_group.group);
3435 		WARN_ON(pe->table_group.group);
3436 	}
3437 
3438 	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
3439 	iommu_tce_table_put(tbl);
3440 }
3441 
3442 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
3443 {
3444 	struct iommu_table *tbl = pe->table_group.tables[0];
3445 	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3446 #ifdef CONFIG_IOMMU_API
3447 	int64_t rc;
3448 #endif
3449 
3450 	if (!weight)
3451 		return;
3452 
3453 #ifdef CONFIG_IOMMU_API
3454 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
3455 	if (rc)
3456 		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
3457 #endif
3458 
3459 	pnv_pci_ioda2_set_bypass(pe, false);
3460 	if (pe->table_group.group) {
3461 		iommu_group_put(pe->table_group.group);
3462 		WARN_ON(pe->table_group.group);
3463 	}
3464 
3465 	iommu_tce_table_put(tbl);
3466 }
3467 
3468 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
3469 				 unsigned short win,
3470 				 unsigned int *map)
3471 {
3472 	struct pnv_phb *phb = pe->phb;
3473 	int idx;
3474 	int64_t rc;
3475 
3476 	for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
3477 		if (map[idx] != pe->pe_number)
3478 			continue;
3479 
3480 		if (win == OPAL_M64_WINDOW_TYPE)
3481 			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3482 					phb->ioda.reserved_pe_idx, win,
3483 					idx / PNV_IODA1_M64_SEGS,
3484 					idx % PNV_IODA1_M64_SEGS);
3485 		else
3486 			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3487 					phb->ioda.reserved_pe_idx, win, 0, idx);
3488 
3489 		if (rc != OPAL_SUCCESS)
3490 			pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
3491 				rc, win, idx);
3492 
3493 		map[idx] = IODA_INVALID_PE;
3494 	}
3495 }
3496 
3497 static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
3498 {
3499 	struct pnv_phb *phb = pe->phb;
3500 
3501 	if (phb->type == PNV_PHB_IODA1) {
3502 		pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
3503 				     phb->ioda.io_segmap);
3504 		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
3505 				     phb->ioda.m32_segmap);
3506 		pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
3507 				     phb->ioda.m64_segmap);
3508 	} else if (phb->type == PNV_PHB_IODA2) {
3509 		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
3510 				     phb->ioda.m32_segmap);
3511 	}
3512 }
3513 
3514 static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
3515 {
3516 	struct pnv_phb *phb = pe->phb;
3517 	struct pnv_ioda_pe *slave, *tmp;
3518 
3519 	list_del(&pe->list);
3520 	switch (phb->type) {
3521 	case PNV_PHB_IODA1:
3522 		pnv_pci_ioda1_release_pe_dma(pe);
3523 		break;
3524 	case PNV_PHB_IODA2:
3525 		pnv_pci_ioda2_release_pe_dma(pe);
3526 		break;
3527 	default:
3528 		WARN_ON(1);
3529 	}
3530 
3531 	pnv_ioda_release_pe_seg(pe);
3532 	pnv_ioda_deconfigure_pe(pe->phb, pe);
3533 
3534 	/* Release slave PEs in the compound PE */
3535 	if (pe->flags & PNV_IODA_PE_MASTER) {
3536 		list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
3537 			list_del(&slave->list);
3538 			pnv_ioda_free_pe(slave);
3539 		}
3540 	}
3541 
3542 	/*
3543 	 * The PE for root bus can be removed because of hotplug in EEH
3544 	 * recovery for fenced PHB error. We need to mark the PE dead so
3545 	 * that it can be populated again in PCI hot add path. The PE
3546 	 * shouldn't be destroyed as it's the global reserved resource.
3547 	 */
3548 	if (phb->ioda.root_pe_populated &&
3549 	    phb->ioda.root_pe_idx == pe->pe_number)
3550 		phb->ioda.root_pe_populated = false;
3551 	else
3552 		pnv_ioda_free_pe(pe);
3553 }
3554 
3555 static void pnv_pci_release_device(struct pci_dev *pdev)
3556 {
3557 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3558 	struct pnv_phb *phb = hose->private_data;
3559 	struct pci_dn *pdn = pci_get_pdn(pdev);
3560 	struct pnv_ioda_pe *pe;
3561 
3562 	if (pdev->is_virtfn)
3563 		return;
3564 
3565 	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
3566 		return;
3567 
3568 	/*
3569 	 * PCI hotplug can happen as part of EEH error recovery. The @pdn
3570 	 * isn't removed and added afterwards in this scenario. We should
3571 	 * set the PE number in @pdn to an invalid one. Otherwise, the PE's
3572 	 * device count is decreased on removing devices while failing to
3573 	 * be increased on adding devices. It leads to unbalanced PE's device
3574 	 * count and eventually make normal PCI hotplug path broken.
3575 	 */
3576 	pe = &phb->ioda.pe_array[pdn->pe_number];
3577 	pdn->pe_number = IODA_INVALID_PE;
3578 
3579 	WARN_ON(--pe->device_count < 0);
3580 	if (pe->device_count == 0)
3581 		pnv_ioda_release_pe(pe);
3582 }
3583 
3584 static void pnv_npu_disable_device(struct pci_dev *pdev)
3585 {
3586 	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
3587 	struct eeh_pe *eehpe = edev ? edev->pe : NULL;
3588 
3589 	if (eehpe && eeh_ops && eeh_ops->reset)
3590 		eeh_ops->reset(eehpe, EEH_RESET_HOT);
3591 }
3592 
3593 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
3594 {
3595 	struct pnv_phb *phb = hose->private_data;
3596 
3597 	opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
3598 		       OPAL_ASSERT_RESET);
3599 }
3600 
3601 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
3602 	.dma_dev_setup		= pnv_pci_dma_dev_setup,
3603 	.dma_bus_setup		= pnv_pci_dma_bus_setup,
3604 	.iommu_bypass_supported	= pnv_pci_ioda_iommu_bypass_supported,
3605 	.setup_msi_irqs		= pnv_setup_msi_irqs,
3606 	.teardown_msi_irqs	= pnv_teardown_msi_irqs,
3607 	.enable_device_hook	= pnv_pci_enable_device_hook,
3608 	.release_device		= pnv_pci_release_device,
3609 	.window_alignment	= pnv_pci_window_alignment,
3610 	.setup_bridge		= pnv_pci_setup_bridge,
3611 	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
3612 	.shutdown		= pnv_pci_ioda_shutdown,
3613 };
3614 
3615 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
3616 	.dma_dev_setup		= pnv_pci_dma_dev_setup,
3617 	.setup_msi_irqs		= pnv_setup_msi_irqs,
3618 	.teardown_msi_irqs	= pnv_teardown_msi_irqs,
3619 	.enable_device_hook	= pnv_pci_enable_device_hook,
3620 	.window_alignment	= pnv_pci_window_alignment,
3621 	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
3622 	.shutdown		= pnv_pci_ioda_shutdown,
3623 	.disable_device		= pnv_npu_disable_device,
3624 };
3625 
3626 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
3627 	.enable_device_hook	= pnv_pci_enable_device_hook,
3628 	.window_alignment	= pnv_pci_window_alignment,
3629 	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
3630 	.shutdown		= pnv_pci_ioda_shutdown,
3631 };
3632 
3633 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
3634 					 u64 hub_id, int ioda_type)
3635 {
3636 	struct pci_controller *hose;
3637 	struct pnv_phb *phb;
3638 	unsigned long size, m64map_off, m32map_off, pemap_off;
3639 	unsigned long iomap_off = 0, dma32map_off = 0;
3640 	struct resource r;
3641 	const __be64 *prop64;
3642 	const __be32 *prop32;
3643 	int len;
3644 	unsigned int segno;
3645 	u64 phb_id;
3646 	void *aux;
3647 	long rc;
3648 
3649 	if (!of_device_is_available(np))
3650 		return;
3651 
3652 	pr_info("Initializing %s PHB (%pOF)\n",	pnv_phb_names[ioda_type], np);
3653 
3654 	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
3655 	if (!prop64) {
3656 		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
3657 		return;
3658 	}
3659 	phb_id = be64_to_cpup(prop64);
3660 	pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
3661 
3662 	phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
3663 	if (!phb)
3664 		panic("%s: Failed to allocate %zu bytes\n", __func__,
3665 		      sizeof(*phb));
3666 
3667 	/* Allocate PCI controller */
3668 	phb->hose = hose = pcibios_alloc_controller(np);
3669 	if (!phb->hose) {
3670 		pr_err("  Can't allocate PCI controller for %pOF\n",
3671 		       np);
3672 		memblock_free(__pa(phb), sizeof(struct pnv_phb));
3673 		return;
3674 	}
3675 
3676 	spin_lock_init(&phb->lock);
3677 	prop32 = of_get_property(np, "bus-range", &len);
3678 	if (prop32 && len == 8) {
3679 		hose->first_busno = be32_to_cpu(prop32[0]);
3680 		hose->last_busno = be32_to_cpu(prop32[1]);
3681 	} else {
3682 		pr_warn("  Broken <bus-range> on %pOF\n", np);
3683 		hose->first_busno = 0;
3684 		hose->last_busno = 0xff;
3685 	}
3686 	hose->private_data = phb;
3687 	phb->hub_id = hub_id;
3688 	phb->opal_id = phb_id;
3689 	phb->type = ioda_type;
3690 	mutex_init(&phb->ioda.pe_alloc_mutex);
3691 
3692 	/* Detect specific models for error handling */
3693 	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
3694 		phb->model = PNV_PHB_MODEL_P7IOC;
3695 	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
3696 		phb->model = PNV_PHB_MODEL_PHB3;
3697 	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
3698 		phb->model = PNV_PHB_MODEL_NPU;
3699 	else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
3700 		phb->model = PNV_PHB_MODEL_NPU2;
3701 	else
3702 		phb->model = PNV_PHB_MODEL_UNKNOWN;
3703 
3704 	/* Initialize diagnostic data buffer */
3705 	prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
3706 	if (prop32)
3707 		phb->diag_data_size = be32_to_cpup(prop32);
3708 	else
3709 		phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
3710 
3711 	phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
3712 	if (!phb->diag_data)
3713 		panic("%s: Failed to allocate %u bytes\n", __func__,
3714 		      phb->diag_data_size);
3715 
3716 	/* Parse 32-bit and IO ranges (if any) */
3717 	pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
3718 
3719 	/* Get registers */
3720 	if (!of_address_to_resource(np, 0, &r)) {
3721 		phb->regs_phys = r.start;
3722 		phb->regs = ioremap(r.start, resource_size(&r));
3723 		if (phb->regs == NULL)
3724 			pr_err("  Failed to map registers !\n");
3725 	}
3726 
3727 	/* Initialize more IODA stuff */
3728 	phb->ioda.total_pe_num = 1;
3729 	prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
3730 	if (prop32)
3731 		phb->ioda.total_pe_num = be32_to_cpup(prop32);
3732 	prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
3733 	if (prop32)
3734 		phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
3735 
3736 	/* Invalidate RID to PE# mapping */
3737 	for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
3738 		phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
3739 
3740 	/* Parse 64-bit MMIO range */
3741 	pnv_ioda_parse_m64_window(phb);
3742 
3743 	phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
3744 	/* FW Has already off top 64k of M32 space (MSI space) */
3745 	phb->ioda.m32_size += 0x10000;
3746 
3747 	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
3748 	phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
3749 	phb->ioda.io_size = hose->pci_io_size;
3750 	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
3751 	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
3752 
3753 	/* Calculate how many 32-bit TCE segments we have */
3754 	phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3755 				PNV_IODA1_DMA32_SEGSIZE;
3756 
3757 	/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
3758 	size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
3759 			sizeof(unsigned long));
3760 	m64map_off = size;
3761 	size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
3762 	m32map_off = size;
3763 	size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
3764 	if (phb->type == PNV_PHB_IODA1) {
3765 		iomap_off = size;
3766 		size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
3767 		dma32map_off = size;
3768 		size += phb->ioda.dma32_count *
3769 			sizeof(phb->ioda.dma32_segmap[0]);
3770 	}
3771 	pemap_off = size;
3772 	size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
3773 	aux = memblock_alloc(size, SMP_CACHE_BYTES);
3774 	if (!aux)
3775 		panic("%s: Failed to allocate %lu bytes\n", __func__, size);
3776 	phb->ioda.pe_alloc = aux;
3777 	phb->ioda.m64_segmap = aux + m64map_off;
3778 	phb->ioda.m32_segmap = aux + m32map_off;
3779 	for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
3780 		phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
3781 		phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
3782 	}
3783 	if (phb->type == PNV_PHB_IODA1) {
3784 		phb->ioda.io_segmap = aux + iomap_off;
3785 		for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
3786 			phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
3787 
3788 		phb->ioda.dma32_segmap = aux + dma32map_off;
3789 		for (segno = 0; segno < phb->ioda.dma32_count; segno++)
3790 			phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
3791 	}
3792 	phb->ioda.pe_array = aux + pemap_off;
3793 
3794 	/*
3795 	 * Choose PE number for root bus, which shouldn't have
3796 	 * M64 resources consumed by its child devices. To pick
3797 	 * the PE number adjacent to the reserved one if possible.
3798 	 */
3799 	pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
3800 	if (phb->ioda.reserved_pe_idx == 0) {
3801 		phb->ioda.root_pe_idx = 1;
3802 		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3803 	} else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
3804 		phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
3805 		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3806 	} else {
3807 		phb->ioda.root_pe_idx = IODA_INVALID_PE;
3808 	}
3809 
3810 	INIT_LIST_HEAD(&phb->ioda.pe_list);
3811 	mutex_init(&phb->ioda.pe_list_mutex);
3812 
3813 	/* Calculate how many 32-bit TCE segments we have */
3814 	phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3815 				PNV_IODA1_DMA32_SEGSIZE;
3816 
3817 #if 0 /* We should really do that ... */
3818 	rc = opal_pci_set_phb_mem_window(opal->phb_id,
3819 					 window_type,
3820 					 window_num,
3821 					 starting_real_address,
3822 					 starting_pci_address,
3823 					 segment_size);
3824 #endif
3825 
3826 	pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
3827 		phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
3828 		phb->ioda.m32_size, phb->ioda.m32_segsize);
3829 	if (phb->ioda.m64_size)
3830 		pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
3831 			phb->ioda.m64_size, phb->ioda.m64_segsize);
3832 	if (phb->ioda.io_size)
3833 		pr_info("                  IO: 0x%x [segment=0x%x]\n",
3834 			phb->ioda.io_size, phb->ioda.io_segsize);
3835 
3836 
3837 	phb->hose->ops = &pnv_pci_ops;
3838 	phb->get_pe_state = pnv_ioda_get_pe_state;
3839 	phb->freeze_pe = pnv_ioda_freeze_pe;
3840 	phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
3841 
3842 	/* Setup MSI support */
3843 	pnv_pci_init_ioda_msis(phb);
3844 
3845 	/*
3846 	 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
3847 	 * to let the PCI core do resource assignment. It's supposed
3848 	 * that the PCI core will do correct I/O and MMIO alignment
3849 	 * for the P2P bridge bars so that each PCI bus (excluding
3850 	 * the child P2P bridges) can form individual PE.
3851 	 */
3852 	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
3853 
3854 	switch (phb->type) {
3855 	case PNV_PHB_NPU_NVLINK:
3856 		hose->controller_ops = pnv_npu_ioda_controller_ops;
3857 		break;
3858 	case PNV_PHB_NPU_OCAPI:
3859 		hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
3860 		break;
3861 	default:
3862 		phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
3863 		hose->controller_ops = pnv_pci_ioda_controller_ops;
3864 	}
3865 
3866 	ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
3867 
3868 #ifdef CONFIG_PCI_IOV
3869 	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
3870 	ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
3871 	ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
3872 	ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
3873 #endif
3874 
3875 	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
3876 
3877 	/* Reset IODA tables to a clean state */
3878 	rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
3879 	if (rc)
3880 		pr_warn("  OPAL Error %ld performing IODA table reset !\n", rc);
3881 
3882 	/*
3883 	 * If we're running in kdump kernel, the previous kernel never
3884 	 * shutdown PCI devices correctly. We already got IODA table
3885 	 * cleaned out. So we have to issue PHB reset to stop all PCI
3886 	 * transactions from previous kernel. The ppc_pci_reset_phbs
3887 	 * kernel parameter will force this reset too. Additionally,
3888 	 * if the IODA reset above failed then use a bigger hammer.
3889 	 * This can happen if we get a PHB fatal error in very early
3890 	 * boot.
3891 	 */
3892 	if (is_kdump_kernel() || pci_reset_phbs || rc) {
3893 		pr_info("  Issue PHB reset ...\n");
3894 		pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
3895 		pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
3896 	}
3897 
3898 	/* Remove M64 resource if we can't configure it successfully */
3899 	if (!phb->init_m64 || phb->init_m64(phb))
3900 		hose->mem_resources[1].flags = 0;
3901 }
3902 
3903 void __init pnv_pci_init_ioda2_phb(struct device_node *np)
3904 {
3905 	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
3906 }
3907 
3908 void __init pnv_pci_init_npu_phb(struct device_node *np)
3909 {
3910 	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
3911 }
3912 
3913 void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
3914 {
3915 	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
3916 }
3917 
3918 static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
3919 {
3920 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
3921 	struct pnv_phb *phb = hose->private_data;
3922 
3923 	if (!machine_is(powernv))
3924 		return;
3925 
3926 	if (phb->type == PNV_PHB_NPU_OCAPI)
3927 		dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
3928 }
3929 DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
3930 
3931 void __init pnv_pci_init_ioda_hub(struct device_node *np)
3932 {
3933 	struct device_node *phbn;
3934 	const __be64 *prop64;
3935 	u64 hub_id;
3936 
3937 	pr_info("Probing IODA IO-Hub %pOF\n", np);
3938 
3939 	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
3940 	if (!prop64) {
3941 		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
3942 		return;
3943 	}
3944 	hub_id = be64_to_cpup(prop64);
3945 	pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
3946 
3947 	/* Count child PHBs */
3948 	for_each_child_of_node(np, phbn) {
3949 		/* Look for IODA1 PHBs */
3950 		if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
3951 			pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
3952 	}
3953 }
3954