xref: /openbmc/linux/arch/powerpc/kernel/eeh_driver.c (revision 5c7a35e3)
1317f06deSGavin Shan /*
2317f06deSGavin Shan  * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
3317f06deSGavin Shan  * Copyright IBM Corp. 2004 2005
4317f06deSGavin Shan  * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
5317f06deSGavin Shan  *
6317f06deSGavin Shan  * All rights reserved.
7317f06deSGavin Shan  *
8317f06deSGavin Shan  * This program is free software; you can redistribute it and/or modify
9317f06deSGavin Shan  * it under the terms of the GNU General Public License as published by
10317f06deSGavin Shan  * the Free Software Foundation; either version 2 of the License, or (at
11317f06deSGavin Shan  * your option) any later version.
12317f06deSGavin Shan  *
13317f06deSGavin Shan  * This program is distributed in the hope that it will be useful, but
14317f06deSGavin Shan  * WITHOUT ANY WARRANTY; without even the implied warranty of
15317f06deSGavin Shan  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16317f06deSGavin Shan  * NON INFRINGEMENT.  See the GNU General Public License for more
17317f06deSGavin Shan  * details.
18317f06deSGavin Shan  *
19317f06deSGavin Shan  * You should have received a copy of the GNU General Public License
20317f06deSGavin Shan  * along with this program; if not, write to the Free Software
21317f06deSGavin Shan  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22317f06deSGavin Shan  *
23317f06deSGavin Shan  * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
24317f06deSGavin Shan  */
25317f06deSGavin Shan #include <linux/delay.h>
26317f06deSGavin Shan #include <linux/interrupt.h>
27317f06deSGavin Shan #include <linux/irq.h>
28317f06deSGavin Shan #include <linux/module.h>
29317f06deSGavin Shan #include <linux/pci.h>
30317f06deSGavin Shan #include <asm/eeh.h>
31317f06deSGavin Shan #include <asm/eeh_event.h>
32317f06deSGavin Shan #include <asm/ppc-pci.h>
33317f06deSGavin Shan #include <asm/pci-bridge.h>
34317f06deSGavin Shan #include <asm/prom.h>
35317f06deSGavin Shan #include <asm/rtas.h>
36317f06deSGavin Shan 
37317f06deSGavin Shan /**
38317f06deSGavin Shan  * eeh_pcid_name - Retrieve name of PCI device driver
39317f06deSGavin Shan  * @pdev: PCI device
40317f06deSGavin Shan  *
41317f06deSGavin Shan  * This routine is used to retrieve the name of PCI device driver
42317f06deSGavin Shan  * if that's valid.
43317f06deSGavin Shan  */
44317f06deSGavin Shan static inline const char *eeh_pcid_name(struct pci_dev *pdev)
45317f06deSGavin Shan {
46317f06deSGavin Shan 	if (pdev && pdev->dev.driver)
47317f06deSGavin Shan 		return pdev->dev.driver->name;
48317f06deSGavin Shan 	return "";
49317f06deSGavin Shan }
50317f06deSGavin Shan 
51317f06deSGavin Shan /**
52317f06deSGavin Shan  * eeh_pcid_get - Get the PCI device driver
53317f06deSGavin Shan  * @pdev: PCI device
54317f06deSGavin Shan  *
55317f06deSGavin Shan  * The function is used to retrieve the PCI device driver for
56317f06deSGavin Shan  * the indicated PCI device. Besides, we will increase the reference
57317f06deSGavin Shan  * of the PCI device driver to prevent that being unloaded on
58317f06deSGavin Shan  * the fly. Otherwise, kernel crash would be seen.
59317f06deSGavin Shan  */
60317f06deSGavin Shan static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
61317f06deSGavin Shan {
62317f06deSGavin Shan 	if (!pdev || !pdev->driver)
63317f06deSGavin Shan 		return NULL;
64317f06deSGavin Shan 
65317f06deSGavin Shan 	if (!try_module_get(pdev->driver->driver.owner))
66317f06deSGavin Shan 		return NULL;
67317f06deSGavin Shan 
68317f06deSGavin Shan 	return pdev->driver;
69317f06deSGavin Shan }
70317f06deSGavin Shan 
71317f06deSGavin Shan /**
72317f06deSGavin Shan  * eeh_pcid_put - Dereference on the PCI device driver
73317f06deSGavin Shan  * @pdev: PCI device
74317f06deSGavin Shan  *
75317f06deSGavin Shan  * The function is called to do dereference on the PCI device
76317f06deSGavin Shan  * driver of the indicated PCI device.
77317f06deSGavin Shan  */
78317f06deSGavin Shan static inline void eeh_pcid_put(struct pci_dev *pdev)
79317f06deSGavin Shan {
80317f06deSGavin Shan 	if (!pdev || !pdev->driver)
81317f06deSGavin Shan 		return;
82317f06deSGavin Shan 
83317f06deSGavin Shan 	module_put(pdev->driver->driver.owner);
84317f06deSGavin Shan }
85317f06deSGavin Shan 
86317f06deSGavin Shan #if 0
87317f06deSGavin Shan static void print_device_node_tree(struct pci_dn *pdn, int dent)
88317f06deSGavin Shan {
89317f06deSGavin Shan 	int i;
90317f06deSGavin Shan 	struct device_node *pc;
91317f06deSGavin Shan 
92317f06deSGavin Shan 	if (!pdn)
93317f06deSGavin Shan 		return;
94317f06deSGavin Shan 	for (i = 0; i < dent; i++)
95317f06deSGavin Shan 		printk(" ");
96317f06deSGavin Shan 	printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
97317f06deSGavin Shan 		pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
98317f06deSGavin Shan 		pdn->eeh_pe_config_addr, pdn->node->full_name);
99317f06deSGavin Shan 	dent += 3;
100317f06deSGavin Shan 	pc = pdn->node->child;
101317f06deSGavin Shan 	while (pc) {
102317f06deSGavin Shan 		print_device_node_tree(PCI_DN(pc), dent);
103317f06deSGavin Shan 		pc = pc->sibling;
104317f06deSGavin Shan 	}
105317f06deSGavin Shan }
106317f06deSGavin Shan #endif
107317f06deSGavin Shan 
108317f06deSGavin Shan /**
109317f06deSGavin Shan  * eeh_disable_irq - Disable interrupt for the recovering device
110317f06deSGavin Shan  * @dev: PCI device
111317f06deSGavin Shan  *
112317f06deSGavin Shan  * This routine must be called when reporting temporary or permanent
113317f06deSGavin Shan  * error to the particular PCI device to disable interrupt of that
114317f06deSGavin Shan  * device. If the device has enabled MSI or MSI-X interrupt, we needn't
115317f06deSGavin Shan  * do real work because EEH should freeze DMA transfers for those PCI
116317f06deSGavin Shan  * devices encountering EEH errors, which includes MSI or MSI-X.
117317f06deSGavin Shan  */
118317f06deSGavin Shan static void eeh_disable_irq(struct pci_dev *dev)
119317f06deSGavin Shan {
120317f06deSGavin Shan 	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
121317f06deSGavin Shan 
122317f06deSGavin Shan 	/* Don't disable MSI and MSI-X interrupts. They are
123317f06deSGavin Shan 	 * effectively disabled by the DMA Stopped state
124317f06deSGavin Shan 	 * when an EEH error occurs.
125317f06deSGavin Shan 	 */
126317f06deSGavin Shan 	if (dev->msi_enabled || dev->msix_enabled)
127317f06deSGavin Shan 		return;
128317f06deSGavin Shan 
129317f06deSGavin Shan 	if (!irq_has_action(dev->irq))
130317f06deSGavin Shan 		return;
131317f06deSGavin Shan 
132317f06deSGavin Shan 	edev->mode |= EEH_DEV_IRQ_DISABLED;
133317f06deSGavin Shan 	disable_irq_nosync(dev->irq);
134317f06deSGavin Shan }
135317f06deSGavin Shan 
136317f06deSGavin Shan /**
137317f06deSGavin Shan  * eeh_enable_irq - Enable interrupt for the recovering device
138317f06deSGavin Shan  * @dev: PCI device
139317f06deSGavin Shan  *
140317f06deSGavin Shan  * This routine must be called to enable interrupt while failed
141317f06deSGavin Shan  * device could be resumed.
142317f06deSGavin Shan  */
143317f06deSGavin Shan static void eeh_enable_irq(struct pci_dev *dev)
144317f06deSGavin Shan {
145317f06deSGavin Shan 	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
146317f06deSGavin Shan 
147317f06deSGavin Shan 	if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
148317f06deSGavin Shan 		edev->mode &= ~EEH_DEV_IRQ_DISABLED;
149b8a9a11bSThomas Gleixner 		/*
150b8a9a11bSThomas Gleixner 		 * FIXME !!!!!
151b8a9a11bSThomas Gleixner 		 *
152b8a9a11bSThomas Gleixner 		 * This is just ass backwards. This maze has
153b8a9a11bSThomas Gleixner 		 * unbalanced irq_enable/disable calls. So instead of
154b8a9a11bSThomas Gleixner 		 * finding the root cause it works around the warning
155b8a9a11bSThomas Gleixner 		 * in the irq_enable code by conditionally calling
156b8a9a11bSThomas Gleixner 		 * into it.
157b8a9a11bSThomas Gleixner 		 *
158b8a9a11bSThomas Gleixner 		 * That's just wrong.The warning in the core code is
159b8a9a11bSThomas Gleixner 		 * there to tell people to fix their assymetries in
160b8a9a11bSThomas Gleixner 		 * their own code, not by abusing the core information
161b8a9a11bSThomas Gleixner 		 * to avoid it.
162b8a9a11bSThomas Gleixner 		 *
163b8a9a11bSThomas Gleixner 		 * I so wish that the assymetry would be the other way
164b8a9a11bSThomas Gleixner 		 * round and a few more irq_disable calls render that
165b8a9a11bSThomas Gleixner 		 * shit unusable forever.
166b8a9a11bSThomas Gleixner 		 *
167b8a9a11bSThomas Gleixner 		 *	tglx
168b8a9a11bSThomas Gleixner 		 */
16957310c3cSThomas Gleixner 		if (irqd_irq_disabled(irq_get_irq_data(dev->irq)))
170317f06deSGavin Shan 			enable_irq(dev->irq);
171317f06deSGavin Shan 	}
17257310c3cSThomas Gleixner }
173317f06deSGavin Shan 
174d2b0f6f7SGavin Shan static bool eeh_dev_removed(struct eeh_dev *edev)
175d2b0f6f7SGavin Shan {
176d2b0f6f7SGavin Shan 	/* EEH device removed ? */
177d2b0f6f7SGavin Shan 	if (!edev || (edev->mode & EEH_DEV_REMOVED))
178d2b0f6f7SGavin Shan 		return true;
179d2b0f6f7SGavin Shan 
180d2b0f6f7SGavin Shan 	return false;
181d2b0f6f7SGavin Shan }
182d2b0f6f7SGavin Shan 
183317f06deSGavin Shan /**
184317f06deSGavin Shan  * eeh_report_error - Report pci error to each device driver
185317f06deSGavin Shan  * @data: eeh device
186317f06deSGavin Shan  * @userdata: return value
187317f06deSGavin Shan  *
188317f06deSGavin Shan  * Report an EEH error to each device driver, collect up and
189317f06deSGavin Shan  * merge the device driver responses. Cumulative response
190317f06deSGavin Shan  * passed back in "userdata".
191317f06deSGavin Shan  */
192317f06deSGavin Shan static void *eeh_report_error(void *data, void *userdata)
193317f06deSGavin Shan {
194317f06deSGavin Shan 	struct eeh_dev *edev = (struct eeh_dev *)data;
195317f06deSGavin Shan 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
196317f06deSGavin Shan 	enum pci_ers_result rc, *res = userdata;
197317f06deSGavin Shan 	struct pci_driver *driver;
198317f06deSGavin Shan 
199d2b0f6f7SGavin Shan 	if (!dev || eeh_dev_removed(edev))
200d2b0f6f7SGavin Shan 		return NULL;
201317f06deSGavin Shan 	dev->error_state = pci_channel_io_frozen;
202317f06deSGavin Shan 
203317f06deSGavin Shan 	driver = eeh_pcid_get(dev);
204317f06deSGavin Shan 	if (!driver) return NULL;
205317f06deSGavin Shan 
206317f06deSGavin Shan 	eeh_disable_irq(dev);
207317f06deSGavin Shan 
208317f06deSGavin Shan 	if (!driver->err_handler ||
209317f06deSGavin Shan 	    !driver->err_handler->error_detected) {
210317f06deSGavin Shan 		eeh_pcid_put(dev);
211317f06deSGavin Shan 		return NULL;
212317f06deSGavin Shan 	}
213317f06deSGavin Shan 
214317f06deSGavin Shan 	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
215317f06deSGavin Shan 
216317f06deSGavin Shan 	/* A driver that needs a reset trumps all others */
217317f06deSGavin Shan 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
218317f06deSGavin Shan 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
219317f06deSGavin Shan 
220317f06deSGavin Shan 	eeh_pcid_put(dev);
221317f06deSGavin Shan 	return NULL;
222317f06deSGavin Shan }
223317f06deSGavin Shan 
224317f06deSGavin Shan /**
225317f06deSGavin Shan  * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
226317f06deSGavin Shan  * @data: eeh device
227317f06deSGavin Shan  * @userdata: return value
228317f06deSGavin Shan  *
229317f06deSGavin Shan  * Tells each device driver that IO ports, MMIO and config space I/O
230317f06deSGavin Shan  * are now enabled. Collects up and merges the device driver responses.
231317f06deSGavin Shan  * Cumulative response passed back in "userdata".
232317f06deSGavin Shan  */
233317f06deSGavin Shan static void *eeh_report_mmio_enabled(void *data, void *userdata)
234317f06deSGavin Shan {
235317f06deSGavin Shan 	struct eeh_dev *edev = (struct eeh_dev *)data;
236317f06deSGavin Shan 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
237317f06deSGavin Shan 	enum pci_ers_result rc, *res = userdata;
238317f06deSGavin Shan 	struct pci_driver *driver;
239317f06deSGavin Shan 
240d2b0f6f7SGavin Shan 	if (!dev || eeh_dev_removed(edev))
241d2b0f6f7SGavin Shan 		return NULL;
242d2b0f6f7SGavin Shan 
243317f06deSGavin Shan 	driver = eeh_pcid_get(dev);
244317f06deSGavin Shan 	if (!driver) return NULL;
245317f06deSGavin Shan 
246317f06deSGavin Shan 	if (!driver->err_handler ||
247f26c7a03SGavin Shan 	    !driver->err_handler->mmio_enabled ||
248f26c7a03SGavin Shan 	    (edev->mode & EEH_DEV_NO_HANDLER)) {
249317f06deSGavin Shan 		eeh_pcid_put(dev);
250317f06deSGavin Shan 		return NULL;
251317f06deSGavin Shan 	}
252317f06deSGavin Shan 
253317f06deSGavin Shan 	rc = driver->err_handler->mmio_enabled(dev);
254317f06deSGavin Shan 
255317f06deSGavin Shan 	/* A driver that needs a reset trumps all others */
256317f06deSGavin Shan 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
257317f06deSGavin Shan 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
258317f06deSGavin Shan 
259317f06deSGavin Shan 	eeh_pcid_put(dev);
260317f06deSGavin Shan 	return NULL;
261317f06deSGavin Shan }
262317f06deSGavin Shan 
263317f06deSGavin Shan /**
264317f06deSGavin Shan  * eeh_report_reset - Tell device that slot has been reset
265317f06deSGavin Shan  * @data: eeh device
266317f06deSGavin Shan  * @userdata: return value
267317f06deSGavin Shan  *
268317f06deSGavin Shan  * This routine must be called while EEH tries to reset particular
269317f06deSGavin Shan  * PCI device so that the associated PCI device driver could take
270317f06deSGavin Shan  * some actions, usually to save data the driver needs so that the
271317f06deSGavin Shan  * driver can work again while the device is recovered.
272317f06deSGavin Shan  */
273317f06deSGavin Shan static void *eeh_report_reset(void *data, void *userdata)
274317f06deSGavin Shan {
275317f06deSGavin Shan 	struct eeh_dev *edev = (struct eeh_dev *)data;
276317f06deSGavin Shan 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
277317f06deSGavin Shan 	enum pci_ers_result rc, *res = userdata;
278317f06deSGavin Shan 	struct pci_driver *driver;
279317f06deSGavin Shan 
280d2b0f6f7SGavin Shan 	if (!dev || eeh_dev_removed(edev))
281d2b0f6f7SGavin Shan 		return NULL;
282317f06deSGavin Shan 	dev->error_state = pci_channel_io_normal;
283317f06deSGavin Shan 
284317f06deSGavin Shan 	driver = eeh_pcid_get(dev);
285317f06deSGavin Shan 	if (!driver) return NULL;
286317f06deSGavin Shan 
287317f06deSGavin Shan 	eeh_enable_irq(dev);
288317f06deSGavin Shan 
289317f06deSGavin Shan 	if (!driver->err_handler ||
290f26c7a03SGavin Shan 	    !driver->err_handler->slot_reset ||
291f26c7a03SGavin Shan 	    (edev->mode & EEH_DEV_NO_HANDLER)) {
292317f06deSGavin Shan 		eeh_pcid_put(dev);
293317f06deSGavin Shan 		return NULL;
294317f06deSGavin Shan 	}
295317f06deSGavin Shan 
296317f06deSGavin Shan 	rc = driver->err_handler->slot_reset(dev);
297317f06deSGavin Shan 	if ((*res == PCI_ERS_RESULT_NONE) ||
298317f06deSGavin Shan 	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
299317f06deSGavin Shan 	if (*res == PCI_ERS_RESULT_DISCONNECT &&
300317f06deSGavin Shan 	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
301317f06deSGavin Shan 
302317f06deSGavin Shan 	eeh_pcid_put(dev);
303317f06deSGavin Shan 	return NULL;
304317f06deSGavin Shan }
305317f06deSGavin Shan 
306317f06deSGavin Shan /**
307317f06deSGavin Shan  * eeh_report_resume - Tell device to resume normal operations
308317f06deSGavin Shan  * @data: eeh device
309317f06deSGavin Shan  * @userdata: return value
310317f06deSGavin Shan  *
311317f06deSGavin Shan  * This routine must be called to notify the device driver that it
312317f06deSGavin Shan  * could resume so that the device driver can do some initialization
313317f06deSGavin Shan  * to make the recovered device work again.
314317f06deSGavin Shan  */
315317f06deSGavin Shan static void *eeh_report_resume(void *data, void *userdata)
316317f06deSGavin Shan {
317317f06deSGavin Shan 	struct eeh_dev *edev = (struct eeh_dev *)data;
318317f06deSGavin Shan 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
319317f06deSGavin Shan 	struct pci_driver *driver;
320317f06deSGavin Shan 
321d2b0f6f7SGavin Shan 	if (!dev || eeh_dev_removed(edev))
322d2b0f6f7SGavin Shan 		return NULL;
323317f06deSGavin Shan 	dev->error_state = pci_channel_io_normal;
324317f06deSGavin Shan 
325317f06deSGavin Shan 	driver = eeh_pcid_get(dev);
326317f06deSGavin Shan 	if (!driver) return NULL;
327317f06deSGavin Shan 
328317f06deSGavin Shan 	eeh_enable_irq(dev);
329317f06deSGavin Shan 
330317f06deSGavin Shan 	if (!driver->err_handler ||
331f26c7a03SGavin Shan 	    !driver->err_handler->resume ||
332f26c7a03SGavin Shan 	    (edev->mode & EEH_DEV_NO_HANDLER)) {
333f26c7a03SGavin Shan 		edev->mode &= ~EEH_DEV_NO_HANDLER;
334317f06deSGavin Shan 		eeh_pcid_put(dev);
335317f06deSGavin Shan 		return NULL;
336317f06deSGavin Shan 	}
337317f06deSGavin Shan 
338317f06deSGavin Shan 	driver->err_handler->resume(dev);
339317f06deSGavin Shan 
340317f06deSGavin Shan 	eeh_pcid_put(dev);
341317f06deSGavin Shan 	return NULL;
342317f06deSGavin Shan }
343317f06deSGavin Shan 
344317f06deSGavin Shan /**
345317f06deSGavin Shan  * eeh_report_failure - Tell device driver that device is dead.
346317f06deSGavin Shan  * @data: eeh device
347317f06deSGavin Shan  * @userdata: return value
348317f06deSGavin Shan  *
349317f06deSGavin Shan  * This informs the device driver that the device is permanently
350317f06deSGavin Shan  * dead, and that no further recovery attempts will be made on it.
351317f06deSGavin Shan  */
352317f06deSGavin Shan static void *eeh_report_failure(void *data, void *userdata)
353317f06deSGavin Shan {
354317f06deSGavin Shan 	struct eeh_dev *edev = (struct eeh_dev *)data;
355317f06deSGavin Shan 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
356317f06deSGavin Shan 	struct pci_driver *driver;
357317f06deSGavin Shan 
358d2b0f6f7SGavin Shan 	if (!dev || eeh_dev_removed(edev))
359d2b0f6f7SGavin Shan 		return NULL;
360317f06deSGavin Shan 	dev->error_state = pci_channel_io_perm_failure;
361317f06deSGavin Shan 
362317f06deSGavin Shan 	driver = eeh_pcid_get(dev);
363317f06deSGavin Shan 	if (!driver) return NULL;
364317f06deSGavin Shan 
365317f06deSGavin Shan 	eeh_disable_irq(dev);
366317f06deSGavin Shan 
367317f06deSGavin Shan 	if (!driver->err_handler ||
368317f06deSGavin Shan 	    !driver->err_handler->error_detected) {
369317f06deSGavin Shan 		eeh_pcid_put(dev);
370317f06deSGavin Shan 		return NULL;
371317f06deSGavin Shan 	}
372317f06deSGavin Shan 
373317f06deSGavin Shan 	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
374317f06deSGavin Shan 
375317f06deSGavin Shan 	eeh_pcid_put(dev);
376317f06deSGavin Shan 	return NULL;
377317f06deSGavin Shan }
378317f06deSGavin Shan 
379f5c57710SGavin Shan static void *eeh_rmv_device(void *data, void *userdata)
380f5c57710SGavin Shan {
381f5c57710SGavin Shan 	struct pci_driver *driver;
382f5c57710SGavin Shan 	struct eeh_dev *edev = (struct eeh_dev *)data;
383f5c57710SGavin Shan 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
384f5c57710SGavin Shan 	int *removed = (int *)userdata;
385f5c57710SGavin Shan 
386f5c57710SGavin Shan 	/*
387f5c57710SGavin Shan 	 * Actually, we should remove the PCI bridges as well.
388f5c57710SGavin Shan 	 * However, that's lots of complexity to do that,
389f5c57710SGavin Shan 	 * particularly some of devices under the bridge might
390f5c57710SGavin Shan 	 * support EEH. So we just care about PCI devices for
391f5c57710SGavin Shan 	 * simplicity here.
392f5c57710SGavin Shan 	 */
393f5c57710SGavin Shan 	if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
394f5c57710SGavin Shan 		return NULL;
3958cc6b6cdSThadeu Lima de Souza Cascardo 
396d2b0f6f7SGavin Shan 	/*
397d2b0f6f7SGavin Shan 	 * We rely on count-based pcibios_release_device() to
398d2b0f6f7SGavin Shan 	 * detach permanently offlined PEs. Unfortunately, that's
399d2b0f6f7SGavin Shan 	 * not reliable enough. We might have the permanently
400d2b0f6f7SGavin Shan 	 * offlined PEs attached, but we needn't take care of
401d2b0f6f7SGavin Shan 	 * them and their child devices.
402d2b0f6f7SGavin Shan 	 */
403d2b0f6f7SGavin Shan 	if (eeh_dev_removed(edev))
404d2b0f6f7SGavin Shan 		return NULL;
405d2b0f6f7SGavin Shan 
406f5c57710SGavin Shan 	driver = eeh_pcid_get(dev);
4078cc6b6cdSThadeu Lima de Souza Cascardo 	if (driver) {
4088cc6b6cdSThadeu Lima de Souza Cascardo 		eeh_pcid_put(dev);
4098cc6b6cdSThadeu Lima de Souza Cascardo 		if (driver->err_handler)
410f5c57710SGavin Shan 			return NULL;
4118cc6b6cdSThadeu Lima de Souza Cascardo 	}
412f5c57710SGavin Shan 
413f5c57710SGavin Shan 	/* Remove it from PCI subsystem */
414f5c57710SGavin Shan 	pr_debug("EEH: Removing %s without EEH sensitive driver\n",
415f5c57710SGavin Shan 		 pci_name(dev));
416f5c57710SGavin Shan 	edev->bus = dev->bus;
417f5c57710SGavin Shan 	edev->mode |= EEH_DEV_DISCONNECTED;
418f5c57710SGavin Shan 	(*removed)++;
419f5c57710SGavin Shan 
4201c2042c8SRafael J. Wysocki 	pci_lock_rescan_remove();
421f5c57710SGavin Shan 	pci_stop_and_remove_bus_device(dev);
4221c2042c8SRafael J. Wysocki 	pci_unlock_rescan_remove();
423f5c57710SGavin Shan 
424f5c57710SGavin Shan 	return NULL;
425f5c57710SGavin Shan }
426f5c57710SGavin Shan 
427f5c57710SGavin Shan static void *eeh_pe_detach_dev(void *data, void *userdata)
428f5c57710SGavin Shan {
429f5c57710SGavin Shan 	struct eeh_pe *pe = (struct eeh_pe *)data;
430f5c57710SGavin Shan 	struct eeh_dev *edev, *tmp;
431f5c57710SGavin Shan 
432f5c57710SGavin Shan 	eeh_pe_for_each_dev(pe, edev, tmp) {
433f5c57710SGavin Shan 		if (!(edev->mode & EEH_DEV_DISCONNECTED))
434f5c57710SGavin Shan 			continue;
435f5c57710SGavin Shan 
436f5c57710SGavin Shan 		edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
437f5c57710SGavin Shan 		eeh_rmv_from_parent_pe(edev);
438f5c57710SGavin Shan 	}
439f5c57710SGavin Shan 
440f5c57710SGavin Shan 	return NULL;
441f5c57710SGavin Shan }
442f5c57710SGavin Shan 
44378954700SGavin Shan /*
44478954700SGavin Shan  * Explicitly clear PE's frozen state for PowerNV where
44578954700SGavin Shan  * we have frozen PE until BAR restore is completed. It's
44678954700SGavin Shan  * harmless to clear it for pSeries. To be consistent with
44778954700SGavin Shan  * PE reset (for 3 times), we try to clear the frozen state
44878954700SGavin Shan  * for 3 times as well.
44978954700SGavin Shan  */
4502c665992SGavin Shan static void *__eeh_clear_pe_frozen_state(void *data, void *flag)
45178954700SGavin Shan {
4522c665992SGavin Shan 	struct eeh_pe *pe = (struct eeh_pe *)data;
45378954700SGavin Shan 	int i, rc;
45478954700SGavin Shan 
45578954700SGavin Shan 	for (i = 0; i < 3; i++) {
45678954700SGavin Shan 		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
45778954700SGavin Shan 		if (rc)
45878954700SGavin Shan 			continue;
45978954700SGavin Shan 		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
46078954700SGavin Shan 		if (!rc)
46178954700SGavin Shan 			break;
46278954700SGavin Shan 	}
46378954700SGavin Shan 
46478954700SGavin Shan 	/* The PE has been isolated, clear it */
4652c665992SGavin Shan 	if (rc) {
46678954700SGavin Shan 		pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n",
46778954700SGavin Shan 			__func__, pe->phb->global_number, pe->addr, rc);
4682c665992SGavin Shan 		return (void *)pe;
4692c665992SGavin Shan 	}
4702c665992SGavin Shan 
4712c665992SGavin Shan 	return NULL;
4722c665992SGavin Shan }
4732c665992SGavin Shan 
4742c665992SGavin Shan static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
4752c665992SGavin Shan {
4762c665992SGavin Shan 	void *rc;
4772c665992SGavin Shan 
4782c665992SGavin Shan 	rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, NULL);
4792c665992SGavin Shan 	if (!rc)
48078954700SGavin Shan 		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
48178954700SGavin Shan 
4822c665992SGavin Shan 	return rc ? -EIO : 0;
48378954700SGavin Shan }
48478954700SGavin Shan 
485317f06deSGavin Shan /**
486317f06deSGavin Shan  * eeh_reset_device - Perform actual reset of a pci slot
487317f06deSGavin Shan  * @pe: EEH PE
488317f06deSGavin Shan  * @bus: PCI bus corresponding to the isolcated slot
489317f06deSGavin Shan  *
490317f06deSGavin Shan  * This routine must be called to do reset on the indicated PE.
491317f06deSGavin Shan  * During the reset, udev might be invoked because those affected
492317f06deSGavin Shan  * PCI devices will be removed and then added.
493317f06deSGavin Shan  */
494317f06deSGavin Shan static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
495317f06deSGavin Shan {
496f5c57710SGavin Shan 	struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
4975a71978eSGavin Shan 	struct timeval tstamp;
498f5c57710SGavin Shan 	int cnt, rc, removed = 0;
499317f06deSGavin Shan 
500317f06deSGavin Shan 	/* pcibios will clear the counter; save the value */
501317f06deSGavin Shan 	cnt = pe->freeze_count;
5025a71978eSGavin Shan 	tstamp = pe->tstamp;
503317f06deSGavin Shan 
504317f06deSGavin Shan 	/*
505317f06deSGavin Shan 	 * We don't remove the corresponding PE instances because
506317f06deSGavin Shan 	 * we need the information afterwords. The attached EEH
507317f06deSGavin Shan 	 * devices are expected to be attached soon when calling
508317f06deSGavin Shan 	 * into pcibios_add_pci_devices().
509317f06deSGavin Shan 	 */
510807a827dSGavin Shan 	eeh_pe_state_mark(pe, EEH_PE_KEEP);
5111c2042c8SRafael J. Wysocki 	if (bus) {
5121c2042c8SRafael J. Wysocki 		pci_lock_rescan_remove();
513807a827dSGavin Shan 		pcibios_remove_pci_devices(bus);
5141c2042c8SRafael J. Wysocki 		pci_unlock_rescan_remove();
5151c2042c8SRafael J. Wysocki 	} else if (frozen_bus) {
516f5c57710SGavin Shan 		eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
5171c2042c8SRafael J. Wysocki 	}
518317f06deSGavin Shan 
519d0914f50SGavin Shan 	/*
520d0914f50SGavin Shan 	 * Reset the pci controller. (Asserts RST#; resets config space).
521317f06deSGavin Shan 	 * Reconfigure bridges and devices. Don't try to bring the system
522317f06deSGavin Shan 	 * up if the reset failed for some reason.
523d0914f50SGavin Shan 	 *
524d0914f50SGavin Shan 	 * During the reset, it's very dangerous to have uncontrolled PCI
525d0914f50SGavin Shan 	 * config accesses. So we prefer to block them. However, controlled
526d0914f50SGavin Shan 	 * PCI config accesses initiated from EEH itself are allowed.
527317f06deSGavin Shan 	 */
528d0914f50SGavin Shan 	eeh_pe_state_mark(pe, EEH_PE_RESET);
529317f06deSGavin Shan 	rc = eeh_reset_pe(pe);
530d0914f50SGavin Shan 	if (rc) {
531d0914f50SGavin Shan 		eeh_pe_state_clear(pe, EEH_PE_RESET);
532317f06deSGavin Shan 		return rc;
533d0914f50SGavin Shan 	}
534317f06deSGavin Shan 
5351c2042c8SRafael J. Wysocki 	pci_lock_rescan_remove();
5361c2042c8SRafael J. Wysocki 
537317f06deSGavin Shan 	/* Restore PE */
538317f06deSGavin Shan 	eeh_ops->configure_bridge(pe);
539317f06deSGavin Shan 	eeh_pe_restore_bars(pe);
540d0914f50SGavin Shan 	eeh_pe_state_clear(pe, EEH_PE_RESET);
541317f06deSGavin Shan 
54278954700SGavin Shan 	/* Clear frozen state */
54378954700SGavin Shan 	rc = eeh_clear_pe_frozen_state(pe);
54478954700SGavin Shan 	if (rc)
54578954700SGavin Shan 		return rc;
54678954700SGavin Shan 
547317f06deSGavin Shan 	/* Give the system 5 seconds to finish running the user-space
548317f06deSGavin Shan 	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
549317f06deSGavin Shan 	 * this is a hack, but if we don't do this, and try to bring
550317f06deSGavin Shan 	 * the device up before the scripts have taken it down,
551317f06deSGavin Shan 	 * potentially weird things happen.
552317f06deSGavin Shan 	 */
553317f06deSGavin Shan 	if (bus) {
554f5c57710SGavin Shan 		pr_info("EEH: Sleep 5s ahead of complete hotplug\n");
555317f06deSGavin Shan 		ssleep(5);
556f5c57710SGavin Shan 
557f5c57710SGavin Shan 		/*
558f5c57710SGavin Shan 		 * The EEH device is still connected with its parent
559f5c57710SGavin Shan 		 * PE. We should disconnect it so the binding can be
560f5c57710SGavin Shan 		 * rebuilt when adding PCI devices.
561f5c57710SGavin Shan 		 */
562f5c57710SGavin Shan 		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
563317f06deSGavin Shan 		pcibios_add_pci_devices(bus);
564f5c57710SGavin Shan 	} else if (frozen_bus && removed) {
565f5c57710SGavin Shan 		pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
566f5c57710SGavin Shan 		ssleep(5);
567f5c57710SGavin Shan 
568f5c57710SGavin Shan 		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
569f5c57710SGavin Shan 		pcibios_add_pci_devices(frozen_bus);
570317f06deSGavin Shan 	}
571f5c57710SGavin Shan 	eeh_pe_state_clear(pe, EEH_PE_KEEP);
5725a71978eSGavin Shan 
5735a71978eSGavin Shan 	pe->tstamp = tstamp;
574317f06deSGavin Shan 	pe->freeze_count = cnt;
575317f06deSGavin Shan 
5761c2042c8SRafael J. Wysocki 	pci_unlock_rescan_remove();
577317f06deSGavin Shan 	return 0;
578317f06deSGavin Shan }
579317f06deSGavin Shan 
580317f06deSGavin Shan /* The longest amount of time to wait for a pci device
581317f06deSGavin Shan  * to come back on line, in seconds.
582317f06deSGavin Shan  */
583fb48dc22SBrian King #define MAX_WAIT_FOR_RECOVERY 300
584317f06deSGavin Shan 
5858a6b1bc7SGavin Shan static void eeh_handle_normal_event(struct eeh_pe *pe)
586317f06deSGavin Shan {
587317f06deSGavin Shan 	struct pci_bus *frozen_bus;
588317f06deSGavin Shan 	int rc = 0;
589317f06deSGavin Shan 	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
590317f06deSGavin Shan 
591317f06deSGavin Shan 	frozen_bus = eeh_pe_bus_get(pe);
592317f06deSGavin Shan 	if (!frozen_bus) {
593317f06deSGavin Shan 		pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
594317f06deSGavin Shan 			__func__, pe->phb->global_number, pe->addr);
595317f06deSGavin Shan 		return;
596317f06deSGavin Shan 	}
597317f06deSGavin Shan 
5985a71978eSGavin Shan 	eeh_pe_update_time_stamp(pe);
599317f06deSGavin Shan 	pe->freeze_count++;
600317f06deSGavin Shan 	if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
601317f06deSGavin Shan 		goto excess_failures;
602317f06deSGavin Shan 	pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
603317f06deSGavin Shan 		pe->freeze_count);
604317f06deSGavin Shan 
605317f06deSGavin Shan 	/* Walk the various device drivers attached to this slot through
606317f06deSGavin Shan 	 * a reset sequence, giving each an opportunity to do what it needs
607317f06deSGavin Shan 	 * to accomplish the reset.  Each child gets a report of the
608317f06deSGavin Shan 	 * status ... if any child can't handle the reset, then the entire
609317f06deSGavin Shan 	 * slot is dlpar removed and added.
610317f06deSGavin Shan 	 */
61156ca4fdeSGavin Shan 	pr_info("EEH: Notify device drivers to shutdown\n");
612317f06deSGavin Shan 	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
613317f06deSGavin Shan 
614317f06deSGavin Shan 	/* Get the current PCI slot state. This can take a long time,
615317f06deSGavin Shan 	 * sometimes over 3 seconds for certain systems.
616317f06deSGavin Shan 	 */
617317f06deSGavin Shan 	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
618317f06deSGavin Shan 	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
61956ca4fdeSGavin Shan 		pr_warning("EEH: Permanent failure\n");
620317f06deSGavin Shan 		goto hard_fail;
621317f06deSGavin Shan 	}
622317f06deSGavin Shan 
623317f06deSGavin Shan 	/* Since rtas may enable MMIO when posting the error log,
624317f06deSGavin Shan 	 * don't post the error log until after all dev drivers
625317f06deSGavin Shan 	 * have been informed.
626317f06deSGavin Shan 	 */
62756ca4fdeSGavin Shan 	pr_info("EEH: Collect temporary log\n");
628317f06deSGavin Shan 	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
629317f06deSGavin Shan 
630317f06deSGavin Shan 	/* If all device drivers were EEH-unaware, then shut
631317f06deSGavin Shan 	 * down all of the device drivers, and hope they
632317f06deSGavin Shan 	 * go down willingly, without panicing the system.
633317f06deSGavin Shan 	 */
634317f06deSGavin Shan 	if (result == PCI_ERS_RESULT_NONE) {
63556ca4fdeSGavin Shan 		pr_info("EEH: Reset with hotplug activity\n");
636317f06deSGavin Shan 		rc = eeh_reset_device(pe, frozen_bus);
637317f06deSGavin Shan 		if (rc) {
63856ca4fdeSGavin Shan 			pr_warning("%s: Unable to reset, err=%d\n",
63956ca4fdeSGavin Shan 				   __func__, rc);
640317f06deSGavin Shan 			goto hard_fail;
641317f06deSGavin Shan 		}
642317f06deSGavin Shan 	}
643317f06deSGavin Shan 
644317f06deSGavin Shan 	/* If all devices reported they can proceed, then re-enable MMIO */
645317f06deSGavin Shan 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
64656ca4fdeSGavin Shan 		pr_info("EEH: Enable I/O for affected devices\n");
647317f06deSGavin Shan 		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
648317f06deSGavin Shan 
649317f06deSGavin Shan 		if (rc < 0)
650317f06deSGavin Shan 			goto hard_fail;
651317f06deSGavin Shan 		if (rc) {
652317f06deSGavin Shan 			result = PCI_ERS_RESULT_NEED_RESET;
653317f06deSGavin Shan 		} else {
65456ca4fdeSGavin Shan 			pr_info("EEH: Notify device drivers to resume I/O\n");
655317f06deSGavin Shan 			eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
656317f06deSGavin Shan 		}
657317f06deSGavin Shan 	}
658317f06deSGavin Shan 
659317f06deSGavin Shan 	/* If all devices reported they can proceed, then re-enable DMA */
660317f06deSGavin Shan 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
66156ca4fdeSGavin Shan 		pr_info("EEH: Enabled DMA for affected devices\n");
662317f06deSGavin Shan 		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
663317f06deSGavin Shan 
664317f06deSGavin Shan 		if (rc < 0)
665317f06deSGavin Shan 			goto hard_fail;
66635845a78SGavin Shan 		if (rc) {
667317f06deSGavin Shan 			result = PCI_ERS_RESULT_NEED_RESET;
66835845a78SGavin Shan 		} else {
66935845a78SGavin Shan 			/*
67035845a78SGavin Shan 			 * We didn't do PE reset for the case. The PE
67135845a78SGavin Shan 			 * is still in frozen state. Clear it before
67235845a78SGavin Shan 			 * resuming the PE.
67335845a78SGavin Shan 			 */
67435845a78SGavin Shan 			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
675317f06deSGavin Shan 			result = PCI_ERS_RESULT_RECOVERED;
676317f06deSGavin Shan 		}
67735845a78SGavin Shan 	}
678317f06deSGavin Shan 
679317f06deSGavin Shan 	/* If any device has a hard failure, then shut off everything. */
680317f06deSGavin Shan 	if (result == PCI_ERS_RESULT_DISCONNECT) {
68156ca4fdeSGavin Shan 		pr_warning("EEH: Device driver gave up\n");
682317f06deSGavin Shan 		goto hard_fail;
683317f06deSGavin Shan 	}
684317f06deSGavin Shan 
685317f06deSGavin Shan 	/* If any device called out for a reset, then reset the slot */
686317f06deSGavin Shan 	if (result == PCI_ERS_RESULT_NEED_RESET) {
68756ca4fdeSGavin Shan 		pr_info("EEH: Reset without hotplug activity\n");
688317f06deSGavin Shan 		rc = eeh_reset_device(pe, NULL);
689317f06deSGavin Shan 		if (rc) {
69056ca4fdeSGavin Shan 			pr_warning("%s: Cannot reset, err=%d\n",
69156ca4fdeSGavin Shan 				   __func__, rc);
692317f06deSGavin Shan 			goto hard_fail;
693317f06deSGavin Shan 		}
69456ca4fdeSGavin Shan 
69556ca4fdeSGavin Shan 		pr_info("EEH: Notify device drivers "
69656ca4fdeSGavin Shan 			"the completion of reset\n");
697317f06deSGavin Shan 		result = PCI_ERS_RESULT_NONE;
698317f06deSGavin Shan 		eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
699317f06deSGavin Shan 	}
700317f06deSGavin Shan 
701317f06deSGavin Shan 	/* All devices should claim they have recovered by now. */
702317f06deSGavin Shan 	if ((result != PCI_ERS_RESULT_RECOVERED) &&
703317f06deSGavin Shan 	    (result != PCI_ERS_RESULT_NONE)) {
70456ca4fdeSGavin Shan 		pr_warning("EEH: Not recovered\n");
705317f06deSGavin Shan 		goto hard_fail;
706317f06deSGavin Shan 	}
707317f06deSGavin Shan 
708317f06deSGavin Shan 	/* Tell all device drivers that they can resume operations */
70956ca4fdeSGavin Shan 	pr_info("EEH: Notify device driver to resume\n");
710317f06deSGavin Shan 	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
711317f06deSGavin Shan 
712317f06deSGavin Shan 	return;
713317f06deSGavin Shan 
714317f06deSGavin Shan excess_failures:
715317f06deSGavin Shan 	/*
716317f06deSGavin Shan 	 * About 90% of all real-life EEH failures in the field
717317f06deSGavin Shan 	 * are due to poorly seated PCI cards. Only 10% or so are
718317f06deSGavin Shan 	 * due to actual, failed cards.
719317f06deSGavin Shan 	 */
720317f06deSGavin Shan 	pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
721317f06deSGavin Shan 	       "last hour and has been permanently disabled.\n"
722317f06deSGavin Shan 	       "Please try reseating or replacing it.\n",
723317f06deSGavin Shan 		pe->phb->global_number, pe->addr,
724317f06deSGavin Shan 		pe->freeze_count);
725317f06deSGavin Shan 	goto perm_error;
726317f06deSGavin Shan 
727317f06deSGavin Shan hard_fail:
728317f06deSGavin Shan 	pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
729317f06deSGavin Shan 	       "Please try reseating or replacing it\n",
730317f06deSGavin Shan 		pe->phb->global_number, pe->addr);
731317f06deSGavin Shan 
732317f06deSGavin Shan perm_error:
733317f06deSGavin Shan 	eeh_slot_error_detail(pe, EEH_LOG_PERM);
734317f06deSGavin Shan 
735317f06deSGavin Shan 	/* Notify all devices that they're about to go down. */
736317f06deSGavin Shan 	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
737317f06deSGavin Shan 
738d2b0f6f7SGavin Shan 	/* Mark the PE to be removed permanently */
739d2b0f6f7SGavin Shan 	pe->freeze_count = EEH_MAX_ALLOWED_FREEZES + 1;
740d2b0f6f7SGavin Shan 
741d2b0f6f7SGavin Shan 	/*
742d2b0f6f7SGavin Shan 	 * Shut down the device drivers for good. We mark
743d2b0f6f7SGavin Shan 	 * all removed devices correctly to avoid access
744d2b0f6f7SGavin Shan 	 * the their PCI config any more.
745d2b0f6f7SGavin Shan 	 */
7461c2042c8SRafael J. Wysocki 	if (frozen_bus) {
747d2b0f6f7SGavin Shan 		eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
748d2b0f6f7SGavin Shan 
7491c2042c8SRafael J. Wysocki 		pci_lock_rescan_remove();
750317f06deSGavin Shan 		pcibios_remove_pci_devices(frozen_bus);
7511c2042c8SRafael J. Wysocki 		pci_unlock_rescan_remove();
7521c2042c8SRafael J. Wysocki 	}
753317f06deSGavin Shan }
7548a6b1bc7SGavin Shan 
7558a6b1bc7SGavin Shan static void eeh_handle_special_event(void)
7568a6b1bc7SGavin Shan {
7578a6b1bc7SGavin Shan 	struct eeh_pe *pe, *phb_pe;
7588a6b1bc7SGavin Shan 	struct pci_bus *bus;
7597e4e7867SGavin Shan 	struct pci_controller *hose;
7608a6b1bc7SGavin Shan 	unsigned long flags;
7617e4e7867SGavin Shan 	int rc;
7628a6b1bc7SGavin Shan 
7637e4e7867SGavin Shan 
7647e4e7867SGavin Shan 	do {
7658a6b1bc7SGavin Shan 		rc = eeh_ops->next_error(&pe);
7668a6b1bc7SGavin Shan 
7678a6b1bc7SGavin Shan 		switch (rc) {
7687e4e7867SGavin Shan 		case EEH_NEXT_ERR_DEAD_IOC:
7698a6b1bc7SGavin Shan 			/* Mark all PHBs in dead state */
7708a6b1bc7SGavin Shan 			eeh_serialize_lock(&flags);
7717e4e7867SGavin Shan 
7727e4e7867SGavin Shan 			/* Purge all events */
7735c7a35e3SGavin Shan 			eeh_remove_event(NULL, true);
7747e4e7867SGavin Shan 
7757e4e7867SGavin Shan 			list_for_each_entry(hose, &hose_list, list_node) {
7768a6b1bc7SGavin Shan 				phb_pe = eeh_phb_pe_get(hose);
7778a6b1bc7SGavin Shan 				if (!phb_pe) continue;
7788a6b1bc7SGavin Shan 
7799e049375SGavin Shan 				eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
7808a6b1bc7SGavin Shan 			}
7817e4e7867SGavin Shan 
7828a6b1bc7SGavin Shan 			eeh_serialize_unlock(flags);
7838a6b1bc7SGavin Shan 
7848a6b1bc7SGavin Shan 			break;
7857e4e7867SGavin Shan 		case EEH_NEXT_ERR_FROZEN_PE:
7867e4e7867SGavin Shan 		case EEH_NEXT_ERR_FENCED_PHB:
7877e4e7867SGavin Shan 		case EEH_NEXT_ERR_DEAD_PHB:
7888a6b1bc7SGavin Shan 			/* Mark the PE in fenced state */
7898a6b1bc7SGavin Shan 			eeh_serialize_lock(&flags);
7907e4e7867SGavin Shan 
7917e4e7867SGavin Shan 			/* Purge all events of the PHB */
7925c7a35e3SGavin Shan 			eeh_remove_event(pe, true);
7937e4e7867SGavin Shan 
7947e4e7867SGavin Shan 			if (rc == EEH_NEXT_ERR_DEAD_PHB)
7959e049375SGavin Shan 				eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
7968a6b1bc7SGavin Shan 			else
7978a6b1bc7SGavin Shan 				eeh_pe_state_mark(pe,
7988a6b1bc7SGavin Shan 					EEH_PE_ISOLATED | EEH_PE_RECOVERING);
7997e4e7867SGavin Shan 
8008a6b1bc7SGavin Shan 			eeh_serialize_unlock(flags);
8018a6b1bc7SGavin Shan 
8028a6b1bc7SGavin Shan 			break;
8037e4e7867SGavin Shan 		case EEH_NEXT_ERR_NONE:
8047e4e7867SGavin Shan 			return;
8058a6b1bc7SGavin Shan 		default:
8067e4e7867SGavin Shan 			pr_warn("%s: Invalid value %d from next_error()\n",
8078a6b1bc7SGavin Shan 				__func__, rc);
8088a6b1bc7SGavin Shan 			return;
8098a6b1bc7SGavin Shan 		}
8108a6b1bc7SGavin Shan 
8118a6b1bc7SGavin Shan 		/*
8128a6b1bc7SGavin Shan 		 * For fenced PHB and frozen PE, it's handled as normal
8138a6b1bc7SGavin Shan 		 * event. We have to remove the affected PHBs for dead
8148a6b1bc7SGavin Shan 		 * PHB and IOC
8158a6b1bc7SGavin Shan 		 */
8167e4e7867SGavin Shan 		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
8177e4e7867SGavin Shan 		    rc == EEH_NEXT_ERR_FENCED_PHB) {
8188a6b1bc7SGavin Shan 			eeh_handle_normal_event(pe);
8199e049375SGavin Shan 			eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
8207e4e7867SGavin Shan 		} else {
8211c2042c8SRafael J. Wysocki 			pci_lock_rescan_remove();
8227e4e7867SGavin Shan 			list_for_each_entry(hose, &hose_list, list_node) {
8238a6b1bc7SGavin Shan 				phb_pe = eeh_phb_pe_get(hose);
8247e4e7867SGavin Shan 				if (!phb_pe ||
8259e049375SGavin Shan 				    !(phb_pe->state & EEH_PE_ISOLATED) ||
8269e049375SGavin Shan 				    (phb_pe->state & EEH_PE_RECOVERING))
8278a6b1bc7SGavin Shan 					continue;
8288a6b1bc7SGavin Shan 
8297e4e7867SGavin Shan 				/* Notify all devices to be down */
8308a6b1bc7SGavin Shan 				bus = eeh_pe_bus_get(phb_pe);
8317e4e7867SGavin Shan 				eeh_pe_dev_traverse(pe,
8327e4e7867SGavin Shan 					eeh_report_failure, NULL);
8338a6b1bc7SGavin Shan 				pcibios_remove_pci_devices(bus);
8348a6b1bc7SGavin Shan 			}
8351c2042c8SRafael J. Wysocki 			pci_unlock_rescan_remove();
8368a6b1bc7SGavin Shan 		}
8377e4e7867SGavin Shan 
8387e4e7867SGavin Shan 		/*
8397e4e7867SGavin Shan 		 * If we have detected dead IOC, we needn't proceed
8407e4e7867SGavin Shan 		 * any more since all PHBs would have been removed
8417e4e7867SGavin Shan 		 */
8427e4e7867SGavin Shan 		if (rc == EEH_NEXT_ERR_DEAD_IOC)
8437e4e7867SGavin Shan 			break;
8447e4e7867SGavin Shan 	} while (rc != EEH_NEXT_ERR_NONE);
8458a6b1bc7SGavin Shan }
8468a6b1bc7SGavin Shan 
8478a6b1bc7SGavin Shan /**
8488a6b1bc7SGavin Shan  * eeh_handle_event - Reset a PCI device after hard lockup.
8498a6b1bc7SGavin Shan  * @pe: EEH PE
8508a6b1bc7SGavin Shan  *
8518a6b1bc7SGavin Shan  * While PHB detects address or data parity errors on particular PCI
8528a6b1bc7SGavin Shan  * slot, the associated PE will be frozen. Besides, DMA's occurring
8538a6b1bc7SGavin Shan  * to wild addresses (which usually happen due to bugs in device
8548a6b1bc7SGavin Shan  * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
8558a6b1bc7SGavin Shan  * #PERR or other misc PCI-related errors also can trigger EEH errors.
8568a6b1bc7SGavin Shan  *
8578a6b1bc7SGavin Shan  * Recovery process consists of unplugging the device driver (which
8588a6b1bc7SGavin Shan  * generated hotplug events to userspace), then issuing a PCI #RST to
8598a6b1bc7SGavin Shan  * the device, then reconfiguring the PCI config space for all bridges
8608a6b1bc7SGavin Shan  * & devices under this slot, and then finally restarting the device
8618a6b1bc7SGavin Shan  * drivers (which cause a second set of hotplug events to go out to
8628a6b1bc7SGavin Shan  * userspace).
8638a6b1bc7SGavin Shan  */
8648a6b1bc7SGavin Shan void eeh_handle_event(struct eeh_pe *pe)
8658a6b1bc7SGavin Shan {
8668a6b1bc7SGavin Shan 	if (pe)
8678a6b1bc7SGavin Shan 		eeh_handle_normal_event(pe);
8688a6b1bc7SGavin Shan 	else
8698a6b1bc7SGavin Shan 		eeh_handle_special_event();
8708a6b1bc7SGavin Shan }
871