1bbbd7f11SThomas Huth // SPDX-License-Identifier: GPL-2.0-or-later 2317f06deSGavin Shan /* 3317f06deSGavin Shan * PCI Error Recovery Driver for RPA-compliant PPC64 platform. 4317f06deSGavin Shan * Copyright IBM Corp. 2004 2005 5317f06deSGavin Shan * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 6317f06deSGavin Shan * 7317f06deSGavin Shan * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> 8317f06deSGavin Shan */ 9317f06deSGavin Shan #include <linux/delay.h> 10317f06deSGavin Shan #include <linux/interrupt.h> 11317f06deSGavin Shan #include <linux/irq.h> 12317f06deSGavin Shan #include <linux/module.h> 13317f06deSGavin Shan #include <linux/pci.h> 14b104af5aSOliver O'Halloran #include <linux/pci_hotplug.h> 15317f06deSGavin Shan #include <asm/eeh.h> 16317f06deSGavin Shan #include <asm/eeh_event.h> 17317f06deSGavin Shan #include <asm/ppc-pci.h> 18317f06deSGavin Shan #include <asm/pci-bridge.h> 19317f06deSGavin Shan #include <asm/prom.h> 20317f06deSGavin Shan #include <asm/rtas.h> 21317f06deSGavin Shan 2267086e32SWei Yang struct eeh_rmv_data { 231c5c533bSSam Bobroff struct list_head removed_vf_list; 241c5c533bSSam Bobroff int removed_dev_count; 2567086e32SWei Yang }; 2667086e32SWei Yang 2730424e38SSam Bobroff static int eeh_result_priority(enum pci_ers_result result) 2830424e38SSam Bobroff { 2930424e38SSam Bobroff switch (result) { 3030424e38SSam Bobroff case PCI_ERS_RESULT_NONE: 3130424e38SSam Bobroff return 1; 3230424e38SSam Bobroff case PCI_ERS_RESULT_NO_AER_DRIVER: 3330424e38SSam Bobroff return 2; 3430424e38SSam Bobroff case PCI_ERS_RESULT_RECOVERED: 3530424e38SSam Bobroff return 3; 3630424e38SSam Bobroff case PCI_ERS_RESULT_CAN_RECOVER: 3730424e38SSam Bobroff return 4; 3830424e38SSam Bobroff case PCI_ERS_RESULT_DISCONNECT: 3930424e38SSam Bobroff return 5; 4030424e38SSam Bobroff case PCI_ERS_RESULT_NEED_RESET: 4130424e38SSam Bobroff return 6; 4230424e38SSam Bobroff default: 4330424e38SSam Bobroff WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result); 4430424e38SSam Bobroff return 0; 4530424e38SSam Bobroff } 4630424e38SSam Bobroff }; 4730424e38SSam Bobroff 48c36c5ffdSBreno Leitao static const char *pci_ers_result_name(enum pci_ers_result result) 4920b34497SSam Bobroff { 5020b34497SSam Bobroff switch (result) { 5120b34497SSam Bobroff case PCI_ERS_RESULT_NONE: 5220b34497SSam Bobroff return "none"; 5320b34497SSam Bobroff case PCI_ERS_RESULT_CAN_RECOVER: 5420b34497SSam Bobroff return "can recover"; 5520b34497SSam Bobroff case PCI_ERS_RESULT_NEED_RESET: 5620b34497SSam Bobroff return "need reset"; 5720b34497SSam Bobroff case PCI_ERS_RESULT_DISCONNECT: 5820b34497SSam Bobroff return "disconnect"; 5920b34497SSam Bobroff case PCI_ERS_RESULT_RECOVERED: 6020b34497SSam Bobroff return "recovered"; 6120b34497SSam Bobroff case PCI_ERS_RESULT_NO_AER_DRIVER: 6220b34497SSam Bobroff return "no AER driver"; 6320b34497SSam Bobroff default: 6420b34497SSam Bobroff WARN_ONCE(1, "Unknown result type: %d\n", (int)result); 6520b34497SSam Bobroff return "unknown"; 6620b34497SSam Bobroff } 6720b34497SSam Bobroff }; 6820b34497SSam Bobroff 6930424e38SSam Bobroff static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, 7030424e38SSam Bobroff enum pci_ers_result new) 7130424e38SSam Bobroff { 7230424e38SSam Bobroff if (eeh_result_priority(new) > eeh_result_priority(old)) 7330424e38SSam Bobroff return new; 7430424e38SSam Bobroff return old; 7530424e38SSam Bobroff } 7630424e38SSam Bobroff 77e2b810d5SSam Bobroff static bool eeh_dev_removed(struct eeh_dev *edev) 78e2b810d5SSam Bobroff { 79e2b810d5SSam Bobroff return !edev || (edev->mode & EEH_DEV_REMOVED); 80e2b810d5SSam Bobroff } 81e2b810d5SSam Bobroff 82e2b810d5SSam Bobroff static bool eeh_edev_actionable(struct eeh_dev *edev) 83e2b810d5SSam Bobroff { 8438ddc011SOliver O'Halloran if (!edev->pdev) 8538ddc011SOliver O'Halloran return false; 8638ddc011SOliver O'Halloran if (edev->pdev->error_state == pci_channel_io_perm_failure) 8738ddc011SOliver O'Halloran return false; 8838ddc011SOliver O'Halloran if (eeh_dev_removed(edev)) 8938ddc011SOliver O'Halloran return false; 9038ddc011SOliver O'Halloran if (eeh_pe_passed(edev->pe)) 9138ddc011SOliver O'Halloran return false; 9238ddc011SOliver O'Halloran 9338ddc011SOliver O'Halloran return true; 94e2b810d5SSam Bobroff } 95e2b810d5SSam Bobroff 96317f06deSGavin Shan /** 97317f06deSGavin Shan * eeh_pcid_get - Get the PCI device driver 98317f06deSGavin Shan * @pdev: PCI device 99317f06deSGavin Shan * 100317f06deSGavin Shan * The function is used to retrieve the PCI device driver for 101317f06deSGavin Shan * the indicated PCI device. Besides, we will increase the reference 102317f06deSGavin Shan * of the PCI device driver to prevent that being unloaded on 103317f06deSGavin Shan * the fly. Otherwise, kernel crash would be seen. 104317f06deSGavin Shan */ 105317f06deSGavin Shan static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) 106317f06deSGavin Shan { 107317f06deSGavin Shan if (!pdev || !pdev->driver) 108317f06deSGavin Shan return NULL; 109317f06deSGavin Shan 110317f06deSGavin Shan if (!try_module_get(pdev->driver->driver.owner)) 111317f06deSGavin Shan return NULL; 112317f06deSGavin Shan 113317f06deSGavin Shan return pdev->driver; 114317f06deSGavin Shan } 115317f06deSGavin Shan 116317f06deSGavin Shan /** 117317f06deSGavin Shan * eeh_pcid_put - Dereference on the PCI device driver 118317f06deSGavin Shan * @pdev: PCI device 119317f06deSGavin Shan * 120317f06deSGavin Shan * The function is called to do dereference on the PCI device 121317f06deSGavin Shan * driver of the indicated PCI device. 122317f06deSGavin Shan */ 123317f06deSGavin Shan static inline void eeh_pcid_put(struct pci_dev *pdev) 124317f06deSGavin Shan { 125317f06deSGavin Shan if (!pdev || !pdev->driver) 126317f06deSGavin Shan return; 127317f06deSGavin Shan 128317f06deSGavin Shan module_put(pdev->driver->driver.owner); 129317f06deSGavin Shan } 130317f06deSGavin Shan 131317f06deSGavin Shan /** 132317f06deSGavin Shan * eeh_disable_irq - Disable interrupt for the recovering device 133317f06deSGavin Shan * @dev: PCI device 134317f06deSGavin Shan * 135317f06deSGavin Shan * This routine must be called when reporting temporary or permanent 136317f06deSGavin Shan * error to the particular PCI device to disable interrupt of that 137317f06deSGavin Shan * device. If the device has enabled MSI or MSI-X interrupt, we needn't 138317f06deSGavin Shan * do real work because EEH should freeze DMA transfers for those PCI 139317f06deSGavin Shan * devices encountering EEH errors, which includes MSI or MSI-X. 140317f06deSGavin Shan */ 141010acfa1SSam Bobroff static void eeh_disable_irq(struct eeh_dev *edev) 142317f06deSGavin Shan { 143317f06deSGavin Shan /* Don't disable MSI and MSI-X interrupts. They are 144317f06deSGavin Shan * effectively disabled by the DMA Stopped state 145317f06deSGavin Shan * when an EEH error occurs. 146317f06deSGavin Shan */ 147010acfa1SSam Bobroff if (edev->pdev->msi_enabled || edev->pdev->msix_enabled) 148317f06deSGavin Shan return; 149317f06deSGavin Shan 150010acfa1SSam Bobroff if (!irq_has_action(edev->pdev->irq)) 151317f06deSGavin Shan return; 152317f06deSGavin Shan 153317f06deSGavin Shan edev->mode |= EEH_DEV_IRQ_DISABLED; 154010acfa1SSam Bobroff disable_irq_nosync(edev->pdev->irq); 155317f06deSGavin Shan } 156317f06deSGavin Shan 157317f06deSGavin Shan /** 158317f06deSGavin Shan * eeh_enable_irq - Enable interrupt for the recovering device 159317f06deSGavin Shan * @dev: PCI device 160317f06deSGavin Shan * 161317f06deSGavin Shan * This routine must be called to enable interrupt while failed 162317f06deSGavin Shan * device could be resumed. 163317f06deSGavin Shan */ 164010acfa1SSam Bobroff static void eeh_enable_irq(struct eeh_dev *edev) 165317f06deSGavin Shan { 166317f06deSGavin Shan if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { 167317f06deSGavin Shan edev->mode &= ~EEH_DEV_IRQ_DISABLED; 168b8a9a11bSThomas Gleixner /* 169b8a9a11bSThomas Gleixner * FIXME !!!!! 170b8a9a11bSThomas Gleixner * 171b8a9a11bSThomas Gleixner * This is just ass backwards. This maze has 172b8a9a11bSThomas Gleixner * unbalanced irq_enable/disable calls. So instead of 173b8a9a11bSThomas Gleixner * finding the root cause it works around the warning 174b8a9a11bSThomas Gleixner * in the irq_enable code by conditionally calling 175b8a9a11bSThomas Gleixner * into it. 176b8a9a11bSThomas Gleixner * 177b8a9a11bSThomas Gleixner * That's just wrong.The warning in the core code is 178027dfac6SMichael Ellerman * there to tell people to fix their asymmetries in 179b8a9a11bSThomas Gleixner * their own code, not by abusing the core information 180b8a9a11bSThomas Gleixner * to avoid it. 181b8a9a11bSThomas Gleixner * 182b8a9a11bSThomas Gleixner * I so wish that the assymetry would be the other way 183b8a9a11bSThomas Gleixner * round and a few more irq_disable calls render that 184b8a9a11bSThomas Gleixner * shit unusable forever. 185b8a9a11bSThomas Gleixner * 186b8a9a11bSThomas Gleixner * tglx 187b8a9a11bSThomas Gleixner */ 188010acfa1SSam Bobroff if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq))) 189010acfa1SSam Bobroff enable_irq(edev->pdev->irq); 190317f06deSGavin Shan } 19157310c3cSThomas Gleixner } 192317f06deSGavin Shan 193cef50c67SSam Bobroff static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) 1945cfb20b9SGavin Shan { 1955cfb20b9SGavin Shan struct pci_dev *pdev; 1965cfb20b9SGavin Shan 1975cfb20b9SGavin Shan if (!edev) 198cef50c67SSam Bobroff return; 1995cfb20b9SGavin Shan 2005a0cdbfdSGavin Shan /* 2015a0cdbfdSGavin Shan * We cannot access the config space on some adapters. 2025a0cdbfdSGavin Shan * Otherwise, it will cause fenced PHB. We don't save 2035a0cdbfdSGavin Shan * the content in their config space and will restore 2045a0cdbfdSGavin Shan * from the initial config space saved when the EEH 2055a0cdbfdSGavin Shan * device is created. 2065a0cdbfdSGavin Shan */ 2075a0cdbfdSGavin Shan if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) 208cef50c67SSam Bobroff return; 2095a0cdbfdSGavin Shan 2105cfb20b9SGavin Shan pdev = eeh_dev_to_pci_dev(edev); 2115cfb20b9SGavin Shan if (!pdev) 212cef50c67SSam Bobroff return; 2135cfb20b9SGavin Shan 2145cfb20b9SGavin Shan pci_save_state(pdev); 2155cfb20b9SGavin Shan } 2165cfb20b9SGavin Shan 21716d79cd4SLuc Van Oostenryck static void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s) 21847cc8c1cSSam Bobroff { 21947cc8c1cSSam Bobroff struct eeh_pe *pe; 22047cc8c1cSSam Bobroff struct eeh_dev *edev, *tmp; 22147cc8c1cSSam Bobroff 22247cc8c1cSSam Bobroff eeh_for_each_pe(root, pe) 22347cc8c1cSSam Bobroff eeh_pe_for_each_dev(pe, edev, tmp) 22447cc8c1cSSam Bobroff if (eeh_edev_actionable(edev)) 22547cc8c1cSSam Bobroff edev->pdev->error_state = s; 22647cc8c1cSSam Bobroff } 22747cc8c1cSSam Bobroff 228010acfa1SSam Bobroff static void eeh_set_irq_state(struct eeh_pe *root, bool enable) 229010acfa1SSam Bobroff { 230010acfa1SSam Bobroff struct eeh_pe *pe; 231010acfa1SSam Bobroff struct eeh_dev *edev, *tmp; 232010acfa1SSam Bobroff 233010acfa1SSam Bobroff eeh_for_each_pe(root, pe) { 234010acfa1SSam Bobroff eeh_pe_for_each_dev(pe, edev, tmp) { 235010acfa1SSam Bobroff if (!eeh_edev_actionable(edev)) 236010acfa1SSam Bobroff continue; 237010acfa1SSam Bobroff 238010acfa1SSam Bobroff if (!eeh_pcid_get(edev->pdev)) 239010acfa1SSam Bobroff continue; 240010acfa1SSam Bobroff 241010acfa1SSam Bobroff if (enable) 242010acfa1SSam Bobroff eeh_enable_irq(edev); 243010acfa1SSam Bobroff else 244010acfa1SSam Bobroff eeh_disable_irq(edev); 245010acfa1SSam Bobroff 246010acfa1SSam Bobroff eeh_pcid_put(edev->pdev); 247010acfa1SSam Bobroff } 248010acfa1SSam Bobroff } 249010acfa1SSam Bobroff } 250010acfa1SSam Bobroff 25120b34497SSam Bobroff typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, 2522e255051SSam Bobroff struct pci_dev *, 25320b34497SSam Bobroff struct pci_driver *); 25420b34497SSam Bobroff static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, 25520b34497SSam Bobroff enum pci_ers_result *result) 25620b34497SSam Bobroff { 2572e255051SSam Bobroff struct pci_dev *pdev; 25820b34497SSam Bobroff struct pci_driver *driver; 25920b34497SSam Bobroff enum pci_ers_result new_result; 26020b34497SSam Bobroff 2612e255051SSam Bobroff pci_lock_rescan_remove(); 2622e255051SSam Bobroff pdev = edev->pdev; 2632e255051SSam Bobroff if (pdev) 2642e255051SSam Bobroff get_device(&pdev->dev); 2652e255051SSam Bobroff pci_unlock_rescan_remove(); 2662e255051SSam Bobroff if (!pdev) { 267bcbe3730SSam Bobroff eeh_edev_info(edev, "no device"); 268bcbe3730SSam Bobroff return; 269bcbe3730SSam Bobroff } 2702e255051SSam Bobroff device_lock(&pdev->dev); 27120b34497SSam Bobroff if (eeh_edev_actionable(edev)) { 2722e255051SSam Bobroff driver = eeh_pcid_get(pdev); 27320b34497SSam Bobroff 27420b34497SSam Bobroff if (!driver) 27520b34497SSam Bobroff eeh_edev_info(edev, "no driver"); 27620b34497SSam Bobroff else if (!driver->err_handler) 27720b34497SSam Bobroff eeh_edev_info(edev, "driver not EEH aware"); 27820b34497SSam Bobroff else if (edev->mode & EEH_DEV_NO_HANDLER) 27920b34497SSam Bobroff eeh_edev_info(edev, "driver bound too late"); 28020b34497SSam Bobroff else { 2812e255051SSam Bobroff new_result = fn(edev, pdev, driver); 28220b34497SSam Bobroff eeh_edev_info(edev, "%s driver reports: '%s'", 28320b34497SSam Bobroff driver->name, 28420b34497SSam Bobroff pci_ers_result_name(new_result)); 28520b34497SSam Bobroff if (result) 28620b34497SSam Bobroff *result = pci_ers_merge_result(*result, 28720b34497SSam Bobroff new_result); 28820b34497SSam Bobroff } 28920b34497SSam Bobroff if (driver) 2902e255051SSam Bobroff eeh_pcid_put(pdev); 29120b34497SSam Bobroff } else { 2922e255051SSam Bobroff eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev, 29320b34497SSam Bobroff !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); 29420b34497SSam Bobroff } 2952e255051SSam Bobroff device_unlock(&pdev->dev); 2962e255051SSam Bobroff if (edev->pdev != pdev) 2972e255051SSam Bobroff eeh_edev_warn(edev, "Device changed during processing!\n"); 2982e255051SSam Bobroff put_device(&pdev->dev); 29920b34497SSam Bobroff } 30020b34497SSam Bobroff 30120b34497SSam Bobroff static void eeh_pe_report(const char *name, struct eeh_pe *root, 30220b34497SSam Bobroff eeh_report_fn fn, enum pci_ers_result *result) 30320b34497SSam Bobroff { 30420b34497SSam Bobroff struct eeh_pe *pe; 30520b34497SSam Bobroff struct eeh_dev *edev, *tmp; 30620b34497SSam Bobroff 30720b34497SSam Bobroff pr_info("EEH: Beginning: '%s'\n", name); 30820b34497SSam Bobroff eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) 30920b34497SSam Bobroff eeh_pe_report_edev(edev, fn, result); 31020b34497SSam Bobroff if (result) 31120b34497SSam Bobroff pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n", 31220b34497SSam Bobroff name, pci_ers_result_name(*result)); 31320b34497SSam Bobroff else 31420b34497SSam Bobroff pr_info("EEH: Finished:'%s'", name); 31520b34497SSam Bobroff } 31620b34497SSam Bobroff 317317f06deSGavin Shan /** 318317f06deSGavin Shan * eeh_report_error - Report pci error to each device driver 31920b34497SSam Bobroff * @edev: eeh device 32020b34497SSam Bobroff * @driver: device's PCI driver 321317f06deSGavin Shan * 32220b34497SSam Bobroff * Report an EEH error to each device driver. 323317f06deSGavin Shan */ 32420b34497SSam Bobroff static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, 3252e255051SSam Bobroff struct pci_dev *pdev, 32620b34497SSam Bobroff struct pci_driver *driver) 327317f06deSGavin Shan { 32820b34497SSam Bobroff enum pci_ers_result rc; 329317f06deSGavin Shan 33020b34497SSam Bobroff if (!driver->err_handler->error_detected) 33120b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 332f0295e04SMichael Neuling 33320b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", 33420b34497SSam Bobroff driver->name); 3352e255051SSam Bobroff rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); 336317f06deSGavin Shan 33767086e32SWei Yang edev->in_error = true; 3382e255051SSam Bobroff pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE); 33920b34497SSam Bobroff return rc; 340317f06deSGavin Shan } 341317f06deSGavin Shan 342317f06deSGavin Shan /** 343317f06deSGavin Shan * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled 34420b34497SSam Bobroff * @edev: eeh device 34520b34497SSam Bobroff * @driver: device's PCI driver 346317f06deSGavin Shan * 347317f06deSGavin Shan * Tells each device driver that IO ports, MMIO and config space I/O 34820b34497SSam Bobroff * are now enabled. 349317f06deSGavin Shan */ 35020b34497SSam Bobroff static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, 3512e255051SSam Bobroff struct pci_dev *pdev, 35220b34497SSam Bobroff struct pci_driver *driver) 353317f06deSGavin Shan { 35420b34497SSam Bobroff if (!driver->err_handler->mmio_enabled) 35520b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 35620b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); 3572e255051SSam Bobroff return driver->err_handler->mmio_enabled(pdev); 358317f06deSGavin Shan } 359317f06deSGavin Shan 360317f06deSGavin Shan /** 361317f06deSGavin Shan * eeh_report_reset - Tell device that slot has been reset 36220b34497SSam Bobroff * @edev: eeh device 36320b34497SSam Bobroff * @driver: device's PCI driver 364317f06deSGavin Shan * 365317f06deSGavin Shan * This routine must be called while EEH tries to reset particular 366317f06deSGavin Shan * PCI device so that the associated PCI device driver could take 367317f06deSGavin Shan * some actions, usually to save data the driver needs so that the 368317f06deSGavin Shan * driver can work again while the device is recovered. 369317f06deSGavin Shan */ 37020b34497SSam Bobroff static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, 3712e255051SSam Bobroff struct pci_dev *pdev, 37220b34497SSam Bobroff struct pci_driver *driver) 373317f06deSGavin Shan { 37420b34497SSam Bobroff if (!driver->err_handler->slot_reset || !edev->in_error) 37520b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 37620b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); 3772e255051SSam Bobroff return driver->err_handler->slot_reset(pdev); 378317f06deSGavin Shan } 379317f06deSGavin Shan 380cef50c67SSam Bobroff static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) 3815cfb20b9SGavin Shan { 3825cfb20b9SGavin Shan struct pci_dev *pdev; 3835cfb20b9SGavin Shan 3845cfb20b9SGavin Shan if (!edev) 385cef50c67SSam Bobroff return; 3865cfb20b9SGavin Shan 3875a0cdbfdSGavin Shan /* 3885a0cdbfdSGavin Shan * The content in the config space isn't saved because 3895a0cdbfdSGavin Shan * the blocked config space on some adapters. We have 3905a0cdbfdSGavin Shan * to restore the initial saved config space when the 3915a0cdbfdSGavin Shan * EEH device is created. 3925a0cdbfdSGavin Shan */ 3935a0cdbfdSGavin Shan if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { 39480e65b00SSam Bobroff if (list_is_last(&edev->entry, &edev->pe->edevs)) 3955a0cdbfdSGavin Shan eeh_pe_restore_bars(edev->pe); 3965a0cdbfdSGavin Shan 397cef50c67SSam Bobroff return; 3985a0cdbfdSGavin Shan } 3995a0cdbfdSGavin Shan 4005cfb20b9SGavin Shan pdev = eeh_dev_to_pci_dev(edev); 4015cfb20b9SGavin Shan if (!pdev) 402cef50c67SSam Bobroff return; 4035cfb20b9SGavin Shan 4045cfb20b9SGavin Shan pci_restore_state(pdev); 4055cfb20b9SGavin Shan } 4065cfb20b9SGavin Shan 407317f06deSGavin Shan /** 408317f06deSGavin Shan * eeh_report_resume - Tell device to resume normal operations 40920b34497SSam Bobroff * @edev: eeh device 41020b34497SSam Bobroff * @driver: device's PCI driver 411317f06deSGavin Shan * 412317f06deSGavin Shan * This routine must be called to notify the device driver that it 413317f06deSGavin Shan * could resume so that the device driver can do some initialization 414317f06deSGavin Shan * to make the recovered device work again. 415317f06deSGavin Shan */ 41620b34497SSam Bobroff static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, 4172e255051SSam Bobroff struct pci_dev *pdev, 41820b34497SSam Bobroff struct pci_driver *driver) 419317f06deSGavin Shan { 42020b34497SSam Bobroff if (!driver->err_handler->resume || !edev->in_error) 42120b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 422317f06deSGavin Shan 42320b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->resume()", driver->name); 4242e255051SSam Bobroff driver->err_handler->resume(pdev); 425f0295e04SMichael Neuling 42620b34497SSam Bobroff pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); 427856e1eb9SBryant G. Ly #ifdef CONFIG_PCI_IOV 428521ca5a9SJuan J. Alvarez if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) 429856e1eb9SBryant G. Ly eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); 430856e1eb9SBryant G. Ly #endif 43120b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 432317f06deSGavin Shan } 433317f06deSGavin Shan 434317f06deSGavin Shan /** 435317f06deSGavin Shan * eeh_report_failure - Tell device driver that device is dead. 43620b34497SSam Bobroff * @edev: eeh device 43720b34497SSam Bobroff * @driver: device's PCI driver 438317f06deSGavin Shan * 439317f06deSGavin Shan * This informs the device driver that the device is permanently 440317f06deSGavin Shan * dead, and that no further recovery attempts will be made on it. 441317f06deSGavin Shan */ 44220b34497SSam Bobroff static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, 4432e255051SSam Bobroff struct pci_dev *pdev, 44420b34497SSam Bobroff struct pci_driver *driver) 445317f06deSGavin Shan { 44620b34497SSam Bobroff enum pci_ers_result rc; 447317f06deSGavin Shan 44820b34497SSam Bobroff if (!driver->err_handler->error_detected) 44920b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 450f0295e04SMichael Neuling 45120b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", 45220b34497SSam Bobroff driver->name); 4532e255051SSam Bobroff rc = driver->err_handler->error_detected(pdev, 45420b34497SSam Bobroff pci_channel_io_perm_failure); 455317f06deSGavin Shan 4562e255051SSam Bobroff pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); 45720b34497SSam Bobroff return rc; 458317f06deSGavin Shan } 459317f06deSGavin Shan 460bf773df9SSam Bobroff static void *eeh_add_virt_device(struct eeh_dev *edev) 46167086e32SWei Yang { 46267086e32SWei Yang struct pci_driver *driver; 46367086e32SWei Yang struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 46467086e32SWei Yang 46567086e32SWei Yang if (!(edev->physfn)) { 4661ff8f36fSSam Bobroff eeh_edev_warn(edev, "Not for VF\n"); 46767086e32SWei Yang return NULL; 46867086e32SWei Yang } 46967086e32SWei Yang 47067086e32SWei Yang driver = eeh_pcid_get(dev); 47167086e32SWei Yang if (driver) { 47246d4be41SSam Bobroff if (driver->err_handler) { 47367086e32SWei Yang eeh_pcid_put(dev); 47467086e32SWei Yang return NULL; 47567086e32SWei Yang } 47646d4be41SSam Bobroff eeh_pcid_put(dev); 47746d4be41SSam Bobroff } 47867086e32SWei Yang 479988fc3baSBryant G. Ly #ifdef CONFIG_PCI_IOV 4801ff8f36fSSam Bobroff pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index); 48167086e32SWei Yang #endif 48267086e32SWei Yang return NULL; 48367086e32SWei Yang } 48467086e32SWei Yang 485cef50c67SSam Bobroff static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) 486f5c57710SGavin Shan { 487f5c57710SGavin Shan struct pci_driver *driver; 488f5c57710SGavin Shan struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 48967086e32SWei Yang struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; 490f5c57710SGavin Shan 491f5c57710SGavin Shan /* 492f5c57710SGavin Shan * Actually, we should remove the PCI bridges as well. 493f5c57710SGavin Shan * However, that's lots of complexity to do that, 494f5c57710SGavin Shan * particularly some of devices under the bridge might 495f5c57710SGavin Shan * support EEH. So we just care about PCI devices for 496f5c57710SGavin Shan * simplicity here. 497f5c57710SGavin Shan */ 4981ef52073SSam Bobroff if (!eeh_edev_actionable(edev) || 4991ef52073SSam Bobroff (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) 500cef50c67SSam Bobroff return; 501d2b0f6f7SGavin Shan 5021c5c533bSSam Bobroff if (rmv_data) { 503f5c57710SGavin Shan driver = eeh_pcid_get(dev); 5048cc6b6cdSThadeu Lima de Souza Cascardo if (driver) { 50546d4be41SSam Bobroff if (driver->err_handler && 506f2da4ccfSGavin Shan driver->err_handler->error_detected && 50746d4be41SSam Bobroff driver->err_handler->slot_reset) { 50846d4be41SSam Bobroff eeh_pcid_put(dev); 509cef50c67SSam Bobroff return; 5108cc6b6cdSThadeu Lima de Souza Cascardo } 51146d4be41SSam Bobroff eeh_pcid_put(dev); 51246d4be41SSam Bobroff } 51346d4be41SSam Bobroff } 514f5c57710SGavin Shan 515f5c57710SGavin Shan /* Remove it from PCI subsystem */ 5161ef52073SSam Bobroff pr_info("EEH: Removing %s without EEH sensitive driver\n", 517f5c57710SGavin Shan pci_name(dev)); 518f5c57710SGavin Shan edev->mode |= EEH_DEV_DISCONNECTED; 5191c5c533bSSam Bobroff if (rmv_data) 5201c5c533bSSam Bobroff rmv_data->removed_dev_count++; 521f5c57710SGavin Shan 52267086e32SWei Yang if (edev->physfn) { 523988fc3baSBryant G. Ly #ifdef CONFIG_PCI_IOV 52467086e32SWei Yang struct pci_dn *pdn = eeh_dev_to_pdn(edev); 52567086e32SWei Yang 526753f6124SJan H. Schönherr pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); 52767086e32SWei Yang edev->pdev = NULL; 52867086e32SWei Yang #endif 52967086e32SWei Yang if (rmv_data) 5301c5c533bSSam Bobroff list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); 53167086e32SWei Yang } else { 5321c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 533f5c57710SGavin Shan pci_stop_and_remove_bus_device(dev); 5341c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 53567086e32SWei Yang } 536f5c57710SGavin Shan } 537f5c57710SGavin Shan 538d6c4932fSSam Bobroff static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) 539f5c57710SGavin Shan { 540f5c57710SGavin Shan struct eeh_dev *edev, *tmp; 541f5c57710SGavin Shan 542f5c57710SGavin Shan eeh_pe_for_each_dev(pe, edev, tmp) { 543f5c57710SGavin Shan if (!(edev->mode & EEH_DEV_DISCONNECTED)) 544f5c57710SGavin Shan continue; 545f5c57710SGavin Shan 546f5c57710SGavin Shan edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); 547f5c57710SGavin Shan eeh_rmv_from_parent_pe(edev); 548f5c57710SGavin Shan } 549f5c57710SGavin Shan 550f5c57710SGavin Shan return NULL; 551f5c57710SGavin Shan } 552f5c57710SGavin Shan 55378954700SGavin Shan /* 55478954700SGavin Shan * Explicitly clear PE's frozen state for PowerNV where 55578954700SGavin Shan * we have frozen PE until BAR restore is completed. It's 55678954700SGavin Shan * harmless to clear it for pSeries. To be consistent with 55778954700SGavin Shan * PE reset (for 3 times), we try to clear the frozen state 55878954700SGavin Shan * for 3 times as well. 55978954700SGavin Shan */ 5604d8e325dSSam Bobroff static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) 56178954700SGavin Shan { 5623376cb91SSam Bobroff struct eeh_pe *pe; 5633376cb91SSam Bobroff int i; 56478954700SGavin Shan 5653376cb91SSam Bobroff eeh_for_each_pe(root, pe) { 5664d8e325dSSam Bobroff if (include_passed || !eeh_pe_passed(pe)) { 5673376cb91SSam Bobroff for (i = 0; i < 3; i++) 568188fdea6SSam Bobroff if (!eeh_unfreeze_pe(pe)) 5693376cb91SSam Bobroff break; 5703376cb91SSam Bobroff if (i >= 3) 5713376cb91SSam Bobroff return -EIO; 5722c665992SGavin Shan } 5734d8e325dSSam Bobroff } 5744d8e325dSSam Bobroff eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); 5753376cb91SSam Bobroff return 0; 57678954700SGavin Shan } 57778954700SGavin Shan 5785cfb20b9SGavin Shan int eeh_pe_reset_and_recover(struct eeh_pe *pe) 5795cfb20b9SGavin Shan { 5802efc771fSGavin Shan int ret; 5815cfb20b9SGavin Shan 5825cfb20b9SGavin Shan /* Bail if the PE is being recovered */ 5835cfb20b9SGavin Shan if (pe->state & EEH_PE_RECOVERING) 5845cfb20b9SGavin Shan return 0; 5855cfb20b9SGavin Shan 5865cfb20b9SGavin Shan /* Put the PE into recovery mode */ 5875cfb20b9SGavin Shan eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 5885cfb20b9SGavin Shan 5895cfb20b9SGavin Shan /* Save states */ 5905cfb20b9SGavin Shan eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); 5915cfb20b9SGavin Shan 5925cfb20b9SGavin Shan /* Issue reset */ 5931ef52073SSam Bobroff ret = eeh_pe_reset_full(pe, true); 5945cfb20b9SGavin Shan if (ret) { 5959ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 5965cfb20b9SGavin Shan return ret; 5975cfb20b9SGavin Shan } 5985cfb20b9SGavin Shan 5995cfb20b9SGavin Shan /* Unfreeze the PE */ 6004d8e325dSSam Bobroff ret = eeh_clear_pe_frozen_state(pe, true); 6015cfb20b9SGavin Shan if (ret) { 6029ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 6035cfb20b9SGavin Shan return ret; 6045cfb20b9SGavin Shan } 6055cfb20b9SGavin Shan 6065cfb20b9SGavin Shan /* Restore device state */ 6075cfb20b9SGavin Shan eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); 6085cfb20b9SGavin Shan 6095cfb20b9SGavin Shan /* Clear recovery mode */ 6109ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 6115cfb20b9SGavin Shan 6125cfb20b9SGavin Shan return 0; 6135cfb20b9SGavin Shan } 6145cfb20b9SGavin Shan 615317f06deSGavin Shan /** 616317f06deSGavin Shan * eeh_reset_device - Perform actual reset of a pci slot 6175fd13460SSam Bobroff * @driver_eeh_aware: Does the device's driver provide EEH support? 618317f06deSGavin Shan * @pe: EEH PE 619317f06deSGavin Shan * @bus: PCI bus corresponding to the isolcated slot 6205fd13460SSam Bobroff * @rmv_data: Optional, list to record removed devices 621317f06deSGavin Shan * 622317f06deSGavin Shan * This routine must be called to do reset on the indicated PE. 623317f06deSGavin Shan * During the reset, udev might be invoked because those affected 624317f06deSGavin Shan * PCI devices will be removed and then added. 625317f06deSGavin Shan */ 62667086e32SWei Yang static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, 6275fd13460SSam Bobroff struct eeh_rmv_data *rmv_data, 6285fd13460SSam Bobroff bool driver_eeh_aware) 629317f06deSGavin Shan { 630edfd17ffSArnd Bergmann time64_t tstamp; 63167086e32SWei Yang int cnt, rc; 63267086e32SWei Yang struct eeh_dev *edev; 6331ef52073SSam Bobroff struct eeh_pe *tmp_pe; 6341ef52073SSam Bobroff bool any_passed = false; 6351ef52073SSam Bobroff 6361ef52073SSam Bobroff eeh_for_each_pe(pe, tmp_pe) 6371ef52073SSam Bobroff any_passed |= eeh_pe_passed(tmp_pe); 638317f06deSGavin Shan 639317f06deSGavin Shan /* pcibios will clear the counter; save the value */ 640317f06deSGavin Shan cnt = pe->freeze_count; 6415a71978eSGavin Shan tstamp = pe->tstamp; 642317f06deSGavin Shan 643317f06deSGavin Shan /* 644317f06deSGavin Shan * We don't remove the corresponding PE instances because 645317f06deSGavin Shan * we need the information afterwords. The attached EEH 646317f06deSGavin Shan * devices are expected to be attached soon when calling 647bd251b89SGavin Shan * into pci_hp_add_devices(). 648317f06deSGavin Shan */ 649807a827dSGavin Shan eeh_pe_state_mark(pe, EEH_PE_KEEP); 6501ef52073SSam Bobroff if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { 65154048cf8SSam Bobroff eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); 65267086e32SWei Yang } else { 6531c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 654bd251b89SGavin Shan pci_hp_remove_devices(bus); 6551c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 65667086e32SWei Yang } 657317f06deSGavin Shan 658d0914f50SGavin Shan /* 659d0914f50SGavin Shan * Reset the pci controller. (Asserts RST#; resets config space). 660317f06deSGavin Shan * Reconfigure bridges and devices. Don't try to bring the system 661317f06deSGavin Shan * up if the reset failed for some reason. 662d0914f50SGavin Shan * 663d0914f50SGavin Shan * During the reset, it's very dangerous to have uncontrolled PCI 664d0914f50SGavin Shan * config accesses. So we prefer to block them. However, controlled 665d0914f50SGavin Shan * PCI config accesses initiated from EEH itself are allowed. 666317f06deSGavin Shan */ 6671ef52073SSam Bobroff rc = eeh_pe_reset_full(pe, false); 66828bf36f9SGavin Shan if (rc) 669317f06deSGavin Shan return rc; 670317f06deSGavin Shan 6711c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 6721c2042c8SRafael J. Wysocki 673317f06deSGavin Shan /* Restore PE */ 674317f06deSGavin Shan eeh_ops->configure_bridge(pe); 675317f06deSGavin Shan eeh_pe_restore_bars(pe); 676317f06deSGavin Shan 677dc9c41bdSAndrew Donnellan /* Clear frozen state */ 6781ef52073SSam Bobroff rc = eeh_clear_pe_frozen_state(pe, false); 679409bf7f8SAndrew Donnellan if (rc) { 680409bf7f8SAndrew Donnellan pci_unlock_rescan_remove(); 68178954700SGavin Shan return rc; 682409bf7f8SAndrew Donnellan } 68378954700SGavin Shan 684317f06deSGavin Shan /* Give the system 5 seconds to finish running the user-space 685317f06deSGavin Shan * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, 686317f06deSGavin Shan * this is a hack, but if we don't do this, and try to bring 687317f06deSGavin Shan * the device up before the scripts have taken it down, 688317f06deSGavin Shan * potentially weird things happen. 689317f06deSGavin Shan */ 6901c5c533bSSam Bobroff if (!driver_eeh_aware || rmv_data->removed_dev_count) { 69154048cf8SSam Bobroff pr_info("EEH: Sleep 5s ahead of %s hotplug\n", 69254048cf8SSam Bobroff (driver_eeh_aware ? "partial" : "complete")); 693317f06deSGavin Shan ssleep(5); 694f5c57710SGavin Shan 695f5c57710SGavin Shan /* 696f5c57710SGavin Shan * The EEH device is still connected with its parent 697f5c57710SGavin Shan * PE. We should disconnect it so the binding can be 698f5c57710SGavin Shan * rebuilt when adding PCI devices. 699f5c57710SGavin Shan */ 70080e65b00SSam Bobroff edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); 701f5c57710SGavin Shan eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); 702a3aa256bSGavin Shan if (pe->type & EEH_PE_VF) { 703bf773df9SSam Bobroff eeh_add_virt_device(edev); 704a3aa256bSGavin Shan } else { 70554048cf8SSam Bobroff if (!driver_eeh_aware) 7069ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); 707bd251b89SGavin Shan pci_hp_add_devices(bus); 708a3aa256bSGavin Shan } 709317f06deSGavin Shan } 7109ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_KEEP, true); 7115a71978eSGavin Shan 7125a71978eSGavin Shan pe->tstamp = tstamp; 713317f06deSGavin Shan pe->freeze_count = cnt; 714317f06deSGavin Shan 7151c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 716317f06deSGavin Shan return 0; 717317f06deSGavin Shan } 718317f06deSGavin Shan 719317f06deSGavin Shan /* The longest amount of time to wait for a pci device 720317f06deSGavin Shan * to come back on line, in seconds. 721317f06deSGavin Shan */ 722fb48dc22SBrian King #define MAX_WAIT_FOR_RECOVERY 300 723317f06deSGavin Shan 724799abe28SOliver O'Halloran 725799abe28SOliver O'Halloran /* Walks the PE tree after processing an event to remove any stale PEs. 726799abe28SOliver O'Halloran * 727799abe28SOliver O'Halloran * NB: This needs to be recursive to ensure the leaf PEs get removed 728799abe28SOliver O'Halloran * before their parents do. Although this is possible to do recursively 729799abe28SOliver O'Halloran * we don't since this is easier to read and we need to garantee 730799abe28SOliver O'Halloran * the leaf nodes will be handled first. 731799abe28SOliver O'Halloran */ 732799abe28SOliver O'Halloran static void eeh_pe_cleanup(struct eeh_pe *pe) 733799abe28SOliver O'Halloran { 734799abe28SOliver O'Halloran struct eeh_pe *child_pe, *tmp; 735799abe28SOliver O'Halloran 736799abe28SOliver O'Halloran list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) 737799abe28SOliver O'Halloran eeh_pe_cleanup(child_pe); 738799abe28SOliver O'Halloran 739799abe28SOliver O'Halloran if (pe->state & EEH_PE_KEEP) 740799abe28SOliver O'Halloran return; 741799abe28SOliver O'Halloran 742799abe28SOliver O'Halloran if (!(pe->state & EEH_PE_INVALID)) 743799abe28SOliver O'Halloran return; 744799abe28SOliver O'Halloran 745799abe28SOliver O'Halloran if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { 746799abe28SOliver O'Halloran list_del(&pe->child); 747799abe28SOliver O'Halloran kfree(pe); 748799abe28SOliver O'Halloran } 749799abe28SOliver O'Halloran } 750799abe28SOliver O'Halloran 751c0b64978SRussell Currey /** 752b104af5aSOliver O'Halloran * eeh_check_slot_presence - Check if a device is still present in a slot 753b104af5aSOliver O'Halloran * @pdev: pci_dev to check 754b104af5aSOliver O'Halloran * 755b104af5aSOliver O'Halloran * This function may return a false positive if we can't determine the slot's 756b104af5aSOliver O'Halloran * presence state. This might happen for for PCIe slots if the PE containing 757b104af5aSOliver O'Halloran * the upstream bridge is also frozen, or the bridge is part of the same PE 758b104af5aSOliver O'Halloran * as the device. 759b104af5aSOliver O'Halloran * 760b104af5aSOliver O'Halloran * This shouldn't happen often, but you might see it if you hotplug a PCIe 761b104af5aSOliver O'Halloran * switch. 762b104af5aSOliver O'Halloran */ 763b104af5aSOliver O'Halloran static bool eeh_slot_presence_check(struct pci_dev *pdev) 764b104af5aSOliver O'Halloran { 765b104af5aSOliver O'Halloran const struct hotplug_slot_ops *ops; 766b104af5aSOliver O'Halloran struct pci_slot *slot; 767b104af5aSOliver O'Halloran u8 state; 768b104af5aSOliver O'Halloran int rc; 769b104af5aSOliver O'Halloran 770b104af5aSOliver O'Halloran if (!pdev) 771b104af5aSOliver O'Halloran return false; 772b104af5aSOliver O'Halloran 773b104af5aSOliver O'Halloran if (pdev->error_state == pci_channel_io_perm_failure) 774b104af5aSOliver O'Halloran return false; 775b104af5aSOliver O'Halloran 776b104af5aSOliver O'Halloran slot = pdev->slot; 777b104af5aSOliver O'Halloran if (!slot || !slot->hotplug) 778b104af5aSOliver O'Halloran return true; 779b104af5aSOliver O'Halloran 780b104af5aSOliver O'Halloran ops = slot->hotplug->ops; 781b104af5aSOliver O'Halloran if (!ops || !ops->get_adapter_status) 782b104af5aSOliver O'Halloran return true; 783b104af5aSOliver O'Halloran 784aeff27c1SOliver O'Halloran /* set the attention indicator while we've got the slot ops */ 785aeff27c1SOliver O'Halloran if (ops->set_attention_status) 786aeff27c1SOliver O'Halloran ops->set_attention_status(slot->hotplug, 1); 787aeff27c1SOliver O'Halloran 788b104af5aSOliver O'Halloran rc = ops->get_adapter_status(slot->hotplug, &state); 789b104af5aSOliver O'Halloran if (rc) 790b104af5aSOliver O'Halloran return true; 791b104af5aSOliver O'Halloran 792b104af5aSOliver O'Halloran return !!state; 793b104af5aSOliver O'Halloran } 794b104af5aSOliver O'Halloran 795aeff27c1SOliver O'Halloran static void eeh_clear_slot_attention(struct pci_dev *pdev) 796aeff27c1SOliver O'Halloran { 797aeff27c1SOliver O'Halloran const struct hotplug_slot_ops *ops; 798aeff27c1SOliver O'Halloran struct pci_slot *slot; 799aeff27c1SOliver O'Halloran 800aeff27c1SOliver O'Halloran if (!pdev) 801aeff27c1SOliver O'Halloran return; 802aeff27c1SOliver O'Halloran 803aeff27c1SOliver O'Halloran if (pdev->error_state == pci_channel_io_perm_failure) 804aeff27c1SOliver O'Halloran return; 805aeff27c1SOliver O'Halloran 806aeff27c1SOliver O'Halloran slot = pdev->slot; 807aeff27c1SOliver O'Halloran if (!slot || !slot->hotplug) 808aeff27c1SOliver O'Halloran return; 809aeff27c1SOliver O'Halloran 810aeff27c1SOliver O'Halloran ops = slot->hotplug->ops; 811aeff27c1SOliver O'Halloran if (!ops || !ops->set_attention_status) 812aeff27c1SOliver O'Halloran return; 813aeff27c1SOliver O'Halloran 814aeff27c1SOliver O'Halloran ops->set_attention_status(slot->hotplug, 0); 815aeff27c1SOliver O'Halloran } 816aeff27c1SOliver O'Halloran 817b104af5aSOliver O'Halloran /** 818c0b64978SRussell Currey * eeh_handle_normal_event - Handle EEH events on a specific PE 81937fd8125SSam Bobroff * @pe: EEH PE - which should not be used after we return, as it may 82037fd8125SSam Bobroff * have been invalidated. 821c0b64978SRussell Currey * 822c0b64978SRussell Currey * Attempts to recover the given PE. If recovery fails or the PE has failed 823c0b64978SRussell Currey * too many times, remove the PE. 824c0b64978SRussell Currey * 82568701780SSam Bobroff * While PHB detects address or data parity errors on particular PCI 82668701780SSam Bobroff * slot, the associated PE will be frozen. Besides, DMA's occurring 82768701780SSam Bobroff * to wild addresses (which usually happen due to bugs in device 82868701780SSam Bobroff * drivers or in PCI adapter firmware) can cause EEH error. #SERR, 82968701780SSam Bobroff * #PERR or other misc PCI-related errors also can trigger EEH errors. 83068701780SSam Bobroff * 83168701780SSam Bobroff * Recovery process consists of unplugging the device driver (which 83268701780SSam Bobroff * generated hotplug events to userspace), then issuing a PCI #RST to 83368701780SSam Bobroff * the device, then reconfiguring the PCI config space for all bridges 83468701780SSam Bobroff * & devices under this slot, and then finally restarting the device 83568701780SSam Bobroff * drivers (which cause a second set of hotplug events to go out to 83668701780SSam Bobroff * userspace). 837c0b64978SRussell Currey */ 83837fd8125SSam Bobroff void eeh_handle_normal_event(struct eeh_pe *pe) 839317f06deSGavin Shan { 840cd95f804SSam Bobroff struct pci_bus *bus; 84167086e32SWei Yang struct eeh_dev *edev, *tmp; 842665012c5SSam Bobroff struct eeh_pe *tmp_pe; 843317f06deSGavin Shan int rc = 0; 844317f06deSGavin Shan enum pci_ers_result result = PCI_ERS_RESULT_NONE; 8451c5c533bSSam Bobroff struct eeh_rmv_data rmv_data = 8461c5c533bSSam Bobroff {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; 847b104af5aSOliver O'Halloran int devices = 0; 848317f06deSGavin Shan 849cd95f804SSam Bobroff bus = eeh_pe_bus_get(pe); 850cd95f804SSam Bobroff if (!bus) { 8511f52f176SRussell Currey pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", 852317f06deSGavin Shan __func__, pe->phb->global_number, pe->addr); 85337fd8125SSam Bobroff return; 854317f06deSGavin Shan } 855317f06deSGavin Shan 856b104af5aSOliver O'Halloran /* 857b104af5aSOliver O'Halloran * When devices are hot-removed we might get an EEH due to 858b104af5aSOliver O'Halloran * a driver attempting to touch the MMIO space of a removed 859b104af5aSOliver O'Halloran * device. In this case we don't have a device to recover 860b104af5aSOliver O'Halloran * so suppress the event if we can't find any present devices. 861b104af5aSOliver O'Halloran * 862b104af5aSOliver O'Halloran * The hotplug driver should take care of tearing down the 863b104af5aSOliver O'Halloran * device itself. 864b104af5aSOliver O'Halloran */ 865b104af5aSOliver O'Halloran eeh_for_each_pe(pe, tmp_pe) 866b104af5aSOliver O'Halloran eeh_pe_for_each_dev(tmp_pe, edev, tmp) 867b104af5aSOliver O'Halloran if (eeh_slot_presence_check(edev->pdev)) 868b104af5aSOliver O'Halloran devices++; 869b104af5aSOliver O'Halloran 87025baf3d8SOliver O'Halloran if (!devices) { 87125baf3d8SOliver O'Halloran pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", 87225baf3d8SOliver O'Halloran pe->phb->global_number, pe->addr); 873b104af5aSOliver O'Halloran goto out; /* nothing to recover */ 87425baf3d8SOliver O'Halloran } 87525baf3d8SOliver O'Halloran 87625baf3d8SOliver O'Halloran /* Log the event */ 87725baf3d8SOliver O'Halloran if (pe->type & EEH_PE_PHB) { 878de84ffc3SSam Bobroff pr_err("EEH: Recovering PHB#%x, location: %s\n", 87925baf3d8SOliver O'Halloran pe->phb->global_number, eeh_pe_loc_get(pe)); 88025baf3d8SOliver O'Halloran } else { 88125baf3d8SOliver O'Halloran struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); 88225baf3d8SOliver O'Halloran 883de84ffc3SSam Bobroff pr_err("EEH: Recovering PHB#%x-PE#%x\n", 88425baf3d8SOliver O'Halloran pe->phb->global_number, pe->addr); 88525baf3d8SOliver O'Halloran pr_err("EEH: PE location: %s, PHB location: %s\n", 88625baf3d8SOliver O'Halloran eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); 88725baf3d8SOliver O'Halloran } 88825baf3d8SOliver O'Halloran 8891b7f3b6cSMichael Ellerman #ifdef CONFIG_STACKTRACE 89025baf3d8SOliver O'Halloran /* 89125baf3d8SOliver O'Halloran * Print the saved stack trace now that we've verified there's 89225baf3d8SOliver O'Halloran * something to recover. 89325baf3d8SOliver O'Halloran */ 89425baf3d8SOliver O'Halloran if (pe->trace_entries) { 89525baf3d8SOliver O'Halloran void **ptrs = (void **) pe->stack_trace; 89625baf3d8SOliver O'Halloran int i; 89725baf3d8SOliver O'Halloran 89825baf3d8SOliver O'Halloran pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", 89925baf3d8SOliver O'Halloran pe->phb->global_number, pe->addr); 90025baf3d8SOliver O'Halloran 90125baf3d8SOliver O'Halloran /* FIXME: Use the same format as dump_stack() */ 90225baf3d8SOliver O'Halloran pr_err("EEH: Call Trace:\n"); 90325baf3d8SOliver O'Halloran for (i = 0; i < pe->trace_entries; i++) 90425baf3d8SOliver O'Halloran pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); 90525baf3d8SOliver O'Halloran 90625baf3d8SOliver O'Halloran pe->trace_entries = 0; 90725baf3d8SOliver O'Halloran } 9081b7f3b6cSMichael Ellerman #endif /* CONFIG_STACKTRACE */ 909b104af5aSOliver O'Halloran 9105a71978eSGavin Shan eeh_pe_update_time_stamp(pe); 911317f06deSGavin Shan pe->freeze_count++; 912c0b64978SRussell Currey if (pe->freeze_count > eeh_max_freezes) { 913796b9f5bSSam Bobroff pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", 914c0b64978SRussell Currey pe->phb->global_number, pe->addr, 915c0b64978SRussell Currey pe->freeze_count); 916b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 917c0b64978SRussell Currey } 918317f06deSGavin Shan 919aa06e3d6SSam Bobroff eeh_for_each_pe(pe, tmp_pe) 920aa06e3d6SSam Bobroff eeh_pe_for_each_dev(tmp_pe, edev, tmp) 921aa06e3d6SSam Bobroff edev->mode &= ~EEH_DEV_NO_HANDLER; 922aa06e3d6SSam Bobroff 923317f06deSGavin Shan /* Walk the various device drivers attached to this slot through 924317f06deSGavin Shan * a reset sequence, giving each an opportunity to do what it needs 925317f06deSGavin Shan * to accomplish the reset. Each child gets a report of the 926317f06deSGavin Shan * status ... if any child can't handle the reset, then the entire 927317f06deSGavin Shan * slot is dlpar removed and added. 9288234fcedSGavin Shan * 9298234fcedSGavin Shan * When the PHB is fenced, we have to issue a reset to recover from 9308234fcedSGavin Shan * the error. Override the result if necessary to have partially 9318234fcedSGavin Shan * hotplug for this case. 932317f06deSGavin Shan */ 933b90484ecSSam Bobroff if (result != PCI_ERS_RESULT_DISCONNECT) { 934b90484ecSSam Bobroff pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", 935b90484ecSSam Bobroff pe->freeze_count, eeh_max_freezes); 93656ca4fdeSGavin Shan pr_info("EEH: Notify device drivers to shutdown\n"); 93747cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_frozen); 938010acfa1SSam Bobroff eeh_set_irq_state(pe, false); 939b90484ecSSam Bobroff eeh_pe_report("error_detected(IO frozen)", pe, 940b90484ecSSam Bobroff eeh_report_error, &result); 9418234fcedSGavin Shan if ((pe->type & EEH_PE_PHB) && 9428234fcedSGavin Shan result != PCI_ERS_RESULT_NONE && 9438234fcedSGavin Shan result != PCI_ERS_RESULT_NEED_RESET) 9448234fcedSGavin Shan result = PCI_ERS_RESULT_NEED_RESET; 945b90484ecSSam Bobroff } 946317f06deSGavin Shan 947317f06deSGavin Shan /* Get the current PCI slot state. This can take a long time, 9482ac3990cSWei Yang * sometimes over 300 seconds for certain systems. 949317f06deSGavin Shan */ 950b90484ecSSam Bobroff if (result != PCI_ERS_RESULT_DISCONNECT) { 951fef7f905SSam Bobroff rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); 952317f06deSGavin Shan if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { 9530dae2743SGavin Shan pr_warn("EEH: Permanent failure\n"); 954b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 955b90484ecSSam Bobroff } 956317f06deSGavin Shan } 957317f06deSGavin Shan 958317f06deSGavin Shan /* Since rtas may enable MMIO when posting the error log, 959317f06deSGavin Shan * don't post the error log until after all dev drivers 960317f06deSGavin Shan * have been informed. 961317f06deSGavin Shan */ 962b90484ecSSam Bobroff if (result != PCI_ERS_RESULT_DISCONNECT) { 96356ca4fdeSGavin Shan pr_info("EEH: Collect temporary log\n"); 964317f06deSGavin Shan eeh_slot_error_detail(pe, EEH_LOG_TEMP); 965b90484ecSSam Bobroff } 966317f06deSGavin Shan 967317f06deSGavin Shan /* If all device drivers were EEH-unaware, then shut 968317f06deSGavin Shan * down all of the device drivers, and hope they 969317f06deSGavin Shan * go down willingly, without panicing the system. 970317f06deSGavin Shan */ 971317f06deSGavin Shan if (result == PCI_ERS_RESULT_NONE) { 97256ca4fdeSGavin Shan pr_info("EEH: Reset with hotplug activity\n"); 9735fd13460SSam Bobroff rc = eeh_reset_device(pe, bus, NULL, false); 974317f06deSGavin Shan if (rc) { 9750dae2743SGavin Shan pr_warn("%s: Unable to reset, err=%d\n", 97656ca4fdeSGavin Shan __func__, rc); 977b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 978317f06deSGavin Shan } 979317f06deSGavin Shan } 980317f06deSGavin Shan 981317f06deSGavin Shan /* If all devices reported they can proceed, then re-enable MMIO */ 982317f06deSGavin Shan if (result == PCI_ERS_RESULT_CAN_RECOVER) { 98356ca4fdeSGavin Shan pr_info("EEH: Enable I/O for affected devices\n"); 984317f06deSGavin Shan rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); 985317f06deSGavin Shan 986b90484ecSSam Bobroff if (rc < 0) { 987b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 988b90484ecSSam Bobroff } else if (rc) { 989317f06deSGavin Shan result = PCI_ERS_RESULT_NEED_RESET; 990317f06deSGavin Shan } else { 99156ca4fdeSGavin Shan pr_info("EEH: Notify device drivers to resume I/O\n"); 99220b34497SSam Bobroff eeh_pe_report("mmio_enabled", pe, 99320b34497SSam Bobroff eeh_report_mmio_enabled, &result); 994317f06deSGavin Shan } 995317f06deSGavin Shan } 996317f06deSGavin Shan 997317f06deSGavin Shan /* If all devices reported they can proceed, then re-enable DMA */ 998317f06deSGavin Shan if (result == PCI_ERS_RESULT_CAN_RECOVER) { 99956ca4fdeSGavin Shan pr_info("EEH: Enabled DMA for affected devices\n"); 1000317f06deSGavin Shan rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); 1001317f06deSGavin Shan 1002b90484ecSSam Bobroff if (rc < 0) { 1003b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 1004b90484ecSSam Bobroff } else if (rc) { 1005317f06deSGavin Shan result = PCI_ERS_RESULT_NEED_RESET; 100635845a78SGavin Shan } else { 100735845a78SGavin Shan /* 100835845a78SGavin Shan * We didn't do PE reset for the case. The PE 100935845a78SGavin Shan * is still in frozen state. Clear it before 101035845a78SGavin Shan * resuming the PE. 101135845a78SGavin Shan */ 10129ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); 1013317f06deSGavin Shan result = PCI_ERS_RESULT_RECOVERED; 1014317f06deSGavin Shan } 101535845a78SGavin Shan } 1016317f06deSGavin Shan 1017317f06deSGavin Shan /* If any device called out for a reset, then reset the slot */ 1018317f06deSGavin Shan if (result == PCI_ERS_RESULT_NEED_RESET) { 101956ca4fdeSGavin Shan pr_info("EEH: Reset without hotplug activity\n"); 10205fd13460SSam Bobroff rc = eeh_reset_device(pe, bus, &rmv_data, true); 1021317f06deSGavin Shan if (rc) { 10220dae2743SGavin Shan pr_warn("%s: Cannot reset, err=%d\n", 102356ca4fdeSGavin Shan __func__, rc); 1024b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 1025b90484ecSSam Bobroff } else { 1026317f06deSGavin Shan result = PCI_ERS_RESULT_NONE; 102747cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_normal); 1028010acfa1SSam Bobroff eeh_set_irq_state(pe, true); 1029b90484ecSSam Bobroff eeh_pe_report("slot_reset", pe, eeh_report_reset, 1030b90484ecSSam Bobroff &result); 1031b90484ecSSam Bobroff } 1032317f06deSGavin Shan } 1033317f06deSGavin Shan 1034b90484ecSSam Bobroff if ((result == PCI_ERS_RESULT_RECOVERED) || 1035b90484ecSSam Bobroff (result == PCI_ERS_RESULT_NONE)) { 103667086e32SWei Yang /* 1037b90484ecSSam Bobroff * For those hot removed VFs, we should add back them after PF 1038b90484ecSSam Bobroff * get recovered properly. 103967086e32SWei Yang */ 10401c5c533bSSam Bobroff list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, 10411c5c533bSSam Bobroff rmv_entry) { 1042bf773df9SSam Bobroff eeh_add_virt_device(edev); 104380e65b00SSam Bobroff list_del(&edev->rmv_entry); 104467086e32SWei Yang } 104567086e32SWei Yang 1046317f06deSGavin Shan /* Tell all device drivers that they can resume operations */ 104756ca4fdeSGavin Shan pr_info("EEH: Notify device driver to resume\n"); 104847cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_normal); 1049010acfa1SSam Bobroff eeh_set_irq_state(pe, true); 105020b34497SSam Bobroff eeh_pe_report("resume", pe, eeh_report_resume, NULL); 105120b34497SSam Bobroff eeh_for_each_pe(pe, tmp_pe) { 105220b34497SSam Bobroff eeh_pe_for_each_dev(tmp_pe, edev, tmp) { 1053665012c5SSam Bobroff edev->mode &= ~EEH_DEV_NO_HANDLER; 105420b34497SSam Bobroff edev->in_error = false; 105520b34497SSam Bobroff } 105620b34497SSam Bobroff } 1057665012c5SSam Bobroff 1058796b9f5bSSam Bobroff pr_info("EEH: Recovery successful.\n"); 1059b90484ecSSam Bobroff } else { 1060317f06deSGavin Shan /* 1061317f06deSGavin Shan * About 90% of all real-life EEH failures in the field 1062317f06deSGavin Shan * are due to poorly seated PCI cards. Only 10% or so are 1063317f06deSGavin Shan * due to actual, failed cards. 1064317f06deSGavin Shan */ 10651f52f176SRussell Currey pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" 1066317f06deSGavin Shan "Please try reseating or replacing it\n", 1067317f06deSGavin Shan pe->phb->global_number, pe->addr); 1068317f06deSGavin Shan 1069317f06deSGavin Shan eeh_slot_error_detail(pe, EEH_LOG_PERM); 1070317f06deSGavin Shan 1071317f06deSGavin Shan /* Notify all devices that they're about to go down. */ 107247cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_perm_failure); 1073010acfa1SSam Bobroff eeh_set_irq_state(pe, false); 107420b34497SSam Bobroff eeh_pe_report("error_detected(permanent failure)", pe, 107520b34497SSam Bobroff eeh_report_failure, NULL); 1076317f06deSGavin Shan 1077d2b0f6f7SGavin Shan /* Mark the PE to be removed permanently */ 1078432227e9SGavin Shan eeh_pe_state_mark(pe, EEH_PE_REMOVED); 1079d2b0f6f7SGavin Shan 1080d2b0f6f7SGavin Shan /* 1081d2b0f6f7SGavin Shan * Shut down the device drivers for good. We mark 1082d2b0f6f7SGavin Shan * all removed devices correctly to avoid access 1083d2b0f6f7SGavin Shan * the their PCI config any more. 1084d2b0f6f7SGavin Shan */ 108567086e32SWei Yang if (pe->type & EEH_PE_VF) { 108667086e32SWei Yang eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); 108767086e32SWei Yang eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 108867086e32SWei Yang } else { 10899ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); 1090d2b0f6f7SGavin Shan eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 1091d2b0f6f7SGavin Shan 10921c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 1093cd95f804SSam Bobroff pci_hp_remove_devices(bus); 10941c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 1095daeba295SRussell Currey /* The passed PE should no longer be used */ 109637fd8125SSam Bobroff return; 10971c2042c8SRafael J. Wysocki } 1098b90484ecSSam Bobroff } 1099799abe28SOliver O'Halloran 1100b104af5aSOliver O'Halloran out: 1101799abe28SOliver O'Halloran /* 1102799abe28SOliver O'Halloran * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING 1103799abe28SOliver O'Halloran * we don't want to modify the PE tree structure so we do it here. 1104799abe28SOliver O'Halloran */ 1105799abe28SOliver O'Halloran eeh_pe_cleanup(pe); 1106aeff27c1SOliver O'Halloran 1107aeff27c1SOliver O'Halloran /* clear the slot attention LED for all recovered devices */ 1108aeff27c1SOliver O'Halloran eeh_for_each_pe(pe, tmp_pe) 1109aeff27c1SOliver O'Halloran eeh_pe_for_each_dev(tmp_pe, edev, tmp) 1110aeff27c1SOliver O'Halloran eeh_clear_slot_attention(edev->pdev); 1111aeff27c1SOliver O'Halloran 11129ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 111367086e32SWei Yang } 11148a6b1bc7SGavin Shan 1115c0b64978SRussell Currey /** 1116c0b64978SRussell Currey * eeh_handle_special_event - Handle EEH events without a specific failing PE 1117c0b64978SRussell Currey * 1118c0b64978SRussell Currey * Called when an EEH event is detected but can't be narrowed down to a 1119c0b64978SRussell Currey * specific PE. Iterates through possible failures and handles them as 1120c0b64978SRussell Currey * necessary. 1121c0b64978SRussell Currey */ 112268701780SSam Bobroff void eeh_handle_special_event(void) 11238a6b1bc7SGavin Shan { 1124aa06e3d6SSam Bobroff struct eeh_pe *pe, *phb_pe, *tmp_pe; 1125aa06e3d6SSam Bobroff struct eeh_dev *edev, *tmp_edev; 11268a6b1bc7SGavin Shan struct pci_bus *bus; 11277e4e7867SGavin Shan struct pci_controller *hose; 11288a6b1bc7SGavin Shan unsigned long flags; 11297e4e7867SGavin Shan int rc; 11308a6b1bc7SGavin Shan 11317e4e7867SGavin Shan 11327e4e7867SGavin Shan do { 11338a6b1bc7SGavin Shan rc = eeh_ops->next_error(&pe); 11348a6b1bc7SGavin Shan 11358a6b1bc7SGavin Shan switch (rc) { 11367e4e7867SGavin Shan case EEH_NEXT_ERR_DEAD_IOC: 11378a6b1bc7SGavin Shan /* Mark all PHBs in dead state */ 11388a6b1bc7SGavin Shan eeh_serialize_lock(&flags); 11397e4e7867SGavin Shan 11407e4e7867SGavin Shan /* Purge all events */ 11415c7a35e3SGavin Shan eeh_remove_event(NULL, true); 11427e4e7867SGavin Shan 11437e4e7867SGavin Shan list_for_each_entry(hose, &hose_list, list_node) { 11448a6b1bc7SGavin Shan phb_pe = eeh_phb_pe_get(hose); 11458a6b1bc7SGavin Shan if (!phb_pe) continue; 11468a6b1bc7SGavin Shan 1147e762bb89SSam Bobroff eeh_pe_mark_isolated(phb_pe); 11488a6b1bc7SGavin Shan } 11497e4e7867SGavin Shan 11508a6b1bc7SGavin Shan eeh_serialize_unlock(flags); 11518a6b1bc7SGavin Shan 11528a6b1bc7SGavin Shan break; 11537e4e7867SGavin Shan case EEH_NEXT_ERR_FROZEN_PE: 11547e4e7867SGavin Shan case EEH_NEXT_ERR_FENCED_PHB: 11557e4e7867SGavin Shan case EEH_NEXT_ERR_DEAD_PHB: 11568a6b1bc7SGavin Shan /* Mark the PE in fenced state */ 11578a6b1bc7SGavin Shan eeh_serialize_lock(&flags); 11587e4e7867SGavin Shan 11597e4e7867SGavin Shan /* Purge all events of the PHB */ 11605c7a35e3SGavin Shan eeh_remove_event(pe, true); 11617e4e7867SGavin Shan 1162e762bb89SSam Bobroff if (rc != EEH_NEXT_ERR_DEAD_PHB) 1163e762bb89SSam Bobroff eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 1164e762bb89SSam Bobroff eeh_pe_mark_isolated(pe); 11657e4e7867SGavin Shan 11668a6b1bc7SGavin Shan eeh_serialize_unlock(flags); 11678a6b1bc7SGavin Shan 11688a6b1bc7SGavin Shan break; 11697e4e7867SGavin Shan case EEH_NEXT_ERR_NONE: 11707e4e7867SGavin Shan return; 11718a6b1bc7SGavin Shan default: 11727e4e7867SGavin Shan pr_warn("%s: Invalid value %d from next_error()\n", 11738a6b1bc7SGavin Shan __func__, rc); 11748a6b1bc7SGavin Shan return; 11758a6b1bc7SGavin Shan } 11768a6b1bc7SGavin Shan 11778a6b1bc7SGavin Shan /* 11788a6b1bc7SGavin Shan * For fenced PHB and frozen PE, it's handled as normal 11798a6b1bc7SGavin Shan * event. We have to remove the affected PHBs for dead 11808a6b1bc7SGavin Shan * PHB and IOC 11818a6b1bc7SGavin Shan */ 11827e4e7867SGavin Shan if (rc == EEH_NEXT_ERR_FROZEN_PE || 11837e4e7867SGavin Shan rc == EEH_NEXT_ERR_FENCED_PHB) { 1184799abe28SOliver O'Halloran eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 118537fd8125SSam Bobroff eeh_handle_normal_event(pe); 11867e4e7867SGavin Shan } else { 1187aa06e3d6SSam Bobroff eeh_for_each_pe(pe, tmp_pe) 1188aa06e3d6SSam Bobroff eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) 1189aa06e3d6SSam Bobroff edev->mode &= ~EEH_DEV_NO_HANDLER; 1190aa06e3d6SSam Bobroff 11917e4e7867SGavin Shan /* Notify all devices to be down */ 11929ed5ca66SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); 119347cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_perm_failure); 119420b34497SSam Bobroff eeh_pe_report( 119520b34497SSam Bobroff "error_detected(permanent failure)", pe, 1196af2e3a00SRussell Currey eeh_report_failure, NULL); 1197d4f194edSSam Bobroff 1198d4f194edSSam Bobroff pci_lock_rescan_remove(); 1199d4f194edSSam Bobroff list_for_each_entry(hose, &hose_list, list_node) { 1200d4f194edSSam Bobroff phb_pe = eeh_phb_pe_get(hose); 1201d4f194edSSam Bobroff if (!phb_pe || 1202d4f194edSSam Bobroff !(phb_pe->state & EEH_PE_ISOLATED) || 1203d4f194edSSam Bobroff (phb_pe->state & EEH_PE_RECOVERING)) 1204d4f194edSSam Bobroff continue; 1205d4f194edSSam Bobroff 12068a6b1bc7SGavin Shan bus = eeh_pe_bus_get(phb_pe); 120704fec21cSRussell Currey if (!bus) { 120804fec21cSRussell Currey pr_err("%s: Cannot find PCI bus for " 12091f52f176SRussell Currey "PHB#%x-PE#%x\n", 121004fec21cSRussell Currey __func__, 121104fec21cSRussell Currey pe->phb->global_number, 121204fec21cSRussell Currey pe->addr); 121304fec21cSRussell Currey break; 121404fec21cSRussell Currey } 1215bd251b89SGavin Shan pci_hp_remove_devices(bus); 12168a6b1bc7SGavin Shan } 12171c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 12188a6b1bc7SGavin Shan } 12197e4e7867SGavin Shan 12207e4e7867SGavin Shan /* 12217e4e7867SGavin Shan * If we have detected dead IOC, we needn't proceed 12227e4e7867SGavin Shan * any more since all PHBs would have been removed 12237e4e7867SGavin Shan */ 12247e4e7867SGavin Shan if (rc == EEH_NEXT_ERR_DEAD_IOC) 12257e4e7867SGavin Shan break; 12267e4e7867SGavin Shan } while (rc != EEH_NEXT_ERR_NONE); 12278a6b1bc7SGavin Shan } 1228