1317f06deSGavin Shan /* 2317f06deSGavin Shan * PCI Error Recovery Driver for RPA-compliant PPC64 platform. 3317f06deSGavin Shan * Copyright IBM Corp. 2004 2005 4317f06deSGavin Shan * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 5317f06deSGavin Shan * 6317f06deSGavin Shan * All rights reserved. 7317f06deSGavin Shan * 8317f06deSGavin Shan * This program is free software; you can redistribute it and/or modify 9317f06deSGavin Shan * it under the terms of the GNU General Public License as published by 10317f06deSGavin Shan * the Free Software Foundation; either version 2 of the License, or (at 11317f06deSGavin Shan * your option) any later version. 12317f06deSGavin Shan * 13317f06deSGavin Shan * This program is distributed in the hope that it will be useful, but 14317f06deSGavin Shan * WITHOUT ANY WARRANTY; without even the implied warranty of 15317f06deSGavin Shan * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 16317f06deSGavin Shan * NON INFRINGEMENT. See the GNU General Public License for more 17317f06deSGavin Shan * details. 18317f06deSGavin Shan * 19317f06deSGavin Shan * You should have received a copy of the GNU General Public License 20317f06deSGavin Shan * along with this program; if not, write to the Free Software 21317f06deSGavin Shan * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22317f06deSGavin Shan * 23317f06deSGavin Shan * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> 24317f06deSGavin Shan */ 25317f06deSGavin Shan #include <linux/delay.h> 26317f06deSGavin Shan #include <linux/interrupt.h> 27317f06deSGavin Shan #include <linux/irq.h> 28317f06deSGavin Shan #include <linux/module.h> 29317f06deSGavin Shan #include <linux/pci.h> 30317f06deSGavin Shan #include <asm/eeh.h> 31317f06deSGavin Shan #include <asm/eeh_event.h> 32317f06deSGavin Shan #include <asm/ppc-pci.h> 33317f06deSGavin Shan #include <asm/pci-bridge.h> 34317f06deSGavin Shan #include <asm/prom.h> 35317f06deSGavin Shan #include <asm/rtas.h> 36317f06deSGavin Shan 3767086e32SWei Yang struct eeh_rmv_data { 381c5c533bSSam Bobroff struct list_head removed_vf_list; 391c5c533bSSam Bobroff int removed_dev_count; 4067086e32SWei Yang }; 4167086e32SWei Yang 4230424e38SSam Bobroff static int eeh_result_priority(enum pci_ers_result result) 4330424e38SSam Bobroff { 4430424e38SSam Bobroff switch (result) { 4530424e38SSam Bobroff case PCI_ERS_RESULT_NONE: 4630424e38SSam Bobroff return 1; 4730424e38SSam Bobroff case PCI_ERS_RESULT_NO_AER_DRIVER: 4830424e38SSam Bobroff return 2; 4930424e38SSam Bobroff case PCI_ERS_RESULT_RECOVERED: 5030424e38SSam Bobroff return 3; 5130424e38SSam Bobroff case PCI_ERS_RESULT_CAN_RECOVER: 5230424e38SSam Bobroff return 4; 5330424e38SSam Bobroff case PCI_ERS_RESULT_DISCONNECT: 5430424e38SSam Bobroff return 5; 5530424e38SSam Bobroff case PCI_ERS_RESULT_NEED_RESET: 5630424e38SSam Bobroff return 6; 5730424e38SSam Bobroff default: 5830424e38SSam Bobroff WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result); 5930424e38SSam Bobroff return 0; 6030424e38SSam Bobroff } 6130424e38SSam Bobroff }; 6230424e38SSam Bobroff 63c36c5ffdSBreno Leitao static const char *pci_ers_result_name(enum pci_ers_result result) 6420b34497SSam Bobroff { 6520b34497SSam Bobroff switch (result) { 6620b34497SSam Bobroff case PCI_ERS_RESULT_NONE: 6720b34497SSam Bobroff return "none"; 6820b34497SSam Bobroff case PCI_ERS_RESULT_CAN_RECOVER: 6920b34497SSam Bobroff return "can recover"; 7020b34497SSam Bobroff case PCI_ERS_RESULT_NEED_RESET: 7120b34497SSam Bobroff return "need reset"; 7220b34497SSam Bobroff case PCI_ERS_RESULT_DISCONNECT: 7320b34497SSam Bobroff return "disconnect"; 7420b34497SSam Bobroff case PCI_ERS_RESULT_RECOVERED: 7520b34497SSam Bobroff return "recovered"; 7620b34497SSam Bobroff case PCI_ERS_RESULT_NO_AER_DRIVER: 7720b34497SSam Bobroff return "no AER driver"; 7820b34497SSam Bobroff default: 7920b34497SSam Bobroff WARN_ONCE(1, "Unknown result type: %d\n", (int)result); 8020b34497SSam Bobroff return "unknown"; 8120b34497SSam Bobroff } 8220b34497SSam Bobroff }; 8320b34497SSam Bobroff 8420b34497SSam Bobroff static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev, 8520b34497SSam Bobroff const char *fmt, ...) 8620b34497SSam Bobroff { 8720b34497SSam Bobroff struct va_format vaf; 8820b34497SSam Bobroff va_list args; 8920b34497SSam Bobroff 9020b34497SSam Bobroff va_start(args, fmt); 9120b34497SSam Bobroff 9220b34497SSam Bobroff vaf.fmt = fmt; 9320b34497SSam Bobroff vaf.va = &args; 9420b34497SSam Bobroff 9520b34497SSam Bobroff printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr, 9620b34497SSam Bobroff edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf); 9720b34497SSam Bobroff 9820b34497SSam Bobroff va_end(args); 9920b34497SSam Bobroff } 10020b34497SSam Bobroff 10130424e38SSam Bobroff static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, 10230424e38SSam Bobroff enum pci_ers_result new) 10330424e38SSam Bobroff { 10430424e38SSam Bobroff if (eeh_result_priority(new) > eeh_result_priority(old)) 10530424e38SSam Bobroff return new; 10630424e38SSam Bobroff return old; 10730424e38SSam Bobroff } 10830424e38SSam Bobroff 109e2b810d5SSam Bobroff static bool eeh_dev_removed(struct eeh_dev *edev) 110e2b810d5SSam Bobroff { 111e2b810d5SSam Bobroff return !edev || (edev->mode & EEH_DEV_REMOVED); 112e2b810d5SSam Bobroff } 113e2b810d5SSam Bobroff 114e2b810d5SSam Bobroff static bool eeh_edev_actionable(struct eeh_dev *edev) 115e2b810d5SSam Bobroff { 116e2b810d5SSam Bobroff return (edev->pdev && !eeh_dev_removed(edev) && 117e2b810d5SSam Bobroff !eeh_pe_passed(edev->pe)); 118e2b810d5SSam Bobroff } 119e2b810d5SSam Bobroff 120317f06deSGavin Shan /** 121317f06deSGavin Shan * eeh_pcid_get - Get the PCI device driver 122317f06deSGavin Shan * @pdev: PCI device 123317f06deSGavin Shan * 124317f06deSGavin Shan * The function is used to retrieve the PCI device driver for 125317f06deSGavin Shan * the indicated PCI device. Besides, we will increase the reference 126317f06deSGavin Shan * of the PCI device driver to prevent that being unloaded on 127317f06deSGavin Shan * the fly. Otherwise, kernel crash would be seen. 128317f06deSGavin Shan */ 129317f06deSGavin Shan static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) 130317f06deSGavin Shan { 131317f06deSGavin Shan if (!pdev || !pdev->driver) 132317f06deSGavin Shan return NULL; 133317f06deSGavin Shan 134317f06deSGavin Shan if (!try_module_get(pdev->driver->driver.owner)) 135317f06deSGavin Shan return NULL; 136317f06deSGavin Shan 137317f06deSGavin Shan return pdev->driver; 138317f06deSGavin Shan } 139317f06deSGavin Shan 140317f06deSGavin Shan /** 141317f06deSGavin Shan * eeh_pcid_put - Dereference on the PCI device driver 142317f06deSGavin Shan * @pdev: PCI device 143317f06deSGavin Shan * 144317f06deSGavin Shan * The function is called to do dereference on the PCI device 145317f06deSGavin Shan * driver of the indicated PCI device. 146317f06deSGavin Shan */ 147317f06deSGavin Shan static inline void eeh_pcid_put(struct pci_dev *pdev) 148317f06deSGavin Shan { 149317f06deSGavin Shan if (!pdev || !pdev->driver) 150317f06deSGavin Shan return; 151317f06deSGavin Shan 152317f06deSGavin Shan module_put(pdev->driver->driver.owner); 153317f06deSGavin Shan } 154317f06deSGavin Shan 155317f06deSGavin Shan /** 156317f06deSGavin Shan * eeh_disable_irq - Disable interrupt for the recovering device 157317f06deSGavin Shan * @dev: PCI device 158317f06deSGavin Shan * 159317f06deSGavin Shan * This routine must be called when reporting temporary or permanent 160317f06deSGavin Shan * error to the particular PCI device to disable interrupt of that 161317f06deSGavin Shan * device. If the device has enabled MSI or MSI-X interrupt, we needn't 162317f06deSGavin Shan * do real work because EEH should freeze DMA transfers for those PCI 163317f06deSGavin Shan * devices encountering EEH errors, which includes MSI or MSI-X. 164317f06deSGavin Shan */ 165010acfa1SSam Bobroff static void eeh_disable_irq(struct eeh_dev *edev) 166317f06deSGavin Shan { 167317f06deSGavin Shan /* Don't disable MSI and MSI-X interrupts. They are 168317f06deSGavin Shan * effectively disabled by the DMA Stopped state 169317f06deSGavin Shan * when an EEH error occurs. 170317f06deSGavin Shan */ 171010acfa1SSam Bobroff if (edev->pdev->msi_enabled || edev->pdev->msix_enabled) 172317f06deSGavin Shan return; 173317f06deSGavin Shan 174010acfa1SSam Bobroff if (!irq_has_action(edev->pdev->irq)) 175317f06deSGavin Shan return; 176317f06deSGavin Shan 177317f06deSGavin Shan edev->mode |= EEH_DEV_IRQ_DISABLED; 178010acfa1SSam Bobroff disable_irq_nosync(edev->pdev->irq); 179317f06deSGavin Shan } 180317f06deSGavin Shan 181317f06deSGavin Shan /** 182317f06deSGavin Shan * eeh_enable_irq - Enable interrupt for the recovering device 183317f06deSGavin Shan * @dev: PCI device 184317f06deSGavin Shan * 185317f06deSGavin Shan * This routine must be called to enable interrupt while failed 186317f06deSGavin Shan * device could be resumed. 187317f06deSGavin Shan */ 188010acfa1SSam Bobroff static void eeh_enable_irq(struct eeh_dev *edev) 189317f06deSGavin Shan { 190317f06deSGavin Shan if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { 191317f06deSGavin Shan edev->mode &= ~EEH_DEV_IRQ_DISABLED; 192b8a9a11bSThomas Gleixner /* 193b8a9a11bSThomas Gleixner * FIXME !!!!! 194b8a9a11bSThomas Gleixner * 195b8a9a11bSThomas Gleixner * This is just ass backwards. This maze has 196b8a9a11bSThomas Gleixner * unbalanced irq_enable/disable calls. So instead of 197b8a9a11bSThomas Gleixner * finding the root cause it works around the warning 198b8a9a11bSThomas Gleixner * in the irq_enable code by conditionally calling 199b8a9a11bSThomas Gleixner * into it. 200b8a9a11bSThomas Gleixner * 201b8a9a11bSThomas Gleixner * That's just wrong.The warning in the core code is 202027dfac6SMichael Ellerman * there to tell people to fix their asymmetries in 203b8a9a11bSThomas Gleixner * their own code, not by abusing the core information 204b8a9a11bSThomas Gleixner * to avoid it. 205b8a9a11bSThomas Gleixner * 206b8a9a11bSThomas Gleixner * I so wish that the assymetry would be the other way 207b8a9a11bSThomas Gleixner * round and a few more irq_disable calls render that 208b8a9a11bSThomas Gleixner * shit unusable forever. 209b8a9a11bSThomas Gleixner * 210b8a9a11bSThomas Gleixner * tglx 211b8a9a11bSThomas Gleixner */ 212010acfa1SSam Bobroff if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq))) 213010acfa1SSam Bobroff enable_irq(edev->pdev->irq); 214317f06deSGavin Shan } 21557310c3cSThomas Gleixner } 216317f06deSGavin Shan 217d6c4932fSSam Bobroff static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) 2185cfb20b9SGavin Shan { 2195cfb20b9SGavin Shan struct pci_dev *pdev; 2205cfb20b9SGavin Shan 2215cfb20b9SGavin Shan if (!edev) 2225cfb20b9SGavin Shan return NULL; 2235cfb20b9SGavin Shan 2245a0cdbfdSGavin Shan /* 2255a0cdbfdSGavin Shan * We cannot access the config space on some adapters. 2265a0cdbfdSGavin Shan * Otherwise, it will cause fenced PHB. We don't save 2275a0cdbfdSGavin Shan * the content in their config space and will restore 2285a0cdbfdSGavin Shan * from the initial config space saved when the EEH 2295a0cdbfdSGavin Shan * device is created. 2305a0cdbfdSGavin Shan */ 2315a0cdbfdSGavin Shan if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) 2325a0cdbfdSGavin Shan return NULL; 2335a0cdbfdSGavin Shan 2345cfb20b9SGavin Shan pdev = eeh_dev_to_pci_dev(edev); 2355cfb20b9SGavin Shan if (!pdev) 2365cfb20b9SGavin Shan return NULL; 2375cfb20b9SGavin Shan 2385cfb20b9SGavin Shan pci_save_state(pdev); 2395cfb20b9SGavin Shan return NULL; 2405cfb20b9SGavin Shan } 2415cfb20b9SGavin Shan 24247cc8c1cSSam Bobroff static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) 24347cc8c1cSSam Bobroff { 24447cc8c1cSSam Bobroff struct eeh_pe *pe; 24547cc8c1cSSam Bobroff struct eeh_dev *edev, *tmp; 24647cc8c1cSSam Bobroff 24747cc8c1cSSam Bobroff eeh_for_each_pe(root, pe) 24847cc8c1cSSam Bobroff eeh_pe_for_each_dev(pe, edev, tmp) 24947cc8c1cSSam Bobroff if (eeh_edev_actionable(edev)) 25047cc8c1cSSam Bobroff edev->pdev->error_state = s; 25147cc8c1cSSam Bobroff } 25247cc8c1cSSam Bobroff 253010acfa1SSam Bobroff static void eeh_set_irq_state(struct eeh_pe *root, bool enable) 254010acfa1SSam Bobroff { 255010acfa1SSam Bobroff struct eeh_pe *pe; 256010acfa1SSam Bobroff struct eeh_dev *edev, *tmp; 257010acfa1SSam Bobroff 258010acfa1SSam Bobroff eeh_for_each_pe(root, pe) { 259010acfa1SSam Bobroff eeh_pe_for_each_dev(pe, edev, tmp) { 260010acfa1SSam Bobroff if (!eeh_edev_actionable(edev)) 261010acfa1SSam Bobroff continue; 262010acfa1SSam Bobroff 263010acfa1SSam Bobroff if (!eeh_pcid_get(edev->pdev)) 264010acfa1SSam Bobroff continue; 265010acfa1SSam Bobroff 266010acfa1SSam Bobroff if (enable) 267010acfa1SSam Bobroff eeh_enable_irq(edev); 268010acfa1SSam Bobroff else 269010acfa1SSam Bobroff eeh_disable_irq(edev); 270010acfa1SSam Bobroff 271010acfa1SSam Bobroff eeh_pcid_put(edev->pdev); 272010acfa1SSam Bobroff } 273010acfa1SSam Bobroff } 274010acfa1SSam Bobroff } 275010acfa1SSam Bobroff 27620b34497SSam Bobroff typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, 27720b34497SSam Bobroff struct pci_driver *); 27820b34497SSam Bobroff static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, 27920b34497SSam Bobroff enum pci_ers_result *result) 28020b34497SSam Bobroff { 28120b34497SSam Bobroff struct pci_driver *driver; 28220b34497SSam Bobroff enum pci_ers_result new_result; 28320b34497SSam Bobroff 284bcbe3730SSam Bobroff if (!edev->pdev) { 285bcbe3730SSam Bobroff eeh_edev_info(edev, "no device"); 286bcbe3730SSam Bobroff return; 287bcbe3730SSam Bobroff } 28820b34497SSam Bobroff device_lock(&edev->pdev->dev); 28920b34497SSam Bobroff if (eeh_edev_actionable(edev)) { 29020b34497SSam Bobroff driver = eeh_pcid_get(edev->pdev); 29120b34497SSam Bobroff 29220b34497SSam Bobroff if (!driver) 29320b34497SSam Bobroff eeh_edev_info(edev, "no driver"); 29420b34497SSam Bobroff else if (!driver->err_handler) 29520b34497SSam Bobroff eeh_edev_info(edev, "driver not EEH aware"); 29620b34497SSam Bobroff else if (edev->mode & EEH_DEV_NO_HANDLER) 29720b34497SSam Bobroff eeh_edev_info(edev, "driver bound too late"); 29820b34497SSam Bobroff else { 29920b34497SSam Bobroff new_result = fn(edev, driver); 30020b34497SSam Bobroff eeh_edev_info(edev, "%s driver reports: '%s'", 30120b34497SSam Bobroff driver->name, 30220b34497SSam Bobroff pci_ers_result_name(new_result)); 30320b34497SSam Bobroff if (result) 30420b34497SSam Bobroff *result = pci_ers_merge_result(*result, 30520b34497SSam Bobroff new_result); 30620b34497SSam Bobroff } 30720b34497SSam Bobroff if (driver) 30820b34497SSam Bobroff eeh_pcid_put(edev->pdev); 30920b34497SSam Bobroff } else { 31020b34497SSam Bobroff eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev, 31120b34497SSam Bobroff !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); 31220b34497SSam Bobroff } 31320b34497SSam Bobroff device_unlock(&edev->pdev->dev); 31420b34497SSam Bobroff } 31520b34497SSam Bobroff 31620b34497SSam Bobroff static void eeh_pe_report(const char *name, struct eeh_pe *root, 31720b34497SSam Bobroff eeh_report_fn fn, enum pci_ers_result *result) 31820b34497SSam Bobroff { 31920b34497SSam Bobroff struct eeh_pe *pe; 32020b34497SSam Bobroff struct eeh_dev *edev, *tmp; 32120b34497SSam Bobroff 32220b34497SSam Bobroff pr_info("EEH: Beginning: '%s'\n", name); 32320b34497SSam Bobroff eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) 32420b34497SSam Bobroff eeh_pe_report_edev(edev, fn, result); 32520b34497SSam Bobroff if (result) 32620b34497SSam Bobroff pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n", 32720b34497SSam Bobroff name, pci_ers_result_name(*result)); 32820b34497SSam Bobroff else 32920b34497SSam Bobroff pr_info("EEH: Finished:'%s'", name); 33020b34497SSam Bobroff } 33120b34497SSam Bobroff 332317f06deSGavin Shan /** 333317f06deSGavin Shan * eeh_report_error - Report pci error to each device driver 33420b34497SSam Bobroff * @edev: eeh device 33520b34497SSam Bobroff * @driver: device's PCI driver 336317f06deSGavin Shan * 33720b34497SSam Bobroff * Report an EEH error to each device driver. 338317f06deSGavin Shan */ 33920b34497SSam Bobroff static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, 34020b34497SSam Bobroff struct pci_driver *driver) 341317f06deSGavin Shan { 34220b34497SSam Bobroff enum pci_ers_result rc; 34320b34497SSam Bobroff struct pci_dev *dev = edev->pdev; 344317f06deSGavin Shan 34520b34497SSam Bobroff if (!driver->err_handler->error_detected) 34620b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 347f0295e04SMichael Neuling 34820b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", 34920b34497SSam Bobroff driver->name); 350317f06deSGavin Shan rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); 351317f06deSGavin Shan 35267086e32SWei Yang edev->in_error = true; 353856e1eb9SBryant G. Ly pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); 35420b34497SSam Bobroff return rc; 355317f06deSGavin Shan } 356317f06deSGavin Shan 357317f06deSGavin Shan /** 358317f06deSGavin Shan * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled 35920b34497SSam Bobroff * @edev: eeh device 36020b34497SSam Bobroff * @driver: device's PCI driver 361317f06deSGavin Shan * 362317f06deSGavin Shan * Tells each device driver that IO ports, MMIO and config space I/O 36320b34497SSam Bobroff * are now enabled. 364317f06deSGavin Shan */ 36520b34497SSam Bobroff static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, 36620b34497SSam Bobroff struct pci_driver *driver) 367317f06deSGavin Shan { 36820b34497SSam Bobroff if (!driver->err_handler->mmio_enabled) 36920b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 37020b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); 37120b34497SSam Bobroff return driver->err_handler->mmio_enabled(edev->pdev); 372317f06deSGavin Shan } 373317f06deSGavin Shan 374317f06deSGavin Shan /** 375317f06deSGavin Shan * eeh_report_reset - Tell device that slot has been reset 37620b34497SSam Bobroff * @edev: eeh device 37720b34497SSam Bobroff * @driver: device's PCI driver 378317f06deSGavin Shan * 379317f06deSGavin Shan * This routine must be called while EEH tries to reset particular 380317f06deSGavin Shan * PCI device so that the associated PCI device driver could take 381317f06deSGavin Shan * some actions, usually to save data the driver needs so that the 382317f06deSGavin Shan * driver can work again while the device is recovered. 383317f06deSGavin Shan */ 38420b34497SSam Bobroff static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, 38520b34497SSam Bobroff struct pci_driver *driver) 386317f06deSGavin Shan { 38720b34497SSam Bobroff if (!driver->err_handler->slot_reset || !edev->in_error) 38820b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 38920b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); 39020b34497SSam Bobroff return driver->err_handler->slot_reset(edev->pdev); 391317f06deSGavin Shan } 392317f06deSGavin Shan 393d6c4932fSSam Bobroff static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) 3945cfb20b9SGavin Shan { 3955cfb20b9SGavin Shan struct pci_dev *pdev; 3965cfb20b9SGavin Shan 3975cfb20b9SGavin Shan if (!edev) 3985cfb20b9SGavin Shan return NULL; 3995cfb20b9SGavin Shan 4005a0cdbfdSGavin Shan /* 4015a0cdbfdSGavin Shan * The content in the config space isn't saved because 4025a0cdbfdSGavin Shan * the blocked config space on some adapters. We have 4035a0cdbfdSGavin Shan * to restore the initial saved config space when the 4045a0cdbfdSGavin Shan * EEH device is created. 4055a0cdbfdSGavin Shan */ 4065a0cdbfdSGavin Shan if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { 40780e65b00SSam Bobroff if (list_is_last(&edev->entry, &edev->pe->edevs)) 4085a0cdbfdSGavin Shan eeh_pe_restore_bars(edev->pe); 4095a0cdbfdSGavin Shan 4105a0cdbfdSGavin Shan return NULL; 4115a0cdbfdSGavin Shan } 4125a0cdbfdSGavin Shan 4135cfb20b9SGavin Shan pdev = eeh_dev_to_pci_dev(edev); 4145cfb20b9SGavin Shan if (!pdev) 4155cfb20b9SGavin Shan return NULL; 4165cfb20b9SGavin Shan 4175cfb20b9SGavin Shan pci_restore_state(pdev); 4185cfb20b9SGavin Shan return NULL; 4195cfb20b9SGavin Shan } 4205cfb20b9SGavin Shan 421317f06deSGavin Shan /** 422317f06deSGavin Shan * eeh_report_resume - Tell device to resume normal operations 42320b34497SSam Bobroff * @edev: eeh device 42420b34497SSam Bobroff * @driver: device's PCI driver 425317f06deSGavin Shan * 426317f06deSGavin Shan * This routine must be called to notify the device driver that it 427317f06deSGavin Shan * could resume so that the device driver can do some initialization 428317f06deSGavin Shan * to make the recovered device work again. 429317f06deSGavin Shan */ 43020b34497SSam Bobroff static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, 43120b34497SSam Bobroff struct pci_driver *driver) 432317f06deSGavin Shan { 43320b34497SSam Bobroff if (!driver->err_handler->resume || !edev->in_error) 43420b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 435317f06deSGavin Shan 43620b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->resume()", driver->name); 43720b34497SSam Bobroff driver->err_handler->resume(edev->pdev); 438f0295e04SMichael Neuling 43920b34497SSam Bobroff pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); 440856e1eb9SBryant G. Ly #ifdef CONFIG_PCI_IOV 441521ca5a9SJuan J. Alvarez if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) 442856e1eb9SBryant G. Ly eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); 443856e1eb9SBryant G. Ly #endif 44420b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 445317f06deSGavin Shan } 446317f06deSGavin Shan 447317f06deSGavin Shan /** 448317f06deSGavin Shan * eeh_report_failure - Tell device driver that device is dead. 44920b34497SSam Bobroff * @edev: eeh device 45020b34497SSam Bobroff * @driver: device's PCI driver 451317f06deSGavin Shan * 452317f06deSGavin Shan * This informs the device driver that the device is permanently 453317f06deSGavin Shan * dead, and that no further recovery attempts will be made on it. 454317f06deSGavin Shan */ 45520b34497SSam Bobroff static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, 45620b34497SSam Bobroff struct pci_driver *driver) 457317f06deSGavin Shan { 45820b34497SSam Bobroff enum pci_ers_result rc; 459317f06deSGavin Shan 46020b34497SSam Bobroff if (!driver->err_handler->error_detected) 46120b34497SSam Bobroff return PCI_ERS_RESULT_NONE; 462f0295e04SMichael Neuling 46320b34497SSam Bobroff eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", 46420b34497SSam Bobroff driver->name); 46520b34497SSam Bobroff rc = driver->err_handler->error_detected(edev->pdev, 46620b34497SSam Bobroff pci_channel_io_perm_failure); 467317f06deSGavin Shan 46820b34497SSam Bobroff pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT); 46920b34497SSam Bobroff return rc; 470317f06deSGavin Shan } 471317f06deSGavin Shan 472bf773df9SSam Bobroff static void *eeh_add_virt_device(struct eeh_dev *edev) 47367086e32SWei Yang { 47467086e32SWei Yang struct pci_driver *driver; 47567086e32SWei Yang struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 47667086e32SWei Yang struct pci_dn *pdn = eeh_dev_to_pdn(edev); 47767086e32SWei Yang 47867086e32SWei Yang if (!(edev->physfn)) { 47967086e32SWei Yang pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", 48069672bd7SAlexey Kardashevskiy __func__, pdn->phb->global_number, pdn->busno, 48167086e32SWei Yang PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); 48267086e32SWei Yang return NULL; 48367086e32SWei Yang } 48467086e32SWei Yang 48567086e32SWei Yang driver = eeh_pcid_get(dev); 48667086e32SWei Yang if (driver) { 48746d4be41SSam Bobroff if (driver->err_handler) { 48867086e32SWei Yang eeh_pcid_put(dev); 48967086e32SWei Yang return NULL; 49067086e32SWei Yang } 49146d4be41SSam Bobroff eeh_pcid_put(dev); 49246d4be41SSam Bobroff } 49367086e32SWei Yang 494988fc3baSBryant G. Ly #ifdef CONFIG_PCI_IOV 495753f6124SJan H. Schönherr pci_iov_add_virtfn(edev->physfn, pdn->vf_index); 49667086e32SWei Yang #endif 49767086e32SWei Yang return NULL; 49867086e32SWei Yang } 49967086e32SWei Yang 500d6c4932fSSam Bobroff static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) 501f5c57710SGavin Shan { 502f5c57710SGavin Shan struct pci_driver *driver; 503f5c57710SGavin Shan struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 50467086e32SWei Yang struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; 505f5c57710SGavin Shan 506f5c57710SGavin Shan /* 507f5c57710SGavin Shan * Actually, we should remove the PCI bridges as well. 508f5c57710SGavin Shan * However, that's lots of complexity to do that, 509f5c57710SGavin Shan * particularly some of devices under the bridge might 510f5c57710SGavin Shan * support EEH. So we just care about PCI devices for 511f5c57710SGavin Shan * simplicity here. 512f5c57710SGavin Shan */ 51393de6901SBjorn Helgaas if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) 514f5c57710SGavin Shan return NULL; 5158cc6b6cdSThadeu Lima de Souza Cascardo 516d2b0f6f7SGavin Shan /* 517d2b0f6f7SGavin Shan * We rely on count-based pcibios_release_device() to 518d2b0f6f7SGavin Shan * detach permanently offlined PEs. Unfortunately, that's 519d2b0f6f7SGavin Shan * not reliable enough. We might have the permanently 520d2b0f6f7SGavin Shan * offlined PEs attached, but we needn't take care of 521d2b0f6f7SGavin Shan * them and their child devices. 522d2b0f6f7SGavin Shan */ 523d2b0f6f7SGavin Shan if (eeh_dev_removed(edev)) 524d2b0f6f7SGavin Shan return NULL; 525d2b0f6f7SGavin Shan 5261c5c533bSSam Bobroff if (rmv_data) { 52746d4be41SSam Bobroff if (eeh_pe_passed(edev->pe)) 52846d4be41SSam Bobroff return NULL; 529f5c57710SGavin Shan driver = eeh_pcid_get(dev); 5308cc6b6cdSThadeu Lima de Souza Cascardo if (driver) { 53146d4be41SSam Bobroff if (driver->err_handler && 532f2da4ccfSGavin Shan driver->err_handler->error_detected && 53346d4be41SSam Bobroff driver->err_handler->slot_reset) { 53446d4be41SSam Bobroff eeh_pcid_put(dev); 535f5c57710SGavin Shan return NULL; 5368cc6b6cdSThadeu Lima de Souza Cascardo } 53746d4be41SSam Bobroff eeh_pcid_put(dev); 53846d4be41SSam Bobroff } 53946d4be41SSam Bobroff } 540f5c57710SGavin Shan 541f5c57710SGavin Shan /* Remove it from PCI subsystem */ 542f5c57710SGavin Shan pr_debug("EEH: Removing %s without EEH sensitive driver\n", 543f5c57710SGavin Shan pci_name(dev)); 544f5c57710SGavin Shan edev->mode |= EEH_DEV_DISCONNECTED; 5451c5c533bSSam Bobroff if (rmv_data) 5461c5c533bSSam Bobroff rmv_data->removed_dev_count++; 547f5c57710SGavin Shan 54867086e32SWei Yang if (edev->physfn) { 549988fc3baSBryant G. Ly #ifdef CONFIG_PCI_IOV 55067086e32SWei Yang struct pci_dn *pdn = eeh_dev_to_pdn(edev); 55167086e32SWei Yang 552753f6124SJan H. Schönherr pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); 55367086e32SWei Yang edev->pdev = NULL; 55467086e32SWei Yang 55567086e32SWei Yang /* 55667086e32SWei Yang * We have to set the VF PE number to invalid one, which is 55767086e32SWei Yang * required to plug the VF successfully. 55867086e32SWei Yang */ 55967086e32SWei Yang pdn->pe_number = IODA_INVALID_PE; 56067086e32SWei Yang #endif 56167086e32SWei Yang if (rmv_data) 5621c5c533bSSam Bobroff list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); 56367086e32SWei Yang } else { 5641c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 565f5c57710SGavin Shan pci_stop_and_remove_bus_device(dev); 5661c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 56767086e32SWei Yang } 568f5c57710SGavin Shan 569f5c57710SGavin Shan return NULL; 570f5c57710SGavin Shan } 571f5c57710SGavin Shan 572d6c4932fSSam Bobroff static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) 573f5c57710SGavin Shan { 574f5c57710SGavin Shan struct eeh_dev *edev, *tmp; 575f5c57710SGavin Shan 576f5c57710SGavin Shan eeh_pe_for_each_dev(pe, edev, tmp) { 577f5c57710SGavin Shan if (!(edev->mode & EEH_DEV_DISCONNECTED)) 578f5c57710SGavin Shan continue; 579f5c57710SGavin Shan 580f5c57710SGavin Shan edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); 581f5c57710SGavin Shan eeh_rmv_from_parent_pe(edev); 582f5c57710SGavin Shan } 583f5c57710SGavin Shan 584f5c57710SGavin Shan return NULL; 585f5c57710SGavin Shan } 586f5c57710SGavin Shan 58778954700SGavin Shan /* 58878954700SGavin Shan * Explicitly clear PE's frozen state for PowerNV where 58978954700SGavin Shan * we have frozen PE until BAR restore is completed. It's 59078954700SGavin Shan * harmless to clear it for pSeries. To be consistent with 59178954700SGavin Shan * PE reset (for 3 times), we try to clear the frozen state 59278954700SGavin Shan * for 3 times as well. 59378954700SGavin Shan */ 594d6c4932fSSam Bobroff static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag) 59578954700SGavin Shan { 596f05fea5bSGavin Shan bool clear_sw_state = *(bool *)flag; 597c9dd0143SGavin Shan int i, rc = 1; 59878954700SGavin Shan 599c9dd0143SGavin Shan for (i = 0; rc && i < 3; i++) 6005cfb20b9SGavin Shan rc = eeh_unfreeze_pe(pe, clear_sw_state); 60178954700SGavin Shan 602c9dd0143SGavin Shan /* Stop immediately on any errors */ 6032c665992SGavin Shan if (rc) { 604c9dd0143SGavin Shan pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n", 605c9dd0143SGavin Shan __func__, rc, pe->phb->global_number, pe->addr); 6062c665992SGavin Shan return (void *)pe; 6072c665992SGavin Shan } 6082c665992SGavin Shan 6092c665992SGavin Shan return NULL; 6102c665992SGavin Shan } 6112c665992SGavin Shan 6125cfb20b9SGavin Shan static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, 6135cfb20b9SGavin Shan bool clear_sw_state) 6142c665992SGavin Shan { 6152c665992SGavin Shan void *rc; 6162c665992SGavin Shan 6175cfb20b9SGavin Shan rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state); 6182c665992SGavin Shan if (!rc) 61978954700SGavin Shan eeh_pe_state_clear(pe, EEH_PE_ISOLATED); 62078954700SGavin Shan 6212c665992SGavin Shan return rc ? -EIO : 0; 62278954700SGavin Shan } 62378954700SGavin Shan 6245cfb20b9SGavin Shan int eeh_pe_reset_and_recover(struct eeh_pe *pe) 6255cfb20b9SGavin Shan { 6262efc771fSGavin Shan int ret; 6275cfb20b9SGavin Shan 6285cfb20b9SGavin Shan /* Bail if the PE is being recovered */ 6295cfb20b9SGavin Shan if (pe->state & EEH_PE_RECOVERING) 6305cfb20b9SGavin Shan return 0; 6315cfb20b9SGavin Shan 6325cfb20b9SGavin Shan /* Put the PE into recovery mode */ 6335cfb20b9SGavin Shan eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 6345cfb20b9SGavin Shan 6355cfb20b9SGavin Shan /* Save states */ 6365cfb20b9SGavin Shan eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); 6375cfb20b9SGavin Shan 6385cfb20b9SGavin Shan /* Issue reset */ 6396654c936SRussell Currey ret = eeh_pe_reset_full(pe); 6405cfb20b9SGavin Shan if (ret) { 64128bf36f9SGavin Shan eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 6425cfb20b9SGavin Shan return ret; 6435cfb20b9SGavin Shan } 6445cfb20b9SGavin Shan 6455cfb20b9SGavin Shan /* Unfreeze the PE */ 6465cfb20b9SGavin Shan ret = eeh_clear_pe_frozen_state(pe, true); 6475cfb20b9SGavin Shan if (ret) { 6485cfb20b9SGavin Shan eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 6495cfb20b9SGavin Shan return ret; 6505cfb20b9SGavin Shan } 6515cfb20b9SGavin Shan 6525cfb20b9SGavin Shan /* Restore device state */ 6535cfb20b9SGavin Shan eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); 6545cfb20b9SGavin Shan 6555cfb20b9SGavin Shan /* Clear recovery mode */ 6565cfb20b9SGavin Shan eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 6575cfb20b9SGavin Shan 6585cfb20b9SGavin Shan return 0; 6595cfb20b9SGavin Shan } 6605cfb20b9SGavin Shan 661317f06deSGavin Shan /** 662317f06deSGavin Shan * eeh_reset_device - Perform actual reset of a pci slot 6635fd13460SSam Bobroff * @driver_eeh_aware: Does the device's driver provide EEH support? 664317f06deSGavin Shan * @pe: EEH PE 665317f06deSGavin Shan * @bus: PCI bus corresponding to the isolcated slot 6665fd13460SSam Bobroff * @rmv_data: Optional, list to record removed devices 667317f06deSGavin Shan * 668317f06deSGavin Shan * This routine must be called to do reset on the indicated PE. 669317f06deSGavin Shan * During the reset, udev might be invoked because those affected 670317f06deSGavin Shan * PCI devices will be removed and then added. 671317f06deSGavin Shan */ 67267086e32SWei Yang static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, 6735fd13460SSam Bobroff struct eeh_rmv_data *rmv_data, 6745fd13460SSam Bobroff bool driver_eeh_aware) 675317f06deSGavin Shan { 676edfd17ffSArnd Bergmann time64_t tstamp; 67767086e32SWei Yang int cnt, rc; 67867086e32SWei Yang struct eeh_dev *edev; 679317f06deSGavin Shan 680317f06deSGavin Shan /* pcibios will clear the counter; save the value */ 681317f06deSGavin Shan cnt = pe->freeze_count; 6825a71978eSGavin Shan tstamp = pe->tstamp; 683317f06deSGavin Shan 684317f06deSGavin Shan /* 685317f06deSGavin Shan * We don't remove the corresponding PE instances because 686317f06deSGavin Shan * we need the information afterwords. The attached EEH 687317f06deSGavin Shan * devices are expected to be attached soon when calling 688bd251b89SGavin Shan * into pci_hp_add_devices(). 689317f06deSGavin Shan */ 690807a827dSGavin Shan eeh_pe_state_mark(pe, EEH_PE_KEEP); 69154048cf8SSam Bobroff if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { 69254048cf8SSam Bobroff eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); 69367086e32SWei Yang } else { 6941c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 695bd251b89SGavin Shan pci_hp_remove_devices(bus); 6961c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 69767086e32SWei Yang } 698317f06deSGavin Shan 699d0914f50SGavin Shan /* 700d0914f50SGavin Shan * Reset the pci controller. (Asserts RST#; resets config space). 701317f06deSGavin Shan * Reconfigure bridges and devices. Don't try to bring the system 702317f06deSGavin Shan * up if the reset failed for some reason. 703d0914f50SGavin Shan * 704d0914f50SGavin Shan * During the reset, it's very dangerous to have uncontrolled PCI 705d0914f50SGavin Shan * config accesses. So we prefer to block them. However, controlled 706d0914f50SGavin Shan * PCI config accesses initiated from EEH itself are allowed. 707317f06deSGavin Shan */ 7086654c936SRussell Currey rc = eeh_pe_reset_full(pe); 70928bf36f9SGavin Shan if (rc) 710317f06deSGavin Shan return rc; 711317f06deSGavin Shan 7121c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 7131c2042c8SRafael J. Wysocki 714317f06deSGavin Shan /* Restore PE */ 715317f06deSGavin Shan eeh_ops->configure_bridge(pe); 716317f06deSGavin Shan eeh_pe_restore_bars(pe); 717317f06deSGavin Shan 718dc9c41bdSAndrew Donnellan /* Clear frozen state */ 7195cfb20b9SGavin Shan rc = eeh_clear_pe_frozen_state(pe, false); 720409bf7f8SAndrew Donnellan if (rc) { 721409bf7f8SAndrew Donnellan pci_unlock_rescan_remove(); 72278954700SGavin Shan return rc; 723409bf7f8SAndrew Donnellan } 72478954700SGavin Shan 725317f06deSGavin Shan /* Give the system 5 seconds to finish running the user-space 726317f06deSGavin Shan * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, 727317f06deSGavin Shan * this is a hack, but if we don't do this, and try to bring 728317f06deSGavin Shan * the device up before the scripts have taken it down, 729317f06deSGavin Shan * potentially weird things happen. 730317f06deSGavin Shan */ 7311c5c533bSSam Bobroff if (!driver_eeh_aware || rmv_data->removed_dev_count) { 73254048cf8SSam Bobroff pr_info("EEH: Sleep 5s ahead of %s hotplug\n", 73354048cf8SSam Bobroff (driver_eeh_aware ? "partial" : "complete")); 734317f06deSGavin Shan ssleep(5); 735f5c57710SGavin Shan 736f5c57710SGavin Shan /* 737f5c57710SGavin Shan * The EEH device is still connected with its parent 738f5c57710SGavin Shan * PE. We should disconnect it so the binding can be 739f5c57710SGavin Shan * rebuilt when adding PCI devices. 740f5c57710SGavin Shan */ 74180e65b00SSam Bobroff edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); 742f5c57710SGavin Shan eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); 743a3aa256bSGavin Shan if (pe->type & EEH_PE_VF) { 744bf773df9SSam Bobroff eeh_add_virt_device(edev); 745a3aa256bSGavin Shan } else { 74654048cf8SSam Bobroff if (!driver_eeh_aware) 747a3aa256bSGavin Shan eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 748bd251b89SGavin Shan pci_hp_add_devices(bus); 749a3aa256bSGavin Shan } 750317f06deSGavin Shan } 751f5c57710SGavin Shan eeh_pe_state_clear(pe, EEH_PE_KEEP); 7525a71978eSGavin Shan 7535a71978eSGavin Shan pe->tstamp = tstamp; 754317f06deSGavin Shan pe->freeze_count = cnt; 755317f06deSGavin Shan 7561c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 757317f06deSGavin Shan return 0; 758317f06deSGavin Shan } 759317f06deSGavin Shan 760317f06deSGavin Shan /* The longest amount of time to wait for a pci device 761317f06deSGavin Shan * to come back on line, in seconds. 762317f06deSGavin Shan */ 763fb48dc22SBrian King #define MAX_WAIT_FOR_RECOVERY 300 764317f06deSGavin Shan 765c0b64978SRussell Currey /** 766c0b64978SRussell Currey * eeh_handle_normal_event - Handle EEH events on a specific PE 76737fd8125SSam Bobroff * @pe: EEH PE - which should not be used after we return, as it may 76837fd8125SSam Bobroff * have been invalidated. 769c0b64978SRussell Currey * 770c0b64978SRussell Currey * Attempts to recover the given PE. If recovery fails or the PE has failed 771c0b64978SRussell Currey * too many times, remove the PE. 772c0b64978SRussell Currey * 77368701780SSam Bobroff * While PHB detects address or data parity errors on particular PCI 77468701780SSam Bobroff * slot, the associated PE will be frozen. Besides, DMA's occurring 77568701780SSam Bobroff * to wild addresses (which usually happen due to bugs in device 77668701780SSam Bobroff * drivers or in PCI adapter firmware) can cause EEH error. #SERR, 77768701780SSam Bobroff * #PERR or other misc PCI-related errors also can trigger EEH errors. 77868701780SSam Bobroff * 77968701780SSam Bobroff * Recovery process consists of unplugging the device driver (which 78068701780SSam Bobroff * generated hotplug events to userspace), then issuing a PCI #RST to 78168701780SSam Bobroff * the device, then reconfiguring the PCI config space for all bridges 78268701780SSam Bobroff * & devices under this slot, and then finally restarting the device 78368701780SSam Bobroff * drivers (which cause a second set of hotplug events to go out to 78468701780SSam Bobroff * userspace). 785c0b64978SRussell Currey */ 78637fd8125SSam Bobroff void eeh_handle_normal_event(struct eeh_pe *pe) 787317f06deSGavin Shan { 788cd95f804SSam Bobroff struct pci_bus *bus; 78967086e32SWei Yang struct eeh_dev *edev, *tmp; 790665012c5SSam Bobroff struct eeh_pe *tmp_pe; 791317f06deSGavin Shan int rc = 0; 792317f06deSGavin Shan enum pci_ers_result result = PCI_ERS_RESULT_NONE; 7931c5c533bSSam Bobroff struct eeh_rmv_data rmv_data = 7941c5c533bSSam Bobroff {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; 795317f06deSGavin Shan 796cd95f804SSam Bobroff bus = eeh_pe_bus_get(pe); 797cd95f804SSam Bobroff if (!bus) { 7981f52f176SRussell Currey pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", 799317f06deSGavin Shan __func__, pe->phb->global_number, pe->addr); 80037fd8125SSam Bobroff return; 801317f06deSGavin Shan } 802317f06deSGavin Shan 80337fd8125SSam Bobroff eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 80437fd8125SSam Bobroff 8055a71978eSGavin Shan eeh_pe_update_time_stamp(pe); 806317f06deSGavin Shan pe->freeze_count++; 807c0b64978SRussell Currey if (pe->freeze_count > eeh_max_freezes) { 808796b9f5bSSam Bobroff pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", 809c0b64978SRussell Currey pe->phb->global_number, pe->addr, 810c0b64978SRussell Currey pe->freeze_count); 811b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 812c0b64978SRussell Currey } 813317f06deSGavin Shan 814317f06deSGavin Shan /* Walk the various device drivers attached to this slot through 815317f06deSGavin Shan * a reset sequence, giving each an opportunity to do what it needs 816317f06deSGavin Shan * to accomplish the reset. Each child gets a report of the 817317f06deSGavin Shan * status ... if any child can't handle the reset, then the entire 818317f06deSGavin Shan * slot is dlpar removed and added. 8198234fcedSGavin Shan * 8208234fcedSGavin Shan * When the PHB is fenced, we have to issue a reset to recover from 8218234fcedSGavin Shan * the error. Override the result if necessary to have partially 8228234fcedSGavin Shan * hotplug for this case. 823317f06deSGavin Shan */ 824b90484ecSSam Bobroff if (result != PCI_ERS_RESULT_DISCONNECT) { 825b90484ecSSam Bobroff pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", 826b90484ecSSam Bobroff pe->freeze_count, eeh_max_freezes); 82756ca4fdeSGavin Shan pr_info("EEH: Notify device drivers to shutdown\n"); 82847cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_frozen); 829010acfa1SSam Bobroff eeh_set_irq_state(pe, false); 830b90484ecSSam Bobroff eeh_pe_report("error_detected(IO frozen)", pe, 831b90484ecSSam Bobroff eeh_report_error, &result); 8328234fcedSGavin Shan if ((pe->type & EEH_PE_PHB) && 8338234fcedSGavin Shan result != PCI_ERS_RESULT_NONE && 8348234fcedSGavin Shan result != PCI_ERS_RESULT_NEED_RESET) 8358234fcedSGavin Shan result = PCI_ERS_RESULT_NEED_RESET; 836b90484ecSSam Bobroff } 837317f06deSGavin Shan 838317f06deSGavin Shan /* Get the current PCI slot state. This can take a long time, 8392ac3990cSWei Yang * sometimes over 300 seconds for certain systems. 840317f06deSGavin Shan */ 841b90484ecSSam Bobroff if (result != PCI_ERS_RESULT_DISCONNECT) { 842fef7f905SSam Bobroff rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); 843317f06deSGavin Shan if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { 8440dae2743SGavin Shan pr_warn("EEH: Permanent failure\n"); 845b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 846b90484ecSSam Bobroff } 847317f06deSGavin Shan } 848317f06deSGavin Shan 849317f06deSGavin Shan /* Since rtas may enable MMIO when posting the error log, 850317f06deSGavin Shan * don't post the error log until after all dev drivers 851317f06deSGavin Shan * have been informed. 852317f06deSGavin Shan */ 853b90484ecSSam Bobroff if (result != PCI_ERS_RESULT_DISCONNECT) { 85456ca4fdeSGavin Shan pr_info("EEH: Collect temporary log\n"); 855317f06deSGavin Shan eeh_slot_error_detail(pe, EEH_LOG_TEMP); 856b90484ecSSam Bobroff } 857317f06deSGavin Shan 858317f06deSGavin Shan /* If all device drivers were EEH-unaware, then shut 859317f06deSGavin Shan * down all of the device drivers, and hope they 860317f06deSGavin Shan * go down willingly, without panicing the system. 861317f06deSGavin Shan */ 862317f06deSGavin Shan if (result == PCI_ERS_RESULT_NONE) { 86356ca4fdeSGavin Shan pr_info("EEH: Reset with hotplug activity\n"); 8645fd13460SSam Bobroff rc = eeh_reset_device(pe, bus, NULL, false); 865317f06deSGavin Shan if (rc) { 8660dae2743SGavin Shan pr_warn("%s: Unable to reset, err=%d\n", 86756ca4fdeSGavin Shan __func__, rc); 868b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 869317f06deSGavin Shan } 870317f06deSGavin Shan } 871317f06deSGavin Shan 872317f06deSGavin Shan /* If all devices reported they can proceed, then re-enable MMIO */ 873317f06deSGavin Shan if (result == PCI_ERS_RESULT_CAN_RECOVER) { 87456ca4fdeSGavin Shan pr_info("EEH: Enable I/O for affected devices\n"); 875317f06deSGavin Shan rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); 876317f06deSGavin Shan 877b90484ecSSam Bobroff if (rc < 0) { 878b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 879b90484ecSSam Bobroff } else if (rc) { 880317f06deSGavin Shan result = PCI_ERS_RESULT_NEED_RESET; 881317f06deSGavin Shan } else { 88256ca4fdeSGavin Shan pr_info("EEH: Notify device drivers to resume I/O\n"); 88320b34497SSam Bobroff eeh_pe_report("mmio_enabled", pe, 88420b34497SSam Bobroff eeh_report_mmio_enabled, &result); 885317f06deSGavin Shan } 886317f06deSGavin Shan } 887317f06deSGavin Shan 888317f06deSGavin Shan /* If all devices reported they can proceed, then re-enable DMA */ 889317f06deSGavin Shan if (result == PCI_ERS_RESULT_CAN_RECOVER) { 89056ca4fdeSGavin Shan pr_info("EEH: Enabled DMA for affected devices\n"); 891317f06deSGavin Shan rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); 892317f06deSGavin Shan 893b90484ecSSam Bobroff if (rc < 0) { 894b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 895b90484ecSSam Bobroff } else if (rc) { 896317f06deSGavin Shan result = PCI_ERS_RESULT_NEED_RESET; 89735845a78SGavin Shan } else { 89835845a78SGavin Shan /* 89935845a78SGavin Shan * We didn't do PE reset for the case. The PE 90035845a78SGavin Shan * is still in frozen state. Clear it before 90135845a78SGavin Shan * resuming the PE. 90235845a78SGavin Shan */ 90335845a78SGavin Shan eeh_pe_state_clear(pe, EEH_PE_ISOLATED); 904317f06deSGavin Shan result = PCI_ERS_RESULT_RECOVERED; 905317f06deSGavin Shan } 90635845a78SGavin Shan } 907317f06deSGavin Shan 908317f06deSGavin Shan /* If any device called out for a reset, then reset the slot */ 909317f06deSGavin Shan if (result == PCI_ERS_RESULT_NEED_RESET) { 91056ca4fdeSGavin Shan pr_info("EEH: Reset without hotplug activity\n"); 9115fd13460SSam Bobroff rc = eeh_reset_device(pe, bus, &rmv_data, true); 912317f06deSGavin Shan if (rc) { 9130dae2743SGavin Shan pr_warn("%s: Cannot reset, err=%d\n", 91456ca4fdeSGavin Shan __func__, rc); 915b90484ecSSam Bobroff result = PCI_ERS_RESULT_DISCONNECT; 916b90484ecSSam Bobroff } else { 917317f06deSGavin Shan result = PCI_ERS_RESULT_NONE; 91847cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_normal); 919010acfa1SSam Bobroff eeh_set_irq_state(pe, true); 920b90484ecSSam Bobroff eeh_pe_report("slot_reset", pe, eeh_report_reset, 921b90484ecSSam Bobroff &result); 922b90484ecSSam Bobroff } 923317f06deSGavin Shan } 924317f06deSGavin Shan 925b90484ecSSam Bobroff if ((result == PCI_ERS_RESULT_RECOVERED) || 926b90484ecSSam Bobroff (result == PCI_ERS_RESULT_NONE)) { 92767086e32SWei Yang /* 928b90484ecSSam Bobroff * For those hot removed VFs, we should add back them after PF 929b90484ecSSam Bobroff * get recovered properly. 93067086e32SWei Yang */ 9311c5c533bSSam Bobroff list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, 9321c5c533bSSam Bobroff rmv_entry) { 933bf773df9SSam Bobroff eeh_add_virt_device(edev); 93480e65b00SSam Bobroff list_del(&edev->rmv_entry); 93567086e32SWei Yang } 93667086e32SWei Yang 937317f06deSGavin Shan /* Tell all device drivers that they can resume operations */ 93856ca4fdeSGavin Shan pr_info("EEH: Notify device driver to resume\n"); 93947cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_normal); 940010acfa1SSam Bobroff eeh_set_irq_state(pe, true); 94120b34497SSam Bobroff eeh_pe_report("resume", pe, eeh_report_resume, NULL); 94220b34497SSam Bobroff eeh_for_each_pe(pe, tmp_pe) { 94320b34497SSam Bobroff eeh_pe_for_each_dev(tmp_pe, edev, tmp) { 944665012c5SSam Bobroff edev->mode &= ~EEH_DEV_NO_HANDLER; 94520b34497SSam Bobroff edev->in_error = false; 94620b34497SSam Bobroff } 94720b34497SSam Bobroff } 948665012c5SSam Bobroff 949796b9f5bSSam Bobroff pr_info("EEH: Recovery successful.\n"); 950b90484ecSSam Bobroff } else { 951317f06deSGavin Shan /* 952317f06deSGavin Shan * About 90% of all real-life EEH failures in the field 953317f06deSGavin Shan * are due to poorly seated PCI cards. Only 10% or so are 954317f06deSGavin Shan * due to actual, failed cards. 955317f06deSGavin Shan */ 9561f52f176SRussell Currey pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" 957317f06deSGavin Shan "Please try reseating or replacing it\n", 958317f06deSGavin Shan pe->phb->global_number, pe->addr); 959317f06deSGavin Shan 960317f06deSGavin Shan eeh_slot_error_detail(pe, EEH_LOG_PERM); 961317f06deSGavin Shan 962317f06deSGavin Shan /* Notify all devices that they're about to go down. */ 96347cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_perm_failure); 964010acfa1SSam Bobroff eeh_set_irq_state(pe, false); 96520b34497SSam Bobroff eeh_pe_report("error_detected(permanent failure)", pe, 96620b34497SSam Bobroff eeh_report_failure, NULL); 967317f06deSGavin Shan 968d2b0f6f7SGavin Shan /* Mark the PE to be removed permanently */ 969432227e9SGavin Shan eeh_pe_state_mark(pe, EEH_PE_REMOVED); 970d2b0f6f7SGavin Shan 971d2b0f6f7SGavin Shan /* 972d2b0f6f7SGavin Shan * Shut down the device drivers for good. We mark 973d2b0f6f7SGavin Shan * all removed devices correctly to avoid access 974d2b0f6f7SGavin Shan * the their PCI config any more. 975d2b0f6f7SGavin Shan */ 97667086e32SWei Yang if (pe->type & EEH_PE_VF) { 97767086e32SWei Yang eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); 97867086e32SWei Yang eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 97967086e32SWei Yang } else { 98005ba75f8SGavin Shan eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 981d2b0f6f7SGavin Shan eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 982d2b0f6f7SGavin Shan 9831c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 984cd95f804SSam Bobroff pci_hp_remove_devices(bus); 9851c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 986daeba295SRussell Currey /* The passed PE should no longer be used */ 98737fd8125SSam Bobroff return; 9881c2042c8SRafael J. Wysocki } 989b90484ecSSam Bobroff } 99037fd8125SSam Bobroff eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 99167086e32SWei Yang } 9928a6b1bc7SGavin Shan 993c0b64978SRussell Currey /** 994c0b64978SRussell Currey * eeh_handle_special_event - Handle EEH events without a specific failing PE 995c0b64978SRussell Currey * 996c0b64978SRussell Currey * Called when an EEH event is detected but can't be narrowed down to a 997c0b64978SRussell Currey * specific PE. Iterates through possible failures and handles them as 998c0b64978SRussell Currey * necessary. 999c0b64978SRussell Currey */ 100068701780SSam Bobroff void eeh_handle_special_event(void) 10018a6b1bc7SGavin Shan { 10028a6b1bc7SGavin Shan struct eeh_pe *pe, *phb_pe; 10038a6b1bc7SGavin Shan struct pci_bus *bus; 10047e4e7867SGavin Shan struct pci_controller *hose; 10058a6b1bc7SGavin Shan unsigned long flags; 10067e4e7867SGavin Shan int rc; 10078a6b1bc7SGavin Shan 10087e4e7867SGavin Shan 10097e4e7867SGavin Shan do { 10108a6b1bc7SGavin Shan rc = eeh_ops->next_error(&pe); 10118a6b1bc7SGavin Shan 10128a6b1bc7SGavin Shan switch (rc) { 10137e4e7867SGavin Shan case EEH_NEXT_ERR_DEAD_IOC: 10148a6b1bc7SGavin Shan /* Mark all PHBs in dead state */ 10158a6b1bc7SGavin Shan eeh_serialize_lock(&flags); 10167e4e7867SGavin Shan 10177e4e7867SGavin Shan /* Purge all events */ 10185c7a35e3SGavin Shan eeh_remove_event(NULL, true); 10197e4e7867SGavin Shan 10207e4e7867SGavin Shan list_for_each_entry(hose, &hose_list, list_node) { 10218a6b1bc7SGavin Shan phb_pe = eeh_phb_pe_get(hose); 10228a6b1bc7SGavin Shan if (!phb_pe) continue; 10238a6b1bc7SGavin Shan 1024e762bb89SSam Bobroff eeh_pe_mark_isolated(phb_pe); 10258a6b1bc7SGavin Shan } 10267e4e7867SGavin Shan 10278a6b1bc7SGavin Shan eeh_serialize_unlock(flags); 10288a6b1bc7SGavin Shan 10298a6b1bc7SGavin Shan break; 10307e4e7867SGavin Shan case EEH_NEXT_ERR_FROZEN_PE: 10317e4e7867SGavin Shan case EEH_NEXT_ERR_FENCED_PHB: 10327e4e7867SGavin Shan case EEH_NEXT_ERR_DEAD_PHB: 10338a6b1bc7SGavin Shan /* Mark the PE in fenced state */ 10348a6b1bc7SGavin Shan eeh_serialize_lock(&flags); 10357e4e7867SGavin Shan 10367e4e7867SGavin Shan /* Purge all events of the PHB */ 10375c7a35e3SGavin Shan eeh_remove_event(pe, true); 10387e4e7867SGavin Shan 1039e762bb89SSam Bobroff if (rc != EEH_NEXT_ERR_DEAD_PHB) 1040e762bb89SSam Bobroff eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 1041e762bb89SSam Bobroff eeh_pe_mark_isolated(pe); 10427e4e7867SGavin Shan 10438a6b1bc7SGavin Shan eeh_serialize_unlock(flags); 10448a6b1bc7SGavin Shan 10458a6b1bc7SGavin Shan break; 10467e4e7867SGavin Shan case EEH_NEXT_ERR_NONE: 10477e4e7867SGavin Shan return; 10488a6b1bc7SGavin Shan default: 10497e4e7867SGavin Shan pr_warn("%s: Invalid value %d from next_error()\n", 10508a6b1bc7SGavin Shan __func__, rc); 10518a6b1bc7SGavin Shan return; 10528a6b1bc7SGavin Shan } 10538a6b1bc7SGavin Shan 10548a6b1bc7SGavin Shan /* 10558a6b1bc7SGavin Shan * For fenced PHB and frozen PE, it's handled as normal 10568a6b1bc7SGavin Shan * event. We have to remove the affected PHBs for dead 10578a6b1bc7SGavin Shan * PHB and IOC 10588a6b1bc7SGavin Shan */ 10597e4e7867SGavin Shan if (rc == EEH_NEXT_ERR_FROZEN_PE || 10607e4e7867SGavin Shan rc == EEH_NEXT_ERR_FENCED_PHB) { 106137fd8125SSam Bobroff eeh_handle_normal_event(pe); 10627e4e7867SGavin Shan } else { 10631c2042c8SRafael J. Wysocki pci_lock_rescan_remove(); 10647e4e7867SGavin Shan list_for_each_entry(hose, &hose_list, list_node) { 10658a6b1bc7SGavin Shan phb_pe = eeh_phb_pe_get(hose); 10667e4e7867SGavin Shan if (!phb_pe || 10679e049375SGavin Shan !(phb_pe->state & EEH_PE_ISOLATED) || 10689e049375SGavin Shan (phb_pe->state & EEH_PE_RECOVERING)) 10698a6b1bc7SGavin Shan continue; 10708a6b1bc7SGavin Shan 10717e4e7867SGavin Shan /* Notify all devices to be down */ 107205ba75f8SGavin Shan eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 107347cc8c1cSSam Bobroff eeh_set_channel_state(pe, pci_channel_io_perm_failure); 107420b34497SSam Bobroff eeh_pe_report( 107520b34497SSam Bobroff "error_detected(permanent failure)", pe, 1076af2e3a00SRussell Currey eeh_report_failure, NULL); 10778a6b1bc7SGavin Shan bus = eeh_pe_bus_get(phb_pe); 107804fec21cSRussell Currey if (!bus) { 107904fec21cSRussell Currey pr_err("%s: Cannot find PCI bus for " 10801f52f176SRussell Currey "PHB#%x-PE#%x\n", 108104fec21cSRussell Currey __func__, 108204fec21cSRussell Currey pe->phb->global_number, 108304fec21cSRussell Currey pe->addr); 108404fec21cSRussell Currey break; 108504fec21cSRussell Currey } 1086bd251b89SGavin Shan pci_hp_remove_devices(bus); 10878a6b1bc7SGavin Shan } 10881c2042c8SRafael J. Wysocki pci_unlock_rescan_remove(); 10898a6b1bc7SGavin Shan } 10907e4e7867SGavin Shan 10917e4e7867SGavin Shan /* 10927e4e7867SGavin Shan * If we have detected dead IOC, we needn't proceed 10937e4e7867SGavin Shan * any more since all PHBs would have been removed 10947e4e7867SGavin Shan */ 10957e4e7867SGavin Shan if (rc == EEH_NEXT_ERR_DEAD_IOC) 10967e4e7867SGavin Shan break; 10977e4e7867SGavin Shan } while (rc != EEH_NEXT_ERR_NONE); 10988a6b1bc7SGavin Shan } 1099