1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This file implements the error recovery as a core part of PCIe error 4 * reporting. When a PCIe error is delivered, an error message will be 5 * collected and printed to console, then, an error recovery procedure 6 * will be executed by following the PCI error recovery rules. 7 * 8 * Copyright (C) 2006 Intel Corp. 9 * Tom Long Nguyen (tom.l.nguyen@intel.com) 10 * Zhang Yanmin (yanmin.zhang@intel.com) 11 */ 12 13 #include <linux/pci.h> 14 #include <linux/module.h> 15 #include <linux/pci.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/aer.h> 19 #include "portdrv.h" 20 #include "../pci.h" 21 22 struct aer_broadcast_data { 23 enum pci_channel_state state; 24 enum pci_ers_result result; 25 }; 26 27 static pci_ers_result_t merge_result(enum pci_ers_result orig, 28 enum pci_ers_result new) 29 { 30 if (new == PCI_ERS_RESULT_NO_AER_DRIVER) 31 return PCI_ERS_RESULT_NO_AER_DRIVER; 32 33 if (new == PCI_ERS_RESULT_NONE) 34 return orig; 35 36 switch (orig) { 37 case PCI_ERS_RESULT_CAN_RECOVER: 38 case PCI_ERS_RESULT_RECOVERED: 39 orig = new; 40 break; 41 case PCI_ERS_RESULT_DISCONNECT: 42 if (new == PCI_ERS_RESULT_NEED_RESET) 43 orig = PCI_ERS_RESULT_NEED_RESET; 44 break; 45 default: 46 break; 47 } 48 49 return orig; 50 } 51 52 static int report_error_detected(struct pci_dev *dev, void *data) 53 { 54 pci_ers_result_t vote; 55 const struct pci_error_handlers *err_handler; 56 struct aer_broadcast_data *result_data; 57 58 result_data = (struct aer_broadcast_data *) data; 59 60 device_lock(&dev->dev); 61 dev->error_state = result_data->state; 62 63 if (!dev->driver || 64 !dev->driver->err_handler || 65 !dev->driver->err_handler->error_detected) { 66 if (result_data->state == pci_channel_io_frozen && 67 dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { 68 /* 69 * In case of fatal recovery, if one of down- 70 * stream device has no driver. We might be 71 * unable to recover because a later insmod 72 * of a driver for this device is unaware of 73 * its hw state. 74 */ 75 pci_printk(KERN_DEBUG, dev, "device has %s\n", 76 dev->driver ? 77 "no AER-aware driver" : "no driver"); 78 } 79 80 /* 81 * If there's any device in the subtree that does not 82 * have an error_detected callback, returning 83 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of 84 * the subsequent mmio_enabled/slot_reset/resume 85 * callbacks of "any" device in the subtree. All the 86 * devices in the subtree are left in the error state 87 * without recovery. 88 */ 89 90 if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) 91 vote = PCI_ERS_RESULT_NO_AER_DRIVER; 92 else 93 vote = PCI_ERS_RESULT_NONE; 94 } else { 95 err_handler = dev->driver->err_handler; 96 vote = err_handler->error_detected(dev, result_data->state); 97 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); 98 } 99 100 result_data->result = merge_result(result_data->result, vote); 101 device_unlock(&dev->dev); 102 return 0; 103 } 104 105 static int report_mmio_enabled(struct pci_dev *dev, void *data) 106 { 107 pci_ers_result_t vote; 108 const struct pci_error_handlers *err_handler; 109 struct aer_broadcast_data *result_data; 110 111 result_data = (struct aer_broadcast_data *) data; 112 113 device_lock(&dev->dev); 114 if (!dev->driver || 115 !dev->driver->err_handler || 116 !dev->driver->err_handler->mmio_enabled) 117 goto out; 118 119 err_handler = dev->driver->err_handler; 120 vote = err_handler->mmio_enabled(dev); 121 result_data->result = merge_result(result_data->result, vote); 122 out: 123 device_unlock(&dev->dev); 124 return 0; 125 } 126 127 static int report_slot_reset(struct pci_dev *dev, void *data) 128 { 129 pci_ers_result_t vote; 130 const struct pci_error_handlers *err_handler; 131 struct aer_broadcast_data *result_data; 132 133 result_data = (struct aer_broadcast_data *) data; 134 135 device_lock(&dev->dev); 136 if (!dev->driver || 137 !dev->driver->err_handler || 138 !dev->driver->err_handler->slot_reset) 139 goto out; 140 141 err_handler = dev->driver->err_handler; 142 vote = err_handler->slot_reset(dev); 143 result_data->result = merge_result(result_data->result, vote); 144 out: 145 device_unlock(&dev->dev); 146 return 0; 147 } 148 149 static int report_resume(struct pci_dev *dev, void *data) 150 { 151 const struct pci_error_handlers *err_handler; 152 153 device_lock(&dev->dev); 154 dev->error_state = pci_channel_io_normal; 155 156 if (!dev->driver || 157 !dev->driver->err_handler || 158 !dev->driver->err_handler->resume) 159 goto out; 160 161 err_handler = dev->driver->err_handler; 162 err_handler->resume(dev); 163 pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED); 164 out: 165 device_unlock(&dev->dev); 166 return 0; 167 } 168 169 /** 170 * default_reset_link - default reset function 171 * @dev: pointer to pci_dev data structure 172 * 173 * Invoked when performing link reset on a Downstream Port or a 174 * Root Port with no aer driver. 175 */ 176 static pci_ers_result_t default_reset_link(struct pci_dev *dev) 177 { 178 pci_reset_bridge_secondary_bus(dev); 179 pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n"); 180 return PCI_ERS_RESULT_RECOVERED; 181 } 182 183 static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service) 184 { 185 struct pci_dev *udev; 186 pci_ers_result_t status; 187 struct pcie_port_service_driver *driver = NULL; 188 189 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 190 /* Reset this port for all subordinates */ 191 udev = dev; 192 } else { 193 /* Reset the upstream component (likely downstream port) */ 194 udev = dev->bus->self; 195 } 196 197 /* Use the aer driver of the component firstly */ 198 driver = pcie_port_find_service(udev, service); 199 200 if (driver && driver->reset_link) { 201 status = driver->reset_link(udev); 202 } else if (udev->has_secondary_link) { 203 status = default_reset_link(udev); 204 } else { 205 pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n", 206 pci_name(udev)); 207 return PCI_ERS_RESULT_DISCONNECT; 208 } 209 210 if (status != PCI_ERS_RESULT_RECOVERED) { 211 pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n", 212 pci_name(udev)); 213 return PCI_ERS_RESULT_DISCONNECT; 214 } 215 216 return status; 217 } 218 219 /** 220 * broadcast_error_message - handle message broadcast to downstream drivers 221 * @dev: pointer to from where in a hierarchy message is broadcasted down 222 * @state: error state 223 * @error_mesg: message to print 224 * @cb: callback to be broadcasted 225 * 226 * Invoked during error recovery process. Once being invoked, the content 227 * of error severity will be broadcasted to all downstream drivers in a 228 * hierarchy in question. 229 */ 230 static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, 231 enum pci_channel_state state, 232 char *error_mesg, 233 int (*cb)(struct pci_dev *, void *)) 234 { 235 struct aer_broadcast_data result_data; 236 237 pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg); 238 result_data.state = state; 239 if (cb == report_error_detected) 240 result_data.result = PCI_ERS_RESULT_CAN_RECOVER; 241 else 242 result_data.result = PCI_ERS_RESULT_RECOVERED; 243 244 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 245 /* 246 * If the error is reported by a bridge, we think this error 247 * is related to the downstream link of the bridge, so we 248 * do error recovery on all subordinates of the bridge instead 249 * of the bridge and clear the error status of the bridge. 250 */ 251 if (cb == report_error_detected) 252 dev->error_state = state; 253 pci_walk_bus(dev->subordinate, cb, &result_data); 254 if (cb == report_resume) { 255 pci_cleanup_aer_uncorrect_error_status(dev); 256 dev->error_state = pci_channel_io_normal; 257 } 258 } else { 259 /* 260 * If the error is reported by an end point, we think this 261 * error is related to the upstream link of the end point. 262 */ 263 if (state == pci_channel_io_normal) 264 /* 265 * the error is non fatal so the bus is ok, just invoke 266 * the callback for the function that logged the error. 267 */ 268 cb(dev, &result_data); 269 else 270 pci_walk_bus(dev->bus, cb, &result_data); 271 } 272 273 return result_data.result; 274 } 275 276 /** 277 * pcie_do_fatal_recovery - handle fatal error recovery process 278 * @dev: pointer to a pci_dev data structure of agent detecting an error 279 * 280 * Invoked when an error is fatal. Once being invoked, removes the devices 281 * beneath this AER agent, followed by reset link e.g. secondary bus reset 282 * followed by re-enumeration of devices. 283 */ 284 void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service) 285 { 286 struct pci_dev *udev; 287 struct pci_bus *parent; 288 struct pci_dev *pdev, *temp; 289 pci_ers_result_t result; 290 291 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) 292 udev = dev; 293 else 294 udev = dev->bus->self; 295 296 parent = udev->subordinate; 297 pci_lock_rescan_remove(); 298 list_for_each_entry_safe_reverse(pdev, temp, &parent->devices, 299 bus_list) { 300 pci_dev_get(pdev); 301 pci_dev_set_disconnected(pdev, NULL); 302 if (pci_has_subordinate(pdev)) 303 pci_walk_bus(pdev->subordinate, 304 pci_dev_set_disconnected, NULL); 305 pci_stop_and_remove_bus_device(pdev); 306 pci_dev_put(pdev); 307 } 308 309 result = reset_link(udev, service); 310 311 if ((service == PCIE_PORT_SERVICE_AER) && 312 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) { 313 /* 314 * If the error is reported by a bridge, we think this error 315 * is related to the downstream link of the bridge, so we 316 * do error recovery on all subordinates of the bridge instead 317 * of the bridge and clear the error status of the bridge. 318 */ 319 pci_cleanup_aer_uncorrect_error_status(dev); 320 } 321 322 if (result == PCI_ERS_RESULT_RECOVERED) { 323 if (pcie_wait_for_link(udev, true)) 324 pci_rescan_bus(udev->bus); 325 pci_info(dev, "Device recovery from fatal error successful\n"); 326 } else { 327 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); 328 pci_info(dev, "Device recovery from fatal error failed\n"); 329 } 330 331 pci_unlock_rescan_remove(); 332 } 333 334 /** 335 * pcie_do_nonfatal_recovery - handle nonfatal error recovery process 336 * @dev: pointer to a pci_dev data structure of agent detecting an error 337 * 338 * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast 339 * error detected message to all downstream drivers within a hierarchy in 340 * question and return the returned code. 341 */ 342 void pcie_do_nonfatal_recovery(struct pci_dev *dev) 343 { 344 pci_ers_result_t status; 345 enum pci_channel_state state; 346 347 state = pci_channel_io_normal; 348 349 status = broadcast_error_message(dev, 350 state, 351 "error_detected", 352 report_error_detected); 353 354 if (status == PCI_ERS_RESULT_CAN_RECOVER) 355 status = broadcast_error_message(dev, 356 state, 357 "mmio_enabled", 358 report_mmio_enabled); 359 360 if (status == PCI_ERS_RESULT_NEED_RESET) { 361 /* 362 * TODO: Should call platform-specific 363 * functions to reset slot before calling 364 * drivers' slot_reset callbacks? 365 */ 366 status = broadcast_error_message(dev, 367 state, 368 "slot_reset", 369 report_slot_reset); 370 } 371 372 if (status != PCI_ERS_RESULT_RECOVERED) 373 goto failed; 374 375 broadcast_error_message(dev, 376 state, 377 "resume", 378 report_resume); 379 380 pci_info(dev, "AER: Device recovery successful\n"); 381 return; 382 383 failed: 384 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); 385 386 /* TODO: Should kernel panic here? */ 387 pci_info(dev, "AER: Device recovery failed\n"); 388 } 389