1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This file implements the error recovery as a core part of PCIe error 4 * reporting. When a PCIe error is delivered, an error message will be 5 * collected and printed to console, then, an error recovery procedure 6 * will be executed by following the PCI error recovery rules. 7 * 8 * Copyright (C) 2006 Intel Corp. 9 * Tom Long Nguyen (tom.l.nguyen@intel.com) 10 * Zhang Yanmin (yanmin.zhang@intel.com) 11 */ 12 13 #include <linux/pci.h> 14 #include <linux/module.h> 15 #include <linux/pci.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/aer.h> 19 #include "portdrv.h" 20 #include "../pci.h" 21 22 struct aer_broadcast_data { 23 enum pci_channel_state state; 24 enum pci_ers_result result; 25 }; 26 27 static pci_ers_result_t merge_result(enum pci_ers_result orig, 28 enum pci_ers_result new) 29 { 30 if (new == PCI_ERS_RESULT_NO_AER_DRIVER) 31 return PCI_ERS_RESULT_NO_AER_DRIVER; 32 33 if (new == PCI_ERS_RESULT_NONE) 34 return orig; 35 36 switch (orig) { 37 case PCI_ERS_RESULT_CAN_RECOVER: 38 case PCI_ERS_RESULT_RECOVERED: 39 orig = new; 40 break; 41 case PCI_ERS_RESULT_DISCONNECT: 42 if (new == PCI_ERS_RESULT_NEED_RESET) 43 orig = PCI_ERS_RESULT_NEED_RESET; 44 break; 45 default: 46 break; 47 } 48 49 return orig; 50 } 51 52 static int report_error_detected(struct pci_dev *dev, void *data) 53 { 54 pci_ers_result_t vote; 55 const struct pci_error_handlers *err_handler; 56 struct aer_broadcast_data *result_data; 57 58 result_data = (struct aer_broadcast_data *) data; 59 60 device_lock(&dev->dev); 61 dev->error_state = result_data->state; 62 63 if (!dev->driver || 64 !dev->driver->err_handler || 65 !dev->driver->err_handler->error_detected) { 66 if (result_data->state == pci_channel_io_frozen && 67 dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { 68 /* 69 * In case of fatal recovery, if one of down- 70 * stream device has no driver. We might be 71 * unable to recover because a later insmod 72 * of a driver for this device is unaware of 73 * its hw state. 74 */ 75 pci_printk(KERN_DEBUG, dev, "device has %s\n", 76 dev->driver ? 77 "no AER-aware driver" : "no driver"); 78 } 79 80 /* 81 * If there's any device in the subtree that does not 82 * have an error_detected callback, returning 83 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of 84 * the subsequent mmio_enabled/slot_reset/resume 85 * callbacks of "any" device in the subtree. All the 86 * devices in the subtree are left in the error state 87 * without recovery. 88 */ 89 90 if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) 91 vote = PCI_ERS_RESULT_NO_AER_DRIVER; 92 else 93 vote = PCI_ERS_RESULT_NONE; 94 } else { 95 err_handler = dev->driver->err_handler; 96 vote = err_handler->error_detected(dev, result_data->state); 97 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); 98 } 99 100 result_data->result = merge_result(result_data->result, vote); 101 device_unlock(&dev->dev); 102 return 0; 103 } 104 105 static int report_mmio_enabled(struct pci_dev *dev, void *data) 106 { 107 pci_ers_result_t vote; 108 const struct pci_error_handlers *err_handler; 109 struct aer_broadcast_data *result_data; 110 111 result_data = (struct aer_broadcast_data *) data; 112 113 device_lock(&dev->dev); 114 if (!dev->driver || 115 !dev->driver->err_handler || 116 !dev->driver->err_handler->mmio_enabled) 117 goto out; 118 119 err_handler = dev->driver->err_handler; 120 vote = err_handler->mmio_enabled(dev); 121 result_data->result = merge_result(result_data->result, vote); 122 out: 123 device_unlock(&dev->dev); 124 return 0; 125 } 126 127 static int report_slot_reset(struct pci_dev *dev, void *data) 128 { 129 pci_ers_result_t vote; 130 const struct pci_error_handlers *err_handler; 131 struct aer_broadcast_data *result_data; 132 133 result_data = (struct aer_broadcast_data *) data; 134 135 device_lock(&dev->dev); 136 if (!dev->driver || 137 !dev->driver->err_handler || 138 !dev->driver->err_handler->slot_reset) 139 goto out; 140 141 err_handler = dev->driver->err_handler; 142 vote = err_handler->slot_reset(dev); 143 result_data->result = merge_result(result_data->result, vote); 144 out: 145 device_unlock(&dev->dev); 146 return 0; 147 } 148 149 static int report_resume(struct pci_dev *dev, void *data) 150 { 151 const struct pci_error_handlers *err_handler; 152 153 device_lock(&dev->dev); 154 dev->error_state = pci_channel_io_normal; 155 156 if (!dev->driver || 157 !dev->driver->err_handler || 158 !dev->driver->err_handler->resume) 159 goto out; 160 161 err_handler = dev->driver->err_handler; 162 err_handler->resume(dev); 163 pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED); 164 out: 165 device_unlock(&dev->dev); 166 return 0; 167 } 168 169 /** 170 * default_reset_link - default reset function 171 * @dev: pointer to pci_dev data structure 172 * 173 * Invoked when performing link reset on a Downstream Port or a 174 * Root Port with no aer driver. 175 */ 176 static pci_ers_result_t default_reset_link(struct pci_dev *dev) 177 { 178 int rc; 179 180 rc = pci_bridge_secondary_bus_reset(dev); 181 pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n"); 182 return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 183 } 184 185 static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service) 186 { 187 struct pci_dev *udev; 188 pci_ers_result_t status; 189 struct pcie_port_service_driver *driver = NULL; 190 191 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 192 /* Reset this port for all subordinates */ 193 udev = dev; 194 } else { 195 /* Reset the upstream component (likely downstream port) */ 196 udev = dev->bus->self; 197 } 198 199 /* Use the aer driver of the component firstly */ 200 driver = pcie_port_find_service(udev, service); 201 202 if (driver && driver->reset_link) { 203 status = driver->reset_link(udev); 204 } else if (udev->has_secondary_link) { 205 status = default_reset_link(udev); 206 } else { 207 pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n", 208 pci_name(udev)); 209 return PCI_ERS_RESULT_DISCONNECT; 210 } 211 212 if (status != PCI_ERS_RESULT_RECOVERED) { 213 pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n", 214 pci_name(udev)); 215 return PCI_ERS_RESULT_DISCONNECT; 216 } 217 218 return status; 219 } 220 221 /** 222 * broadcast_error_message - handle message broadcast to downstream drivers 223 * @dev: pointer to from where in a hierarchy message is broadcasted down 224 * @state: error state 225 * @error_mesg: message to print 226 * @cb: callback to be broadcasted 227 * 228 * Invoked during error recovery process. Once being invoked, the content 229 * of error severity will be broadcasted to all downstream drivers in a 230 * hierarchy in question. 231 */ 232 static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, 233 enum pci_channel_state state, 234 char *error_mesg, 235 int (*cb)(struct pci_dev *, void *)) 236 { 237 struct aer_broadcast_data result_data; 238 239 pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg); 240 result_data.state = state; 241 if (cb == report_error_detected) 242 result_data.result = PCI_ERS_RESULT_CAN_RECOVER; 243 else 244 result_data.result = PCI_ERS_RESULT_RECOVERED; 245 246 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 247 /* 248 * If the error is reported by a bridge, we think this error 249 * is related to the downstream link of the bridge, so we 250 * do error recovery on all subordinates of the bridge instead 251 * of the bridge and clear the error status of the bridge. 252 */ 253 if (cb == report_error_detected) 254 dev->error_state = state; 255 pci_walk_bus(dev->subordinate, cb, &result_data); 256 if (cb == report_resume) { 257 pci_aer_clear_device_status(dev); 258 pci_cleanup_aer_uncorrect_error_status(dev); 259 dev->error_state = pci_channel_io_normal; 260 } 261 } else { 262 /* 263 * If the error is reported by an end point, we think this 264 * error is related to the upstream link of the end point. 265 * The error is non fatal so the bus is ok; just invoke 266 * the callback for the function that logged the error. 267 */ 268 cb(dev, &result_data); 269 } 270 271 return result_data.result; 272 } 273 274 /** 275 * pcie_do_fatal_recovery - handle fatal error recovery process 276 * @dev: pointer to a pci_dev data structure of agent detecting an error 277 * 278 * Invoked when an error is fatal. Once being invoked, removes the devices 279 * beneath this AER agent, followed by reset link e.g. secondary bus reset 280 * followed by re-enumeration of devices. 281 */ 282 void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service) 283 { 284 struct pci_dev *udev; 285 struct pci_bus *parent; 286 struct pci_dev *pdev, *temp; 287 pci_ers_result_t result; 288 289 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) 290 udev = dev; 291 else 292 udev = dev->bus->self; 293 294 parent = udev->subordinate; 295 pci_lock_rescan_remove(); 296 pci_dev_get(dev); 297 list_for_each_entry_safe_reverse(pdev, temp, &parent->devices, 298 bus_list) { 299 pci_dev_get(pdev); 300 pci_dev_set_disconnected(pdev, NULL); 301 if (pci_has_subordinate(pdev)) 302 pci_walk_bus(pdev->subordinate, 303 pci_dev_set_disconnected, NULL); 304 pci_stop_and_remove_bus_device(pdev); 305 pci_dev_put(pdev); 306 } 307 308 result = reset_link(udev, service); 309 310 if ((service == PCIE_PORT_SERVICE_AER) && 311 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) { 312 /* 313 * If the error is reported by a bridge, we think this error 314 * is related to the downstream link of the bridge, so we 315 * do error recovery on all subordinates of the bridge instead 316 * of the bridge and clear the error status of the bridge. 317 */ 318 pci_aer_clear_fatal_status(dev); 319 pci_aer_clear_device_status(dev); 320 } 321 322 if (result == PCI_ERS_RESULT_RECOVERED) { 323 if (pcie_wait_for_link(udev, true)) 324 pci_rescan_bus(udev->bus); 325 pci_info(dev, "Device recovery from fatal error successful\n"); 326 } else { 327 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); 328 pci_info(dev, "Device recovery from fatal error failed\n"); 329 } 330 331 pci_dev_put(dev); 332 pci_unlock_rescan_remove(); 333 } 334 335 /** 336 * pcie_do_nonfatal_recovery - handle nonfatal error recovery process 337 * @dev: pointer to a pci_dev data structure of agent detecting an error 338 * 339 * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast 340 * error detected message to all downstream drivers within a hierarchy in 341 * question and return the returned code. 342 */ 343 void pcie_do_nonfatal_recovery(struct pci_dev *dev) 344 { 345 pci_ers_result_t status; 346 enum pci_channel_state state; 347 348 state = pci_channel_io_normal; 349 350 status = broadcast_error_message(dev, 351 state, 352 "error_detected", 353 report_error_detected); 354 355 if (status == PCI_ERS_RESULT_CAN_RECOVER) 356 status = broadcast_error_message(dev, 357 state, 358 "mmio_enabled", 359 report_mmio_enabled); 360 361 if (status == PCI_ERS_RESULT_NEED_RESET) { 362 /* 363 * TODO: Should call platform-specific 364 * functions to reset slot before calling 365 * drivers' slot_reset callbacks? 366 */ 367 status = broadcast_error_message(dev, 368 state, 369 "slot_reset", 370 report_slot_reset); 371 } 372 373 if (status != PCI_ERS_RESULT_RECOVERED) 374 goto failed; 375 376 broadcast_error_message(dev, 377 state, 378 "resume", 379 report_resume); 380 381 pci_info(dev, "AER: Device recovery successful\n"); 382 return; 383 384 failed: 385 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); 386 387 /* TODO: Should kernel panic here? */ 388 pci_info(dev, "AER: Device recovery failed\n"); 389 } 390