1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This file implements the error recovery as a core part of PCIe error 4 * reporting. When a PCIe error is delivered, an error message will be 5 * collected and printed to console, then, an error recovery procedure 6 * will be executed by following the PCI error recovery rules. 7 * 8 * Copyright (C) 2006 Intel Corp. 9 * Tom Long Nguyen (tom.l.nguyen@intel.com) 10 * Zhang Yanmin (yanmin.zhang@intel.com) 11 */ 12 13 #include <linux/pci.h> 14 #include <linux/module.h> 15 #include <linux/pci.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/aer.h> 19 #include "portdrv.h" 20 #include "../pci.h" 21 22 struct aer_broadcast_data { 23 enum pci_channel_state state; 24 enum pci_ers_result result; 25 }; 26 27 static pci_ers_result_t merge_result(enum pci_ers_result orig, 28 enum pci_ers_result new) 29 { 30 if (new == PCI_ERS_RESULT_NO_AER_DRIVER) 31 return PCI_ERS_RESULT_NO_AER_DRIVER; 32 33 if (new == PCI_ERS_RESULT_NONE) 34 return orig; 35 36 switch (orig) { 37 case PCI_ERS_RESULT_CAN_RECOVER: 38 case PCI_ERS_RESULT_RECOVERED: 39 orig = new; 40 break; 41 case PCI_ERS_RESULT_DISCONNECT: 42 if (new == PCI_ERS_RESULT_NEED_RESET) 43 orig = PCI_ERS_RESULT_NEED_RESET; 44 break; 45 default: 46 break; 47 } 48 49 return orig; 50 } 51 52 static int report_error_detected(struct pci_dev *dev, void *data) 53 { 54 pci_ers_result_t vote; 55 const struct pci_error_handlers *err_handler; 56 struct aer_broadcast_data *result_data; 57 58 result_data = (struct aer_broadcast_data *) data; 59 60 device_lock(&dev->dev); 61 dev->error_state = result_data->state; 62 63 if (!dev->driver || 64 !dev->driver->err_handler || 65 !dev->driver->err_handler->error_detected) { 66 if (result_data->state == pci_channel_io_frozen && 67 dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { 68 /* 69 * In case of fatal recovery, if one of down- 70 * stream device has no driver. We might be 71 * unable to recover because a later insmod 72 * of a driver for this device is unaware of 73 * its hw state. 74 */ 75 pci_printk(KERN_DEBUG, dev, "device has %s\n", 76 dev->driver ? 77 "no AER-aware driver" : "no driver"); 78 } 79 80 /* 81 * If there's any device in the subtree that does not 82 * have an error_detected callback, returning 83 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of 84 * the subsequent mmio_enabled/slot_reset/resume 85 * callbacks of "any" device in the subtree. All the 86 * devices in the subtree are left in the error state 87 * without recovery. 88 */ 89 90 if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) 91 vote = PCI_ERS_RESULT_NO_AER_DRIVER; 92 else 93 vote = PCI_ERS_RESULT_NONE; 94 } else { 95 err_handler = dev->driver->err_handler; 96 vote = err_handler->error_detected(dev, result_data->state); 97 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); 98 } 99 100 result_data->result = merge_result(result_data->result, vote); 101 device_unlock(&dev->dev); 102 return 0; 103 } 104 105 static int report_mmio_enabled(struct pci_dev *dev, void *data) 106 { 107 pci_ers_result_t vote; 108 const struct pci_error_handlers *err_handler; 109 struct aer_broadcast_data *result_data; 110 111 result_data = (struct aer_broadcast_data *) data; 112 113 device_lock(&dev->dev); 114 if (!dev->driver || 115 !dev->driver->err_handler || 116 !dev->driver->err_handler->mmio_enabled) 117 goto out; 118 119 err_handler = dev->driver->err_handler; 120 vote = err_handler->mmio_enabled(dev); 121 result_data->result = merge_result(result_data->result, vote); 122 out: 123 device_unlock(&dev->dev); 124 return 0; 125 } 126 127 static int report_slot_reset(struct pci_dev *dev, void *data) 128 { 129 pci_ers_result_t vote; 130 const struct pci_error_handlers *err_handler; 131 struct aer_broadcast_data *result_data; 132 133 result_data = (struct aer_broadcast_data *) data; 134 135 device_lock(&dev->dev); 136 if (!dev->driver || 137 !dev->driver->err_handler || 138 !dev->driver->err_handler->slot_reset) 139 goto out; 140 141 err_handler = dev->driver->err_handler; 142 vote = err_handler->slot_reset(dev); 143 result_data->result = merge_result(result_data->result, vote); 144 out: 145 device_unlock(&dev->dev); 146 return 0; 147 } 148 149 static int report_resume(struct pci_dev *dev, void *data) 150 { 151 const struct pci_error_handlers *err_handler; 152 153 device_lock(&dev->dev); 154 dev->error_state = pci_channel_io_normal; 155 156 if (!dev->driver || 157 !dev->driver->err_handler || 158 !dev->driver->err_handler->resume) 159 goto out; 160 161 err_handler = dev->driver->err_handler; 162 err_handler->resume(dev); 163 pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED); 164 out: 165 device_unlock(&dev->dev); 166 return 0; 167 } 168 169 /** 170 * default_reset_link - default reset function 171 * @dev: pointer to pci_dev data structure 172 * 173 * Invoked when performing link reset on a Downstream Port or a 174 * Root Port with no aer driver. 175 */ 176 static pci_ers_result_t default_reset_link(struct pci_dev *dev) 177 { 178 pci_reset_bridge_secondary_bus(dev); 179 pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n"); 180 return PCI_ERS_RESULT_RECOVERED; 181 } 182 183 static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service) 184 { 185 struct pci_dev *udev; 186 pci_ers_result_t status; 187 struct pcie_port_service_driver *driver = NULL; 188 189 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 190 /* Reset this port for all subordinates */ 191 udev = dev; 192 } else { 193 /* Reset the upstream component (likely downstream port) */ 194 udev = dev->bus->self; 195 } 196 197 /* Use the aer driver of the component firstly */ 198 driver = pcie_port_find_service(udev, service); 199 200 if (driver && driver->reset_link) { 201 status = driver->reset_link(udev); 202 } else if (udev->has_secondary_link) { 203 status = default_reset_link(udev); 204 } else { 205 pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n", 206 pci_name(udev)); 207 return PCI_ERS_RESULT_DISCONNECT; 208 } 209 210 if (status != PCI_ERS_RESULT_RECOVERED) { 211 pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n", 212 pci_name(udev)); 213 return PCI_ERS_RESULT_DISCONNECT; 214 } 215 216 return status; 217 } 218 219 /** 220 * broadcast_error_message - handle message broadcast to downstream drivers 221 * @dev: pointer to from where in a hierarchy message is broadcasted down 222 * @state: error state 223 * @error_mesg: message to print 224 * @cb: callback to be broadcasted 225 * 226 * Invoked during error recovery process. Once being invoked, the content 227 * of error severity will be broadcasted to all downstream drivers in a 228 * hierarchy in question. 229 */ 230 static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, 231 enum pci_channel_state state, 232 char *error_mesg, 233 int (*cb)(struct pci_dev *, void *)) 234 { 235 struct aer_broadcast_data result_data; 236 237 pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg); 238 result_data.state = state; 239 if (cb == report_error_detected) 240 result_data.result = PCI_ERS_RESULT_CAN_RECOVER; 241 else 242 result_data.result = PCI_ERS_RESULT_RECOVERED; 243 244 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 245 /* 246 * If the error is reported by a bridge, we think this error 247 * is related to the downstream link of the bridge, so we 248 * do error recovery on all subordinates of the bridge instead 249 * of the bridge and clear the error status of the bridge. 250 */ 251 if (cb == report_error_detected) 252 dev->error_state = state; 253 pci_walk_bus(dev->subordinate, cb, &result_data); 254 if (cb == report_resume) { 255 pci_cleanup_aer_uncorrect_error_status(dev); 256 dev->error_state = pci_channel_io_normal; 257 } 258 } else { 259 /* 260 * If the error is reported by an end point, we think this 261 * error is related to the upstream link of the end point. 262 */ 263 if (state == pci_channel_io_normal) 264 /* 265 * the error is non fatal so the bus is ok, just invoke 266 * the callback for the function that logged the error. 267 */ 268 cb(dev, &result_data); 269 else 270 pci_walk_bus(dev->bus, cb, &result_data); 271 } 272 273 return result_data.result; 274 } 275 276 /** 277 * pcie_do_fatal_recovery - handle fatal error recovery process 278 * @dev: pointer to a pci_dev data structure of agent detecting an error 279 * 280 * Invoked when an error is fatal. Once being invoked, removes the devices 281 * beneath this AER agent, followed by reset link e.g. secondary bus reset 282 * followed by re-enumeration of devices. 283 */ 284 void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service) 285 { 286 struct pci_dev *udev; 287 struct pci_bus *parent; 288 struct pci_dev *pdev, *temp; 289 pci_ers_result_t result; 290 291 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) 292 udev = dev; 293 else 294 udev = dev->bus->self; 295 296 parent = udev->subordinate; 297 pci_lock_rescan_remove(); 298 pci_dev_get(dev); 299 list_for_each_entry_safe_reverse(pdev, temp, &parent->devices, 300 bus_list) { 301 pci_dev_get(pdev); 302 pci_dev_set_disconnected(pdev, NULL); 303 if (pci_has_subordinate(pdev)) 304 pci_walk_bus(pdev->subordinate, 305 pci_dev_set_disconnected, NULL); 306 pci_stop_and_remove_bus_device(pdev); 307 pci_dev_put(pdev); 308 } 309 310 result = reset_link(udev, service); 311 312 if ((service == PCIE_PORT_SERVICE_AER) && 313 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) { 314 /* 315 * If the error is reported by a bridge, we think this error 316 * is related to the downstream link of the bridge, so we 317 * do error recovery on all subordinates of the bridge instead 318 * of the bridge and clear the error status of the bridge. 319 */ 320 pci_cleanup_aer_uncorrect_error_status(dev); 321 } 322 323 if (result == PCI_ERS_RESULT_RECOVERED) { 324 if (pcie_wait_for_link(udev, true)) 325 pci_rescan_bus(udev->bus); 326 pci_info(dev, "Device recovery from fatal error successful\n"); 327 } else { 328 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); 329 pci_info(dev, "Device recovery from fatal error failed\n"); 330 } 331 332 pci_dev_put(dev); 333 pci_unlock_rescan_remove(); 334 } 335 336 /** 337 * pcie_do_nonfatal_recovery - handle nonfatal error recovery process 338 * @dev: pointer to a pci_dev data structure of agent detecting an error 339 * 340 * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast 341 * error detected message to all downstream drivers within a hierarchy in 342 * question and return the returned code. 343 */ 344 void pcie_do_nonfatal_recovery(struct pci_dev *dev) 345 { 346 pci_ers_result_t status; 347 enum pci_channel_state state; 348 349 state = pci_channel_io_normal; 350 351 status = broadcast_error_message(dev, 352 state, 353 "error_detected", 354 report_error_detected); 355 356 if (status == PCI_ERS_RESULT_CAN_RECOVER) 357 status = broadcast_error_message(dev, 358 state, 359 "mmio_enabled", 360 report_mmio_enabled); 361 362 if (status == PCI_ERS_RESULT_NEED_RESET) { 363 /* 364 * TODO: Should call platform-specific 365 * functions to reset slot before calling 366 * drivers' slot_reset callbacks? 367 */ 368 status = broadcast_error_message(dev, 369 state, 370 "slot_reset", 371 report_slot_reset); 372 } 373 374 if (status != PCI_ERS_RESULT_RECOVERED) 375 goto failed; 376 377 broadcast_error_message(dev, 378 state, 379 "resume", 380 report_resume); 381 382 pci_info(dev, "AER: Device recovery successful\n"); 383 return; 384 385 failed: 386 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); 387 388 /* TODO: Should kernel panic here? */ 389 pci_info(dev, "AER: Device recovery failed\n"); 390 } 391