xref: /openbmc/linux/drivers/pci/pcie/err.c (revision 5b394b2d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * This file implements the error recovery as a core part of PCIe error
4  * reporting. When a PCIe error is delivered, an error message will be
5  * collected and printed to console, then, an error recovery procedure
6  * will be executed by following the PCI error recovery rules.
7  *
8  * Copyright (C) 2006 Intel Corp.
9  *	Tom Long Nguyen (tom.l.nguyen@intel.com)
10  *	Zhang Yanmin (yanmin.zhang@intel.com)
11  */
12 
13 #include <linux/pci.h>
14 #include <linux/module.h>
15 #include <linux/pci.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/aer.h>
19 #include "portdrv.h"
20 #include "../pci.h"
21 
22 struct aer_broadcast_data {
23 	enum pci_channel_state state;
24 	enum pci_ers_result result;
25 };
26 
27 static pci_ers_result_t merge_result(enum pci_ers_result orig,
28 				  enum pci_ers_result new)
29 {
30 	if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
31 		return PCI_ERS_RESULT_NO_AER_DRIVER;
32 
33 	if (new == PCI_ERS_RESULT_NONE)
34 		return orig;
35 
36 	switch (orig) {
37 	case PCI_ERS_RESULT_CAN_RECOVER:
38 	case PCI_ERS_RESULT_RECOVERED:
39 		orig = new;
40 		break;
41 	case PCI_ERS_RESULT_DISCONNECT:
42 		if (new == PCI_ERS_RESULT_NEED_RESET)
43 			orig = PCI_ERS_RESULT_NEED_RESET;
44 		break;
45 	default:
46 		break;
47 	}
48 
49 	return orig;
50 }
51 
52 static int report_error_detected(struct pci_dev *dev, void *data)
53 {
54 	pci_ers_result_t vote;
55 	const struct pci_error_handlers *err_handler;
56 	struct aer_broadcast_data *result_data;
57 
58 	result_data = (struct aer_broadcast_data *) data;
59 
60 	device_lock(&dev->dev);
61 	dev->error_state = result_data->state;
62 
63 	if (!dev->driver ||
64 		!dev->driver->err_handler ||
65 		!dev->driver->err_handler->error_detected) {
66 		if (result_data->state == pci_channel_io_frozen &&
67 			dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
68 			/*
69 			 * In case of fatal recovery, if one of down-
70 			 * stream device has no driver. We might be
71 			 * unable to recover because a later insmod
72 			 * of a driver for this device is unaware of
73 			 * its hw state.
74 			 */
75 			pci_printk(KERN_DEBUG, dev, "device has %s\n",
76 				   dev->driver ?
77 				   "no AER-aware driver" : "no driver");
78 		}
79 
80 		/*
81 		 * If there's any device in the subtree that does not
82 		 * have an error_detected callback, returning
83 		 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
84 		 * the subsequent mmio_enabled/slot_reset/resume
85 		 * callbacks of "any" device in the subtree. All the
86 		 * devices in the subtree are left in the error state
87 		 * without recovery.
88 		 */
89 
90 		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
91 			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
92 		else
93 			vote = PCI_ERS_RESULT_NONE;
94 	} else {
95 		err_handler = dev->driver->err_handler;
96 		vote = err_handler->error_detected(dev, result_data->state);
97 		pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
98 	}
99 
100 	result_data->result = merge_result(result_data->result, vote);
101 	device_unlock(&dev->dev);
102 	return 0;
103 }
104 
105 static int report_mmio_enabled(struct pci_dev *dev, void *data)
106 {
107 	pci_ers_result_t vote;
108 	const struct pci_error_handlers *err_handler;
109 	struct aer_broadcast_data *result_data;
110 
111 	result_data = (struct aer_broadcast_data *) data;
112 
113 	device_lock(&dev->dev);
114 	if (!dev->driver ||
115 		!dev->driver->err_handler ||
116 		!dev->driver->err_handler->mmio_enabled)
117 		goto out;
118 
119 	err_handler = dev->driver->err_handler;
120 	vote = err_handler->mmio_enabled(dev);
121 	result_data->result = merge_result(result_data->result, vote);
122 out:
123 	device_unlock(&dev->dev);
124 	return 0;
125 }
126 
127 static int report_slot_reset(struct pci_dev *dev, void *data)
128 {
129 	pci_ers_result_t vote;
130 	const struct pci_error_handlers *err_handler;
131 	struct aer_broadcast_data *result_data;
132 
133 	result_data = (struct aer_broadcast_data *) data;
134 
135 	device_lock(&dev->dev);
136 	if (!dev->driver ||
137 		!dev->driver->err_handler ||
138 		!dev->driver->err_handler->slot_reset)
139 		goto out;
140 
141 	err_handler = dev->driver->err_handler;
142 	vote = err_handler->slot_reset(dev);
143 	result_data->result = merge_result(result_data->result, vote);
144 out:
145 	device_unlock(&dev->dev);
146 	return 0;
147 }
148 
149 static int report_resume(struct pci_dev *dev, void *data)
150 {
151 	const struct pci_error_handlers *err_handler;
152 
153 	device_lock(&dev->dev);
154 	dev->error_state = pci_channel_io_normal;
155 
156 	if (!dev->driver ||
157 		!dev->driver->err_handler ||
158 		!dev->driver->err_handler->resume)
159 		goto out;
160 
161 	err_handler = dev->driver->err_handler;
162 	err_handler->resume(dev);
163 	pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
164 out:
165 	device_unlock(&dev->dev);
166 	return 0;
167 }
168 
169 /**
170  * default_reset_link - default reset function
171  * @dev: pointer to pci_dev data structure
172  *
173  * Invoked when performing link reset on a Downstream Port or a
174  * Root Port with no aer driver.
175  */
176 static pci_ers_result_t default_reset_link(struct pci_dev *dev)
177 {
178 	int rc;
179 
180 	rc = pci_bridge_secondary_bus_reset(dev);
181 	pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
182 	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
183 }
184 
185 static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
186 {
187 	struct pci_dev *udev;
188 	pci_ers_result_t status;
189 	struct pcie_port_service_driver *driver = NULL;
190 
191 	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
192 		/* Reset this port for all subordinates */
193 		udev = dev;
194 	} else {
195 		/* Reset the upstream component (likely downstream port) */
196 		udev = dev->bus->self;
197 	}
198 
199 	/* Use the aer driver of the component firstly */
200 	driver = pcie_port_find_service(udev, service);
201 
202 	if (driver && driver->reset_link) {
203 		status = driver->reset_link(udev);
204 	} else if (udev->has_secondary_link) {
205 		status = default_reset_link(udev);
206 	} else {
207 		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
208 			pci_name(udev));
209 		return PCI_ERS_RESULT_DISCONNECT;
210 	}
211 
212 	if (status != PCI_ERS_RESULT_RECOVERED) {
213 		pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
214 			pci_name(udev));
215 		return PCI_ERS_RESULT_DISCONNECT;
216 	}
217 
218 	return status;
219 }
220 
221 /**
222  * broadcast_error_message - handle message broadcast to downstream drivers
223  * @dev: pointer to from where in a hierarchy message is broadcasted down
224  * @state: error state
225  * @error_mesg: message to print
226  * @cb: callback to be broadcasted
227  *
228  * Invoked during error recovery process. Once being invoked, the content
229  * of error severity will be broadcasted to all downstream drivers in a
230  * hierarchy in question.
231  */
232 static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
233 	enum pci_channel_state state,
234 	char *error_mesg,
235 	int (*cb)(struct pci_dev *, void *))
236 {
237 	struct aer_broadcast_data result_data;
238 
239 	pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
240 	result_data.state = state;
241 	if (cb == report_error_detected)
242 		result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
243 	else
244 		result_data.result = PCI_ERS_RESULT_RECOVERED;
245 
246 	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
247 		/*
248 		 * If the error is reported by a bridge, we think this error
249 		 * is related to the downstream link of the bridge, so we
250 		 * do error recovery on all subordinates of the bridge instead
251 		 * of the bridge and clear the error status of the bridge.
252 		 */
253 		if (cb == report_error_detected)
254 			dev->error_state = state;
255 		pci_walk_bus(dev->subordinate, cb, &result_data);
256 		if (cb == report_resume) {
257 			pci_aer_clear_device_status(dev);
258 			pci_cleanup_aer_uncorrect_error_status(dev);
259 			dev->error_state = pci_channel_io_normal;
260 		}
261 	} else {
262 		/*
263 		 * If the error is reported by an end point, we think this
264 		 * error is related to the upstream link of the end point.
265 		 * The error is non fatal so the bus is ok; just invoke
266 		 * the callback for the function that logged the error.
267 		 */
268 		cb(dev, &result_data);
269 	}
270 
271 	return result_data.result;
272 }
273 
274 /**
275  * pcie_do_fatal_recovery - handle fatal error recovery process
276  * @dev: pointer to a pci_dev data structure of agent detecting an error
277  *
278  * Invoked when an error is fatal. Once being invoked, removes the devices
279  * beneath this AER agent, followed by reset link e.g. secondary bus reset
280  * followed by re-enumeration of devices.
281  */
282 void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
283 {
284 	struct pci_dev *udev;
285 	struct pci_bus *parent;
286 	struct pci_dev *pdev, *temp;
287 	pci_ers_result_t result;
288 
289 	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
290 		udev = dev;
291 	else
292 		udev = dev->bus->self;
293 
294 	parent = udev->subordinate;
295 	pci_lock_rescan_remove();
296 	pci_dev_get(dev);
297 	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
298 					 bus_list) {
299 		pci_dev_get(pdev);
300 		pci_dev_set_disconnected(pdev, NULL);
301 		if (pci_has_subordinate(pdev))
302 			pci_walk_bus(pdev->subordinate,
303 				     pci_dev_set_disconnected, NULL);
304 		pci_stop_and_remove_bus_device(pdev);
305 		pci_dev_put(pdev);
306 	}
307 
308 	result = reset_link(udev, service);
309 
310 	if ((service == PCIE_PORT_SERVICE_AER) &&
311 	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
312 		/*
313 		 * If the error is reported by a bridge, we think this error
314 		 * is related to the downstream link of the bridge, so we
315 		 * do error recovery on all subordinates of the bridge instead
316 		 * of the bridge and clear the error status of the bridge.
317 		 */
318 		pci_aer_clear_fatal_status(dev);
319 		pci_aer_clear_device_status(dev);
320 	}
321 
322 	if (result == PCI_ERS_RESULT_RECOVERED) {
323 		if (pcie_wait_for_link(udev, true))
324 			pci_rescan_bus(udev->bus);
325 		pci_info(dev, "Device recovery from fatal error successful\n");
326 	} else {
327 		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
328 		pci_info(dev, "Device recovery from fatal error failed\n");
329 	}
330 
331 	pci_dev_put(dev);
332 	pci_unlock_rescan_remove();
333 }
334 
335 /**
336  * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
337  * @dev: pointer to a pci_dev data structure of agent detecting an error
338  *
339  * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
340  * error detected message to all downstream drivers within a hierarchy in
341  * question and return the returned code.
342  */
343 void pcie_do_nonfatal_recovery(struct pci_dev *dev)
344 {
345 	pci_ers_result_t status;
346 	enum pci_channel_state state;
347 
348 	state = pci_channel_io_normal;
349 
350 	status = broadcast_error_message(dev,
351 			state,
352 			"error_detected",
353 			report_error_detected);
354 
355 	if (status == PCI_ERS_RESULT_CAN_RECOVER)
356 		status = broadcast_error_message(dev,
357 				state,
358 				"mmio_enabled",
359 				report_mmio_enabled);
360 
361 	if (status == PCI_ERS_RESULT_NEED_RESET) {
362 		/*
363 		 * TODO: Should call platform-specific
364 		 * functions to reset slot before calling
365 		 * drivers' slot_reset callbacks?
366 		 */
367 		status = broadcast_error_message(dev,
368 				state,
369 				"slot_reset",
370 				report_slot_reset);
371 	}
372 
373 	if (status != PCI_ERS_RESULT_RECOVERED)
374 		goto failed;
375 
376 	broadcast_error_message(dev,
377 				state,
378 				"resume",
379 				report_resume);
380 
381 	pci_info(dev, "AER: Device recovery successful\n");
382 	return;
383 
384 failed:
385 	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
386 
387 	/* TODO: Should kernel panic here? */
388 	pci_info(dev, "AER: Device recovery failed\n");
389 }
390