xref: /openbmc/linux/arch/powerpc/platforms/powernv/opal-hmi.c (revision efdbd7345f8836f7495f3ac6ee237d86cb3bb6b0)
1 /*
2  * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Copyright 2014 IBM Corporation
18  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
19  */
20 
21 #undef DEBUG
22 
23 #include <linux/kernel.h>
24 #include <linux/init.h>
25 #include <linux/of.h>
26 #include <linux/mm.h>
27 #include <linux/slab.h>
28 
29 #include <asm/opal.h>
30 #include <asm/cputable.h>
31 #include <asm/machdep.h>
32 
33 static int opal_hmi_handler_nb_init;
34 struct OpalHmiEvtNode {
35 	struct list_head list;
36 	struct OpalHMIEvent hmi_evt;
37 };
38 
39 struct xstop_reason {
40 	uint32_t xstop_reason;
41 	const char *unit_failed;
42 	const char *description;
43 };
44 
45 static LIST_HEAD(opal_hmi_evt_list);
46 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
47 
48 static void print_core_checkstop_reason(const char *level,
49 					struct OpalHMIEvent *hmi_evt)
50 {
51 	int i;
52 	static const struct xstop_reason xstop_reason[] = {
53 		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
54 				"RegFile core check stop" },
55 		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
56 		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
57 				"Core checkstop during recovery" },
58 		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
59 				"RegFile core check stop (mapper error)" },
60 		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
61 		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
62 		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
63 		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
64 				"Recovery in maintenance mode" },
65 		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
66 				"RegFile core check stop" },
67 		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
68 				"Forward Progress Error" },
69 		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
70 		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
71 		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
72 				"Hypervisor Resource error - core check stop" },
73 		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
74 				"Hang Recovery Failed (core check stop)" },
75 		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
76 				"Ambiguous Hang Detected (unknown source)" },
77 		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
78 				"Debug Trigger Error inject" },
79 		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
80 				"Hypervisor check stop via SPRC/SPRD" },
81 	};
82 
83 	/* Validity check */
84 	if (!hmi_evt->u.xstop_error.xstop_reason) {
85 		printk("%s	Unknown Core check stop.\n", level);
86 		return;
87 	}
88 
89 	printk("%s	CPU PIR: %08x\n", level,
90 			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
91 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
92 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
93 					xstop_reason[i].xstop_reason)
94 			printk("%s	[Unit: %-3s] %s\n", level,
95 					xstop_reason[i].unit_failed,
96 					xstop_reason[i].description);
97 }
98 
99 static void print_nx_checkstop_reason(const char *level,
100 					struct OpalHMIEvent *hmi_evt)
101 {
102 	int i;
103 	static const struct xstop_reason xstop_reason[] = {
104 		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
105 					"SHM invalid state error" },
106 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
107 					"DMA invalid state error bit 15" },
108 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
109 					"DMA invalid state error bit 16" },
110 		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
111 					"Channel 0 invalid state error" },
112 		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
113 					"Channel 1 invalid state error" },
114 		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
115 					"Channel 2 invalid state error" },
116 		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
117 					"Channel 3 invalid state error" },
118 		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
119 					"Channel 4 invalid state error" },
120 		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
121 					"Channel 5 invalid state error" },
122 		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
123 					"Channel 6 invalid state error" },
124 		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
125 					"Channel 7 invalid state error" },
126 		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
127 					"UE error on CRB(CSB address, CCB)" },
128 		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
129 					"SUE error on CRB(CSB address, CCB)" },
130 		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
131 		"CRB Kill ISN received while holding ISN with UE error" },
132 	};
133 
134 	/* Validity check */
135 	if (!hmi_evt->u.xstop_error.xstop_reason) {
136 		printk("%s	Unknown NX check stop.\n", level);
137 		return;
138 	}
139 
140 	printk("%s	NX checkstop on CHIP ID: %x\n", level,
141 			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
142 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
143 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
144 					xstop_reason[i].xstop_reason)
145 			printk("%s	[Unit: %-3s] %s\n", level,
146 					xstop_reason[i].unit_failed,
147 					xstop_reason[i].description);
148 }
149 
150 static void print_checkstop_reason(const char *level,
151 					struct OpalHMIEvent *hmi_evt)
152 {
153 	switch (hmi_evt->u.xstop_error.xstop_type) {
154 	case CHECKSTOP_TYPE_CORE:
155 		print_core_checkstop_reason(level, hmi_evt);
156 		break;
157 	case CHECKSTOP_TYPE_NX:
158 		print_nx_checkstop_reason(level, hmi_evt);
159 		break;
160 	case CHECKSTOP_TYPE_UNKNOWN:
161 		printk("%s	Unknown Malfunction Alert.\n", level);
162 		break;
163 	}
164 }
165 
166 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
167 {
168 	const char *level, *sevstr, *error_info;
169 	static const char *hmi_error_types[] = {
170 		"Malfunction Alert",
171 		"Processor Recovery done",
172 		"Processor recovery occurred again",
173 		"Processor recovery occurred for masked error",
174 		"Timer facility experienced an error",
175 		"TFMR SPR is corrupted",
176 		"UPS (Uniterrupted Power System) Overflow indication",
177 		"An XSCOM operation failure",
178 		"An XSCOM operation completed",
179 		"SCOM has set a reserved FIR bit to cause recovery",
180 		"Debug trigger has set a reserved FIR bit to cause recovery",
181 		"A hypervisor resource error occurred"
182 	};
183 
184 	/* Print things out */
185 	if (hmi_evt->version < OpalHMIEvt_V1) {
186 		pr_err("HMI Interrupt, Unknown event version %d !\n",
187 			hmi_evt->version);
188 		return;
189 	}
190 	switch (hmi_evt->severity) {
191 	case OpalHMI_SEV_NO_ERROR:
192 		level = KERN_INFO;
193 		sevstr = "Harmless";
194 		break;
195 	case OpalHMI_SEV_WARNING:
196 		level = KERN_WARNING;
197 		sevstr = "";
198 		break;
199 	case OpalHMI_SEV_ERROR_SYNC:
200 		level = KERN_ERR;
201 		sevstr = "Severe";
202 		break;
203 	case OpalHMI_SEV_FATAL:
204 	default:
205 		level = KERN_ERR;
206 		sevstr = "Fatal";
207 		break;
208 	}
209 
210 	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
211 		level, sevstr,
212 		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
213 		"Recovered" : "Not recovered");
214 	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
215 			hmi_error_types[hmi_evt->type]
216 			: "Unknown";
217 	printk("%s Error detail: %s\n", level, error_info);
218 	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
219 	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
220 		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
221 		printk("%s	TFMR: %016llx\n", level,
222 						be64_to_cpu(hmi_evt->tfmr));
223 
224 	if (hmi_evt->version < OpalHMIEvt_V2)
225 		return;
226 
227 	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
228 	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
229 		print_checkstop_reason(level, hmi_evt);
230 }
231 
232 static void hmi_event_handler(struct work_struct *work)
233 {
234 	unsigned long flags;
235 	struct OpalHMIEvent *hmi_evt;
236 	struct OpalHmiEvtNode *msg_node;
237 	uint8_t disposition;
238 	struct opal_msg msg;
239 	int unrecoverable = 0;
240 
241 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
242 	while (!list_empty(&opal_hmi_evt_list)) {
243 		msg_node = list_entry(opal_hmi_evt_list.next,
244 					   struct OpalHmiEvtNode, list);
245 		list_del(&msg_node->list);
246 		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
247 
248 		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
249 		print_hmi_event_info(hmi_evt);
250 		disposition = hmi_evt->disposition;
251 		kfree(msg_node);
252 
253 		/*
254 		 * Check if HMI event has been recovered or not. If not
255 		 * then kernel can't continue, we need to panic.
256 		 * But before we do that, display all the HMI event
257 		 * available on the list and set unrecoverable flag to 1.
258 		 */
259 		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
260 			unrecoverable = 1;
261 
262 		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
263 	}
264 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
265 
266 	if (unrecoverable) {
267 		int ret;
268 
269 		/* Pull all HMI events from OPAL before we panic. */
270 		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
271 			u32 type;
272 
273 			type = be32_to_cpu(msg.msg_type);
274 
275 			/* skip if not HMI event */
276 			if (type != OPAL_MSG_HMI_EVT)
277 				continue;
278 
279 			/* HMI event info starts from param[0] */
280 			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
281 			print_hmi_event_info(hmi_evt);
282 		}
283 
284 		/*
285 		 * Unrecoverable HMI exception. We need to inform BMC/OCC
286 		 * about this error so that it can collect relevant data
287 		 * for error analysis before rebooting.
288 		 */
289 		ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR,
290 			"Unrecoverable HMI exception");
291 		if (ret == OPAL_UNSUPPORTED) {
292 			pr_emerg("Reboot type %d not supported\n",
293 						OPAL_REBOOT_PLATFORM_ERROR);
294 		}
295 
296 		/*
297 		 * Fall through and panic if opal_cec_reboot2() returns
298 		 * OPAL_UNSUPPORTED.
299 		 */
300 		panic("Unrecoverable HMI exception");
301 	}
302 }
303 
304 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
305 /*
306  * opal_handle_hmi_event - notifier handler that queues up HMI events
307  * to be preocessed later.
308  */
309 static int opal_handle_hmi_event(struct notifier_block *nb,
310 			  unsigned long msg_type, void *msg)
311 {
312 	unsigned long flags;
313 	struct OpalHMIEvent *hmi_evt;
314 	struct opal_msg *hmi_msg = msg;
315 	struct OpalHmiEvtNode *msg_node;
316 
317 	/* Sanity Checks */
318 	if (msg_type != OPAL_MSG_HMI_EVT)
319 		return 0;
320 
321 	/* HMI event info starts from param[0] */
322 	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
323 
324 	/* Delay the logging of HMI events to workqueue. */
325 	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
326 	if (!msg_node) {
327 		pr_err("HMI: out of memory, Opal message event not handled\n");
328 		return -ENOMEM;
329 	}
330 	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent));
331 
332 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
333 	list_add(&msg_node->list, &opal_hmi_evt_list);
334 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
335 
336 	schedule_work(&hmi_event_work);
337 	return 0;
338 }
339 
340 static struct notifier_block opal_hmi_handler_nb = {
341 	.notifier_call	= opal_handle_hmi_event,
342 	.next		= NULL,
343 	.priority	= 0,
344 };
345 
346 int __init opal_hmi_handler_init(void)
347 {
348 	int ret;
349 
350 	if (!opal_hmi_handler_nb_init) {
351 		ret = opal_message_notifier_register(
352 				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
353 		if (ret) {
354 			pr_err("%s: Can't register OPAL event notifier (%d)\n",
355 			       __func__, ret);
356 			return ret;
357 		}
358 		opal_hmi_handler_nb_init = 1;
359 	}
360 	return 0;
361 }
362