1 /*
2  * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Copyright 2014 IBM Corporation
18  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
19  */
20 
21 #undef DEBUG
22 
23 #include <linux/kernel.h>
24 #include <linux/init.h>
25 #include <linux/of.h>
26 #include <linux/mm.h>
27 #include <linux/slab.h>
28 
29 #include <asm/opal.h>
30 #include <asm/cputable.h>
31 #include <asm/machdep.h>
32 
33 static int opal_hmi_handler_nb_init;
34 struct OpalHmiEvtNode {
35 	struct list_head list;
36 	struct OpalHMIEvent hmi_evt;
37 };
38 
39 struct xstop_reason {
40 	uint32_t xstop_reason;
41 	const char *unit_failed;
42 	const char *description;
43 };
44 
45 static LIST_HEAD(opal_hmi_evt_list);
46 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
47 
48 static void print_core_checkstop_reason(const char *level,
49 					struct OpalHMIEvent *hmi_evt)
50 {
51 	int i;
52 	static const struct xstop_reason xstop_reason[] = {
53 		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
54 				"RegFile core check stop" },
55 		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
56 		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
57 				"Core checkstop during recovery" },
58 		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
59 				"RegFile core check stop (mapper error)" },
60 		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
61 		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
62 		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
63 		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
64 				"Recovery in maintenance mode" },
65 		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
66 				"RegFile core check stop" },
67 		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
68 				"Forward Progress Error" },
69 		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
70 		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
71 		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
72 				"Hypervisor Resource error - core check stop" },
73 		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
74 				"Hang Recovery Failed (core check stop)" },
75 		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
76 				"Ambiguous Hang Detected (unknown source)" },
77 		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
78 				"Debug Trigger Error inject" },
79 		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
80 				"Hypervisor check stop via SPRC/SPRD" },
81 	};
82 
83 	/* Validity check */
84 	if (!hmi_evt->u.xstop_error.xstop_reason) {
85 		printk("%s	Unknown Core check stop.\n", level);
86 		return;
87 	}
88 
89 	printk("%s	CPU PIR: %08x\n", level,
90 			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
91 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
92 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
93 					xstop_reason[i].xstop_reason)
94 			printk("%s	[Unit: %-3s] %s\n", level,
95 					xstop_reason[i].unit_failed,
96 					xstop_reason[i].description);
97 }
98 
99 static void print_nx_checkstop_reason(const char *level,
100 					struct OpalHMIEvent *hmi_evt)
101 {
102 	int i;
103 	static const struct xstop_reason xstop_reason[] = {
104 		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
105 					"SHM invalid state error" },
106 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
107 					"DMA invalid state error bit 15" },
108 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
109 					"DMA invalid state error bit 16" },
110 		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
111 					"Channel 0 invalid state error" },
112 		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
113 					"Channel 1 invalid state error" },
114 		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
115 					"Channel 2 invalid state error" },
116 		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
117 					"Channel 3 invalid state error" },
118 		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
119 					"Channel 4 invalid state error" },
120 		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
121 					"Channel 5 invalid state error" },
122 		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
123 					"Channel 6 invalid state error" },
124 		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
125 					"Channel 7 invalid state error" },
126 		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
127 					"UE error on CRB(CSB address, CCB)" },
128 		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
129 					"SUE error on CRB(CSB address, CCB)" },
130 		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
131 		"CRB Kill ISN received while holding ISN with UE error" },
132 	};
133 
134 	/* Validity check */
135 	if (!hmi_evt->u.xstop_error.xstop_reason) {
136 		printk("%s	Unknown NX check stop.\n", level);
137 		return;
138 	}
139 
140 	printk("%s	NX checkstop on CHIP ID: %x\n", level,
141 			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
142 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
143 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
144 					xstop_reason[i].xstop_reason)
145 			printk("%s	[Unit: %-3s] %s\n", level,
146 					xstop_reason[i].unit_failed,
147 					xstop_reason[i].description);
148 }
149 
150 static void print_checkstop_reason(const char *level,
151 					struct OpalHMIEvent *hmi_evt)
152 {
153 	uint8_t type = hmi_evt->u.xstop_error.xstop_type;
154 	switch (type) {
155 	case CHECKSTOP_TYPE_CORE:
156 		print_core_checkstop_reason(level, hmi_evt);
157 		break;
158 	case CHECKSTOP_TYPE_NX:
159 		print_nx_checkstop_reason(level, hmi_evt);
160 		break;
161 	default:
162 		printk("%s	Unknown Malfunction Alert of type %d\n",
163 		       level, type);
164 		break;
165 	}
166 }
167 
168 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
169 {
170 	const char *level, *sevstr, *error_info;
171 	static const char *hmi_error_types[] = {
172 		"Malfunction Alert",
173 		"Processor Recovery done",
174 		"Processor recovery occurred again",
175 		"Processor recovery occurred for masked error",
176 		"Timer facility experienced an error",
177 		"TFMR SPR is corrupted",
178 		"UPS (Uniterrupted Power System) Overflow indication",
179 		"An XSCOM operation failure",
180 		"An XSCOM operation completed",
181 		"SCOM has set a reserved FIR bit to cause recovery",
182 		"Debug trigger has set a reserved FIR bit to cause recovery",
183 		"A hypervisor resource error occurred",
184 		"CAPP recovery process is in progress",
185 	};
186 
187 	/* Print things out */
188 	if (hmi_evt->version < OpalHMIEvt_V1) {
189 		pr_err("HMI Interrupt, Unknown event version %d !\n",
190 			hmi_evt->version);
191 		return;
192 	}
193 	switch (hmi_evt->severity) {
194 	case OpalHMI_SEV_NO_ERROR:
195 		level = KERN_INFO;
196 		sevstr = "Harmless";
197 		break;
198 	case OpalHMI_SEV_WARNING:
199 		level = KERN_WARNING;
200 		sevstr = "";
201 		break;
202 	case OpalHMI_SEV_ERROR_SYNC:
203 		level = KERN_ERR;
204 		sevstr = "Severe";
205 		break;
206 	case OpalHMI_SEV_FATAL:
207 	default:
208 		level = KERN_ERR;
209 		sevstr = "Fatal";
210 		break;
211 	}
212 
213 	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
214 		level, sevstr,
215 		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
216 		"Recovered" : "Not recovered");
217 	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
218 			hmi_error_types[hmi_evt->type]
219 			: "Unknown";
220 	printk("%s Error detail: %s\n", level, error_info);
221 	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
222 	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
223 		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
224 		printk("%s	TFMR: %016llx\n", level,
225 						be64_to_cpu(hmi_evt->tfmr));
226 
227 	if (hmi_evt->version < OpalHMIEvt_V2)
228 		return;
229 
230 	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
231 	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
232 		print_checkstop_reason(level, hmi_evt);
233 }
234 
235 static void hmi_event_handler(struct work_struct *work)
236 {
237 	unsigned long flags;
238 	struct OpalHMIEvent *hmi_evt;
239 	struct OpalHmiEvtNode *msg_node;
240 	uint8_t disposition;
241 	struct opal_msg msg;
242 	int unrecoverable = 0;
243 
244 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
245 	while (!list_empty(&opal_hmi_evt_list)) {
246 		msg_node = list_entry(opal_hmi_evt_list.next,
247 					   struct OpalHmiEvtNode, list);
248 		list_del(&msg_node->list);
249 		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
250 
251 		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
252 		print_hmi_event_info(hmi_evt);
253 		disposition = hmi_evt->disposition;
254 		kfree(msg_node);
255 
256 		/*
257 		 * Check if HMI event has been recovered or not. If not
258 		 * then kernel can't continue, we need to panic.
259 		 * But before we do that, display all the HMI event
260 		 * available on the list and set unrecoverable flag to 1.
261 		 */
262 		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
263 			unrecoverable = 1;
264 
265 		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
266 	}
267 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
268 
269 	if (unrecoverable) {
270 		int ret;
271 
272 		/* Pull all HMI events from OPAL before we panic. */
273 		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
274 			u32 type;
275 
276 			type = be32_to_cpu(msg.msg_type);
277 
278 			/* skip if not HMI event */
279 			if (type != OPAL_MSG_HMI_EVT)
280 				continue;
281 
282 			/* HMI event info starts from param[0] */
283 			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
284 			print_hmi_event_info(hmi_evt);
285 		}
286 
287 		/*
288 		 * Unrecoverable HMI exception. We need to inform BMC/OCC
289 		 * about this error so that it can collect relevant data
290 		 * for error analysis before rebooting.
291 		 */
292 		ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR,
293 			"Unrecoverable HMI exception");
294 		if (ret == OPAL_UNSUPPORTED) {
295 			pr_emerg("Reboot type %d not supported\n",
296 						OPAL_REBOOT_PLATFORM_ERROR);
297 		}
298 
299 		/*
300 		 * Fall through and panic if opal_cec_reboot2() returns
301 		 * OPAL_UNSUPPORTED.
302 		 */
303 		panic("Unrecoverable HMI exception");
304 	}
305 }
306 
307 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
308 /*
309  * opal_handle_hmi_event - notifier handler that queues up HMI events
310  * to be preocessed later.
311  */
312 static int opal_handle_hmi_event(struct notifier_block *nb,
313 			  unsigned long msg_type, void *msg)
314 {
315 	unsigned long flags;
316 	struct OpalHMIEvent *hmi_evt;
317 	struct opal_msg *hmi_msg = msg;
318 	struct OpalHmiEvtNode *msg_node;
319 
320 	/* Sanity Checks */
321 	if (msg_type != OPAL_MSG_HMI_EVT)
322 		return 0;
323 
324 	/* HMI event info starts from param[0] */
325 	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
326 
327 	/* Delay the logging of HMI events to workqueue. */
328 	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
329 	if (!msg_node) {
330 		pr_err("HMI: out of memory, Opal message event not handled\n");
331 		return -ENOMEM;
332 	}
333 	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent));
334 
335 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
336 	list_add(&msg_node->list, &opal_hmi_evt_list);
337 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
338 
339 	schedule_work(&hmi_event_work);
340 	return 0;
341 }
342 
343 static struct notifier_block opal_hmi_handler_nb = {
344 	.notifier_call	= opal_handle_hmi_event,
345 	.next		= NULL,
346 	.priority	= 0,
347 };
348 
349 int __init opal_hmi_handler_init(void)
350 {
351 	int ret;
352 
353 	if (!opal_hmi_handler_nb_init) {
354 		ret = opal_message_notifier_register(
355 				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
356 		if (ret) {
357 			pr_err("%s: Can't register OPAL event notifier (%d)\n",
358 			       __func__, ret);
359 			return ret;
360 		}
361 		opal_hmi_handler_nb_init = 1;
362 	}
363 	return 0;
364 }
365