xref: /openbmc/linux/arch/powerpc/kernel/mce.c (revision e5f586c763a079349398e2b0c7c271386193ac34)
1 /*
2  * Machine check exception handling.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  *
18  * Copyright 2013 IBM Corporation
19  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
20  */
21 
22 #undef DEBUG
23 #define pr_fmt(fmt) "mce: " fmt
24 
25 #include <linux/types.h>
26 #include <linux/ptrace.h>
27 #include <linux/percpu.h>
28 #include <linux/export.h>
29 #include <linux/irq_work.h>
30 #include <asm/mce.h>
31 
32 static DEFINE_PER_CPU(int, mce_nest_count);
33 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
34 
35 /* Queue for delayed MCE events. */
36 static DEFINE_PER_CPU(int, mce_queue_count);
37 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
38 
39 static void machine_check_process_queued_event(struct irq_work *work);
40 static struct irq_work mce_event_process_work = {
41         .func = machine_check_process_queued_event,
42 };
43 
44 static void mce_set_error_info(struct machine_check_event *mce,
45 			       struct mce_error_info *mce_err)
46 {
47 	mce->error_type = mce_err->error_type;
48 	switch (mce_err->error_type) {
49 	case MCE_ERROR_TYPE_UE:
50 		mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
51 		break;
52 	case MCE_ERROR_TYPE_SLB:
53 		mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
54 		break;
55 	case MCE_ERROR_TYPE_ERAT:
56 		mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
57 		break;
58 	case MCE_ERROR_TYPE_TLB:
59 		mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
60 		break;
61 	case MCE_ERROR_TYPE_USER:
62 		mce->u.user_error.user_error_type = mce_err->u.user_error_type;
63 		break;
64 	case MCE_ERROR_TYPE_RA:
65 		mce->u.ra_error.ra_error_type = mce_err->u.ra_error_type;
66 		break;
67 	case MCE_ERROR_TYPE_LINK:
68 		mce->u.link_error.link_error_type = mce_err->u.link_error_type;
69 		break;
70 	case MCE_ERROR_TYPE_UNKNOWN:
71 	default:
72 		break;
73 	}
74 }
75 
76 /*
77  * Decode and save high level MCE information into per cpu buffer which
78  * is an array of machine_check_event structure.
79  */
80 void save_mce_event(struct pt_regs *regs, long handled,
81 		    struct mce_error_info *mce_err,
82 		    uint64_t nip, uint64_t addr)
83 {
84 	int index = __this_cpu_inc_return(mce_nest_count) - 1;
85 	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
86 
87 	/*
88 	 * Return if we don't have enough space to log mce event.
89 	 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
90 	 * the check below will stop buffer overrun.
91 	 */
92 	if (index >= MAX_MC_EVT)
93 		return;
94 
95 	/* Populate generic machine check info */
96 	mce->version = MCE_V1;
97 	mce->srr0 = nip;
98 	mce->srr1 = regs->msr;
99 	mce->gpr3 = regs->gpr[3];
100 	mce->in_use = 1;
101 
102 	/* Mark it recovered if we have handled it and MSR(RI=1). */
103 	if (handled && (regs->msr & MSR_RI))
104 		mce->disposition = MCE_DISPOSITION_RECOVERED;
105 	else
106 		mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
107 
108 	mce->initiator = mce_err->initiator;
109 	mce->severity = mce_err->severity;
110 
111 	/*
112 	 * Populate the mce error_type and type-specific error_type.
113 	 */
114 	mce_set_error_info(mce, mce_err);
115 
116 	if (!addr)
117 		return;
118 
119 	if (mce->error_type == MCE_ERROR_TYPE_TLB) {
120 		mce->u.tlb_error.effective_address_provided = true;
121 		mce->u.tlb_error.effective_address = addr;
122 	} else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
123 		mce->u.slb_error.effective_address_provided = true;
124 		mce->u.slb_error.effective_address = addr;
125 	} else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
126 		mce->u.erat_error.effective_address_provided = true;
127 		mce->u.erat_error.effective_address = addr;
128 	} else if (mce->error_type == MCE_ERROR_TYPE_USER) {
129 		mce->u.user_error.effective_address_provided = true;
130 		mce->u.user_error.effective_address = addr;
131 	} else if (mce->error_type == MCE_ERROR_TYPE_RA) {
132 		mce->u.ra_error.effective_address_provided = true;
133 		mce->u.ra_error.effective_address = addr;
134 	} else if (mce->error_type == MCE_ERROR_TYPE_LINK) {
135 		mce->u.link_error.effective_address_provided = true;
136 		mce->u.link_error.effective_address = addr;
137 	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
138 		mce->u.ue_error.effective_address_provided = true;
139 		mce->u.ue_error.effective_address = addr;
140 	}
141 	return;
142 }
143 
144 /*
145  * get_mce_event:
146  *	mce	Pointer to machine_check_event structure to be filled.
147  *	release Flag to indicate whether to free the event slot or not.
148  *		0 <= do not release the mce event. Caller will invoke
149  *		     release_mce_event() once event has been consumed.
150  *		1 <= release the slot.
151  *
152  *	return	1 = success
153  *		0 = failure
154  *
155  * get_mce_event() will be called by platform specific machine check
156  * handle routine and in KVM.
157  * When we call get_mce_event(), we are still in interrupt context and
158  * preemption will not be scheduled until ret_from_expect() routine
159  * is called.
160  */
161 int get_mce_event(struct machine_check_event *mce, bool release)
162 {
163 	int index = __this_cpu_read(mce_nest_count) - 1;
164 	struct machine_check_event *mc_evt;
165 	int ret = 0;
166 
167 	/* Sanity check */
168 	if (index < 0)
169 		return ret;
170 
171 	/* Check if we have MCE info to process. */
172 	if (index < MAX_MC_EVT) {
173 		mc_evt = this_cpu_ptr(&mce_event[index]);
174 		/* Copy the event structure and release the original */
175 		if (mce)
176 			*mce = *mc_evt;
177 		if (release)
178 			mc_evt->in_use = 0;
179 		ret = 1;
180 	}
181 	/* Decrement the count to free the slot. */
182 	if (release)
183 		__this_cpu_dec(mce_nest_count);
184 
185 	return ret;
186 }
187 
188 void release_mce_event(void)
189 {
190 	get_mce_event(NULL, true);
191 }
192 
193 /*
194  * Queue up the MCE event which then can be handled later.
195  */
196 void machine_check_queue_event(void)
197 {
198 	int index;
199 	struct machine_check_event evt;
200 
201 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
202 		return;
203 
204 	index = __this_cpu_inc_return(mce_queue_count) - 1;
205 	/* If queue is full, just return for now. */
206 	if (index >= MAX_MC_EVT) {
207 		__this_cpu_dec(mce_queue_count);
208 		return;
209 	}
210 	memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt));
211 
212 	/* Queue irq work to process this event later. */
213 	irq_work_queue(&mce_event_process_work);
214 }
215 
216 /*
217  * process pending MCE event from the mce event queue. This function will be
218  * called during syscall exit.
219  */
220 static void machine_check_process_queued_event(struct irq_work *work)
221 {
222 	int index;
223 
224 	/*
225 	 * For now just print it to console.
226 	 * TODO: log this error event to FSP or nvram.
227 	 */
228 	while (__this_cpu_read(mce_queue_count) > 0) {
229 		index = __this_cpu_read(mce_queue_count) - 1;
230 		machine_check_print_event_info(
231 				this_cpu_ptr(&mce_event_queue[index]));
232 		__this_cpu_dec(mce_queue_count);
233 	}
234 }
235 
236 void machine_check_print_event_info(struct machine_check_event *evt)
237 {
238 	const char *level, *sevstr, *subtype;
239 	static const char *mc_ue_types[] = {
240 		"Indeterminate",
241 		"Instruction fetch",
242 		"Page table walk ifetch",
243 		"Load/Store",
244 		"Page table walk Load/Store",
245 	};
246 	static const char *mc_slb_types[] = {
247 		"Indeterminate",
248 		"Parity",
249 		"Multihit",
250 	};
251 	static const char *mc_erat_types[] = {
252 		"Indeterminate",
253 		"Parity",
254 		"Multihit",
255 	};
256 	static const char *mc_tlb_types[] = {
257 		"Indeterminate",
258 		"Parity",
259 		"Multihit",
260 	};
261 	static const char *mc_user_types[] = {
262 		"Indeterminate",
263 		"tlbie(l) invalid",
264 	};
265 	static const char *mc_ra_types[] = {
266 		"Indeterminate",
267 		"Instruction fetch (bad)",
268 		"Page table walk ifetch (bad)",
269 		"Page table walk ifetch (foreign)",
270 		"Load (bad)",
271 		"Store (bad)",
272 		"Page table walk Load/Store (bad)",
273 		"Page table walk Load/Store (foreign)",
274 		"Load/Store (foreign)",
275 	};
276 	static const char *mc_link_types[] = {
277 		"Indeterminate",
278 		"Instruction fetch (timeout)",
279 		"Page table walk ifetch (timeout)",
280 		"Load (timeout)",
281 		"Store (timeout)",
282 		"Page table walk Load/Store (timeout)",
283 	};
284 
285 	/* Print things out */
286 	if (evt->version != MCE_V1) {
287 		pr_err("Machine Check Exception, Unknown event version %d !\n",
288 		       evt->version);
289 		return;
290 	}
291 	switch (evt->severity) {
292 	case MCE_SEV_NO_ERROR:
293 		level = KERN_INFO;
294 		sevstr = "Harmless";
295 		break;
296 	case MCE_SEV_WARNING:
297 		level = KERN_WARNING;
298 		sevstr = "";
299 		break;
300 	case MCE_SEV_ERROR_SYNC:
301 		level = KERN_ERR;
302 		sevstr = "Severe";
303 		break;
304 	case MCE_SEV_FATAL:
305 	default:
306 		level = KERN_ERR;
307 		sevstr = "Fatal";
308 		break;
309 	}
310 
311 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
312 	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
313 	       "Recovered" : "[Not recovered");
314 	printk("%s  Initiator: %s\n", level,
315 	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
316 	switch (evt->error_type) {
317 	case MCE_ERROR_TYPE_UE:
318 		subtype = evt->u.ue_error.ue_error_type <
319 			ARRAY_SIZE(mc_ue_types) ?
320 			mc_ue_types[evt->u.ue_error.ue_error_type]
321 			: "Unknown";
322 		printk("%s  Error type: UE [%s]\n", level, subtype);
323 		if (evt->u.ue_error.effective_address_provided)
324 			printk("%s    Effective address: %016llx\n",
325 			       level, evt->u.ue_error.effective_address);
326 		if (evt->u.ue_error.physical_address_provided)
327 			printk("%s      Physical address: %016llx\n",
328 			       level, evt->u.ue_error.physical_address);
329 		break;
330 	case MCE_ERROR_TYPE_SLB:
331 		subtype = evt->u.slb_error.slb_error_type <
332 			ARRAY_SIZE(mc_slb_types) ?
333 			mc_slb_types[evt->u.slb_error.slb_error_type]
334 			: "Unknown";
335 		printk("%s  Error type: SLB [%s]\n", level, subtype);
336 		if (evt->u.slb_error.effective_address_provided)
337 			printk("%s    Effective address: %016llx\n",
338 			       level, evt->u.slb_error.effective_address);
339 		break;
340 	case MCE_ERROR_TYPE_ERAT:
341 		subtype = evt->u.erat_error.erat_error_type <
342 			ARRAY_SIZE(mc_erat_types) ?
343 			mc_erat_types[evt->u.erat_error.erat_error_type]
344 			: "Unknown";
345 		printk("%s  Error type: ERAT [%s]\n", level, subtype);
346 		if (evt->u.erat_error.effective_address_provided)
347 			printk("%s    Effective address: %016llx\n",
348 			       level, evt->u.erat_error.effective_address);
349 		break;
350 	case MCE_ERROR_TYPE_TLB:
351 		subtype = evt->u.tlb_error.tlb_error_type <
352 			ARRAY_SIZE(mc_tlb_types) ?
353 			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
354 			: "Unknown";
355 		printk("%s  Error type: TLB [%s]\n", level, subtype);
356 		if (evt->u.tlb_error.effective_address_provided)
357 			printk("%s    Effective address: %016llx\n",
358 			       level, evt->u.tlb_error.effective_address);
359 		break;
360 	case MCE_ERROR_TYPE_USER:
361 		subtype = evt->u.user_error.user_error_type <
362 			ARRAY_SIZE(mc_user_types) ?
363 			mc_user_types[evt->u.user_error.user_error_type]
364 			: "Unknown";
365 		printk("%s  Error type: User [%s]\n", level, subtype);
366 		if (evt->u.user_error.effective_address_provided)
367 			printk("%s    Effective address: %016llx\n",
368 			       level, evt->u.user_error.effective_address);
369 		break;
370 	case MCE_ERROR_TYPE_RA:
371 		subtype = evt->u.ra_error.ra_error_type <
372 			ARRAY_SIZE(mc_ra_types) ?
373 			mc_ra_types[evt->u.ra_error.ra_error_type]
374 			: "Unknown";
375 		printk("%s  Error type: Real address [%s]\n", level, subtype);
376 		if (evt->u.ra_error.effective_address_provided)
377 			printk("%s    Effective address: %016llx\n",
378 			       level, evt->u.ra_error.effective_address);
379 		break;
380 	case MCE_ERROR_TYPE_LINK:
381 		subtype = evt->u.link_error.link_error_type <
382 			ARRAY_SIZE(mc_link_types) ?
383 			mc_link_types[evt->u.link_error.link_error_type]
384 			: "Unknown";
385 		printk("%s  Error type: Link [%s]\n", level, subtype);
386 		if (evt->u.link_error.effective_address_provided)
387 			printk("%s    Effective address: %016llx\n",
388 			       level, evt->u.link_error.effective_address);
389 		break;
390 	default:
391 	case MCE_ERROR_TYPE_UNKNOWN:
392 		printk("%s  Error type: Unknown\n", level);
393 		break;
394 	}
395 }
396 
397 uint64_t get_mce_fault_addr(struct machine_check_event *evt)
398 {
399 	switch (evt->error_type) {
400 	case MCE_ERROR_TYPE_UE:
401 		if (evt->u.ue_error.effective_address_provided)
402 			return evt->u.ue_error.effective_address;
403 		break;
404 	case MCE_ERROR_TYPE_SLB:
405 		if (evt->u.slb_error.effective_address_provided)
406 			return evt->u.slb_error.effective_address;
407 		break;
408 	case MCE_ERROR_TYPE_ERAT:
409 		if (evt->u.erat_error.effective_address_provided)
410 			return evt->u.erat_error.effective_address;
411 		break;
412 	case MCE_ERROR_TYPE_TLB:
413 		if (evt->u.tlb_error.effective_address_provided)
414 			return evt->u.tlb_error.effective_address;
415 		break;
416 	case MCE_ERROR_TYPE_USER:
417 		if (evt->u.user_error.effective_address_provided)
418 			return evt->u.user_error.effective_address;
419 		break;
420 	case MCE_ERROR_TYPE_RA:
421 		if (evt->u.ra_error.effective_address_provided)
422 			return evt->u.ra_error.effective_address;
423 		break;
424 	case MCE_ERROR_TYPE_LINK:
425 		if (evt->u.link_error.effective_address_provided)
426 			return evt->u.link_error.effective_address;
427 		break;
428 	default:
429 	case MCE_ERROR_TYPE_UNKNOWN:
430 		break;
431 	}
432 	return 0;
433 }
434 EXPORT_SYMBOL(get_mce_fault_addr);
435