xref: /openbmc/linux/arch/powerpc/kernel/mce.c (revision efe4a1ac)
1 /*
2  * Machine check exception handling.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  *
18  * Copyright 2013 IBM Corporation
19  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
20  */
21 
22 #undef DEBUG
23 #define pr_fmt(fmt) "mce: " fmt
24 
25 #include <linux/types.h>
26 #include <linux/ptrace.h>
27 #include <linux/percpu.h>
28 #include <linux/export.h>
29 #include <linux/irq_work.h>
30 #include <asm/mce.h>
31 
32 static DEFINE_PER_CPU(int, mce_nest_count);
33 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
34 
35 /* Queue for delayed MCE events. */
36 static DEFINE_PER_CPU(int, mce_queue_count);
37 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
38 
39 static void machine_check_process_queued_event(struct irq_work *work);
40 static struct irq_work mce_event_process_work = {
41         .func = machine_check_process_queued_event,
42 };
43 
44 static void mce_set_error_info(struct machine_check_event *mce,
45 			       struct mce_error_info *mce_err)
46 {
47 	mce->error_type = mce_err->error_type;
48 	switch (mce_err->error_type) {
49 	case MCE_ERROR_TYPE_UE:
50 		mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
51 		break;
52 	case MCE_ERROR_TYPE_SLB:
53 		mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
54 		break;
55 	case MCE_ERROR_TYPE_ERAT:
56 		mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
57 		break;
58 	case MCE_ERROR_TYPE_TLB:
59 		mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
60 		break;
61 	case MCE_ERROR_TYPE_USER:
62 		mce->u.user_error.user_error_type = mce_err->u.user_error_type;
63 		break;
64 	case MCE_ERROR_TYPE_RA:
65 		mce->u.ra_error.ra_error_type = mce_err->u.ra_error_type;
66 		break;
67 	case MCE_ERROR_TYPE_LINK:
68 		mce->u.link_error.link_error_type = mce_err->u.link_error_type;
69 		break;
70 	case MCE_ERROR_TYPE_UNKNOWN:
71 	default:
72 		break;
73 	}
74 }
75 
76 /*
77  * Decode and save high level MCE information into per cpu buffer which
78  * is an array of machine_check_event structure.
79  */
80 void save_mce_event(struct pt_regs *regs, long handled,
81 		    struct mce_error_info *mce_err,
82 		    uint64_t nip, uint64_t addr)
83 {
84 	int index = __this_cpu_inc_return(mce_nest_count) - 1;
85 	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
86 
87 	/*
88 	 * Return if we don't have enough space to log mce event.
89 	 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
90 	 * the check below will stop buffer overrun.
91 	 */
92 	if (index >= MAX_MC_EVT)
93 		return;
94 
95 	/* Populate generic machine check info */
96 	mce->version = MCE_V1;
97 	mce->srr0 = nip;
98 	mce->srr1 = regs->msr;
99 	mce->gpr3 = regs->gpr[3];
100 	mce->in_use = 1;
101 
102 	/* Mark it recovered if we have handled it and MSR(RI=1). */
103 	if (handled && (regs->msr & MSR_RI))
104 		mce->disposition = MCE_DISPOSITION_RECOVERED;
105 	else
106 		mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
107 
108 	mce->initiator = mce_err->initiator;
109 	mce->severity = mce_err->severity;
110 
111 	/*
112 	 * Populate the mce error_type and type-specific error_type.
113 	 */
114 	mce_set_error_info(mce, mce_err);
115 
116 	if (!addr)
117 		return;
118 
119 	if (mce->error_type == MCE_ERROR_TYPE_TLB) {
120 		mce->u.tlb_error.effective_address_provided = true;
121 		mce->u.tlb_error.effective_address = addr;
122 	} else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
123 		mce->u.slb_error.effective_address_provided = true;
124 		mce->u.slb_error.effective_address = addr;
125 	} else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
126 		mce->u.erat_error.effective_address_provided = true;
127 		mce->u.erat_error.effective_address = addr;
128 	} else if (mce->error_type == MCE_ERROR_TYPE_USER) {
129 		mce->u.user_error.effective_address_provided = true;
130 		mce->u.user_error.effective_address = addr;
131 	} else if (mce->error_type == MCE_ERROR_TYPE_RA) {
132 		mce->u.ra_error.effective_address_provided = true;
133 		mce->u.ra_error.effective_address = addr;
134 	} else if (mce->error_type == MCE_ERROR_TYPE_LINK) {
135 		mce->u.link_error.effective_address_provided = true;
136 		mce->u.link_error.effective_address = addr;
137 	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
138 		mce->u.ue_error.effective_address_provided = true;
139 		mce->u.ue_error.effective_address = addr;
140 	}
141 	return;
142 }
143 
144 /*
145  * get_mce_event:
146  *	mce	Pointer to machine_check_event structure to be filled.
147  *	release Flag to indicate whether to free the event slot or not.
148  *		0 <= do not release the mce event. Caller will invoke
149  *		     release_mce_event() once event has been consumed.
150  *		1 <= release the slot.
151  *
152  *	return	1 = success
153  *		0 = failure
154  *
155  * get_mce_event() will be called by platform specific machine check
156  * handle routine and in KVM.
157  * When we call get_mce_event(), we are still in interrupt context and
158  * preemption will not be scheduled until ret_from_expect() routine
159  * is called.
160  */
161 int get_mce_event(struct machine_check_event *mce, bool release)
162 {
163 	int index = __this_cpu_read(mce_nest_count) - 1;
164 	struct machine_check_event *mc_evt;
165 	int ret = 0;
166 
167 	/* Sanity check */
168 	if (index < 0)
169 		return ret;
170 
171 	/* Check if we have MCE info to process. */
172 	if (index < MAX_MC_EVT) {
173 		mc_evt = this_cpu_ptr(&mce_event[index]);
174 		/* Copy the event structure and release the original */
175 		if (mce)
176 			*mce = *mc_evt;
177 		if (release)
178 			mc_evt->in_use = 0;
179 		ret = 1;
180 	}
181 	/* Decrement the count to free the slot. */
182 	if (release)
183 		__this_cpu_dec(mce_nest_count);
184 
185 	return ret;
186 }
187 
188 void release_mce_event(void)
189 {
190 	get_mce_event(NULL, true);
191 }
192 
193 /*
194  * Queue up the MCE event which then can be handled later.
195  */
196 void machine_check_queue_event(void)
197 {
198 	int index;
199 	struct machine_check_event evt;
200 
201 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
202 		return;
203 
204 	index = __this_cpu_inc_return(mce_queue_count) - 1;
205 	/* If queue is full, just return for now. */
206 	if (index >= MAX_MC_EVT) {
207 		__this_cpu_dec(mce_queue_count);
208 		return;
209 	}
210 	memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt));
211 
212 	/* Queue irq work to process this event later. */
213 	irq_work_queue(&mce_event_process_work);
214 }
215 
216 /*
217  * process pending MCE event from the mce event queue. This function will be
218  * called during syscall exit.
219  */
220 static void machine_check_process_queued_event(struct irq_work *work)
221 {
222 	int index;
223 
224 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
225 
226 	/*
227 	 * For now just print it to console.
228 	 * TODO: log this error event to FSP or nvram.
229 	 */
230 	while (__this_cpu_read(mce_queue_count) > 0) {
231 		index = __this_cpu_read(mce_queue_count) - 1;
232 		machine_check_print_event_info(
233 				this_cpu_ptr(&mce_event_queue[index]), false);
234 		__this_cpu_dec(mce_queue_count);
235 	}
236 }
237 
238 void machine_check_print_event_info(struct machine_check_event *evt,
239 				    bool user_mode)
240 {
241 	const char *level, *sevstr, *subtype;
242 	static const char *mc_ue_types[] = {
243 		"Indeterminate",
244 		"Instruction fetch",
245 		"Page table walk ifetch",
246 		"Load/Store",
247 		"Page table walk Load/Store",
248 	};
249 	static const char *mc_slb_types[] = {
250 		"Indeterminate",
251 		"Parity",
252 		"Multihit",
253 	};
254 	static const char *mc_erat_types[] = {
255 		"Indeterminate",
256 		"Parity",
257 		"Multihit",
258 	};
259 	static const char *mc_tlb_types[] = {
260 		"Indeterminate",
261 		"Parity",
262 		"Multihit",
263 	};
264 	static const char *mc_user_types[] = {
265 		"Indeterminate",
266 		"tlbie(l) invalid",
267 	};
268 	static const char *mc_ra_types[] = {
269 		"Indeterminate",
270 		"Instruction fetch (bad)",
271 		"Page table walk ifetch (bad)",
272 		"Page table walk ifetch (foreign)",
273 		"Load (bad)",
274 		"Store (bad)",
275 		"Page table walk Load/Store (bad)",
276 		"Page table walk Load/Store (foreign)",
277 		"Load/Store (foreign)",
278 	};
279 	static const char *mc_link_types[] = {
280 		"Indeterminate",
281 		"Instruction fetch (timeout)",
282 		"Page table walk ifetch (timeout)",
283 		"Load (timeout)",
284 		"Store (timeout)",
285 		"Page table walk Load/Store (timeout)",
286 	};
287 
288 	/* Print things out */
289 	if (evt->version != MCE_V1) {
290 		pr_err("Machine Check Exception, Unknown event version %d !\n",
291 		       evt->version);
292 		return;
293 	}
294 	switch (evt->severity) {
295 	case MCE_SEV_NO_ERROR:
296 		level = KERN_INFO;
297 		sevstr = "Harmless";
298 		break;
299 	case MCE_SEV_WARNING:
300 		level = KERN_WARNING;
301 		sevstr = "";
302 		break;
303 	case MCE_SEV_ERROR_SYNC:
304 		level = KERN_ERR;
305 		sevstr = "Severe";
306 		break;
307 	case MCE_SEV_FATAL:
308 	default:
309 		level = KERN_ERR;
310 		sevstr = "Fatal";
311 		break;
312 	}
313 
314 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
315 	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
316 	       "Recovered" : "Not recovered");
317 
318 	if (user_mode) {
319 		printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
320 			evt->srr0, current->pid, current->comm);
321 	} else {
322 		printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0,
323 		       (void *)evt->srr0);
324 	}
325 
326 	printk("%s  Initiator: %s\n", level,
327 	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
328 	switch (evt->error_type) {
329 	case MCE_ERROR_TYPE_UE:
330 		subtype = evt->u.ue_error.ue_error_type <
331 			ARRAY_SIZE(mc_ue_types) ?
332 			mc_ue_types[evt->u.ue_error.ue_error_type]
333 			: "Unknown";
334 		printk("%s  Error type: UE [%s]\n", level, subtype);
335 		if (evt->u.ue_error.effective_address_provided)
336 			printk("%s    Effective address: %016llx\n",
337 			       level, evt->u.ue_error.effective_address);
338 		if (evt->u.ue_error.physical_address_provided)
339 			printk("%s      Physical address: %016llx\n",
340 			       level, evt->u.ue_error.physical_address);
341 		break;
342 	case MCE_ERROR_TYPE_SLB:
343 		subtype = evt->u.slb_error.slb_error_type <
344 			ARRAY_SIZE(mc_slb_types) ?
345 			mc_slb_types[evt->u.slb_error.slb_error_type]
346 			: "Unknown";
347 		printk("%s  Error type: SLB [%s]\n", level, subtype);
348 		if (evt->u.slb_error.effective_address_provided)
349 			printk("%s    Effective address: %016llx\n",
350 			       level, evt->u.slb_error.effective_address);
351 		break;
352 	case MCE_ERROR_TYPE_ERAT:
353 		subtype = evt->u.erat_error.erat_error_type <
354 			ARRAY_SIZE(mc_erat_types) ?
355 			mc_erat_types[evt->u.erat_error.erat_error_type]
356 			: "Unknown";
357 		printk("%s  Error type: ERAT [%s]\n", level, subtype);
358 		if (evt->u.erat_error.effective_address_provided)
359 			printk("%s    Effective address: %016llx\n",
360 			       level, evt->u.erat_error.effective_address);
361 		break;
362 	case MCE_ERROR_TYPE_TLB:
363 		subtype = evt->u.tlb_error.tlb_error_type <
364 			ARRAY_SIZE(mc_tlb_types) ?
365 			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
366 			: "Unknown";
367 		printk("%s  Error type: TLB [%s]\n", level, subtype);
368 		if (evt->u.tlb_error.effective_address_provided)
369 			printk("%s    Effective address: %016llx\n",
370 			       level, evt->u.tlb_error.effective_address);
371 		break;
372 	case MCE_ERROR_TYPE_USER:
373 		subtype = evt->u.user_error.user_error_type <
374 			ARRAY_SIZE(mc_user_types) ?
375 			mc_user_types[evt->u.user_error.user_error_type]
376 			: "Unknown";
377 		printk("%s  Error type: User [%s]\n", level, subtype);
378 		if (evt->u.user_error.effective_address_provided)
379 			printk("%s    Effective address: %016llx\n",
380 			       level, evt->u.user_error.effective_address);
381 		break;
382 	case MCE_ERROR_TYPE_RA:
383 		subtype = evt->u.ra_error.ra_error_type <
384 			ARRAY_SIZE(mc_ra_types) ?
385 			mc_ra_types[evt->u.ra_error.ra_error_type]
386 			: "Unknown";
387 		printk("%s  Error type: Real address [%s]\n", level, subtype);
388 		if (evt->u.ra_error.effective_address_provided)
389 			printk("%s    Effective address: %016llx\n",
390 			       level, evt->u.ra_error.effective_address);
391 		break;
392 	case MCE_ERROR_TYPE_LINK:
393 		subtype = evt->u.link_error.link_error_type <
394 			ARRAY_SIZE(mc_link_types) ?
395 			mc_link_types[evt->u.link_error.link_error_type]
396 			: "Unknown";
397 		printk("%s  Error type: Link [%s]\n", level, subtype);
398 		if (evt->u.link_error.effective_address_provided)
399 			printk("%s    Effective address: %016llx\n",
400 			       level, evt->u.link_error.effective_address);
401 		break;
402 	default:
403 	case MCE_ERROR_TYPE_UNKNOWN:
404 		printk("%s  Error type: Unknown\n", level);
405 		break;
406 	}
407 }
408 
409 uint64_t get_mce_fault_addr(struct machine_check_event *evt)
410 {
411 	switch (evt->error_type) {
412 	case MCE_ERROR_TYPE_UE:
413 		if (evt->u.ue_error.effective_address_provided)
414 			return evt->u.ue_error.effective_address;
415 		break;
416 	case MCE_ERROR_TYPE_SLB:
417 		if (evt->u.slb_error.effective_address_provided)
418 			return evt->u.slb_error.effective_address;
419 		break;
420 	case MCE_ERROR_TYPE_ERAT:
421 		if (evt->u.erat_error.effective_address_provided)
422 			return evt->u.erat_error.effective_address;
423 		break;
424 	case MCE_ERROR_TYPE_TLB:
425 		if (evt->u.tlb_error.effective_address_provided)
426 			return evt->u.tlb_error.effective_address;
427 		break;
428 	case MCE_ERROR_TYPE_USER:
429 		if (evt->u.user_error.effective_address_provided)
430 			return evt->u.user_error.effective_address;
431 		break;
432 	case MCE_ERROR_TYPE_RA:
433 		if (evt->u.ra_error.effective_address_provided)
434 			return evt->u.ra_error.effective_address;
435 		break;
436 	case MCE_ERROR_TYPE_LINK:
437 		if (evt->u.link_error.effective_address_provided)
438 			return evt->u.link_error.effective_address;
439 		break;
440 	default:
441 	case MCE_ERROR_TYPE_UNKNOWN:
442 		break;
443 	}
444 	return 0;
445 }
446 EXPORT_SYMBOL(get_mce_fault_addr);
447