xref: /openbmc/linux/arch/powerpc/kernel/mce.c (revision 5d0e4d78)
1 /*
2  * Machine check exception handling.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  *
18  * Copyright 2013 IBM Corporation
19  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
20  */
21 
22 #undef DEBUG
23 #define pr_fmt(fmt) "mce: " fmt
24 
25 #include <linux/types.h>
26 #include <linux/ptrace.h>
27 #include <linux/percpu.h>
28 #include <linux/export.h>
29 #include <linux/irq_work.h>
30 #include <asm/mce.h>
31 
32 static DEFINE_PER_CPU(int, mce_nest_count);
33 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
34 
35 /* Queue for delayed MCE events. */
36 static DEFINE_PER_CPU(int, mce_queue_count);
37 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
38 
39 static void machine_check_process_queued_event(struct irq_work *work);
40 static struct irq_work mce_event_process_work = {
41         .func = machine_check_process_queued_event,
42 };
43 
44 static void mce_set_error_info(struct machine_check_event *mce,
45 			       struct mce_error_info *mce_err)
46 {
47 	mce->error_type = mce_err->error_type;
48 	switch (mce_err->error_type) {
49 	case MCE_ERROR_TYPE_UE:
50 		mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
51 		break;
52 	case MCE_ERROR_TYPE_SLB:
53 		mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
54 		break;
55 	case MCE_ERROR_TYPE_ERAT:
56 		mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
57 		break;
58 	case MCE_ERROR_TYPE_TLB:
59 		mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
60 		break;
61 	case MCE_ERROR_TYPE_USER:
62 		mce->u.user_error.user_error_type = mce_err->u.user_error_type;
63 		break;
64 	case MCE_ERROR_TYPE_RA:
65 		mce->u.ra_error.ra_error_type = mce_err->u.ra_error_type;
66 		break;
67 	case MCE_ERROR_TYPE_LINK:
68 		mce->u.link_error.link_error_type = mce_err->u.link_error_type;
69 		break;
70 	case MCE_ERROR_TYPE_UNKNOWN:
71 	default:
72 		break;
73 	}
74 }
75 
76 /*
77  * Decode and save high level MCE information into per cpu buffer which
78  * is an array of machine_check_event structure.
79  */
80 void save_mce_event(struct pt_regs *regs, long handled,
81 		    struct mce_error_info *mce_err,
82 		    uint64_t nip, uint64_t addr)
83 {
84 	int index = __this_cpu_inc_return(mce_nest_count) - 1;
85 	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
86 
87 	/*
88 	 * Return if we don't have enough space to log mce event.
89 	 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
90 	 * the check below will stop buffer overrun.
91 	 */
92 	if (index >= MAX_MC_EVT)
93 		return;
94 
95 	/* Populate generic machine check info */
96 	mce->version = MCE_V1;
97 	mce->srr0 = nip;
98 	mce->srr1 = regs->msr;
99 	mce->gpr3 = regs->gpr[3];
100 	mce->in_use = 1;
101 
102 	/* Mark it recovered if we have handled it and MSR(RI=1). */
103 	if (handled && (regs->msr & MSR_RI))
104 		mce->disposition = MCE_DISPOSITION_RECOVERED;
105 	else
106 		mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
107 
108 	mce->initiator = mce_err->initiator;
109 	mce->severity = mce_err->severity;
110 
111 	/*
112 	 * Populate the mce error_type and type-specific error_type.
113 	 */
114 	mce_set_error_info(mce, mce_err);
115 
116 	if (!addr)
117 		return;
118 
119 	if (mce->error_type == MCE_ERROR_TYPE_TLB) {
120 		mce->u.tlb_error.effective_address_provided = true;
121 		mce->u.tlb_error.effective_address = addr;
122 	} else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
123 		mce->u.slb_error.effective_address_provided = true;
124 		mce->u.slb_error.effective_address = addr;
125 	} else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
126 		mce->u.erat_error.effective_address_provided = true;
127 		mce->u.erat_error.effective_address = addr;
128 	} else if (mce->error_type == MCE_ERROR_TYPE_USER) {
129 		mce->u.user_error.effective_address_provided = true;
130 		mce->u.user_error.effective_address = addr;
131 	} else if (mce->error_type == MCE_ERROR_TYPE_RA) {
132 		mce->u.ra_error.effective_address_provided = true;
133 		mce->u.ra_error.effective_address = addr;
134 	} else if (mce->error_type == MCE_ERROR_TYPE_LINK) {
135 		mce->u.link_error.effective_address_provided = true;
136 		mce->u.link_error.effective_address = addr;
137 	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
138 		mce->u.ue_error.effective_address_provided = true;
139 		mce->u.ue_error.effective_address = addr;
140 	}
141 	return;
142 }
143 
144 /*
145  * get_mce_event:
146  *	mce	Pointer to machine_check_event structure to be filled.
147  *	release Flag to indicate whether to free the event slot or not.
148  *		0 <= do not release the mce event. Caller will invoke
149  *		     release_mce_event() once event has been consumed.
150  *		1 <= release the slot.
151  *
152  *	return	1 = success
153  *		0 = failure
154  *
155  * get_mce_event() will be called by platform specific machine check
156  * handle routine and in KVM.
157  * When we call get_mce_event(), we are still in interrupt context and
158  * preemption will not be scheduled until ret_from_expect() routine
159  * is called.
160  */
161 int get_mce_event(struct machine_check_event *mce, bool release)
162 {
163 	int index = __this_cpu_read(mce_nest_count) - 1;
164 	struct machine_check_event *mc_evt;
165 	int ret = 0;
166 
167 	/* Sanity check */
168 	if (index < 0)
169 		return ret;
170 
171 	/* Check if we have MCE info to process. */
172 	if (index < MAX_MC_EVT) {
173 		mc_evt = this_cpu_ptr(&mce_event[index]);
174 		/* Copy the event structure and release the original */
175 		if (mce)
176 			*mce = *mc_evt;
177 		if (release)
178 			mc_evt->in_use = 0;
179 		ret = 1;
180 	}
181 	/* Decrement the count to free the slot. */
182 	if (release)
183 		__this_cpu_dec(mce_nest_count);
184 
185 	return ret;
186 }
187 
188 void release_mce_event(void)
189 {
190 	get_mce_event(NULL, true);
191 }
192 
193 /*
194  * Queue up the MCE event which then can be handled later.
195  */
196 void machine_check_queue_event(void)
197 {
198 	int index;
199 	struct machine_check_event evt;
200 
201 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
202 		return;
203 
204 	index = __this_cpu_inc_return(mce_queue_count) - 1;
205 	/* If queue is full, just return for now. */
206 	if (index >= MAX_MC_EVT) {
207 		__this_cpu_dec(mce_queue_count);
208 		return;
209 	}
210 	memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt));
211 
212 	/* Queue irq work to process this event later. */
213 	irq_work_queue(&mce_event_process_work);
214 }
215 
216 /*
217  * process pending MCE event from the mce event queue. This function will be
218  * called during syscall exit.
219  */
220 static void machine_check_process_queued_event(struct irq_work *work)
221 {
222 	int index;
223 
224 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
225 
226 	/*
227 	 * For now just print it to console.
228 	 * TODO: log this error event to FSP or nvram.
229 	 */
230 	while (__this_cpu_read(mce_queue_count) > 0) {
231 		index = __this_cpu_read(mce_queue_count) - 1;
232 		machine_check_print_event_info(
233 				this_cpu_ptr(&mce_event_queue[index]), false);
234 		__this_cpu_dec(mce_queue_count);
235 	}
236 }
237 
238 void machine_check_print_event_info(struct machine_check_event *evt,
239 				    bool user_mode)
240 {
241 	const char *level, *sevstr, *subtype;
242 	static const char *mc_ue_types[] = {
243 		"Indeterminate",
244 		"Instruction fetch",
245 		"Page table walk ifetch",
246 		"Load/Store",
247 		"Page table walk Load/Store",
248 	};
249 	static const char *mc_slb_types[] = {
250 		"Indeterminate",
251 		"Parity",
252 		"Multihit",
253 	};
254 	static const char *mc_erat_types[] = {
255 		"Indeterminate",
256 		"Parity",
257 		"Multihit",
258 	};
259 	static const char *mc_tlb_types[] = {
260 		"Indeterminate",
261 		"Parity",
262 		"Multihit",
263 	};
264 	static const char *mc_user_types[] = {
265 		"Indeterminate",
266 		"tlbie(l) invalid",
267 	};
268 	static const char *mc_ra_types[] = {
269 		"Indeterminate",
270 		"Instruction fetch (bad)",
271 		"Instruction fetch (foreign)",
272 		"Page table walk ifetch (bad)",
273 		"Page table walk ifetch (foreign)",
274 		"Load (bad)",
275 		"Store (bad)",
276 		"Page table walk Load/Store (bad)",
277 		"Page table walk Load/Store (foreign)",
278 		"Load/Store (foreign)",
279 	};
280 	static const char *mc_link_types[] = {
281 		"Indeterminate",
282 		"Instruction fetch (timeout)",
283 		"Page table walk ifetch (timeout)",
284 		"Load (timeout)",
285 		"Store (timeout)",
286 		"Page table walk Load/Store (timeout)",
287 	};
288 
289 	/* Print things out */
290 	if (evt->version != MCE_V1) {
291 		pr_err("Machine Check Exception, Unknown event version %d !\n",
292 		       evt->version);
293 		return;
294 	}
295 	switch (evt->severity) {
296 	case MCE_SEV_NO_ERROR:
297 		level = KERN_INFO;
298 		sevstr = "Harmless";
299 		break;
300 	case MCE_SEV_WARNING:
301 		level = KERN_WARNING;
302 		sevstr = "";
303 		break;
304 	case MCE_SEV_ERROR_SYNC:
305 		level = KERN_ERR;
306 		sevstr = "Severe";
307 		break;
308 	case MCE_SEV_FATAL:
309 	default:
310 		level = KERN_ERR;
311 		sevstr = "Fatal";
312 		break;
313 	}
314 
315 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
316 	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
317 	       "Recovered" : "Not recovered");
318 
319 	if (user_mode) {
320 		printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
321 			evt->srr0, current->pid, current->comm);
322 	} else {
323 		printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0,
324 		       (void *)evt->srr0);
325 	}
326 
327 	printk("%s  Initiator: %s\n", level,
328 	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
329 	switch (evt->error_type) {
330 	case MCE_ERROR_TYPE_UE:
331 		subtype = evt->u.ue_error.ue_error_type <
332 			ARRAY_SIZE(mc_ue_types) ?
333 			mc_ue_types[evt->u.ue_error.ue_error_type]
334 			: "Unknown";
335 		printk("%s  Error type: UE [%s]\n", level, subtype);
336 		if (evt->u.ue_error.effective_address_provided)
337 			printk("%s    Effective address: %016llx\n",
338 			       level, evt->u.ue_error.effective_address);
339 		if (evt->u.ue_error.physical_address_provided)
340 			printk("%s      Physical address: %016llx\n",
341 			       level, evt->u.ue_error.physical_address);
342 		break;
343 	case MCE_ERROR_TYPE_SLB:
344 		subtype = evt->u.slb_error.slb_error_type <
345 			ARRAY_SIZE(mc_slb_types) ?
346 			mc_slb_types[evt->u.slb_error.slb_error_type]
347 			: "Unknown";
348 		printk("%s  Error type: SLB [%s]\n", level, subtype);
349 		if (evt->u.slb_error.effective_address_provided)
350 			printk("%s    Effective address: %016llx\n",
351 			       level, evt->u.slb_error.effective_address);
352 		break;
353 	case MCE_ERROR_TYPE_ERAT:
354 		subtype = evt->u.erat_error.erat_error_type <
355 			ARRAY_SIZE(mc_erat_types) ?
356 			mc_erat_types[evt->u.erat_error.erat_error_type]
357 			: "Unknown";
358 		printk("%s  Error type: ERAT [%s]\n", level, subtype);
359 		if (evt->u.erat_error.effective_address_provided)
360 			printk("%s    Effective address: %016llx\n",
361 			       level, evt->u.erat_error.effective_address);
362 		break;
363 	case MCE_ERROR_TYPE_TLB:
364 		subtype = evt->u.tlb_error.tlb_error_type <
365 			ARRAY_SIZE(mc_tlb_types) ?
366 			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
367 			: "Unknown";
368 		printk("%s  Error type: TLB [%s]\n", level, subtype);
369 		if (evt->u.tlb_error.effective_address_provided)
370 			printk("%s    Effective address: %016llx\n",
371 			       level, evt->u.tlb_error.effective_address);
372 		break;
373 	case MCE_ERROR_TYPE_USER:
374 		subtype = evt->u.user_error.user_error_type <
375 			ARRAY_SIZE(mc_user_types) ?
376 			mc_user_types[evt->u.user_error.user_error_type]
377 			: "Unknown";
378 		printk("%s  Error type: User [%s]\n", level, subtype);
379 		if (evt->u.user_error.effective_address_provided)
380 			printk("%s    Effective address: %016llx\n",
381 			       level, evt->u.user_error.effective_address);
382 		break;
383 	case MCE_ERROR_TYPE_RA:
384 		subtype = evt->u.ra_error.ra_error_type <
385 			ARRAY_SIZE(mc_ra_types) ?
386 			mc_ra_types[evt->u.ra_error.ra_error_type]
387 			: "Unknown";
388 		printk("%s  Error type: Real address [%s]\n", level, subtype);
389 		if (evt->u.ra_error.effective_address_provided)
390 			printk("%s    Effective address: %016llx\n",
391 			       level, evt->u.ra_error.effective_address);
392 		break;
393 	case MCE_ERROR_TYPE_LINK:
394 		subtype = evt->u.link_error.link_error_type <
395 			ARRAY_SIZE(mc_link_types) ?
396 			mc_link_types[evt->u.link_error.link_error_type]
397 			: "Unknown";
398 		printk("%s  Error type: Link [%s]\n", level, subtype);
399 		if (evt->u.link_error.effective_address_provided)
400 			printk("%s    Effective address: %016llx\n",
401 			       level, evt->u.link_error.effective_address);
402 		break;
403 	default:
404 	case MCE_ERROR_TYPE_UNKNOWN:
405 		printk("%s  Error type: Unknown\n", level);
406 		break;
407 	}
408 }
409 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
410 
411 uint64_t get_mce_fault_addr(struct machine_check_event *evt)
412 {
413 	switch (evt->error_type) {
414 	case MCE_ERROR_TYPE_UE:
415 		if (evt->u.ue_error.effective_address_provided)
416 			return evt->u.ue_error.effective_address;
417 		break;
418 	case MCE_ERROR_TYPE_SLB:
419 		if (evt->u.slb_error.effective_address_provided)
420 			return evt->u.slb_error.effective_address;
421 		break;
422 	case MCE_ERROR_TYPE_ERAT:
423 		if (evt->u.erat_error.effective_address_provided)
424 			return evt->u.erat_error.effective_address;
425 		break;
426 	case MCE_ERROR_TYPE_TLB:
427 		if (evt->u.tlb_error.effective_address_provided)
428 			return evt->u.tlb_error.effective_address;
429 		break;
430 	case MCE_ERROR_TYPE_USER:
431 		if (evt->u.user_error.effective_address_provided)
432 			return evt->u.user_error.effective_address;
433 		break;
434 	case MCE_ERROR_TYPE_RA:
435 		if (evt->u.ra_error.effective_address_provided)
436 			return evt->u.ra_error.effective_address;
437 		break;
438 	case MCE_ERROR_TYPE_LINK:
439 		if (evt->u.link_error.effective_address_provided)
440 			return evt->u.link_error.effective_address;
441 		break;
442 	default:
443 	case MCE_ERROR_TYPE_UNKNOWN:
444 		break;
445 	}
446 	return 0;
447 }
448 EXPORT_SYMBOL(get_mce_fault_addr);
449