xref: /openbmc/linux/drivers/edac/mce_amd.c (revision 28efb0046512e8a13ed9f9bdf0d68d10bbfbe9cf)
1 #include <linux/module.h>
2 #include <linux/slab.h>
3 
4 #include <asm/cpu.h>
5 
6 #include "mce_amd.h"
7 
8 static struct amd_decoder_ops *fam_ops;
9 
10 static u8 xec_mask	 = 0xf;
11 
12 static bool report_gart_errors;
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14 
15 void amd_report_gart_errors(bool v)
16 {
17 	report_gart_errors = v;
18 }
19 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 
21 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 	decode_dram_ecc = f;
24 }
25 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 
27 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
28 {
29 	if (decode_dram_ecc) {
30 		WARN_ON(decode_dram_ecc != f);
31 
32 		decode_dram_ecc = NULL;
33 	}
34 }
35 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
36 
37 /*
38  * string representation for the different MCA reported error types, see F3x48
39  * or MSR0000_0411.
40  */
41 
42 /* transaction type */
43 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
44 
45 /* cache level */
46 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 
48 /* memory transaction type */
49 static const char * const rrrr_msgs[] = {
50        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51 };
52 
53 /* participating processor */
54 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
55 EXPORT_SYMBOL_GPL(pp_msgs);
56 
57 /* request timeout */
58 static const char * const to_msgs[] = { "no timeout", "timed out" };
59 
60 /* memory or i/o */
61 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
62 
63 /* internal error type */
64 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
65 
66 static const char * const f15h_mc1_mce_desc[] = {
67 	"UC during a demand linefill from L2",
68 	"Parity error during data load from IC",
69 	"Parity error for IC valid bit",
70 	"Main tag parity error",
71 	"Parity error in prediction queue",
72 	"PFB data/address parity error",
73 	"Parity error in the branch status reg",
74 	"PFB promotion address error",
75 	"Tag error during probe/victimization",
76 	"Parity error for IC probe tag valid bit",
77 	"PFB non-cacheable bit parity error",
78 	"PFB valid bit parity error",			/* xec = 0xd */
79 	"Microcode Patch Buffer",			/* xec = 010 */
80 	"uop queue",
81 	"insn buffer",
82 	"predecode buffer",
83 	"fetch address FIFO",
84 	"dispatch uop queue"
85 };
86 
87 static const char * const f15h_mc2_mce_desc[] = {
88 	"Fill ECC error on data fills",			/* xec = 0x4 */
89 	"Fill parity error on insn fills",
90 	"Prefetcher request FIFO parity error",
91 	"PRQ address parity error",
92 	"PRQ data parity error",
93 	"WCC Tag ECC error",
94 	"WCC Data ECC error",
95 	"WCB Data parity error",
96 	"VB Data ECC or parity error",
97 	"L2 Tag ECC error",				/* xec = 0x10 */
98 	"Hard L2 Tag ECC error",
99 	"Multiple hits on L2 tag",
100 	"XAB parity error",
101 	"PRB address parity error"
102 };
103 
104 static const char * const mc4_mce_desc[] = {
105 	"DRAM ECC error detected on the NB",
106 	"CRC error detected on HT link",
107 	"Link-defined sync error packets detected on HT link",
108 	"HT Master abort",
109 	"HT Target abort",
110 	"Invalid GART PTE entry during GART table walk",
111 	"Unsupported atomic RMW received from an IO link",
112 	"Watchdog timeout due to lack of progress",
113 	"DRAM ECC error detected on the NB",
114 	"SVM DMA Exclusion Vector error",
115 	"HT data error detected on link",
116 	"Protocol error (link, L3, probe filter)",
117 	"NB internal arrays parity error",
118 	"DRAM addr/ctl signals parity error",
119 	"IO link transmission error",
120 	"L3 data cache ECC error",			/* xec = 0x1c */
121 	"L3 cache tag error",
122 	"L3 LRU parity bits error",
123 	"ECC Error in the Probe Filter directory"
124 };
125 
126 static const char * const mc5_mce_desc[] = {
127 	"CPU Watchdog timer expire",
128 	"Wakeup array dest tag",
129 	"AG payload array",
130 	"EX payload array",
131 	"IDRF array",
132 	"Retire dispatch queue",
133 	"Mapper checkpoint array",
134 	"Physical register file EX0 port",
135 	"Physical register file EX1 port",
136 	"Physical register file AG0 port",
137 	"Physical register file AG1 port",
138 	"Flag register file",
139 	"DE error occurred",
140 	"Retire status queue"
141 };
142 
143 static const char * const mc6_mce_desc[] = {
144 	"Hardware Assertion",
145 	"Free List",
146 	"Physical Register File",
147 	"Retire Queue",
148 	"Scheduler table",
149 	"Status Register File",
150 };
151 
152 /* Scalable MCA error strings */
153 static const char * const smca_ls_mce_desc[] = {
154 	"Load queue parity",
155 	"Store queue parity",
156 	"Miss address buffer payload parity",
157 	"L1 TLB parity",
158 	"Reserved",
159 	"DC tag error type 6",
160 	"DC tag error type 1",
161 	"Internal error type 1",
162 	"Internal error type 2",
163 	"Sys Read data error thread 0",
164 	"Sys read data error thread 1",
165 	"DC tag error type 2",
166 	"DC data error type 1 (poison consumption)",
167 	"DC data error type 2",
168 	"DC data error type 3",
169 	"DC tag error type 4",
170 	"L2 TLB parity",
171 	"PDC parity error",
172 	"DC tag error type 3",
173 	"DC tag error type 5",
174 	"L2 fill data error",
175 };
176 
177 static const char * const smca_if_mce_desc[] = {
178 	"microtag probe port parity error",
179 	"IC microtag or full tag multi-hit error",
180 	"IC full tag parity",
181 	"IC data array parity",
182 	"Decoupling queue phys addr parity error",
183 	"L0 ITLB parity error",
184 	"L1 ITLB parity error",
185 	"L2 ITLB parity error",
186 	"BPQ snoop parity on Thread 0",
187 	"BPQ snoop parity on Thread 1",
188 	"L1 BTB multi-match error",
189 	"L2 BTB multi-match error",
190 	"L2 Cache Response Poison error",
191 	"System Read Data error",
192 };
193 
194 static const char * const smca_l2_mce_desc[] = {
195 	"L2M tag multi-way-hit error",
196 	"L2M tag ECC error",
197 	"L2M data ECC error",
198 	"HW assert",
199 };
200 
201 static const char * const smca_de_mce_desc[] = {
202 	"uop cache tag parity error",
203 	"uop cache data parity error",
204 	"Insn buffer parity error",
205 	"uop queue parity error",
206 	"Insn dispatch queue parity error",
207 	"Fetch address FIFO parity",
208 	"Patch RAM data parity",
209 	"Patch RAM sequencer parity",
210 	"uop buffer parity"
211 };
212 
213 static const char * const smca_ex_mce_desc[] = {
214 	"Watchdog timeout error",
215 	"Phy register file parity",
216 	"Flag register file parity",
217 	"Immediate displacement register file parity",
218 	"Address generator payload parity",
219 	"EX payload parity",
220 	"Checkpoint queue parity",
221 	"Retire dispatch queue parity",
222 	"Retire status queue parity error",
223 	"Scheduling queue parity error",
224 	"Branch buffer queue parity error",
225 };
226 
227 static const char * const smca_fp_mce_desc[] = {
228 	"Physical register file parity",
229 	"Freelist parity error",
230 	"Schedule queue parity",
231 	"NSQ parity error",
232 	"Retire queue parity",
233 	"Status register file parity",
234 	"Hardware assertion",
235 };
236 
237 static const char * const smca_l3_mce_desc[] = {
238 	"Shadow tag macro ECC error",
239 	"Shadow tag macro multi-way-hit error",
240 	"L3M tag ECC error",
241 	"L3M tag multi-way-hit error",
242 	"L3M data ECC error",
243 	"XI parity, L3 fill done channel error",
244 	"L3 victim queue parity",
245 	"L3 HW assert",
246 };
247 
248 static const char * const smca_cs_mce_desc[] = {
249 	"Illegal request from transport layer",
250 	"Address violation",
251 	"Security violation",
252 	"Illegal response from transport layer",
253 	"Unexpected response",
254 	"Parity error on incoming request or probe response data",
255 	"Parity error on incoming read response data",
256 	"Atomic request parity",
257 	"ECC error on probe filter access",
258 };
259 
260 static const char * const smca_pie_mce_desc[] = {
261 	"HW assert",
262 	"Internal PIE register security violation",
263 	"Error on GMI link",
264 	"Poison data written to internal PIE register",
265 };
266 
267 static const char * const smca_umc_mce_desc[] = {
268 	"DRAM ECC error",
269 	"Data poison error on DRAM",
270 	"SDP parity error",
271 	"Advanced peripheral bus error",
272 	"Command/address parity error",
273 	"Write data CRC error",
274 };
275 
276 static const char * const smca_pb_mce_desc[] = {
277 	"Parameter Block RAM ECC error",
278 };
279 
280 static const char * const smca_psp_mce_desc[] = {
281 	"PSP RAM ECC or parity error",
282 };
283 
284 static const char * const smca_smu_mce_desc[] = {
285 	"SMU RAM ECC or parity error",
286 };
287 
288 struct smca_mce_desc {
289 	const char * const *descs;
290 	unsigned int num_descs;
291 };
292 
293 static struct smca_mce_desc smca_mce_descs[] = {
294 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
295 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
296 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
297 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
298 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
299 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
300 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
301 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
302 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
303 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
304 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
305 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
306 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
307 };
308 
309 static bool f12h_mc0_mce(u16 ec, u8 xec)
310 {
311 	bool ret = false;
312 
313 	if (MEM_ERROR(ec)) {
314 		u8 ll = LL(ec);
315 		ret = true;
316 
317 		if (ll == LL_L2)
318 			pr_cont("during L1 linefill from L2.\n");
319 		else if (ll == LL_L1)
320 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
321 		else
322 			ret = false;
323 	}
324 	return ret;
325 }
326 
327 static bool f10h_mc0_mce(u16 ec, u8 xec)
328 {
329 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
330 		pr_cont("during data scrub.\n");
331 		return true;
332 	}
333 	return f12h_mc0_mce(ec, xec);
334 }
335 
336 static bool k8_mc0_mce(u16 ec, u8 xec)
337 {
338 	if (BUS_ERROR(ec)) {
339 		pr_cont("during system linefill.\n");
340 		return true;
341 	}
342 
343 	return f10h_mc0_mce(ec, xec);
344 }
345 
346 static bool cat_mc0_mce(u16 ec, u8 xec)
347 {
348 	u8 r4	 = R4(ec);
349 	bool ret = true;
350 
351 	if (MEM_ERROR(ec)) {
352 
353 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
354 			return false;
355 
356 		switch (r4) {
357 		case R4_DRD:
358 		case R4_DWR:
359 			pr_cont("Data/Tag parity error due to %s.\n",
360 				(r4 == R4_DRD ? "load/hw prf" : "store"));
361 			break;
362 		case R4_EVICT:
363 			pr_cont("Copyback parity error on a tag miss.\n");
364 			break;
365 		case R4_SNOOP:
366 			pr_cont("Tag parity error during snoop.\n");
367 			break;
368 		default:
369 			ret = false;
370 		}
371 	} else if (BUS_ERROR(ec)) {
372 
373 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
374 			return false;
375 
376 		pr_cont("System read data error on a ");
377 
378 		switch (r4) {
379 		case R4_RD:
380 			pr_cont("TLB reload.\n");
381 			break;
382 		case R4_DWR:
383 			pr_cont("store.\n");
384 			break;
385 		case R4_DRD:
386 			pr_cont("load.\n");
387 			break;
388 		default:
389 			ret = false;
390 		}
391 	} else {
392 		ret = false;
393 	}
394 
395 	return ret;
396 }
397 
398 static bool f15h_mc0_mce(u16 ec, u8 xec)
399 {
400 	bool ret = true;
401 
402 	if (MEM_ERROR(ec)) {
403 
404 		switch (xec) {
405 		case 0x0:
406 			pr_cont("Data Array access error.\n");
407 			break;
408 
409 		case 0x1:
410 			pr_cont("UC error during a linefill from L2/NB.\n");
411 			break;
412 
413 		case 0x2:
414 		case 0x11:
415 			pr_cont("STQ access error.\n");
416 			break;
417 
418 		case 0x3:
419 			pr_cont("SCB access error.\n");
420 			break;
421 
422 		case 0x10:
423 			pr_cont("Tag error.\n");
424 			break;
425 
426 		case 0x12:
427 			pr_cont("LDQ access error.\n");
428 			break;
429 
430 		default:
431 			ret = false;
432 		}
433 	} else if (BUS_ERROR(ec)) {
434 
435 		if (!xec)
436 			pr_cont("System Read Data Error.\n");
437 		else
438 			pr_cont(" Internal error condition type %d.\n", xec);
439 	} else if (INT_ERROR(ec)) {
440 		if (xec <= 0x1f)
441 			pr_cont("Hardware Assert.\n");
442 		else
443 			ret = false;
444 
445 	} else
446 		ret = false;
447 
448 	return ret;
449 }
450 
451 static void decode_mc0_mce(struct mce *m)
452 {
453 	u16 ec = EC(m->status);
454 	u8 xec = XEC(m->status, xec_mask);
455 
456 	pr_emerg(HW_ERR "MC0 Error: ");
457 
458 	/* TLB error signatures are the same across families */
459 	if (TLB_ERROR(ec)) {
460 		if (TT(ec) == TT_DATA) {
461 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
462 				((xec == 2) ? "locked miss"
463 					    : (xec ? "multimatch" : "parity")));
464 			return;
465 		}
466 	} else if (fam_ops->mc0_mce(ec, xec))
467 		;
468 	else
469 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
470 }
471 
472 static bool k8_mc1_mce(u16 ec, u8 xec)
473 {
474 	u8 ll	 = LL(ec);
475 	bool ret = true;
476 
477 	if (!MEM_ERROR(ec))
478 		return false;
479 
480 	if (ll == 0x2)
481 		pr_cont("during a linefill from L2.\n");
482 	else if (ll == 0x1) {
483 		switch (R4(ec)) {
484 		case R4_IRD:
485 			pr_cont("Parity error during data load.\n");
486 			break;
487 
488 		case R4_EVICT:
489 			pr_cont("Copyback Parity/Victim error.\n");
490 			break;
491 
492 		case R4_SNOOP:
493 			pr_cont("Tag Snoop error.\n");
494 			break;
495 
496 		default:
497 			ret = false;
498 			break;
499 		}
500 	} else
501 		ret = false;
502 
503 	return ret;
504 }
505 
506 static bool cat_mc1_mce(u16 ec, u8 xec)
507 {
508 	u8 r4    = R4(ec);
509 	bool ret = true;
510 
511 	if (!MEM_ERROR(ec))
512 		return false;
513 
514 	if (TT(ec) != TT_INSTR)
515 		return false;
516 
517 	if (r4 == R4_IRD)
518 		pr_cont("Data/tag array parity error for a tag hit.\n");
519 	else if (r4 == R4_SNOOP)
520 		pr_cont("Tag error during snoop/victimization.\n");
521 	else if (xec == 0x0)
522 		pr_cont("Tag parity error from victim castout.\n");
523 	else if (xec == 0x2)
524 		pr_cont("Microcode patch RAM parity error.\n");
525 	else
526 		ret = false;
527 
528 	return ret;
529 }
530 
531 static bool f15h_mc1_mce(u16 ec, u8 xec)
532 {
533 	bool ret = true;
534 
535 	if (!MEM_ERROR(ec))
536 		return false;
537 
538 	switch (xec) {
539 	case 0x0 ... 0xa:
540 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
541 		break;
542 
543 	case 0xd:
544 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
545 		break;
546 
547 	case 0x10:
548 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
549 		break;
550 
551 	case 0x11 ... 0x15:
552 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
553 		break;
554 
555 	default:
556 		ret = false;
557 	}
558 	return ret;
559 }
560 
561 static void decode_mc1_mce(struct mce *m)
562 {
563 	u16 ec = EC(m->status);
564 	u8 xec = XEC(m->status, xec_mask);
565 
566 	pr_emerg(HW_ERR "MC1 Error: ");
567 
568 	if (TLB_ERROR(ec))
569 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
570 			(xec ? "multimatch" : "parity error"));
571 	else if (BUS_ERROR(ec)) {
572 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
573 
574 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
575 	} else if (INT_ERROR(ec)) {
576 		if (xec <= 0x3f)
577 			pr_cont("Hardware Assert.\n");
578 		else
579 			goto wrong_mc1_mce;
580 	} else if (fam_ops->mc1_mce(ec, xec))
581 		;
582 	else
583 		goto wrong_mc1_mce;
584 
585 	return;
586 
587 wrong_mc1_mce:
588 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
589 }
590 
591 static bool k8_mc2_mce(u16 ec, u8 xec)
592 {
593 	bool ret = true;
594 
595 	if (xec == 0x1)
596 		pr_cont(" in the write data buffers.\n");
597 	else if (xec == 0x3)
598 		pr_cont(" in the victim data buffers.\n");
599 	else if (xec == 0x2 && MEM_ERROR(ec))
600 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
601 	else if (xec == 0x0) {
602 		if (TLB_ERROR(ec))
603 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
604 				TT_MSG(ec));
605 		else if (BUS_ERROR(ec))
606 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
607 				R4_MSG(ec), PP_MSG(ec));
608 		else if (MEM_ERROR(ec)) {
609 			u8 r4 = R4(ec);
610 
611 			if (r4 >= 0x7)
612 				pr_cont(": %s error during data copyback.\n",
613 					R4_MSG(ec));
614 			else if (r4 <= 0x1)
615 				pr_cont(": %s parity/ECC error during data "
616 					"access from L2.\n", R4_MSG(ec));
617 			else
618 				ret = false;
619 		} else
620 			ret = false;
621 	} else
622 		ret = false;
623 
624 	return ret;
625 }
626 
627 static bool f15h_mc2_mce(u16 ec, u8 xec)
628 {
629 	bool ret = true;
630 
631 	if (TLB_ERROR(ec)) {
632 		if (xec == 0x0)
633 			pr_cont("Data parity TLB read error.\n");
634 		else if (xec == 0x1)
635 			pr_cont("Poison data provided for TLB fill.\n");
636 		else
637 			ret = false;
638 	} else if (BUS_ERROR(ec)) {
639 		if (xec > 2)
640 			ret = false;
641 
642 		pr_cont("Error during attempted NB data read.\n");
643 	} else if (MEM_ERROR(ec)) {
644 		switch (xec) {
645 		case 0x4 ... 0xc:
646 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
647 			break;
648 
649 		case 0x10 ... 0x14:
650 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
651 			break;
652 
653 		default:
654 			ret = false;
655 		}
656 	} else if (INT_ERROR(ec)) {
657 		if (xec <= 0x3f)
658 			pr_cont("Hardware Assert.\n");
659 		else
660 			ret = false;
661 	}
662 
663 	return ret;
664 }
665 
666 static bool f16h_mc2_mce(u16 ec, u8 xec)
667 {
668 	u8 r4 = R4(ec);
669 
670 	if (!MEM_ERROR(ec))
671 		return false;
672 
673 	switch (xec) {
674 	case 0x04 ... 0x05:
675 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
676 		break;
677 
678 	case 0x09 ... 0x0b:
679 	case 0x0d ... 0x0f:
680 		pr_cont("ECC error in L2 tag (%s).\n",
681 			((r4 == R4_GEN)   ? "BankReq" :
682 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
683 		break;
684 
685 	case 0x10 ... 0x19:
686 	case 0x1b:
687 		pr_cont("ECC error in L2 data array (%s).\n",
688 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
689 			((r4 == R4_GEN)   ? "Attr" :
690 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
691 		break;
692 
693 	case 0x1c ... 0x1d:
694 	case 0x1f:
695 		pr_cont("Parity error in L2 attribute bits (%s).\n",
696 			((r4 == R4_RD)  ? "Hit"  :
697 			((r4 == R4_GEN) ? "Attr" : "Fill")));
698 		break;
699 
700 	default:
701 		return false;
702 	}
703 
704 	return true;
705 }
706 
707 static void decode_mc2_mce(struct mce *m)
708 {
709 	u16 ec = EC(m->status);
710 	u8 xec = XEC(m->status, xec_mask);
711 
712 	pr_emerg(HW_ERR "MC2 Error: ");
713 
714 	if (!fam_ops->mc2_mce(ec, xec))
715 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
716 }
717 
718 static void decode_mc3_mce(struct mce *m)
719 {
720 	u16 ec = EC(m->status);
721 	u8 xec = XEC(m->status, xec_mask);
722 
723 	if (boot_cpu_data.x86 >= 0x14) {
724 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
725 			 " please report on LKML.\n");
726 		return;
727 	}
728 
729 	pr_emerg(HW_ERR "MC3 Error");
730 
731 	if (xec == 0x0) {
732 		u8 r4 = R4(ec);
733 
734 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
735 			goto wrong_mc3_mce;
736 
737 		pr_cont(" during %s.\n", R4_MSG(ec));
738 	} else
739 		goto wrong_mc3_mce;
740 
741 	return;
742 
743  wrong_mc3_mce:
744 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
745 }
746 
747 static void decode_mc4_mce(struct mce *m)
748 {
749 	unsigned int fam = x86_family(m->cpuid);
750 	int node_id = amd_get_nb_id(m->extcpu);
751 	u16 ec = EC(m->status);
752 	u8 xec = XEC(m->status, 0x1f);
753 	u8 offset = 0;
754 
755 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
756 
757 	switch (xec) {
758 	case 0x0 ... 0xe:
759 
760 		/* special handling for DRAM ECCs */
761 		if (xec == 0x0 || xec == 0x8) {
762 			/* no ECCs on F11h */
763 			if (fam == 0x11)
764 				goto wrong_mc4_mce;
765 
766 			pr_cont("%s.\n", mc4_mce_desc[xec]);
767 
768 			if (decode_dram_ecc)
769 				decode_dram_ecc(node_id, m);
770 			return;
771 		}
772 		break;
773 
774 	case 0xf:
775 		if (TLB_ERROR(ec))
776 			pr_cont("GART Table Walk data error.\n");
777 		else if (BUS_ERROR(ec))
778 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
779 		else
780 			goto wrong_mc4_mce;
781 		return;
782 
783 	case 0x19:
784 		if (fam == 0x15 || fam == 0x16)
785 			pr_cont("Compute Unit Data Error.\n");
786 		else
787 			goto wrong_mc4_mce;
788 		return;
789 
790 	case 0x1c ... 0x1f:
791 		offset = 13;
792 		break;
793 
794 	default:
795 		goto wrong_mc4_mce;
796 	}
797 
798 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
799 	return;
800 
801  wrong_mc4_mce:
802 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
803 }
804 
805 static void decode_mc5_mce(struct mce *m)
806 {
807 	unsigned int fam = x86_family(m->cpuid);
808 	u16 ec = EC(m->status);
809 	u8 xec = XEC(m->status, xec_mask);
810 
811 	if (fam == 0xf || fam == 0x11)
812 		goto wrong_mc5_mce;
813 
814 	pr_emerg(HW_ERR "MC5 Error: ");
815 
816 	if (INT_ERROR(ec)) {
817 		if (xec <= 0x1f) {
818 			pr_cont("Hardware Assert.\n");
819 			return;
820 		} else
821 			goto wrong_mc5_mce;
822 	}
823 
824 	if (xec == 0x0 || xec == 0xc)
825 		pr_cont("%s.\n", mc5_mce_desc[xec]);
826 	else if (xec <= 0xd)
827 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
828 	else
829 		goto wrong_mc5_mce;
830 
831 	return;
832 
833  wrong_mc5_mce:
834 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
835 }
836 
837 static void decode_mc6_mce(struct mce *m)
838 {
839 	u8 xec = XEC(m->status, xec_mask);
840 
841 	pr_emerg(HW_ERR "MC6 Error: ");
842 
843 	if (xec > 0x5)
844 		goto wrong_mc6_mce;
845 
846 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
847 	return;
848 
849  wrong_mc6_mce:
850 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
851 }
852 
853 /* Decode errors according to Scalable MCA specification */
854 static void decode_smca_error(struct mce *m)
855 {
856 	struct smca_hwid *hwid;
857 	unsigned int bank_type;
858 	const char *ip_name;
859 	u8 xec = XEC(m->status, xec_mask);
860 
861 	if (m->bank >= ARRAY_SIZE(smca_banks))
862 		return;
863 
864 	if (x86_family(m->cpuid) >= 0x17 && m->bank == 4)
865 		pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
866 
867 	hwid = smca_banks[m->bank].hwid;
868 	if (!hwid)
869 		return;
870 
871 	bank_type = hwid->bank_type;
872 	ip_name = smca_get_long_name(bank_type);
873 
874 	pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
875 
876 	/* Only print the decode of valid error codes */
877 	if (xec < smca_mce_descs[bank_type].num_descs &&
878 			(hwid->xec_bitmap & BIT_ULL(xec))) {
879 		pr_emerg(HW_ERR "%s Error: ", ip_name);
880 		pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
881 	}
882 
883 	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
884 		decode_dram_ecc(cpu_to_node(m->extcpu), m);
885 }
886 
887 static inline void amd_decode_err_code(u16 ec)
888 {
889 	if (INT_ERROR(ec)) {
890 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
891 		return;
892 	}
893 
894 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
895 
896 	if (BUS_ERROR(ec))
897 		pr_cont(", mem/io: %s", II_MSG(ec));
898 	else
899 		pr_cont(", tx: %s", TT_MSG(ec));
900 
901 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
902 		pr_cont(", mem-tx: %s", R4_MSG(ec));
903 
904 		if (BUS_ERROR(ec))
905 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
906 	}
907 
908 	pr_cont("\n");
909 }
910 
911 /*
912  * Filter out unwanted MCE signatures here.
913  */
914 static bool amd_filter_mce(struct mce *m)
915 {
916 	/*
917 	 * NB GART TLB error reporting is disabled by default.
918 	 */
919 	if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
920 		return true;
921 
922 	return false;
923 }
924 
925 static const char *decode_error_status(struct mce *m)
926 {
927 	if (m->status & MCI_STATUS_UC) {
928 		if (m->status & MCI_STATUS_PCC)
929 			return "System Fatal error.";
930 		if (m->mcgstatus & MCG_STATUS_RIPV)
931 			return "Uncorrected, software restartable error.";
932 		return "Uncorrected, software containable error.";
933 	}
934 
935 	if (m->status & MCI_STATUS_DEFERRED)
936 		return "Deferred error, no action required.";
937 
938 	return "Corrected error, no action required.";
939 }
940 
941 static int
942 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
943 {
944 	struct mce *m = (struct mce *)data;
945 	unsigned int fam = x86_family(m->cpuid);
946 	int ecc;
947 
948 	if (amd_filter_mce(m))
949 		return NOTIFY_STOP;
950 
951 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
952 
953 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
954 		m->extcpu,
955 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
956 		m->bank,
957 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
958 		((m->status & MCI_STATUS_UC)	? "UE"	  :
959 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
960 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
961 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
962 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
963 
964 	if (fam >= 0x15) {
965 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
966 
967 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
968 		if (fam != 0x15 || m->bank != 4)
969 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
970 	}
971 
972 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
973 		u32 low, high;
974 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
975 
976 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
977 
978 		if (!rdmsr_safe(addr, &low, &high) &&
979 		    (low & MCI_CONFIG_MCAX))
980 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
981 	}
982 
983 	/* do the two bits[14:13] together */
984 	ecc = (m->status >> 45) & 0x3;
985 	if (ecc)
986 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
987 
988 	pr_cont("]: 0x%016llx\n", m->status);
989 
990 	if (m->status & MCI_STATUS_ADDRV)
991 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
992 
993 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
994 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
995 
996 		if (m->status & MCI_STATUS_SYNDV)
997 			pr_cont(", Syndrome: 0x%016llx", m->synd);
998 
999 		pr_cont("\n");
1000 
1001 		decode_smca_error(m);
1002 		goto err_code;
1003 	}
1004 
1005 	if (m->tsc)
1006 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1007 
1008 	if (!fam_ops)
1009 		goto err_code;
1010 
1011 	switch (m->bank) {
1012 	case 0:
1013 		decode_mc0_mce(m);
1014 		break;
1015 
1016 	case 1:
1017 		decode_mc1_mce(m);
1018 		break;
1019 
1020 	case 2:
1021 		decode_mc2_mce(m);
1022 		break;
1023 
1024 	case 3:
1025 		decode_mc3_mce(m);
1026 		break;
1027 
1028 	case 4:
1029 		decode_mc4_mce(m);
1030 		break;
1031 
1032 	case 5:
1033 		decode_mc5_mce(m);
1034 		break;
1035 
1036 	case 6:
1037 		decode_mc6_mce(m);
1038 		break;
1039 
1040 	default:
1041 		break;
1042 	}
1043 
1044  err_code:
1045 	amd_decode_err_code(m->status & 0xffff);
1046 
1047 	return NOTIFY_STOP;
1048 }
1049 
1050 static struct notifier_block amd_mce_dec_nb = {
1051 	.notifier_call	= amd_decode_mce,
1052 	.priority	= MCE_PRIO_EDAC,
1053 };
1054 
1055 static int __init mce_amd_init(void)
1056 {
1057 	struct cpuinfo_x86 *c = &boot_cpu_data;
1058 
1059 	if (c->x86_vendor != X86_VENDOR_AMD)
1060 		return -ENODEV;
1061 
1062 	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1063 	if (!fam_ops)
1064 		return -ENOMEM;
1065 
1066 	switch (c->x86) {
1067 	case 0xf:
1068 		fam_ops->mc0_mce = k8_mc0_mce;
1069 		fam_ops->mc1_mce = k8_mc1_mce;
1070 		fam_ops->mc2_mce = k8_mc2_mce;
1071 		break;
1072 
1073 	case 0x10:
1074 		fam_ops->mc0_mce = f10h_mc0_mce;
1075 		fam_ops->mc1_mce = k8_mc1_mce;
1076 		fam_ops->mc2_mce = k8_mc2_mce;
1077 		break;
1078 
1079 	case 0x11:
1080 		fam_ops->mc0_mce = k8_mc0_mce;
1081 		fam_ops->mc1_mce = k8_mc1_mce;
1082 		fam_ops->mc2_mce = k8_mc2_mce;
1083 		break;
1084 
1085 	case 0x12:
1086 		fam_ops->mc0_mce = f12h_mc0_mce;
1087 		fam_ops->mc1_mce = k8_mc1_mce;
1088 		fam_ops->mc2_mce = k8_mc2_mce;
1089 		break;
1090 
1091 	case 0x14:
1092 		fam_ops->mc0_mce = cat_mc0_mce;
1093 		fam_ops->mc1_mce = cat_mc1_mce;
1094 		fam_ops->mc2_mce = k8_mc2_mce;
1095 		break;
1096 
1097 	case 0x15:
1098 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1099 
1100 		fam_ops->mc0_mce = f15h_mc0_mce;
1101 		fam_ops->mc1_mce = f15h_mc1_mce;
1102 		fam_ops->mc2_mce = f15h_mc2_mce;
1103 		break;
1104 
1105 	case 0x16:
1106 		xec_mask = 0x1f;
1107 		fam_ops->mc0_mce = cat_mc0_mce;
1108 		fam_ops->mc1_mce = cat_mc1_mce;
1109 		fam_ops->mc2_mce = f16h_mc2_mce;
1110 		break;
1111 
1112 	case 0x17:
1113 		xec_mask = 0x3f;
1114 		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1115 			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1116 			goto err_out;
1117 		}
1118 		break;
1119 
1120 	default:
1121 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1122 		goto err_out;
1123 	}
1124 
1125 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1126 
1127 	mce_register_decode_chain(&amd_mce_dec_nb);
1128 
1129 	return 0;
1130 
1131 err_out:
1132 	kfree(fam_ops);
1133 	fam_ops = NULL;
1134 	return -EINVAL;
1135 }
1136 early_initcall(mce_amd_init);
1137 
1138 #ifdef MODULE
1139 static void __exit mce_amd_exit(void)
1140 {
1141 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1142 	kfree(fam_ops);
1143 }
1144 
1145 MODULE_DESCRIPTION("AMD MCE decoder");
1146 MODULE_ALIAS("edac-mce-amd");
1147 MODULE_LICENSE("GPL");
1148 module_exit(mce_amd_exit);
1149 #endif
1150